Blame - renderscript/clang-include/xmmintrin.h - platform/prebuilts/sdk

blob: 3110e8babf9463af40870cf8286efa18a350e314 [file] [log] [blame]

Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1	/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	23
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	24	#ifndef __XMMINTRIN_H
				25	#define __XMMINTRIN_H
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	26
				27	#include <mmintrin.h>
				28
				29	typedef int __v4si __attribute__((__vector_size__(16)));
				30	typedef float __v4sf __attribute__((__vector_size__(16)));
				31	typedef float __m128 __attribute__((__vector_size__(16)));
				32
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	33	/* Unsigned types */
				34	typedef unsigned int __v4su __attribute__((__vector_size__(16)));
				35
Stephen Hines	990d2fc	2014-07-23 10:40:48 -0700	[diff] [blame]	36	/* This header should only be included in a hosted environment as it depends on
				37	* a standard library to provide allocation routines. */
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	38	#if __STDC_HOSTED__
				39	#include <mm_malloc.h>
				40	#endif
				41
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	42	/* Define the default attributes for the functions in this file. */
				43	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
				44
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	45	/// \brief Adds the 32-bit float values in the low-order bits of the operands.
				46	///
				47	/// \headerfile <x86intrin.h>
				48	///
				49	/// This intrinsic corresponds to the \c VADDSS / ADDSS instructions.
				50	///
				51	/// \param __a
				52	/// A 128-bit vector of [4 x float] containing one of the source operands.
				53	/// The lower 32 bits of this operand are used in the calculation.
				54	/// \param __b
				55	/// A 128-bit vector of [4 x float] containing one of the source operands.
				56	/// The lower 32 bits of this operand are used in the calculation.
				57	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
				58	/// of the lower 32 bits of both operands. The upper 96 bits are copied from
				59	/// the upper 96 bits of the first source operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	60	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	61	_mm_add_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	62	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	63	__a[0] += __b[0];
				64	return __a;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	65	}
				66
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	67	/// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
				68	/// the addition.
				69	///
				70	/// \headerfile <x86intrin.h>
				71	///
				72	/// This intrinsic corresponds to the \c VADDPS / ADDPS instructions.
				73	///
				74	/// \param __a
				75	/// A 128-bit vector of [4 x float] containing one of the source operands.
				76	/// \param __b
				77	/// A 128-bit vector of [4 x float] containing one of the source operands.
				78	/// \returns A 128-bit vector of [4 x float] containing the sums of both
				79	/// operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	80	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	81	_mm_add_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	82	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	83	return (__m128)((__v4sf)__a + (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	84	}
				85
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	86	/// \brief Subtracts the 32-bit float value in the low-order bits of the second
				87	/// operand from the corresponding value in the first operand.
				88	///
				89	/// \headerfile <x86intrin.h>
				90	///
				91	/// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions.
				92	///
				93	/// \param __a
				94	/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
				95	/// of this operand are used in the calculation.
				96	/// \param __b
				97	/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
				98	/// bits of this operand are used in the calculation.
				99	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				100	/// difference of the lower 32 bits of both operands. The upper 96 bits are
				101	/// copied from the upper 96 bits of the first source operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	102	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	103	_mm_sub_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	104	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	105	__a[0] -= __b[0];
				106	return __a;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	107	}
				108
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	109	/// \brief Subtracts each of the values of the second operand from the first
				110	/// operand, both of which are 128-bit vectors of [4 x float] and returns
				111	/// the results of the subtraction.
				112	///
				113	/// \headerfile <x86intrin.h>
				114	///
				115	/// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions.
				116	///
				117	/// \param __a
				118	/// A 128-bit vector of [4 x float] containing the minuend.
				119	/// \param __b
				120	/// A 128-bit vector of [4 x float] containing the subtrahend.
				121	/// \returns A 128-bit vector of [4 x float] containing the differences between
				122	/// both operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	123	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	124	_mm_sub_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	125	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	126	return (__m128)((__v4sf)__a - (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	127	}
				128
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	129	/// \brief Multiplies two 32-bit float values in the low-order bits of the
				130	/// operands.
				131	///
				132	/// \headerfile <x86intrin.h>
				133	///
				134	/// This intrinsic corresponds to the \c VMULSS / MULSS instructions.
				135	///
				136	/// \param __a
				137	/// A 128-bit vector of [4 x float] containing one of the source operands.
				138	/// The lower 32 bits of this operand are used in the calculation.
				139	/// \param __b
				140	/// A 128-bit vector of [4 x float] containing one of the source operands.
				141	/// The lower 32 bits of this operand are used in the calculation.
				142	/// \returns A 128-bit vector of [4 x float] containing the product of the lower
				143	/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
				144	/// bits of the first source operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	145	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	146	_mm_mul_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	147	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	148	__a[0] *= __b[0];
				149	return __a;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	150	}
				151
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	152	/// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
				153	/// results of the multiplication.
				154	///
				155	/// \headerfile <x86intrin.h>
				156	///
				157	/// This intrinsic corresponds to the \c VMULPS / MULPS instructions.
				158	///
				159	/// \param __a
				160	/// A 128-bit vector of [4 x float] containing one of the source operands.
				161	/// \param __b
				162	/// A 128-bit vector of [4 x float] containing one of the source operands.
				163	/// \returns A 128-bit vector of [4 x float] containing the products of both
				164	/// operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	165	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	166	_mm_mul_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	167	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	168	return (__m128)((__v4sf)__a * (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	169	}
				170
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	171	/// \brief Divides the value in the low-order 32 bits of the first operand by
				172	/// the corresponding value in the second operand.
				173	///
				174	/// \headerfile <x86intrin.h>
				175	///
				176	/// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions.
				177	///
				178	/// \param __a
				179	/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
				180	/// bits of this operand are used in the calculation.
				181	/// \param __b
				182	/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
				183	/// of this operand are used in the calculation.
				184	/// \returns A 128-bit vector of [4 x float] containing the quotients of the
				185	/// lower 32 bits of both operands. The upper 96 bits are copied from the
				186	/// upper 96 bits of the first source operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	187	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	188	_mm_div_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	189	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	190	__a[0] /= __b[0];
				191	return __a;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	192	}
				193
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	194	/// \brief Divides two 128-bit vectors of [4 x float].
				195	///
				196	/// \headerfile <x86intrin.h>
				197	///
				198	/// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions.
				199	///
				200	/// \param __a
				201	/// A 128-bit vector of [4 x float] containing the dividend.
				202	/// \param __b
				203	/// A 128-bit vector of [4 x float] containing the divisor.
				204	/// \returns A 128-bit vector of [4 x float] containing the quotients of both
				205	/// operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	206	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	207	_mm_div_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	208	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	209	return (__m128)((__v4sf)__a / (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	210	}
				211
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	212	/// \brief Calculates the square root of the value stored in the low-order bits
				213	/// of a 128-bit vector of [4 x float].
				214	///
				215	/// \headerfile <x86intrin.h>
				216	///
				217	/// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions.
				218	///
				219	/// \param __a
				220	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				221	/// used in the calculation.
				222	/// \returns A 128-bit vector of [4 x float] containing the square root of the
				223	/// value in the low-order bits of the operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	224	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	225	_mm_sqrt_ss(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	226	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	227	__m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	228	return (__m128) { __c[0], __a[1], __a[2], __a[3] };
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	229	}
				230
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	231	/// \brief Calculates the square roots of the values stored in a 128-bit vector
				232	/// of [4 x float].
				233	///
				234	/// \headerfile <x86intrin.h>
				235	///
				236	/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions.
				237	///
				238	/// \param __a
				239	/// A 128-bit vector of [4 x float].
				240	/// \returns A 128-bit vector of [4 x float] containing the square roots of the
				241	/// values in the operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	242	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	243	_mm_sqrt_ps(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	244	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	245	return __builtin_ia32_sqrtps((__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	246	}
				247
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	248	/// \brief Calculates the approximate reciprocal of the value stored in the
				249	/// low-order bits of a 128-bit vector of [4 x float].
				250	///
				251	/// \headerfile <x86intrin.h>
				252	///
				253	/// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions.
				254	///
				255	/// \param __a
				256	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				257	/// used in the calculation.
				258	/// \returns A 128-bit vector of [4 x float] containing the approximate
				259	/// reciprocal of the value in the low-order bits of the operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	260	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	261	_mm_rcp_ss(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	262	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	263	__m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	264	return (__m128) { __c[0], __a[1], __a[2], __a[3] };
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	265	}
				266
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	267	/// \brief Calculates the approximate reciprocals of the values stored in a
				268	/// 128-bit vector of [4 x float].
				269	///
				270	/// \headerfile <x86intrin.h>
				271	///
				272	/// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions.
				273	///
				274	/// \param __a
				275	/// A 128-bit vector of [4 x float].
				276	/// \returns A 128-bit vector of [4 x float] containing the approximate
				277	/// reciprocals of the values in the operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	278	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	279	_mm_rcp_ps(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	280	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	281	return __builtin_ia32_rcpps((__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	282	}
				283
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	284	/// \brief Calculates the approximate reciprocal of the square root of the value
				285	/// stored in the low-order bits of a 128-bit vector of [4 x float].
				286	///
				287	/// \headerfile <x86intrin.h>
				288	///
				289	/// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions.
				290	///
				291	/// \param __a
				292	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				293	/// used in the calculation.
				294	/// \returns A 128-bit vector of [4 x float] containing the approximate
				295	/// reciprocal of the square root of the value in the low-order bits of the
				296	/// operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	297	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	298	_mm_rsqrt_ss(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	299	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	300	__m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	301	return (__m128) { __c[0], __a[1], __a[2], __a[3] };
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	302	}
				303
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	304	/// \brief Calculates the approximate reciprocals of the square roots of the
				305	/// values stored in a 128-bit vector of [4 x float].
				306	///
				307	/// \headerfile <x86intrin.h>
				308	///
				309	/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions.
				310	///
				311	/// \param __a
				312	/// A 128-bit vector of [4 x float].
				313	/// \returns A 128-bit vector of [4 x float] containing the approximate
				314	/// reciprocals of the square roots of the values in the operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	315	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	316	_mm_rsqrt_ps(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	317	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	318	return __builtin_ia32_rsqrtps((__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	319	}
				320
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	321	/// \brief Compares two 32-bit float values in the low-order bits of both
				322	/// operands and returns the lesser value in the low-order bits of the
				323	/// vector of [4 x float].
				324	///
				325	/// \headerfile <x86intrin.h>
				326	///
				327	/// This intrinsic corresponds to the \c VMINSS / MINSS instructions.
				328	///
				329	/// \param __a
				330	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				331	/// 32 bits of this operand are used in the comparison.
				332	/// \param __b
				333	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				334	/// 32 bits of this operand are used in the comparison.
				335	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				336	/// minimum value between both operands. The upper 96 bits are copied from
				337	/// the upper 96 bits of the first source operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	338	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	339	_mm_min_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	340	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	341	return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	342	}
				343
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	344	/// \brief Compares two 128-bit vectors of [4 x float] and returns the
				345	/// lesser of each pair of values.
				346	///
				347	/// \headerfile <x86intrin.h>
				348	///
				349	/// This intrinsic corresponds to the \c VMINPS / MINPS instructions.
				350	///
				351	/// \param __a
				352	/// A 128-bit vector of [4 x float] containing one of the operands.
				353	/// \param __b
				354	/// A 128-bit vector of [4 x float] containing one of the operands.
				355	/// \returns A 128-bit vector of [4 x float] containing the minimum values
				356	/// between both operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	357	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	358	_mm_min_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	359	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	360	return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	361	}
				362
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	363	/// \brief Compares two 32-bit float values in the low-order bits of both
				364	/// operands and returns the greater value in the low-order bits of
				365	/// a vector [4 x float].
				366	///
				367	/// \headerfile <x86intrin.h>
				368	///
				369	/// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions.
				370	///
				371	/// \param __a
				372	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				373	/// 32 bits of this operand are used in the comparison.
				374	/// \param __b
				375	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				376	/// 32 bits of this operand are used in the comparison.
				377	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				378	/// maximum value between both operands. The upper 96 bits are copied from
				379	/// the upper 96 bits of the first source operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	380	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	381	_mm_max_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	382	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	383	return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	384	}
				385
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	386	/// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
				387	/// of each pair of values.
				388	///
				389	/// \headerfile <x86intrin.h>
				390	///
				391	/// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions.
				392	///
				393	/// \param __a
				394	/// A 128-bit vector of [4 x float] containing one of the operands.
				395	/// \param __b
				396	/// A 128-bit vector of [4 x float] containing one of the operands.
				397	/// \returns A 128-bit vector of [4 x float] containing the maximum values
				398	/// between both operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	399	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	400	_mm_max_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	401	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	402	return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	403	}
				404
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	405	/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
				406	///
				407	/// \headerfile <x86intrin.h>
				408	///
				409	/// This intrinsic corresponds to the \c VANDPS / ANDPS instructions.
				410	///
				411	/// \param __a
				412	/// A 128-bit vector containing one of the source operands.
				413	/// \param __b
				414	/// A 128-bit vector containing one of the source operands.
				415	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
				416	/// values between both operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	417	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	418	_mm_and_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	419	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	420	return (__m128)((__v4su)__a & (__v4su)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	421	}
				422
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	423	/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
				424	/// the one's complement of the values contained in the first source
				425	/// operand.
				426	///
				427	/// \headerfile <x86intrin.h>
				428	///
				429	/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions.
				430	///
				431	/// \param __a
				432	/// A 128-bit vector of [4 x float] containing the first source operand. The
				433	/// one's complement of this value is used in the bitwise AND.
				434	/// \param __b
				435	/// A 128-bit vector of [4 x float] containing the second source operand.
				436	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
				437	/// one's complement of the first operand and the values in the second
				438	/// operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	439	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	440	_mm_andnot_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	441	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	442	return (__m128)(~(__v4su)__a & (__v4su)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	443	}
				444
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	445	/// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
				446	///
				447	/// \headerfile <x86intrin.h>
				448	///
				449	/// This intrinsic corresponds to the \c VORPS / ORPS instructions.
				450	///
				451	/// \param __a
				452	/// A 128-bit vector of [4 x float] containing one of the source operands.
				453	/// \param __b
				454	/// A 128-bit vector of [4 x float] containing one of the source operands.
				455	/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
				456	/// values between both operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	457	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	458	_mm_or_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	459	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	460	return (__m128)((__v4su)__a \| (__v4su)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	461	}
				462
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	463	/// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
				464	/// [4 x float].
				465	///
				466	/// \headerfile <x86intrin.h>
				467	///
				468	/// This intrinsic corresponds to the \c VXORPS / XORPS instructions.
				469	///
				470	/// \param __a
				471	/// A 128-bit vector of [4 x float] containing one of the source operands.
				472	/// \param __b
				473	/// A 128-bit vector of [4 x float] containing one of the source operands.
				474	/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
				475	/// of the values between both operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	476	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	477	_mm_xor_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	478	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	479	return (__m128)((__v4su)__a ^ (__v4su)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	480	}
				481
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	482	/// \brief Compares two 32-bit float values in the low-order bits of both
				483	/// operands for equality and returns the result of the comparison in the
				484	/// low-order bits of a vector [4 x float].
				485	///
				486	/// \headerfile <x86intrin.h>
				487	///
				488	/// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions.
				489	///
				490	/// \param __a
				491	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				492	/// 32 bits of this operand are used in the comparison.
				493	/// \param __b
				494	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				495	/// 32 bits of this operand are used in the comparison.
				496	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				497	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	498	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	499	_mm_cmpeq_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	500	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	501	return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	502	}
				503
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	504	/// \brief Compares each of the corresponding 32-bit float values of the
				505	/// 128-bit vectors of [4 x float] for equality.
				506	///
				507	/// \headerfile <x86intrin.h>
				508	///
				509	/// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions.
				510	///
				511	/// \param __a
				512	/// A 128-bit vector of [4 x float].
				513	/// \param __b
				514	/// A 128-bit vector of [4 x float].
				515	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	516	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	517	_mm_cmpeq_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	518	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	519	return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	520	}
				521
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	522	/// \brief Compares two 32-bit float values in the low-order bits of both
				523	/// operands to determine if the value in the first operand is less than the
				524	/// corresponding value in the second operand and returns the result of the
				525	/// comparison in the low-order bits of a vector of [4 x float].
				526	///
				527	/// \headerfile <x86intrin.h>
				528	///
				529	/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
				530	///
				531	/// \param __a
				532	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				533	/// 32 bits of this operand are used in the comparison.
				534	/// \param __b
				535	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				536	/// 32 bits of this operand are used in the comparison.
				537	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				538	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	539	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	540	_mm_cmplt_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	541	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	542	return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	543	}
				544
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	545	/// \brief Compares each of the corresponding 32-bit float values of the
				546	/// 128-bit vectors of [4 x float] to determine if the values in the first
				547	/// operand are less than those in the second operand.
				548	///
				549	/// \headerfile <x86intrin.h>
				550	///
				551	/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
				552	///
				553	/// \param __a
				554	/// A 128-bit vector of [4 x float].
				555	/// \param __b
				556	/// A 128-bit vector of [4 x float].
				557	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	558	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	559	_mm_cmplt_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	560	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	561	return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	562	}
				563
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	564	/// \brief Compares two 32-bit float values in the low-order bits of both
				565	/// operands to determine if the value in the first operand is less than or
				566	/// equal to the corresponding value in the second operand and returns the
				567	/// result of the comparison in the low-order bits of a vector of
				568	/// [4 x float].
				569	///
				570	/// \headerfile <x86intrin.h>
				571	///
				572	/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
				573	///
				574	/// \param __a
				575	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				576	/// 32 bits of this operand are used in the comparison.
				577	/// \param __b
				578	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				579	/// 32 bits of this operand are used in the comparison.
				580	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				581	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	582	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	583	_mm_cmple_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	584	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	585	return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	586	}
				587
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	588	/// \brief Compares each of the corresponding 32-bit float values of the
				589	/// 128-bit vectors of [4 x float] to determine if the values in the first
				590	/// operand are less than or equal to those in the second operand.
				591	///
				592	/// \headerfile <x86intrin.h>
				593	///
				594	/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
				595	///
				596	/// \param __a
				597	/// A 128-bit vector of [4 x float].
				598	/// \param __b
				599	/// A 128-bit vector of [4 x float].
				600	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	601	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	602	_mm_cmple_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	603	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	604	return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	605	}
				606
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	607	/// \brief Compares two 32-bit float values in the low-order bits of both
				608	/// operands to determine if the value in the first operand is greater than
				609	/// the corresponding value in the second operand and returns the result of
				610	/// the comparison in the low-order bits of a vector of [4 x float].
				611	///
				612	/// \headerfile <x86intrin.h>
				613	///
				614	/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
				615	///
				616	/// \param __a
				617	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				618	/// 32 bits of this operand are used in the comparison.
				619	/// \param __b
				620	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				621	/// 32 bits of this operand are used in the comparison.
				622	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				623	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	624	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	625	_mm_cmpgt_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	626	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	627	return (__m128)__builtin_shufflevector((__v4sf)__a,
				628	(__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
Stephen Hines	996e4dc	2013-08-13 01:04:14 -0700	[diff] [blame]	629	4, 1, 2, 3);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	630	}
				631
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	632	/// \brief Compares each of the corresponding 32-bit float values of the
				633	/// 128-bit vectors of [4 x float] to determine if the values in the first
				634	/// operand are greater than those in the second operand.
				635	///
				636	/// \headerfile <x86intrin.h>
				637	///
				638	/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
				639	///
				640	/// \param __a
				641	/// A 128-bit vector of [4 x float].
				642	/// \param __b
				643	/// A 128-bit vector of [4 x float].
				644	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	645	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	646	_mm_cmpgt_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	647	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	648	return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	649	}
				650
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	651	/// \brief Compares two 32-bit float values in the low-order bits of both
				652	/// operands to determine if the value in the first operand is greater than
				653	/// or equal to the corresponding value in the second operand and returns
				654	/// the result of the comparison in the low-order bits of a vector of
				655	/// [4 x float].
				656	///
				657	/// \headerfile <x86intrin.h>
				658	///
				659	/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
				660	///
				661	/// \param __a
				662	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				663	/// 32 bits of this operand are used in the comparison.
				664	/// \param __b
				665	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				666	/// 32 bits of this operand are used in the comparison.
				667	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				668	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	669	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	670	_mm_cmpge_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	671	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	672	return (__m128)__builtin_shufflevector((__v4sf)__a,
				673	(__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
Stephen Hines	996e4dc	2013-08-13 01:04:14 -0700	[diff] [blame]	674	4, 1, 2, 3);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	675	}
				676
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	677	/// \brief Compares each of the corresponding 32-bit float values of the
				678	/// 128-bit vectors of [4 x float] to determine if the values in the first
				679	/// operand are greater than or equal to those in the second operand.
				680	///
				681	/// \headerfile <x86intrin.h>
				682	///
				683	/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
				684	///
				685	/// \param __a
				686	/// A 128-bit vector of [4 x float].
				687	/// \param __b
				688	/// A 128-bit vector of [4 x float].
				689	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	690	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	691	_mm_cmpge_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	692	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	693	return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	694	}
				695
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	696	/// \brief Compares two 32-bit float values in the low-order bits of both
				697	/// operands for inequality and returns the result of the comparison in the
				698	/// low-order bits of a vector of [4 x float].
				699	///
				700	/// \headerfile <x86intrin.h>
				701	///
				702	/// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions.
				703	///
				704	/// \param __a
				705	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				706	/// 32 bits of this operand are used in the comparison.
				707	/// \param __b
				708	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				709	/// 32 bits of this operand are used in the comparison.
				710	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				711	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	712	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	713	_mm_cmpneq_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	714	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	715	return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	716	}
				717
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	718	/// \brief Compares each of the corresponding 32-bit float values of the
				719	/// 128-bit vectors of [4 x float] for inequality.
				720	///
				721	/// \headerfile <x86intrin.h>
				722	///
				723	/// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions.
				724	///
				725	/// \param __a
				726	/// A 128-bit vector of [4 x float].
				727	/// \param __b
				728	/// A 128-bit vector of [4 x float].
				729	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	730	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	731	_mm_cmpneq_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	732	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	733	return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	734	}
				735
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	736	/// \brief Compares two 32-bit float values in the low-order bits of both
				737	/// operands to determine if the value in the first operand is not less than
				738	/// the corresponding value in the second operand and returns the result of
				739	/// the comparison in the low-order bits of a vector of [4 x float].
				740	///
				741	/// \headerfile <x86intrin.h>
				742	///
				743	/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
				744	///
				745	/// \param __a
				746	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				747	/// 32 bits of this operand are used in the comparison.
				748	/// \param __b
				749	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				750	/// 32 bits of this operand are used in the comparison.
				751	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				752	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	753	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	754	_mm_cmpnlt_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	755	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	756	return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	757	}
				758
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	759	/// \brief Compares each of the corresponding 32-bit float values of the
				760	/// 128-bit vectors of [4 x float] to determine if the values in the first
				761	/// operand are not less than those in the second operand.
				762	///
				763	/// \headerfile <x86intrin.h>
				764	///
				765	/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
				766	///
				767	/// \param __a
				768	/// A 128-bit vector of [4 x float].
				769	/// \param __b
				770	/// A 128-bit vector of [4 x float].
				771	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	772	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	773	_mm_cmpnlt_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	774	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	775	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	776	}
				777
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	778	/// \brief Compares two 32-bit float values in the low-order bits of both
				779	/// operands to determine if the value in the first operand is not less than
				780	/// or equal to the corresponding value in the second operand and returns
				781	/// the result of the comparison in the low-order bits of a vector of
				782	/// [4 x float].
				783	///
				784	/// \headerfile <x86intrin.h>
				785	///
				786	/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
				787	///
				788	/// \param __a
				789	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				790	/// 32 bits of this operand are used in the comparison.
				791	/// \param __b
				792	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				793	/// 32 bits of this operand are used in the comparison.
				794	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				795	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	796	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	797	_mm_cmpnle_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	798	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	799	return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	800	}
				801
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	802	/// \brief Compares each of the corresponding 32-bit float values of the
				803	/// 128-bit vectors of [4 x float] to determine if the values in the first
				804	/// operand are not less than or equal to those in the second operand.
				805	///
				806	/// \headerfile <x86intrin.h>
				807	///
				808	/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
				809	///
				810	/// \param __a
				811	/// A 128-bit vector of [4 x float].
				812	/// \param __b
				813	/// A 128-bit vector of [4 x float].
				814	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	815	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	816	_mm_cmpnle_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	817	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	818	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	819	}
				820
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	821	/// \brief Compares two 32-bit float values in the low-order bits of both
				822	/// operands to determine if the value in the first operand is not greater
				823	/// than the corresponding value in the second operand and returns the
				824	/// result of the comparison in the low-order bits of a vector of
				825	/// [4 x float].
				826	///
				827	/// \headerfile <x86intrin.h>
				828	///
				829	/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
				830	///
				831	/// \param __a
				832	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				833	/// 32 bits of this operand are used in the comparison.
				834	/// \param __b
				835	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				836	/// 32 bits of this operand are used in the comparison.
				837	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				838	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	839	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	840	_mm_cmpngt_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	841	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	842	return (__m128)__builtin_shufflevector((__v4sf)__a,
				843	(__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
Stephen Hines	996e4dc	2013-08-13 01:04:14 -0700	[diff] [blame]	844	4, 1, 2, 3);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	845	}
				846
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	847	/// \brief Compares each of the corresponding 32-bit float values of the
				848	/// 128-bit vectors of [4 x float] to determine if the values in the first
				849	/// operand are not greater than those in the second operand.
				850	///
				851	/// \headerfile <x86intrin.h>
				852	///
				853	/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
				854	///
				855	/// \param __a
				856	/// A 128-bit vector of [4 x float].
				857	/// \param __b
				858	/// A 128-bit vector of [4 x float].
				859	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	860	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	861	_mm_cmpngt_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	862	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	863	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	864	}
				865
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	866	/// \brief Compares two 32-bit float values in the low-order bits of both
				867	/// operands to determine if the value in the first operand is not greater
				868	/// than or equal to the corresponding value in the second operand and
				869	/// returns the result of the comparison in the low-order bits of a vector
				870	/// of [4 x float].
				871	///
				872	/// \headerfile <x86intrin.h>
				873	///
				874	/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
				875	///
				876	/// \param __a
				877	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				878	/// 32 bits of this operand are used in the comparison.
				879	/// \param __b
				880	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				881	/// 32 bits of this operand are used in the comparison.
				882	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				883	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	884	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	885	_mm_cmpnge_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	886	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	887	return (__m128)__builtin_shufflevector((__v4sf)__a,
				888	(__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
Stephen Hines	996e4dc	2013-08-13 01:04:14 -0700	[diff] [blame]	889	4, 1, 2, 3);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	890	}
				891
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	892	/// \brief Compares each of the corresponding 32-bit float values of the
				893	/// 128-bit vectors of [4 x float] to determine if the values in the first
				894	/// operand are not greater than or equal to those in the second operand.
				895	///
				896	/// \headerfile <x86intrin.h>
				897	///
				898	/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
				899	///
				900	/// \param __a
				901	/// A 128-bit vector of [4 x float].
				902	/// \param __b
				903	/// A 128-bit vector of [4 x float].
				904	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	905	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	906	_mm_cmpnge_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	907	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	908	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	909	}
				910
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	911	/// \brief Compares two 32-bit float values in the low-order bits of both
				912	/// operands to determine if the value in the first operand is ordered with
				913	/// respect to the corresponding value in the second operand and returns the
				914	/// result of the comparison in the low-order bits of a vector of
				915	/// [4 x float].
				916	///
				917	/// \headerfile <x86intrin.h>
				918	///
				919	/// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions.
				920	///
				921	/// \param __a
				922	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				923	/// 32 bits of this operand are used in the comparison.
				924	/// \param __b
				925	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				926	/// 32 bits of this operand are used in the comparison.
				927	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				928	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	929	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	930	_mm_cmpord_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	931	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	932	return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	933	}
				934
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	935	/// \brief Compares each of the corresponding 32-bit float values of the
				936	/// 128-bit vectors of [4 x float] to determine if the values in the first
				937	/// operand are ordered with respect to those in the second operand.
				938	///
				939	/// \headerfile <x86intrin.h>
				940	///
				941	/// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions.
				942	///
				943	/// \param __a
				944	/// A 128-bit vector of [4 x float].
				945	/// \param __b
				946	/// A 128-bit vector of [4 x float].
				947	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	948	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	949	_mm_cmpord_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	950	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	951	return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	952	}
				953
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	954	/// \brief Compares two 32-bit float values in the low-order bits of both
				955	/// operands to determine if the value in the first operand is unordered
				956	/// with respect to the corresponding value in the second operand and
				957	/// returns the result of the comparison in the low-order bits of a vector
				958	/// of [4 x float].
				959	///
				960	/// \headerfile <x86intrin.h>
				961	///
				962	/// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions.
				963	///
				964	/// \param __a
				965	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				966	/// 32 bits of this operand are used in the comparison.
				967	/// \param __b
				968	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
				969	/// 32 bits of this operand are used in the comparison.
				970	/// \returns A 128-bit vector of [4 x float] containing the comparison results
				971	/// in the low-order bits.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	972	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	973	_mm_cmpunord_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	974	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	975	return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	976	}
				977
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	978	/// \brief Compares each of the corresponding 32-bit float values of the
				979	/// 128-bit vectors of [4 x float] to determine if the values in the first
				980	/// operand are unordered with respect to those in the second operand.
				981	///
				982	/// \headerfile <x86intrin.h>
				983	///
				984	/// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions.
				985	///
				986	/// \param __a
				987	/// A 128-bit vector of [4 x float].
				988	/// \param __b
				989	/// A 128-bit vector of [4 x float].
				990	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	991	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	992	_mm_cmpunord_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	993	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	994	return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	995	}
				996
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	997	/// \brief Compares two 32-bit float values in the low-order bits of both
				998	/// operands for equality and returns the result of the comparison.
				999	///
				1000	/// \headerfile <x86intrin.h>
				1001	///
				1002	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1003	///
				1004	/// \param __a
				1005	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1006	/// used in the comparison.
				1007	/// \param __b
				1008	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1009	/// used in the comparison.
				1010	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1011	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1012	_mm_comieq_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1013	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1014	return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1015	}
				1016
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1017	/// \brief Compares two 32-bit float values in the low-order bits of both
				1018	/// operands to determine if the first operand is less than the second
				1019	/// operand and returns the result of the comparison.
				1020	///
				1021	/// \headerfile <x86intrin.h>
				1022	///
				1023	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1024	///
				1025	/// \param __a
				1026	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1027	/// used in the comparison.
				1028	/// \param __b
				1029	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1030	/// used in the comparison.
				1031	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1032	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1033	_mm_comilt_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1034	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1035	return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1036	}
				1037
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1038	/// \brief Compares two 32-bit float values in the low-order bits of both
				1039	/// operands to determine if the first operand is less than or equal to the
				1040	/// second operand and returns the result of the comparison.
				1041	///
				1042	/// \headerfile <x86intrin.h>
				1043	///
				1044	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1045	///
				1046	/// \param __a
				1047	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1048	/// used in the comparison.
				1049	/// \param __b
				1050	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1051	/// used in the comparison.
				1052	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1053	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1054	_mm_comile_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1055	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1056	return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1057	}
				1058
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1059	/// \brief Compares two 32-bit float values in the low-order bits of both
				1060	/// operands to determine if the first operand is greater than the second
				1061	/// operand and returns the result of the comparison.
				1062	///
				1063	/// \headerfile <x86intrin.h>
				1064	///
				1065	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1066	///
				1067	/// \param __a
				1068	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1069	/// used in the comparison.
				1070	/// \param __b
				1071	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1072	/// used in the comparison.
				1073	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1074	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1075	_mm_comigt_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1076	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1077	return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1078	}
				1079
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1080	/// \brief Compares two 32-bit float values in the low-order bits of both
				1081	/// operands to determine if the first operand is greater than or equal to
				1082	/// the second operand and returns the result of the comparison.
				1083	///
				1084	/// \headerfile <x86intrin.h>
				1085	///
				1086	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1087	///
				1088	/// \param __a
				1089	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1090	/// used in the comparison.
				1091	/// \param __b
				1092	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1093	/// used in the comparison.
				1094	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1095	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1096	_mm_comige_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1097	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1098	return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1099	}
				1100
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1101	/// \brief Compares two 32-bit float values in the low-order bits of both
				1102	/// operands to determine if the first operand is not equal to the second
				1103	/// operand and returns the result of the comparison.
				1104	///
				1105	/// \headerfile <x86intrin.h>
				1106	///
				1107	/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
				1108	///
				1109	/// \param __a
				1110	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1111	/// used in the comparison.
				1112	/// \param __b
				1113	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1114	/// used in the comparison.
				1115	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1116	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1117	_mm_comineq_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1118	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1119	return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1120	}
				1121
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1122	/// \brief Performs an unordered comparison of two 32-bit float values using
				1123	/// the low-order bits of both operands to determine equality and returns
				1124	/// the result of the comparison.
				1125	///
				1126	/// \headerfile <x86intrin.h>
				1127	///
				1128	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1129	///
				1130	/// \param __a
				1131	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1132	/// used in the comparison.
				1133	/// \param __b
				1134	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1135	/// used in the comparison.
				1136	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1137	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1138	_mm_ucomieq_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1139	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1140	return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1141	}
				1142
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1143	/// \brief Performs an unordered comparison of two 32-bit float values using
				1144	/// the low-order bits of both operands to determine if the first operand is
				1145	/// less than the second operand and returns the result of the comparison.
				1146	///
				1147	/// \headerfile <x86intrin.h>
				1148	///
				1149	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1150	///
				1151	/// \param __a
				1152	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1153	/// used in the comparison.
				1154	/// \param __b
				1155	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1156	/// used in the comparison.
				1157	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1158	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1159	_mm_ucomilt_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1160	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1161	return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1162	}
				1163
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1164	/// \brief Performs an unordered comparison of two 32-bit float values using
				1165	/// the low-order bits of both operands to determine if the first operand
				1166	/// is less than or equal to the second operand and returns the result of
				1167	/// the comparison.
				1168	///
				1169	/// \headerfile <x86intrin.h>
				1170	///
				1171	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1172	///
				1173	/// \param __a
				1174	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1175	/// used in the comparison.
				1176	/// \param __b
				1177	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1178	/// used in the comparison.
				1179	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1180	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1181	_mm_ucomile_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1182	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1183	return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1184	}
				1185
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1186	/// \brief Performs an unordered comparison of two 32-bit float values using
				1187	/// the low-order bits of both operands to determine if the first operand
				1188	/// is greater than the second operand and returns the result of the
				1189	/// comparison.
				1190	///
				1191	/// \headerfile <x86intrin.h>
				1192	///
				1193	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1194	///
				1195	/// \param __a
				1196	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1197	/// used in the comparison.
				1198	/// \param __b
				1199	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1200	/// used in the comparison.
				1201	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1202	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1203	_mm_ucomigt_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1204	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1205	return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1206	}
				1207
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1208	/// \brief Performs an unordered comparison of two 32-bit float values using
				1209	/// the low-order bits of both operands to determine if the first operand is
				1210	/// greater than or equal to the second operand and returns the result of
				1211	/// the comparison.
				1212	///
				1213	/// \headerfile <x86intrin.h>
				1214	///
				1215	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1216	///
				1217	/// \param __a
				1218	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1219	/// used in the comparison.
				1220	/// \param __b
				1221	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1222	/// used in the comparison.
				1223	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1224	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1225	_mm_ucomige_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1226	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1227	return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1228	}
				1229
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1230	/// \brief Performs an unordered comparison of two 32-bit float values using
				1231	/// the low-order bits of both operands to determine inequality and returns
				1232	/// the result of the comparison.
				1233	///
				1234	/// \headerfile <x86intrin.h>
				1235	///
				1236	/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
				1237	///
				1238	/// \param __a
				1239	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1240	/// used in the comparison.
				1241	/// \param __b
				1242	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1243	/// used in the comparison.
				1244	/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1245	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1246	_mm_ucomineq_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1247	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1248	return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1249	}
				1250
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1251	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1252	/// [4 x float] into a 32-bit integer.
				1253	///
				1254	/// \headerfile <x86intrin.h>
				1255	///
				1256	/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
				1257	///
				1258	/// \param __a
				1259	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1260	/// used in the conversion.
				1261	/// \returns A 32-bit integer containing the converted value.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1262	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1263	_mm_cvtss_si32(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1264	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1265	return __builtin_ia32_cvtss2si((__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1266	}
				1267
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1268	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1269	/// [4 x float] into a 32-bit integer.
				1270	///
				1271	/// \headerfile <x86intrin.h>
				1272	///
				1273	/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
				1274	///
				1275	/// \param __a
				1276	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1277	/// used in the conversion.
				1278	/// \returns A 32-bit integer containing the converted value.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1279	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1280	_mm_cvt_ss2si(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1281	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1282	return _mm_cvtss_si32(__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1283	}
				1284
				1285	#ifdef __x86_64__
				1286
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1287	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1288	/// [4 x float] into a 64-bit integer.
				1289	///
				1290	/// \headerfile <x86intrin.h>
				1291	///
				1292	/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
				1293	///
				1294	/// \param __a
				1295	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1296	/// used in the conversion.
				1297	/// \returns A 64-bit integer containing the converted value.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1298	static __inline__ long long __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1299	_mm_cvtss_si64(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1300	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1301	return __builtin_ia32_cvtss2si64((__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1302	}
				1303
				1304	#endif
				1305
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1306	/// \brief Converts two low-order float values in a 128-bit vector of
				1307	/// [4 x float] into a 64-bit vector of [2 x i32].
				1308	///
				1309	/// \headerfile <x86intrin.h>
				1310	///
				1311	/// This intrinsic corresponds to the \c CVTPS2PI instruction.
				1312	///
				1313	/// \param __a
				1314	/// A 128-bit vector of [4 x float].
				1315	/// \returns A 64-bit integer vector containing the converted values.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1316	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1317	_mm_cvtps_pi32(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1318	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1319	return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1320	}
				1321
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1322	/// \brief Converts two low-order float values in a 128-bit vector of
				1323	/// [4 x float] into a 64-bit vector of [2 x i32].
				1324	///
				1325	/// \headerfile <x86intrin.h>
				1326	///
				1327	/// This intrinsic corresponds to the \c CVTPS2PI instruction.
				1328	///
				1329	/// \param __a
				1330	/// A 128-bit vector of [4 x float].
				1331	/// \returns A 64-bit integer vector containing the converted values.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1332	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1333	_mm_cvt_ps2pi(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1334	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1335	return _mm_cvtps_pi32(__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1336	}
				1337
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1338	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1339	/// [4 x float] into a 32-bit integer, truncating the result when it is
				1340	/// inexact.
				1341	///
				1342	/// \headerfile <x86intrin.h>
				1343	///
				1344	/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
				1345	///
				1346	/// \param __a
				1347	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1348	/// used in the conversion.
				1349	/// \returns A 32-bit integer containing the converted value.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1350	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1351	_mm_cvttss_si32(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1352	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1353	return __a[0];
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1354	}
				1355
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1356	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1357	/// [4 x float] into a 32-bit integer, truncating the result when it is
				1358	/// inexact.
				1359	///
				1360	/// \headerfile <x86intrin.h>
				1361	///
				1362	/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
				1363	///
				1364	/// \param __a
				1365	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1366	/// used in the conversion.
				1367	/// \returns A 32-bit integer containing the converted value.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1368	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1369	_mm_cvtt_ss2si(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1370	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1371	return _mm_cvttss_si32(__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1372	}
				1373
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1374	/// \brief Converts a float value contained in the lower 32 bits of a vector of
				1375	/// [4 x float] into a 64-bit integer, truncating the result when it is
				1376	/// inexact.
				1377	///
				1378	/// \headerfile <x86intrin.h>
				1379	///
				1380	/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
				1381	///
				1382	/// \param __a
				1383	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1384	/// used in the conversion.
				1385	/// \returns A 64-bit integer containing the converted value.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1386	static __inline__ long long __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1387	_mm_cvttss_si64(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1388	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1389	return __a[0];
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1390	}
				1391
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1392	/// \brief Converts two low-order float values in a 128-bit vector of
				1393	/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
				1394	/// when it is inexact.
				1395	///
				1396	/// \headerfile <x86intrin.h>
				1397	///
				1398	/// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions.
				1399	///
				1400	/// \param __a
				1401	/// A 128-bit vector of [4 x float].
				1402	/// \returns A 64-bit integer vector containing the converted values.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1403	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1404	_mm_cvttps_pi32(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1405	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1406	return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1407	}
				1408
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1409	/// \brief Converts two low-order float values in a 128-bit vector of [4 x
				1410	/// float] into a 64-bit vector of [2 x i32], truncating the result when it
				1411	/// is inexact.
				1412	///
				1413	/// \headerfile <x86intrin.h>
				1414	///
				1415	/// This intrinsic corresponds to the \c CVTTPS2PI instruction.
				1416	///
				1417	/// \param __a
				1418	/// A 128-bit vector of [4 x float].
				1419	/// \returns A 64-bit integer vector containing the converted values.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1420	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1421	_mm_cvtt_ps2pi(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1422	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1423	return _mm_cvttps_pi32(__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1424	}
				1425
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1426	/// \brief Converts a 32-bit signed integer value into a floating point value
				1427	/// and writes it to the lower 32 bits of the destination. The remaining
				1428	/// higher order elements of the destination vector are copied from the
				1429	/// corresponding elements in the first operand.
				1430	///
				1431	/// \headerfile <x86intrin.h>
				1432	///
				1433	/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
				1434	///
				1435	/// \param __a
				1436	/// A 128-bit vector of [4 x float].
				1437	/// \param __b
				1438	/// A 32-bit signed integer operand containing the value to be converted.
				1439	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1440	/// converted value of the second operand. The upper 96 bits are copied from
				1441	/// the upper 96 bits of the first operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1442	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1443	_mm_cvtsi32_ss(__m128 __a, int __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1444	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1445	__a[0] = __b;
				1446	return __a;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1447	}
				1448
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1449	/// \brief Converts a 32-bit signed integer value into a floating point value
				1450	/// and writes it to the lower 32 bits of the destination. The remaining
				1451	/// higher order elements of the destination are copied from the
				1452	/// corresponding elements in the first operand.
				1453	///
				1454	/// \headerfile <x86intrin.h>
				1455	///
				1456	/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
				1457	///
				1458	/// \param __a
				1459	/// A 128-bit vector of [4 x float].
				1460	/// \param __b
				1461	/// A 32-bit signed integer operand containing the value to be converted.
				1462	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1463	/// converted value of the second operand. The upper 96 bits are copied from
				1464	/// the upper 96 bits of the first operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1465	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1466	_mm_cvt_si2ss(__m128 __a, int __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1467	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1468	return _mm_cvtsi32_ss(__a, __b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1469	}
				1470
				1471	#ifdef __x86_64__
				1472
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1473	/// \brief Converts a 64-bit signed integer value into a floating point value
				1474	/// and writes it to the lower 32 bits of the destination. The remaining
				1475	/// higher order elements of the destination are copied from the
				1476	/// corresponding elements in the first operand.
				1477	///
				1478	/// \headerfile <x86intrin.h>
				1479	///
				1480	/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
				1481	///
				1482	/// \param __a
				1483	/// A 128-bit vector of [4 x float].
				1484	/// \param __b
				1485	/// A 64-bit signed integer operand containing the value to be converted.
				1486	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
				1487	/// converted value of the second operand. The upper 96 bits are copied from
				1488	/// the upper 96 bits of the first operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1489	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1490	_mm_cvtsi64_ss(__m128 __a, long long __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1491	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1492	__a[0] = __b;
				1493	return __a;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1494	}
				1495
				1496	#endif
				1497
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1498	/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
				1499	/// floating point values and writes them to the lower 64-bits of the
				1500	/// destination. The remaining higher order elements of the destination are
				1501	/// copied from the corresponding elements in the first operand.
				1502	///
				1503	/// \headerfile <x86intrin.h>
				1504	///
				1505	/// This intrinsic corresponds to the \c CVTPI2PS instruction.
				1506	///
				1507	/// \param __a
				1508	/// A 128-bit vector of [4 x float].
				1509	/// \param __b
				1510	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
				1511	/// and written to the corresponding low-order elements in the destination.
				1512	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				1513	/// converted value of the second operand. The upper 64 bits are copied from
				1514	/// the upper 64 bits of the first operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1515	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1516	_mm_cvtpi32_ps(__m128 __a, __m64 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1517	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1518	return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1519	}
				1520
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1521	/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
				1522	/// floating point values and writes them to the lower 64-bits of the
				1523	/// destination. The remaining higher order elements of the destination are
				1524	/// copied from the corresponding elements in the first operand.
				1525	///
				1526	/// \headerfile <x86intrin.h>
				1527	///
				1528	/// This intrinsic corresponds to the \c CVTPI2PS instruction.
				1529	///
				1530	/// \param __a
				1531	/// A 128-bit vector of [4 x float].
				1532	/// \param __b
				1533	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
				1534	/// and written to the corresponding low-order elements in the destination.
				1535	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				1536	/// converted value from the second operand. The upper 64 bits are copied
				1537	/// from the upper 64 bits of the first operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1538	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1539	_mm_cvt_pi2ps(__m128 __a, __m64 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1540	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1541	return _mm_cvtpi32_ps(__a, __b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1542	}
				1543
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1544	/// \brief Extracts a float value contained in the lower 32 bits of a vector of
				1545	/// [4 x float].
				1546	///
				1547	/// \headerfile <x86intrin.h>
				1548	///
				1549	/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
				1550	///
				1551	/// \param __a
				1552	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
				1553	/// used in the extraction.
				1554	/// \returns A 32-bit float containing the extracted value.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1555	static __inline__ float __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1556	_mm_cvtss_f32(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1557	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1558	return __a[0];
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1559	}
				1560
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1561	/// \brief Loads two packed float values from the address __p into the
				1562	/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
				1563	/// are copied from the low-order bits of the first operand.
				1564	///
				1565	/// \headerfile <x86intrin.h>
				1566	///
				1567	/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
				1568	///
				1569	/// \param __a
				1570	/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
				1571	/// of the destination.
				1572	/// \param __p
				1573	/// A pointer to two packed float values. Bits [63:0] are written to bits
				1574	/// [127:64] of the destination.
				1575	/// \returns A 128-bit vector of [4 x float] containing the moved values.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1576	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1577	_mm_loadh_pi(__m128 __a, const __m64 *__p)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1578	{
				1579	typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
				1580	struct __mm_loadh_pi_struct {
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1581	__mm_loadh_pi_v2f32 __u;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1582	} __attribute__((__packed__, __may_alias__));
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1583	__mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
				1584	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
				1585	return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1586	}
				1587
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1588	/// \brief Loads two packed float values from the address __p into the low-order
				1589	/// bits of a 128-bit vector of [4 x float]. The high-order bits are copied
				1590	/// from the high-order bits of the first operand.
				1591	///
				1592	/// \headerfile <x86intrin.h>
				1593	///
				1594	/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
				1595	///
				1596	/// \param __a
				1597	/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
				1598	/// [127:64] of the destination.
				1599	/// \param __p
				1600	/// A pointer to two packed float values. Bits [63:0] are written to bits
				1601	/// [63:0] of the destination.
				1602	/// \returns A 128-bit vector of [4 x float] containing the moved values.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1603	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1604	_mm_loadl_pi(__m128 __a, const __m64 *__p)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1605	{
				1606	typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
				1607	struct __mm_loadl_pi_struct {
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1608	__mm_loadl_pi_v2f32 __u;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1609	} __attribute__((__packed__, __may_alias__));
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1610	__mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
				1611	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
				1612	return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1613	}
				1614
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1615	/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
				1616	/// 32 bits of the vector are initialized with the single-precision
				1617	/// floating-point value loaded from a specified memory location. The upper
				1618	/// 96 bits are set to zero.
				1619	///
				1620	/// \headerfile <x86intrin.h>
				1621	///
				1622	/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
				1623	///
				1624	/// \param __p
				1625	/// A pointer to a 32-bit memory location containing a single-precision
				1626	/// floating-point value.
				1627	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
				1628	/// lower 32 bits contain the value loaded from the memory location. The
				1629	/// upper 96 bits are set to zero.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1630	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1631	_mm_load_ss(const float *__p)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1632	{
				1633	struct __mm_load_ss_struct {
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1634	float __u;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1635	} __attribute__((__packed__, __may_alias__));
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1636	float __u = ((struct __mm_load_ss_struct*)__p)->__u;
				1637	return (__m128){ __u, 0, 0, 0 };
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1638	}
				1639
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1640	/// \brief Loads a 32-bit float value and duplicates it to all four vector
				1641	/// elements of a 128-bit vector of [4 x float].
				1642	///
				1643	/// \headerfile <x86intrin.h>
				1644	///
				1645	/// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling
				1646	/// instruction.
				1647	///
				1648	/// \param __p
				1649	/// A pointer to a float value to be loaded and duplicated.
				1650	/// \returns A 128-bit vector of [4 x float] containing the loaded
				1651	/// and duplicated values.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1652	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1653	_mm_load1_ps(const float *__p)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1654	{
				1655	struct __mm_load1_ps_struct {
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1656	float __u;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1657	} __attribute__((__packed__, __may_alias__));
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1658	float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
				1659	return (__m128){ __u, __u, __u, __u };
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1660	}
				1661
				1662	#define _mm_load_ps1(p) _mm_load1_ps(p)
				1663
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1664	/// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
				1665	/// memory location.
				1666	///
				1667	/// \headerfile <x86intrin.h>
				1668	///
				1669	/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
				1670	///
				1671	/// \param __p
				1672	/// A pointer to a 128-bit memory location. The address of the memory
				1673	/// location has to be 128-bit aligned.
				1674	/// \returns A 128-bit vector of [4 x float] containing the loaded valus.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1675	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1676	_mm_load_ps(const float *__p)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1677	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1678	return (__m128)__p;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1679	}
				1680
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1681	/// \brief Loads a 128-bit floating-point vector of [4 x float] from an
				1682	/// unaligned memory location.
				1683	///
				1684	/// \headerfile <x86intrin.h>
				1685	///
				1686	/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
				1687	///
				1688	/// \param __p
				1689	/// A pointer to a 128-bit memory location. The address of the memory
				1690	/// location does not have to be aligned.
				1691	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1692	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1693	_mm_loadu_ps(const float *__p)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1694	{
				1695	struct __loadu_ps {
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1696	__m128 __v;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1697	} __attribute__((__packed__, __may_alias__));
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1698	return ((struct __loadu_ps*)__p)->__v;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1699	}
				1700
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1701	/// \brief Loads four packed float values, in reverse order, from an aligned
				1702	/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
				1703	///
				1704	/// \headerfile <x86intrin.h>
				1705	///
				1706	/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
				1707	/// instruction.
				1708	///
				1709	/// \param __p
				1710	/// A pointer to a 128-bit memory location. The address of the memory
				1711	/// location has to be 128-bit aligned.
				1712	/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
				1713	/// in reverse order.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1714	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1715	_mm_loadr_ps(const float *__p)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1716	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1717	__m128 __a = _mm_load_ps(__p);
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1718	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1719	}
				1720
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1721	/// \brief Create a 128-bit vector of [4 x float] with undefined values.
				1722	///
				1723	/// \headerfile <x86intrin.h>
				1724	///
				1725	/// This intrinsic has no corresponding instruction.
				1726	///
				1727	/// \returns A 128-bit vector of [4 x float] containing undefined values.
				1728
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1729	static __inline__ __m128 __DEFAULT_FN_ATTRS
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1730	_mm_undefined_ps(void)
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1731	{
				1732	return (__m128)__builtin_ia32_undef128();
				1733	}
				1734
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1735	/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
				1736	/// 32 bits of the vector are initialized with the specified single-precision
				1737	/// floating-point value. The upper 96 bits are set to zero.
				1738	///
				1739	/// \headerfile <x86intrin.h>
				1740	///
				1741	/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
				1742	///
				1743	/// \param __w
				1744	/// A single-precision floating-point value used to initialize the lower 32
				1745	/// bits of the result.
				1746	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
				1747	/// lower 32 bits contain the value provided in the source operand. The
				1748	/// upper 96 bits are set to zero.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1749	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1750	_mm_set_ss(float __w)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1751	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1752	return (__m128){ __w, 0, 0, 0 };
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1753	}
				1754
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1755	/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
				1756	/// of the four single-precision floating-point vector elements set to the
				1757	/// specified single-precision floating-point value.
				1758	///
				1759	/// \headerfile <x86intrin.h>
				1760	///
				1761	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1762	///
				1763	/// \param __w
				1764	/// A single-precision floating-point value used to initialize each vector
				1765	/// element of the result.
				1766	/// \returns An initialized 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1767	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1768	_mm_set1_ps(float __w)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1769	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1770	return (__m128){ __w, __w, __w, __w };
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1771	}
				1772
Stephen Hines	990d2fc	2014-07-23 10:40:48 -0700	[diff] [blame]	1773	/* Microsoft specific. */
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1774	/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
				1775	/// of the four single-precision floating-point vector elements set to the
				1776	/// specified single-precision floating-point value.
				1777	///
				1778	/// \headerfile <x86intrin.h>
				1779	///
				1780	/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
				1781	///
				1782	/// \param __w
				1783	/// A single-precision floating-point value used to initialize each vector
				1784	/// element of the result.
				1785	/// \returns An initialized 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1786	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1787	_mm_set_ps1(float __w)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1788	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1789	return _mm_set1_ps(__w);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1790	}
				1791
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1792	/// \brief Constructs a 128-bit floating-point vector of [4 x float]
				1793	/// initialized with the specified single-precision floating-point values.
				1794	///
				1795	/// \headerfile <x86intrin.h>
				1796	///
				1797	/// This intrinsic is a utility function and does not correspond to a specific
				1798	/// instruction.
				1799	///
				1800	/// \param __z
				1801	/// A single-precision floating-point value used to initialize bits [127:96]
				1802	/// of the result.
				1803	/// \param __y
				1804	/// A single-precision floating-point value used to initialize bits [95:64]
				1805	/// of the result.
				1806	/// \param __x
				1807	/// A single-precision floating-point value used to initialize bits [63:32]
				1808	/// of the result.
				1809	/// \param __w
				1810	/// A single-precision floating-point value used to initialize bits [31:0]
				1811	/// of the result.
				1812	/// \returns An initialized 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1813	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1814	_mm_set_ps(float __z, float __y, float __x, float __w)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1815	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1816	return (__m128){ __w, __x, __y, __z };
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1817	}
				1818
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1819	/// \brief Constructs a 128-bit floating-point vector of [4 x float],
				1820	/// initialized in reverse order with the specified 32-bit single-precision
				1821	/// float-point values.
				1822	///
				1823	/// \headerfile <x86intrin.h>
				1824	///
				1825	/// This intrinsic is a utility function and does not correspond to a specific
				1826	/// instruction.
				1827	///
				1828	/// \param __z
				1829	/// A single-precision floating-point value used to initialize bits [31:0]
				1830	/// of the result.
				1831	/// \param __y
				1832	/// A single-precision floating-point value used to initialize bits [63:32]
				1833	/// of the result.
				1834	/// \param __x
				1835	/// A single-precision floating-point value used to initialize bits [95:64]
				1836	/// of the result.
				1837	/// \param __w
				1838	/// A single-precision floating-point value used to initialize bits [127:96]
				1839	/// of the result.
				1840	/// \returns An initialized 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1841	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1842	_mm_setr_ps(float __z, float __y, float __x, float __w)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1843	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1844	return (__m128){ __z, __y, __x, __w };
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1845	}
				1846
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1847	/// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
				1848	/// to zero.
				1849	///
				1850	/// \headerfile <x86intrin.h>
				1851	///
				1852	/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
				1853	///
				1854	/// \returns An initialized 128-bit floating-point vector of [4 x float] with
				1855	/// all elements set to zero.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1856	static __inline__ __m128 __DEFAULT_FN_ATTRS
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1857	_mm_setzero_ps(void)
				1858	{
				1859	return (__m128){ 0, 0, 0, 0 };
				1860	}
				1861
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1862	/// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
				1863	/// memory location.
				1864	///
				1865	/// \headerfile <x86intrin.h>
				1866	///
				1867	/// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction.
				1868	///
				1869	/// \param __p
				1870	/// A pointer to a 64-bit memory location.
				1871	/// \param __a
				1872	/// A 128-bit vector of [4 x float] containing the values to be stored.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1873	static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1874	_mm_storeh_pi(__m64 *__p, __m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1875	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1876	__builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1877	}
				1878
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1879	/// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
				1880	/// memory location.
				1881	///
				1882	/// \headerfile <x86intrin.h>
				1883	///
				1884	/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
				1885	///
				1886	/// \param __p
				1887	/// A pointer to a memory location that will receive the float values.
				1888	/// \param __a
				1889	/// A 128-bit vector of [4 x float] containing the values to be stored.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1890	static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1891	_mm_storel_pi(__m64 *__p, __m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1892	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1893	__builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1894	}
				1895
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1896	/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
				1897	/// memory location.
				1898	///
				1899	/// \headerfile <x86intrin.h>
				1900	///
				1901	/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
				1902	///
				1903	/// \param __p
				1904	/// A pointer to a 32-bit memory location.
				1905	/// \param __a
				1906	/// A 128-bit vector of [4 x float] containing the value to be stored.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1907	static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1908	_mm_store_ss(float *__p, __m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1909	{
				1910	struct __mm_store_ss_struct {
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1911	float __u;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1912	} __attribute__((__packed__, __may_alias__));
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1913	((struct __mm_store_ss_struct*)__p)->__u = __a[0];
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1914	}
				1915
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1916	/// \brief Stores float values from a 128-bit vector of [4 x float] to an
				1917	/// unaligned memory location.
				1918	///
				1919	/// \headerfile <x86intrin.h>
				1920	///
				1921	/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
				1922	///
				1923	/// \param __p
				1924	/// A pointer to a 128-bit memory location. The address of the memory
				1925	/// location does not have to be aligned.
				1926	/// \param __a
				1927	/// A 128-bit vector of [4 x float] containing the values to be stored.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1928	static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1929	_mm_storeu_ps(float *__p, __m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1930	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1931	struct __storeu_ps {
				1932	__m128 __v;
				1933	} __attribute__((__packed__, __may_alias__));
				1934	((struct __storeu_ps*)__p)->__v = __a;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1935	}
				1936
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1937	/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
				1938	/// four contiguous elements in an aligned memory location.
				1939	///
				1940	/// \headerfile <x86intrin.h>
				1941	///
				1942	/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
				1943	/// instruction.
				1944	///
				1945	/// \param __p
				1946	/// A pointer to a 128-bit memory location.
				1947	/// \param __a
				1948	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
				1949	/// of the four contiguous elements pointed by __p.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	1950	static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	1951	_mm_store_ps(float *__p, __m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1952	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1953	(__m128)__p = __a;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	1954	}
				1955
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	1956	/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
				1957	/// four contiguous elements in an aligned memory location.
				1958	///
				1959	/// \headerfile <x86intrin.h>
				1960	///
				1961	/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
				1962	/// instruction.
				1963	///
				1964	/// \param __p
				1965	/// A pointer to a 128-bit memory location.
				1966	/// \param __a
				1967	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
				1968	/// of the four contiguous elements pointed by __p.
				1969	static __inline__ void __DEFAULT_FN_ATTRS
				1970	_mm_store1_ps(float *__p, __m128 __a)
				1971	{
				1972	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
				1973	_mm_store_ps(__p, __a);
				1974	}
				1975
				1976	/// \brief Stores float values from a 128-bit vector of [4 x float] to an
				1977	/// aligned memory location.
				1978	///
				1979	/// \headerfile <x86intrin.h>
				1980	///
				1981	/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
				1982	///
				1983	/// \param __p
				1984	/// A pointer to a 128-bit memory location. The address of the memory
				1985	/// location has to be 128-bit aligned.
				1986	/// \param __a
				1987	/// A 128-bit vector of [4 x float] containing the values to be stored.
				1988	static __inline__ void __DEFAULT_FN_ATTRS
				1989	_mm_store_ps1(float *__p, __m128 __a)
				1990	{
				1991	return _mm_store1_ps(__p, __a);
				1992	}
				1993
				1994	/// \brief Stores float values from a 128-bit vector of [4 x float] to an
				1995	/// aligned memory location in reverse order.
				1996	///
				1997	/// \headerfile <x86intrin.h>
				1998	///
				1999	/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
				2000	/// instruction.
				2001	///
				2002	/// \param __p
				2003	/// A pointer to a 128-bit memory location. The address of the memory
				2004	/// location has to be 128-bit aligned.
				2005	/// \param __a
				2006	/// A 128-bit vector of [4 x float] containing the values to be stored.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2007	static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2008	_mm_storer_ps(float *__p, __m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2009	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2010	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2011	_mm_store_ps(__p, __a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2012	}
				2013
				2014	#define _MM_HINT_T0 3
				2015	#define _MM_HINT_T1 2
				2016	#define _MM_HINT_T2 1
				2017	#define _MM_HINT_NTA 0
				2018
Stephen Hines	30047ab	2014-04-24 10:38:22 -0700	[diff] [blame]	2019	#ifndef _MSC_VER
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2020	/* FIXME: We have to #define this because "sel" must be a constant integer, and
				2021	Sema doesn't do any form of constant propagation yet. */
				2022
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2023	/// \brief Loads one cache line of data from the specified address to a location
				2024	/// closer to the processor.
				2025	///
				2026	/// \headerfile <x86intrin.h>
				2027	///
				2028	/// \code
				2029	/// void _mm_prefetch(const void * a, const int sel);
				2030	/// \endcode
				2031	///
				2032	/// This intrinsic corresponds to the \c PREFETCHNTA instruction.
				2033	///
				2034	/// \param a
				2035	/// A pointer to a memory location containing a cache line of data.
				2036	/// \param sel
				2037	/// A predefined integer constant specifying the type of prefetch operation:
				2038	/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint.
				2039	/// The PREFETCHNTA instruction will be generated.
				2040	/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
				2041	/// be generated.
				2042	/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
				2043	/// be generated.
				2044	/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
				2045	/// be generated.
Ying Wang	6099914	2013-01-07 13:59:36 -0800	[diff] [blame]	2046	#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
Stephen Hines	30047ab	2014-04-24 10:38:22 -0700	[diff] [blame]	2047	#endif
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2048
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2049	/// \brief Stores a 64-bit integer in the specified aligned memory location. To
				2050	/// minimize caching, the data is flagged as non-temporal (unlikely to be
				2051	/// used again soon).
				2052	///
				2053	/// \headerfile <x86intrin.h>
				2054	///
				2055	/// This intrinsic corresponds to the \c MOVNTQ instruction.
				2056	///
				2057	/// \param __p
				2058	/// A pointer to an aligned memory location used to store the register value.
				2059	/// \param __a
				2060	/// A 64-bit integer containing the value to be stored.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2061	static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2062	_mm_stream_pi(__m64 *__p, __m64 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2063	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2064	__builtin_ia32_movntq(__p, __a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2065	}
				2066
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2067	/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
				2068	/// 128-bit aligned memory location. To minimize caching, the data is flagged
				2069	/// as non-temporal (unlikely to be used again soon).
				2070	///
				2071	/// \headerfile <x86intrin.h>
				2072	///
				2073	/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
				2074	///
				2075	/// \param __p
				2076	/// A pointer to a 128-bit aligned memory location that will receive the
				2077	/// integer values.
				2078	/// \param __a
				2079	/// A 128-bit vector of [4 x float] containing the values to be moved.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2080	static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2081	_mm_stream_ps(float *__p, __m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2082	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2083	__builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2084	}
				2085
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2086	/// \brief Forces strong memory ordering (serialization) between store
				2087	/// instructions preceding this instruction and store instructions following
				2088	/// this instruction, ensuring the system completes all previous stores
				2089	/// before executing subsequent stores.
				2090	///
				2091	/// \headerfile <x86intrin.h>
				2092	///
				2093	/// This intrinsic corresponds to the \c SFENCE instruction.
				2094	///
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2095	static __inline__ void __DEFAULT_FN_ATTRS
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2096	_mm_sfence(void)
				2097	{
				2098	__builtin_ia32_sfence();
				2099	}
				2100
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2101	/// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
				2102	/// returns it, as specified by the immediate integer operand.
				2103	///
				2104	/// \headerfile <x86intrin.h>
				2105	///
				2106	/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
				2107	///
				2108	/// \param __a
				2109	/// A 64-bit vector of [4 x i16].
				2110	/// \param __n
				2111	/// An immediate integer operand that determines which bits are extracted:
				2112	/// 0: Bits [15:0] are copied to the destination.
				2113	/// 1: Bits [31:16] are copied to the destination.
				2114	/// 2: Bits [47:32] are copied to the destination.
				2115	/// 3: Bits [63:48] are copied to the destination.
				2116	/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
				2117	#define _mm_extract_pi16(a, n) __extension__ ({ \
				2118	(int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2119
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2120	/// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
				2121	/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
				2122	/// specified by the immediate operand __n.
				2123	///
				2124	/// \headerfile <x86intrin.h>
				2125	///
				2126	/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
				2127	///
				2128	/// \param __a
				2129	/// A 64-bit vector of [4 x i16].
				2130	/// \param __d
				2131	/// An integer. The lower 16-bit value from this operand is written to the
				2132	/// destination at the offset specified by operand __n.
				2133	/// \param __n
				2134	/// An immediate integer operant that determines which the bits to be used
				2135	/// in the destination.
				2136	/// 0: Bits [15:0] are copied to the destination.
				2137	/// 1: Bits [31:16] are copied to the destination.
				2138	/// 2: Bits [47:32] are copied to the destination.
				2139	/// 3: Bits [63:48] are copied to the destination.
				2140	/// The remaining bits in the destination are copied from the corresponding
				2141	/// bits in operand __a.
				2142	/// \returns A 64-bit integer vector containing the copied packed data from the
				2143	/// operands.
				2144	#define _mm_insert_pi16(a, d, n) __extension__ ({ \
				2145	(__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2146
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2147	/// \brief Compares each of the corresponding packed 16-bit integer values of
				2148	/// the 64-bit integer vectors, and writes the greater value to the
				2149	/// corresponding bits in the destination.
				2150	///
				2151	/// \headerfile <x86intrin.h>
				2152	///
				2153	/// This intrinsic corresponds to the \c PMAXSW instruction.
				2154	///
				2155	/// \param __a
				2156	/// A 64-bit integer vector containing one of the source operands.
				2157	/// \param __b
				2158	/// A 64-bit integer vector containing one of the source operands.
				2159	/// \returns A 64-bit integer vector containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2160	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2161	_mm_max_pi16(__m64 __a, __m64 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2162	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2163	return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2164	}
				2165
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2166	/// \brief Compares each of the corresponding packed 8-bit unsigned integer
				2167	/// values of the 64-bit integer vectors, and writes the greater value to the
				2168	/// corresponding bits in the destination.
				2169	///
				2170	/// \headerfile <x86intrin.h>
				2171	///
				2172	/// This intrinsic corresponds to the \c PMAXUB instruction.
				2173	///
				2174	/// \param __a
				2175	/// A 64-bit integer vector containing one of the source operands.
				2176	/// \param __b
				2177	/// A 64-bit integer vector containing one of the source operands.
				2178	/// \returns A 64-bit integer vector containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2179	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2180	_mm_max_pu8(__m64 __a, __m64 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2181	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2182	return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2183	}
				2184
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2185	/// \brief Compares each of the corresponding packed 16-bit integer values of
				2186	/// the 64-bit integer vectors, and writes the lesser value to the
				2187	/// corresponding bits in the destination.
				2188	///
				2189	/// \headerfile <x86intrin.h>
				2190	///
				2191	/// This intrinsic corresponds to the \c PMINSW instruction.
				2192	///
				2193	/// \param __a
				2194	/// A 64-bit integer vector containing one of the source operands.
				2195	/// \param __b
				2196	/// A 64-bit integer vector containing one of the source operands.
				2197	/// \returns A 64-bit integer vector containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2198	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2199	_mm_min_pi16(__m64 __a, __m64 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2200	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2201	return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2202	}
				2203
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2204	/// \brief Compares each of the corresponding packed 8-bit unsigned integer
				2205	/// values of the 64-bit integer vectors, and writes the lesser value to the
				2206	/// corresponding bits in the destination.
				2207	///
				2208	/// \headerfile <x86intrin.h>
				2209	///
				2210	/// This intrinsic corresponds to the \c PMINUB instruction.
				2211	///
				2212	/// \param __a
				2213	/// A 64-bit integer vector containing one of the source operands.
				2214	/// \param __b
				2215	/// A 64-bit integer vector containing one of the source operands.
				2216	/// \returns A 64-bit integer vector containing the comparison results.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2217	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2218	_mm_min_pu8(__m64 __a, __m64 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2219	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2220	return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2221	}
				2222
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2223	/// \brief Takes the most significant bit from each 8-bit element in a 64-bit
				2224	/// integer vector to create a 16-bit mask value. Zero-extends the value to
				2225	/// 32-bit integer and writes it to the destination.
				2226	///
				2227	/// \headerfile <x86intrin.h>
				2228	///
				2229	/// This intrinsic corresponds to the \c PMOVMSKB instruction.
				2230	///
				2231	/// \param __a
				2232	/// A 64-bit integer vector containing the values with bits to be extracted.
				2233	/// \returns The most significant bit from each 8-bit element in the operand,
				2234	/// written to bits [15:0].
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2235	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2236	_mm_movemask_pi8(__m64 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2237	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2238	return __builtin_ia32_pmovmskb((__v8qi)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2239	}
				2240
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2241	/// \brief Multiplies packed 16-bit unsigned integer values and writes the
				2242	/// high-order 16 bits of each 32-bit product to the corresponding bits in
				2243	/// the destination.
				2244	///
				2245	/// \headerfile <x86intrin.h>
				2246	///
				2247	/// This intrinsic corresponds to the \c PMULHUW instruction.
				2248	///
				2249	/// \param __a
				2250	/// A 64-bit integer vector containing one of the source operands.
				2251	/// \param __b
				2252	/// A 64-bit integer vector containing one of the source operands.
				2253	/// \returns A 64-bit integer vector containing the products of both operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2254	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2255	_mm_mulhi_pu16(__m64 __a, __m64 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2256	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2257	return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2258	}
				2259
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2260	/// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
				2261	/// destination, as specified by the immediate value operand.
				2262	///
				2263	/// \headerfile <x86intrin.h>
				2264	///
				2265	/// This intrinsic corresponds to the \c PSHUFW instruction.
				2266	///
				2267	/// \code
				2268	/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
				2269	/// \endcode
				2270	///
				2271	/// \param a
				2272	/// A 64-bit integer vector containing the values to be shuffled.
				2273	/// \param n
				2274	/// An immediate value containing an 8-bit value specifying which elements to
				2275	/// copy from a. The destinations within the 64-bit destination are assigned
				2276	/// values as follows:
				2277	/// Bits [1:0] are used to assign values to bits [15:0] in the destination.
				2278	/// Bits [3:2] are used to assign values to bits [31:16] in the destination.
				2279	/// Bits [5:4] are used to assign values to bits [47:32] in the destination.
				2280	/// Bits [7:6] are used to assign values to bits [63:48] in the destination.
				2281	/// Bit value assignments:
				2282	/// 00: assigned from bits [15:0] of a.
				2283	/// 01: assigned from bits [31:16] of a.
				2284	/// 10: assigned from bits [47:32] of a.
				2285	/// 11: assigned from bits [63:48] of a.
				2286	/// \returns A 64-bit integer vector containing the shuffled values.
Ying Wang	6099914	2013-01-07 13:59:36 -0800	[diff] [blame]	2287	#define _mm_shuffle_pi16(a, n) __extension__ ({ \
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2288	(__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2289
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2290	/// \brief Conditionally copies the values from each 8-bit element in the first
				2291	/// 64-bit integer vector operand to the specified memory location, as
				2292	/// specified by the most significant bit in the corresponding element in the
				2293	/// second 64-bit integer vector operand. To minimize caching, the data is
				2294	/// flagged as non-temporal (unlikely to be used again soon).
				2295	///
				2296	/// \headerfile <x86intrin.h>
				2297	///
				2298	/// This intrinsic corresponds to the \c MASKMOVQ instruction.
				2299	///
				2300	/// \param __d
				2301	/// A 64-bit integer vector containing the values with elements to be copied.
				2302	/// \param __n
				2303	/// A 64-bit integer vector operand. The most significant bit from each 8-bit
				2304	/// element determines whether the corresponding element in operand __d is
				2305	/// copied. If the most significant bit of a given element is 1, the
				2306	/// corresponding element in operand __d is copied.
				2307	/// \param __p
				2308	/// A pointer to a 64-bit memory location that will receive the conditionally
				2309	/// copied integer values. The address of the memory location does not have
				2310	/// to be aligned.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2311	static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2312	_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2313	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2314	__builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2315	}
				2316
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2317	/// \brief Computes the rounded averages of the packed unsigned 8-bit integer
				2318	/// values and writes the averages to the corresponding bits in the
				2319	/// destination.
				2320	///
				2321	/// \headerfile <x86intrin.h>
				2322	///
				2323	/// This intrinsic corresponds to the \c PAVGB instruction.
				2324	///
				2325	/// \param __a
				2326	/// A 64-bit integer vector containing one of the source operands.
				2327	/// \param __b
				2328	/// A 64-bit integer vector containing one of the source operands.
				2329	/// \returns A 64-bit integer vector containing the averages of both operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2330	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2331	_mm_avg_pu8(__m64 __a, __m64 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2332	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2333	return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2334	}
				2335
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2336	/// \brief Computes the rounded averages of the packed unsigned 16-bit integer
				2337	/// values and writes the averages to the corresponding bits in the
				2338	/// destination.
				2339	///
				2340	/// \headerfile <x86intrin.h>
				2341	///
				2342	/// This intrinsic corresponds to the \c PAVGW instruction.
				2343	///
				2344	/// \param __a
				2345	/// A 64-bit integer vector containing one of the source operands.
				2346	/// \param __b
				2347	/// A 64-bit integer vector containing one of the source operands.
				2348	/// \returns A 64-bit integer vector containing the averages of both operands.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2349	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2350	_mm_avg_pu16(__m64 __a, __m64 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2351	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2352	return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2353	}
				2354
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2355	/// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
				2356	/// 64-bit vector operands and computes the absolute value for each of the
				2357	/// difference. Then sum of the 8 absolute differences is written to the
				2358	/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
				2359	///
				2360	/// \headerfile <x86intrin.h>
				2361	///
				2362	/// This intrinsic corresponds to the \c PSADBW instruction.
				2363	///
				2364	/// \param __a
				2365	/// A 64-bit integer vector containing one of the source operands.
				2366	/// \param __b
				2367	/// A 64-bit integer vector containing one of the source operands.
				2368	/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
				2369	/// sets of absolute differences between both operands. The upper bits are
				2370	/// cleared.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2371	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2372	_mm_sad_pu8(__m64 __a, __m64 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2373	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2374	return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2375	}
				2376
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2377	/// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
				2378	/// integer value. There are several groups of macros associated with this
				2379	/// intrinsic, including:
				2380	/// * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
				2381	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
				2382	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
				2383	/// _MM_GET_EXCEPTION_STATE().
				2384	/// * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
				2385	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
				2386	/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
				2387	/// * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
				2388	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
				2389	/// _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
				2390	/// * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
				2391	/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
				2392	/// * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
				2393	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
				2394	/// _MM_GET_DENORMALS_ZERO_MODE().
				2395	///
				2396	/// For example, the expression below checks if an overflow exception has
				2397	/// occurred:
				2398	/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
				2399	///
				2400	/// The following example gets the current rounding mode:
				2401	/// _MM_GET_ROUNDING_MODE()
				2402	///
				2403	/// \headerfile <x86intrin.h>
				2404	///
				2405	/// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction.
				2406	///
				2407	/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
				2408	/// register.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2409	static __inline__ unsigned int __DEFAULT_FN_ATTRS
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2410	_mm_getcsr(void)
				2411	{
				2412	return __builtin_ia32_stmxcsr();
				2413	}
				2414
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2415	/// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There
				2416	/// are several groups of macros associated with this intrinsic, including:
				2417	/// * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
				2418	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
				2419	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
				2420	/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
				2421	/// * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
				2422	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
				2423	/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
				2424	/// of these macros.
				2425	/// * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
				2426	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
				2427	/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
				2428	/// * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
				2429	/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
				2430	/// one of these macros.
				2431	/// * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
				2432	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
				2433	/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
				2434	///
				2435	/// For example, the following expression causes subsequent floating-point
				2436	/// operations to round up:
				2437	/// _mm_setcsr(_mm_getcsr() \| _MM_ROUND_UP)
				2438	///
				2439	/// The following example sets the DAZ and FTZ flags:
				2440	/// void setFlags() {
				2441	/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
				2442	/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
				2443	/// }
				2444	///
				2445	/// \headerfile <x86intrin.h>
				2446	///
				2447	/// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction.
				2448	///
				2449	/// \param __i
				2450	/// A 32-bit unsigned integer value to be written to the MXCSR register.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2451	static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2452	_mm_setcsr(unsigned int __i)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2453	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2454	__builtin_ia32_ldmxcsr(__i);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2455	}
				2456
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2457	/// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
				2458	/// specified by the immediate value operand.
				2459	///
				2460	/// \headerfile <x86intrin.h>
				2461	///
				2462	/// \code
				2463	/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
				2464	/// \endcode
				2465	///
				2466	/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
				2467	///
				2468	/// \param a
				2469	/// A 128-bit vector of [4 x float].
				2470	/// \param b
				2471	/// A 128-bit vector of [4 x float].
				2472	/// \param mask
				2473	/// An immediate value containing an 8-bit value specifying which elements to
				2474	/// copy from a and b.
				2475	/// Bits [3:0] specify the values copied from operand a.
				2476	/// Bits [7:4] specify the values copied from operand b. The destinations
				2477	/// within the 128-bit destination are assigned values as follows:
				2478	/// Bits [1:0] are used to assign values to bits [31:0] in the destination.
				2479	/// Bits [3:2] are used to assign values to bits [63:32] in the destination.
				2480	/// Bits [5:4] are used to assign values to bits [95:64] in the destination.
				2481	/// Bits [7:6] are used to assign values to bits [127:96] in the destination.
				2482	/// Bit value assignments:
				2483	/// 00: Bits [31:0] copied from the specified operand.
				2484	/// 01: Bits [63:32] copied from the specified operand.
				2485	/// 10: Bits [95:64] copied from the specified operand.
				2486	/// 11: Bits [127:96] copied from the specified operand.
				2487	/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
Ying Wang	6099914	2013-01-07 13:59:36 -0800	[diff] [blame]	2488	#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2489	(__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2490	0 + (((mask) >> 0) & 0x3), \
				2491	0 + (((mask) >> 2) & 0x3), \
				2492	4 + (((mask) >> 4) & 0x3), \
				2493	4 + (((mask) >> 6) & 0x3)); })
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2494
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2495	/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
				2496	/// [4 x float] and interleaves them into a 128-bit vector of [4 x
				2497	/// float].
				2498	///
				2499	/// \headerfile <x86intrin.h>
				2500	///
				2501	/// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction.
				2502	///
				2503	/// \param __a
				2504	/// A 128-bit vector of [4 x float].
				2505	/// Bits [95:64] are written to bits [31:0] of the destination.
				2506	/// Bits [127:96] are written to bits [95:64] of the destination.
				2507	/// \param __b
				2508	/// A 128-bit vector of [4 x float].
				2509	/// Bits [95:64] are written to bits [63:32] of the destination.
				2510	/// Bits [127:96] are written to bits [127:96] of the destination.
				2511	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2512	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2513	_mm_unpackhi_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2514	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2515	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2516	}
				2517
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2518	/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
				2519	/// [4 x float] and interleaves them into a 128-bit vector of [4 x
				2520	/// float].
				2521	///
				2522	/// \headerfile <x86intrin.h>
				2523	///
				2524	/// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction.
				2525	///
				2526	/// \param __a
				2527	/// A 128-bit vector of [4 x float].
				2528	/// Bits [31:0] are written to bits [31:0] of the destination.
				2529	/// Bits [63:32] are written to bits [95:64] of the destination.
				2530	/// \param __b
				2531	/// A 128-bit vector of [4 x float].
				2532	/// Bits [31:0] are written to bits [63:32] of the destination.
				2533	/// Bits [63:32] are written to bits [127:96] of the destination.
				2534	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2535	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2536	_mm_unpacklo_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2537	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2538	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2539	}
				2540
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2541	/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
				2542	/// 32 bits are set to the lower 32 bits of the second parameter. The upper
				2543	/// 96 bits are set to the upper 96 bits of the first parameter.
				2544	///
				2545	/// \headerfile <x86intrin.h>
				2546	///
				2547	/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
				2548	///
				2549	/// \param __a
				2550	/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
				2551	/// written to the upper 96 bits of the result.
				2552	/// \param __b
				2553	/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
				2554	/// written to the lower 32 bits of the result.
				2555	/// \returns A 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2556	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2557	_mm_move_ss(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2558	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2559	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2560	}
				2561
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2562	/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
				2563	/// 64 bits are set to the upper 64 bits of the second parameter. The upper
				2564	/// 64 bits are set to the upper 64 bits of the first parameter.
				2565	///
				2566	/// \headerfile <x86intrin.h>
				2567	///
				2568	/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
				2569	///
				2570	/// \param __a
				2571	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
				2572	/// written to the upper 64 bits of the result.
				2573	/// \param __b
				2574	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
				2575	/// written to the lower 64 bits of the result.
				2576	/// \returns A 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2577	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2578	_mm_movehl_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2579	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2580	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2581	}
				2582
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2583	/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
				2584	/// 64 bits are set to the lower 64 bits of the first parameter. The upper
				2585	/// 64 bits are set to the lower 64 bits of the second parameter.
				2586	///
				2587	/// \headerfile <x86intrin.h>
				2588	///
				2589	/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
				2590	///
				2591	/// \param __a
				2592	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
				2593	/// written to the lower 64 bits of the result.
				2594	/// \param __b
				2595	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
				2596	/// written to the upper 64 bits of the result.
				2597	/// \returns A 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2598	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2599	_mm_movelh_ps(__m128 __a, __m128 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2600	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2601	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2602	}
				2603
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2604	/// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
				2605	/// float].
				2606	///
				2607	/// \headerfile <x86intrin.h>
				2608	///
				2609	/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
				2610	///
				2611	/// \param __a
				2612	/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
				2613	/// from the corresponding elements in this operand.
				2614	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2615	/// values from the operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2616	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2617	_mm_cvtpi16_ps(__m64 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2618	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2619	__m64 __b, __c;
				2620	__m128 __r;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2621
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2622	__b = _mm_setzero_si64();
				2623	__b = _mm_cmpgt_pi16(__b, __a);
				2624	__c = _mm_unpackhi_pi16(__a, __b);
				2625	__r = _mm_setzero_ps();
				2626	__r = _mm_cvtpi32_ps(__r, __c);
				2627	__r = _mm_movelh_ps(__r, __r);
				2628	__c = _mm_unpacklo_pi16(__a, __b);
				2629	__r = _mm_cvtpi32_ps(__r, __c);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2630
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2631	return __r;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2632	}
				2633
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2634	/// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
				2635	/// 128-bit vector of [4 x float].
				2636	///
				2637	/// \headerfile <x86intrin.h>
				2638	///
				2639	/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
				2640	///
				2641	/// \param __a
				2642	/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
				2643	/// destination are copied from the corresponding elements in this operand.
				2644	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2645	/// values from the operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2646	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2647	_mm_cvtpu16_ps(__m64 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2648	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2649	__m64 __b, __c;
				2650	__m128 __r;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2651
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2652	__b = _mm_setzero_si64();
				2653	__c = _mm_unpackhi_pi16(__a, __b);
				2654	__r = _mm_setzero_ps();
				2655	__r = _mm_cvtpi32_ps(__r, __c);
				2656	__r = _mm_movelh_ps(__r, __r);
				2657	__c = _mm_unpacklo_pi16(__a, __b);
				2658	__r = _mm_cvtpi32_ps(__r, __c);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2659
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2660	return __r;
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2661	}
				2662
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2663	/// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
				2664	/// into a 128-bit vector of [4 x float].
				2665	///
				2666	/// \headerfile <x86intrin.h>
				2667	///
				2668	/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
				2669	///
				2670	/// \param __a
				2671	/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
				2672	/// from the corresponding lower 4 elements in this operand.
				2673	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2674	/// values from the operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2675	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2676	_mm_cvtpi8_ps(__m64 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2677	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2678	__m64 __b;
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2679
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2680	__b = _mm_setzero_si64();
				2681	__b = _mm_cmpgt_pi8(__b, __a);
				2682	__b = _mm_unpacklo_pi8(__a, __b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2683
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2684	return _mm_cvtpi16_ps(__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2685	}
				2686
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2687	/// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
				2688	/// vector of [8 x u8] into a 128-bit vector of [4 x float].
				2689	///
				2690	/// \headerfile <x86intrin.h>
				2691	///
				2692	/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
				2693	///
				2694	/// \param __a
				2695	/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
				2696	/// destination are copied from the corresponding lower 4 elements in this
				2697	/// operand.
				2698	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
				2699	/// values from the source operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2700	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2701	_mm_cvtpu8_ps(__m64 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2702	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2703	__m64 __b;
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2704
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2705	__b = _mm_setzero_si64();
				2706	__b = _mm_unpacklo_pi8(__a, __b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2707
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2708	return _mm_cvtpi16_ps(__b);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2709	}
				2710
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2711	/// \brief Converts the two 32-bit signed integer values from each 64-bit vector
				2712	/// operand of [2 x i32] into a 128-bit vector of [4 x float].
				2713	///
				2714	/// \headerfile <x86intrin.h>
				2715	///
				2716	/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
				2717	///
				2718	/// \param __a
				2719	/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
				2720	/// copied from the elements in this operand.
				2721	/// \param __b
				2722	/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
				2723	/// copied from the elements in this operand.
				2724	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
				2725	/// copied and converted values from the first operand. The upper 64 bits
				2726	/// contain the copied and converted values from the second operand.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2727	static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2728	_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2729	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2730	__m128 __c;
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2731
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2732	__c = _mm_setzero_ps();
				2733	__c = _mm_cvtpi32_ps(__c, __b);
				2734	__c = _mm_movelh_ps(__c, __c);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2735
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2736	return _mm_cvtpi32_ps(__c, __a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2737	}
				2738
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2739	/// \brief Converts each single-precision floating-point element of a 128-bit
				2740	/// floating-point vector of [4 x float] into a 16-bit signed integer, and
				2741	/// packs the results into a 64-bit integer vector of [4 x i16]. If the
				2742	/// floating-point element is NaN or infinity, or if the floating-point
				2743	/// element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
				2744	/// to 0x8000. Otherwise if the floating-point element is greater
				2745	/// than 0x7FFF, it is converted to 0x7FFF.
				2746	///
				2747	/// \headerfile <x86intrin.h>
				2748	///
				2749	/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
				2750	///
				2751	/// \param __a
				2752	/// A 128-bit floating-point vector of [4 x float].
				2753	/// \returns A 64-bit integer vector of [4 x i16] containing the converted
				2754	/// values.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2755	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2756	_mm_cvtps_pi16(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2757	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2758	__m64 __b, __c;
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2759
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2760	__b = _mm_cvtps_pi32(__a);
				2761	__a = _mm_movehl_ps(__a, __a);
				2762	__c = _mm_cvtps_pi32(__a);
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2763
Stephen Hines	e65db13	2014-05-30 13:26:31 -0700	[diff] [blame]	2764	return _mm_packs_pi32(__b, __c);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2765	}
				2766
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2767	/// \brief Converts each single-precision floating-point element of a 128-bit
				2768	/// floating-point vector of [4 x float] into an 8-bit signed integer, and
				2769	/// packs the results into the lower 32 bits of a 64-bit integer vector of
				2770	/// [8 x i8]. The upper 32 bits of the vector are set to 0. If the
				2771	/// floating-point element is NaN or infinity, or if the floating-point
				2772	/// element is greater than 0x7FFFFFFF or less than -0x80, it is converted
				2773	/// to 0x80. Otherwise if the floating-point element is greater
				2774	/// than 0x7F, it is converted to 0x7F.
				2775	///
				2776	/// \headerfile <x86intrin.h>
				2777	///
				2778	/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
				2779	///
				2780	/// \param __a
				2781	/// 128-bit floating-point vector of [4 x float].
				2782	/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
				2783	/// converted values and the uppper 32 bits are set to zero.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2784	static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2785	_mm_cvtps_pi8(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2786	{
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2787	__m64 __b, __c;
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2788
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2789	__b = _mm_cvtps_pi16(__a);
				2790	__c = _mm_setzero_si64();
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2791
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2792	return _mm_packs_pi16(__b, __c);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2793	}
				2794
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2795	/// \brief Extracts the sign bits from each single-precision floating-point
				2796	/// element of a 128-bit floating-point vector of [4 x float] and returns the
				2797	/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
				2798	/// to zero.
				2799	///
				2800	/// \headerfile <x86intrin.h>
				2801	///
				2802	/// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction.
				2803	///
				2804	/// \param __a
				2805	/// A 128-bit floating-point vector of [4 x float].
				2806	/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
				2807	/// single-precision floating-point element of the parameter. Bits [31:4] are
				2808	/// set to zero.
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2809	static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hines	c6ee7df	2013-04-02 18:41:57 -0700	[diff] [blame]	2810	_mm_movemask_ps(__m128 __a)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2811	{
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2812	return __builtin_ia32_movmskps((__v4sf)__a);
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2813	}
				2814
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2815
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2816	#define _MM_ALIGN16 __attribute__((aligned(16)))
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2817
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2818	#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) \| ((y) << 4) \| ((x) << 2) \| (w))
				2819
				2820	#define _MM_EXCEPT_INVALID (0x0001)
				2821	#define _MM_EXCEPT_DENORM (0x0002)
				2822	#define _MM_EXCEPT_DIV_ZERO (0x0004)
				2823	#define _MM_EXCEPT_OVERFLOW (0x0008)
				2824	#define _MM_EXCEPT_UNDERFLOW (0x0010)
				2825	#define _MM_EXCEPT_INEXACT (0x0020)
				2826	#define _MM_EXCEPT_MASK (0x003f)
				2827
				2828	#define _MM_MASK_INVALID (0x0080)
				2829	#define _MM_MASK_DENORM (0x0100)
				2830	#define _MM_MASK_DIV_ZERO (0x0200)
				2831	#define _MM_MASK_OVERFLOW (0x0400)
				2832	#define _MM_MASK_UNDERFLOW (0x0800)
				2833	#define _MM_MASK_INEXACT (0x1000)
				2834	#define _MM_MASK_MASK (0x1f80)
				2835
				2836	#define _MM_ROUND_NEAREST (0x0000)
				2837	#define _MM_ROUND_DOWN (0x2000)
				2838	#define _MM_ROUND_UP (0x4000)
				2839	#define _MM_ROUND_TOWARD_ZERO (0x6000)
				2840	#define _MM_ROUND_MASK (0x6000)
				2841
				2842	#define _MM_FLUSH_ZERO_MASK (0x8000)
				2843	#define _MM_FLUSH_ZERO_ON (0x8000)
Ying Wang	6099914	2013-01-07 13:59:36 -0800	[diff] [blame]	2844	#define _MM_FLUSH_ZERO_OFF (0x0000)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2845
				2846	#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
				2847	#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
				2848	#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
				2849	#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
				2850
				2851	#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) \| (x)))
				2852	#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) \| (x)))
				2853	#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) \| (x)))
				2854	#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) \| (x)))
				2855
				2856	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
				2857	do { \
				2858	__m128 tmp3, tmp2, tmp1, tmp0; \
				2859	tmp0 = _mm_unpacklo_ps((row0), (row1)); \
				2860	tmp2 = _mm_unpacklo_ps((row2), (row3)); \
				2861	tmp1 = _mm_unpackhi_ps((row0), (row1)); \
				2862	tmp3 = _mm_unpackhi_ps((row2), (row3)); \
				2863	(row0) = _mm_movelh_ps(tmp0, tmp2); \
				2864	(row1) = _mm_movehl_ps(tmp2, tmp0); \
				2865	(row2) = _mm_movelh_ps(tmp1, tmp3); \
				2866	(row3) = _mm_movehl_ps(tmp3, tmp1); \
				2867	} while (0)
				2868
				2869	/* Aliases for compatibility. */
				2870	#define _m_pextrw _mm_extract_pi16
				2871	#define _m_pinsrw _mm_insert_pi16
				2872	#define _m_pmaxsw _mm_max_pi16
				2873	#define _m_pmaxub _mm_max_pu8
				2874	#define _m_pminsw _mm_min_pi16
				2875	#define _m_pminub _mm_min_pu8
				2876	#define _m_pmovmskb _mm_movemask_pi8
				2877	#define _m_pmulhuw _mm_mulhi_pu16
				2878	#define _m_pshufw _mm_shuffle_pi16
				2879	#define _m_maskmovq _mm_maskmove_si64
				2880	#define _m_pavgb _mm_avg_pu8
				2881	#define _m_pavgw _mm_avg_pu16
				2882	#define _m_psadbw _mm_sad_pu8
				2883	#define _m_ _mm_
				2884	#define _m_ _mm_
				2885
Pirama Arumuga Nainar	4e74a02	2016-03-17 18:03:02 -0700	[diff] [blame]	2886	#undef __DEFAULT_FN_ATTRS
				2887
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2888	/* Ugly hack for backwards-compatibility (compatible with gcc) */
Pirama Arumuga Nainar	bb4374f	2016-10-20 16:43:03 -0700	[diff] [blame]	2889	#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2890	#include <emmintrin.h>
				2891	#endif
				2892
Ying Wang	a672014	2011-12-20 14:43:20 -0800	[diff] [blame]	2893	#endif /* __XMMINTRIN_H */