Blame - linux-x86/lib64/clang/14.0.0/include/smmintrin.h - platform/prebuilts/clang-tools

blob: 8913a196144bb53c5f85e98dbcc79418be67c907 [file] [log] [blame]

Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1	/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
				2	*
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	* See https://llvm.org/LICENSE.txt for license information.
				5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	6	*
				7	*===-----------------------------------------------------------------------===
				8	*/
				9
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	10	#ifndef __SMMINTRIN_H
				11	#define __SMMINTRIN_H
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	12
				13	#include <tmmintrin.h>
				14
				15	/* Define the default attributes for the functions in this file. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	16	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	17
				18	/* SSE4 Rounding macros. */
				19	#define _MM_FROUND_TO_NEAREST_INT 0x00
				20	#define _MM_FROUND_TO_NEG_INF 0x01
				21	#define _MM_FROUND_TO_POS_INF 0x02
				22	#define _MM_FROUND_TO_ZERO 0x03
				23	#define _MM_FROUND_CUR_DIRECTION 0x04
				24
				25	#define _MM_FROUND_RAISE_EXC 0x00
				26	#define _MM_FROUND_NO_EXC 0x08
				27
				28	#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEAREST_INT)
				29	#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEG_INF)
				30	#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_POS_INF)
				31	#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_ZERO)
				32	#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC \| _MM_FROUND_CUR_DIRECTION)
				33	#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC \| _MM_FROUND_CUR_DIRECTION)
				34
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	35	/// Rounds up each element of the 128-bit vector of [4 x float] to an
				36	/// integer and returns the rounded values in a 128-bit vector of
				37	/// [4 x float].
				38	///
				39	/// \headerfile <x86intrin.h>
				40	///
				41	/// \code
				42	/// __m128 _mm_ceil_ps(__m128 X);
				43	/// \endcode
				44	///
				45	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
				46	///
				47	/// \param X
				48	/// A 128-bit vector of [4 x float] values to be rounded up.
				49	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	50	#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	51
				52	/// Rounds up each element of the 128-bit vector of [2 x double] to an
				53	/// integer and returns the rounded values in a 128-bit vector of
				54	/// [2 x double].
				55	///
				56	/// \headerfile <x86intrin.h>
				57	///
				58	/// \code
				59	/// __m128d _mm_ceil_pd(__m128d X);
				60	/// \endcode
				61	///
				62	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
				63	///
				64	/// \param X
				65	/// A 128-bit vector of [2 x double] values to be rounded up.
				66	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	67	#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	68
				69	/// Copies three upper elements of the first 128-bit vector operand to
				70	/// the corresponding three upper elements of the 128-bit result vector of
				71	/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
				72	/// operand to an integer and copies it to the lowest element of the 128-bit
				73	/// result vector of [4 x float].
				74	///
				75	/// \headerfile <x86intrin.h>
				76	///
				77	/// \code
				78	/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
				79	/// \endcode
				80	///
				81	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
				82	///
				83	/// \param X
				84	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
				85	/// copied to the corresponding bits of the result.
				86	/// \param Y
				87	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
				88	/// rounded up to the nearest integer and copied to the corresponding bits
				89	/// of the result.
				90	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
				91	/// values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	92	#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	93
				94	/// Copies the upper element of the first 128-bit vector operand to the
				95	/// corresponding upper element of the 128-bit result vector of [2 x double].
				96	/// Rounds up the lower element of the second 128-bit vector operand to an
				97	/// integer and copies it to the lower element of the 128-bit result vector
				98	/// of [2 x double].
				99	///
				100	/// \headerfile <x86intrin.h>
				101	///
				102	/// \code
				103	/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
				104	/// \endcode
				105	///
				106	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
				107	///
				108	/// \param X
				109	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
				110	/// copied to the corresponding bits of the result.
				111	/// \param Y
				112	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
				113	/// rounded up to the nearest integer and copied to the corresponding bits
				114	/// of the result.
				115	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
				116	/// values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	117	#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
				118
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	119	/// Rounds down each element of the 128-bit vector of [4 x float] to an
				120	/// an integer and returns the rounded values in a 128-bit vector of
				121	/// [4 x float].
				122	///
				123	/// \headerfile <x86intrin.h>
				124	///
				125	/// \code
				126	/// __m128 _mm_floor_ps(__m128 X);
				127	/// \endcode
				128	///
				129	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
				130	///
				131	/// \param X
				132	/// A 128-bit vector of [4 x float] values to be rounded down.
				133	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	134	#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	135
				136	/// Rounds down each element of the 128-bit vector of [2 x double] to an
				137	/// integer and returns the rounded values in a 128-bit vector of
				138	/// [2 x double].
				139	///
				140	/// \headerfile <x86intrin.h>
				141	///
				142	/// \code
				143	/// __m128d _mm_floor_pd(__m128d X);
				144	/// \endcode
				145	///
				146	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
				147	///
				148	/// \param X
				149	/// A 128-bit vector of [2 x double].
				150	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	151	#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	152
				153	/// Copies three upper elements of the first 128-bit vector operand to
				154	/// the corresponding three upper elements of the 128-bit result vector of
				155	/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
				156	/// operand to an integer and copies it to the lowest element of the 128-bit
				157	/// result vector of [4 x float].
				158	///
				159	/// \headerfile <x86intrin.h>
				160	///
				161	/// \code
				162	/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
				163	/// \endcode
				164	///
				165	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
				166	///
				167	/// \param X
				168	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
				169	/// copied to the corresponding bits of the result.
				170	/// \param Y
				171	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
				172	/// rounded down to the nearest integer and copied to the corresponding bits
				173	/// of the result.
				174	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
				175	/// values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	176	#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	177
				178	/// Copies the upper element of the first 128-bit vector operand to the
				179	/// corresponding upper element of the 128-bit result vector of [2 x double].
				180	/// Rounds down the lower element of the second 128-bit vector operand to an
				181	/// integer and copies it to the lower element of the 128-bit result vector
				182	/// of [2 x double].
				183	///
				184	/// \headerfile <x86intrin.h>
				185	///
				186	/// \code
				187	/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
				188	/// \endcode
				189	///
				190	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
				191	///
				192	/// \param X
				193	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
				194	/// copied to the corresponding bits of the result.
				195	/// \param Y
				196	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
				197	/// rounded down to the nearest integer and copied to the corresponding bits
				198	/// of the result.
				199	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
				200	/// values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	201	#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
				202
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	203	/// Rounds each element of the 128-bit vector of [4 x float] to an
				204	/// integer value according to the rounding control specified by the second
				205	/// argument and returns the rounded values in a 128-bit vector of
				206	/// [4 x float].
				207	///
				208	/// \headerfile <x86intrin.h>
				209	///
				210	/// \code
				211	/// __m128 _mm_round_ps(__m128 X, const int M);
				212	/// \endcode
				213	///
				214	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
				215	///
				216	/// \param X
				217	/// A 128-bit vector of [4 x float].
				218	/// \param M
				219	/// An integer value that specifies the rounding operation. \n
				220	/// Bits [7:4] are reserved. \n
				221	/// Bit [3] is a precision exception value: \n
				222	/// 0: A normal PE exception is used \n
				223	/// 1: The PE field is not updated \n
				224	/// Bit [2] is the rounding control source: \n
				225	/// 0: Use bits [1:0] of \a M \n
				226	/// 1: Use the current MXCSR setting \n
				227	/// Bits [1:0] contain the rounding control definition: \n
				228	/// 00: Nearest \n
				229	/// 01: Downward (toward negative infinity) \n
				230	/// 10: Upward (toward positive infinity) \n
				231	/// 11: Truncated
				232	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
				233	#define _mm_round_ps(X, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	234	((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	235
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	236	/// Copies three upper elements of the first 128-bit vector operand to
				237	/// the corresponding three upper elements of the 128-bit result vector of
				238	/// [4 x float]. Rounds the lowest element of the second 128-bit vector
				239	/// operand to an integer value according to the rounding control specified
				240	/// by the third argument and copies it to the lowest element of the 128-bit
				241	/// result vector of [4 x float].
				242	///
				243	/// \headerfile <x86intrin.h>
				244	///
				245	/// \code
				246	/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
				247	/// \endcode
				248	///
				249	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
				250	///
				251	/// \param X
				252	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
				253	/// copied to the corresponding bits of the result.
				254	/// \param Y
				255	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
				256	/// rounded to the nearest integer using the specified rounding control and
				257	/// copied to the corresponding bits of the result.
				258	/// \param M
				259	/// An integer value that specifies the rounding operation. \n
				260	/// Bits [7:4] are reserved. \n
				261	/// Bit [3] is a precision exception value: \n
				262	/// 0: A normal PE exception is used \n
				263	/// 1: The PE field is not updated \n
				264	/// Bit [2] is the rounding control source: \n
				265	/// 0: Use bits [1:0] of \a M \n
				266	/// 1: Use the current MXCSR setting \n
				267	/// Bits [1:0] contain the rounding control definition: \n
				268	/// 00: Nearest \n
				269	/// 01: Downward (toward negative infinity) \n
				270	/// 10: Upward (toward positive infinity) \n
				271	/// 11: Truncated
				272	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
				273	/// values.
				274	#define _mm_round_ss(X, Y, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	275	((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
				276	(__v4sf)(__m128)(Y), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	277
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	278	/// Rounds each element of the 128-bit vector of [2 x double] to an
				279	/// integer value according to the rounding control specified by the second
				280	/// argument and returns the rounded values in a 128-bit vector of
				281	/// [2 x double].
				282	///
				283	/// \headerfile <x86intrin.h>
				284	///
				285	/// \code
				286	/// __m128d _mm_round_pd(__m128d X, const int M);
				287	/// \endcode
				288	///
				289	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
				290	///
				291	/// \param X
				292	/// A 128-bit vector of [2 x double].
				293	/// \param M
				294	/// An integer value that specifies the rounding operation. \n
				295	/// Bits [7:4] are reserved. \n
				296	/// Bit [3] is a precision exception value: \n
				297	/// 0: A normal PE exception is used \n
				298	/// 1: The PE field is not updated \n
				299	/// Bit [2] is the rounding control source: \n
				300	/// 0: Use bits [1:0] of \a M \n
				301	/// 1: Use the current MXCSR setting \n
				302	/// Bits [1:0] contain the rounding control definition: \n
				303	/// 00: Nearest \n
				304	/// 01: Downward (toward negative infinity) \n
				305	/// 10: Upward (toward positive infinity) \n
				306	/// 11: Truncated
				307	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
				308	#define _mm_round_pd(X, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	309	((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	310
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	311	/// Copies the upper element of the first 128-bit vector operand to the
				312	/// corresponding upper element of the 128-bit result vector of [2 x double].
				313	/// Rounds the lower element of the second 128-bit vector operand to an
				314	/// integer value according to the rounding control specified by the third
				315	/// argument and copies it to the lower element of the 128-bit result vector
				316	/// of [2 x double].
				317	///
				318	/// \headerfile <x86intrin.h>
				319	///
				320	/// \code
				321	/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
				322	/// \endcode
				323	///
				324	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
				325	///
				326	/// \param X
				327	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
				328	/// copied to the corresponding bits of the result.
				329	/// \param Y
				330	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
				331	/// rounded to the nearest integer using the specified rounding control and
				332	/// copied to the corresponding bits of the result.
				333	/// \param M
				334	/// An integer value that specifies the rounding operation. \n
				335	/// Bits [7:4] are reserved. \n
				336	/// Bit [3] is a precision exception value: \n
				337	/// 0: A normal PE exception is used \n
				338	/// 1: The PE field is not updated \n
				339	/// Bit [2] is the rounding control source: \n
				340	/// 0: Use bits [1:0] of \a M \n
				341	/// 1: Use the current MXCSR setting \n
				342	/// Bits [1:0] contain the rounding control definition: \n
				343	/// 00: Nearest \n
				344	/// 01: Downward (toward negative infinity) \n
				345	/// 10: Upward (toward positive infinity) \n
				346	/// 11: Truncated
				347	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
				348	/// values.
				349	#define _mm_round_sd(X, Y, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	350	((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
				351	(__v2df)(__m128d)(Y), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	352
				353	/* SSE4 Packed Blending Intrinsics. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	354	/// Returns a 128-bit vector of [2 x double] where the values are
				355	/// selected from either the first or second operand as specified by the
				356	/// third operand, the control mask.
				357	///
				358	/// \headerfile <x86intrin.h>
				359	///
				360	/// \code
				361	/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
				362	/// \endcode
				363	///
				364	/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
				365	///
				366	/// \param V1
				367	/// A 128-bit vector of [2 x double].
				368	/// \param V2
				369	/// A 128-bit vector of [2 x double].
				370	/// \param M
				371	/// An immediate integer operand, with mask bits [1:0] specifying how the
				372	/// values are to be copied. The position of the mask bit corresponds to the
				373	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
				374	/// element in operand \a V1 is copied to the same position in the result.
				375	/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
				376	/// is copied to the same position in the result.
				377	/// \returns A 128-bit vector of [2 x double] containing the copied values.
				378	#define _mm_blend_pd(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	379	((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
				380	(__v2df)(__m128d)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	381
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	382	/// Returns a 128-bit vector of [4 x float] where the values are selected
				383	/// from either the first or second operand as specified by the third
				384	/// operand, the control mask.
				385	///
				386	/// \headerfile <x86intrin.h>
				387	///
				388	/// \code
				389	/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
				390	/// \endcode
				391	///
				392	/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
				393	///
				394	/// \param V1
				395	/// A 128-bit vector of [4 x float].
				396	/// \param V2
				397	/// A 128-bit vector of [4 x float].
				398	/// \param M
				399	/// An immediate integer operand, with mask bits [3:0] specifying how the
				400	/// values are to be copied. The position of the mask bit corresponds to the
				401	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
				402	/// element in operand \a V1 is copied to the same position in the result.
				403	/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
				404	/// is copied to the same position in the result.
				405	/// \returns A 128-bit vector of [4 x float] containing the copied values.
				406	#define _mm_blend_ps(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	407	((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
				408	(__v4sf)(__m128)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	409
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	410	/// Returns a 128-bit vector of [2 x double] where the values are
				411	/// selected from either the first or second operand as specified by the
				412	/// third operand, the control mask.
				413	///
				414	/// \headerfile <x86intrin.h>
				415	///
				416	/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
				417	///
				418	/// \param __V1
				419	/// A 128-bit vector of [2 x double].
				420	/// \param __V2
				421	/// A 128-bit vector of [2 x double].
				422	/// \param __M
				423	/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
				424	/// values are to be copied. The position of the mask bit corresponds to the
				425	/// most significant bit of a copied value. When a mask bit is 0, the
				426	/// corresponding 64-bit element in operand \a __V1 is copied to the same
				427	/// position in the result. When a mask bit is 1, the corresponding 64-bit
				428	/// element in operand \a __V2 is copied to the same position in the result.
				429	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	430	static __inline__ __m128d __DEFAULT_FN_ATTRS
				431	_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
				432	{
				433	return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
				434	(__v2df)__M);
				435	}
				436
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	437	/// Returns a 128-bit vector of [4 x float] where the values are
				438	/// selected from either the first or second operand as specified by the
				439	/// third operand, the control mask.
				440	///
				441	/// \headerfile <x86intrin.h>
				442	///
				443	/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
				444	///
				445	/// \param __V1
				446	/// A 128-bit vector of [4 x float].
				447	/// \param __V2
				448	/// A 128-bit vector of [4 x float].
				449	/// \param __M
				450	/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
				451	/// how the values are to be copied. The position of the mask bit corresponds
				452	/// to the most significant bit of a copied value. When a mask bit is 0, the
				453	/// corresponding 32-bit element in operand \a __V1 is copied to the same
				454	/// position in the result. When a mask bit is 1, the corresponding 32-bit
				455	/// element in operand \a __V2 is copied to the same position in the result.
				456	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	457	static __inline__ __m128 __DEFAULT_FN_ATTRS
				458	_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
				459	{
				460	return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
				461	(__v4sf)__M);
				462	}
				463
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	464	/// Returns a 128-bit vector of [16 x i8] where the values are selected
				465	/// from either of the first or second operand as specified by the third
				466	/// operand, the control mask.
				467	///
				468	/// \headerfile <x86intrin.h>
				469	///
				470	/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
				471	///
				472	/// \param __V1
				473	/// A 128-bit vector of [16 x i8].
				474	/// \param __V2
				475	/// A 128-bit vector of [16 x i8].
				476	/// \param __M
				477	/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
				478	/// how the values are to be copied. The position of the mask bit corresponds
				479	/// to the most significant bit of a copied value. When a mask bit is 0, the
				480	/// corresponding 8-bit element in operand \a __V1 is copied to the same
				481	/// position in the result. When a mask bit is 1, the corresponding 8-bit
				482	/// element in operand \a __V2 is copied to the same position in the result.
				483	/// \returns A 128-bit vector of [16 x i8] containing the copied values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	484	static __inline__ __m128i __DEFAULT_FN_ATTRS
				485	_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
				486	{
				487	return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
				488	(__v16qi)__M);
				489	}
				490
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	491	/// Returns a 128-bit vector of [8 x i16] where the values are selected
				492	/// from either of the first or second operand as specified by the third
				493	/// operand, the control mask.
				494	///
				495	/// \headerfile <x86intrin.h>
				496	///
				497	/// \code
				498	/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
				499	/// \endcode
				500	///
				501	/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
				502	///
				503	/// \param V1
				504	/// A 128-bit vector of [8 x i16].
				505	/// \param V2
				506	/// A 128-bit vector of [8 x i16].
				507	/// \param M
				508	/// An immediate integer operand, with mask bits [7:0] specifying how the
				509	/// values are to be copied. The position of the mask bit corresponds to the
				510	/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
				511	/// element in operand \a V1 is copied to the same position in the result.
				512	/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
				513	/// is copied to the same position in the result.
				514	/// \returns A 128-bit vector of [8 x i16] containing the copied values.
				515	#define _mm_blend_epi16(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	516	((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
				517	(__v8hi)(__m128i)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	518
				519	/* SSE4 Dword Multiply Instructions. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	520	/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
				521	/// and returns the lower 32 bits of the each product in a 128-bit vector of
				522	/// [4 x i32].
				523	///
				524	/// \headerfile <x86intrin.h>
				525	///
				526	/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
				527	///
				528	/// \param __V1
				529	/// A 128-bit integer vector.
				530	/// \param __V2
				531	/// A 128-bit integer vector.
				532	/// \returns A 128-bit integer vector containing the products of both operands.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	533	static __inline__ __m128i __DEFAULT_FN_ATTRS
				534	_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
				535	{
				536	return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
				537	}
				538
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	539	/// Multiplies corresponding even-indexed elements of two 128-bit
				540	/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
				541	/// containing the products.
				542	///
				543	/// \headerfile <x86intrin.h>
				544	///
				545	/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
				546	///
				547	/// \param __V1
				548	/// A 128-bit vector of [4 x i32].
				549	/// \param __V2
				550	/// A 128-bit vector of [4 x i32].
				551	/// \returns A 128-bit vector of [2 x i64] containing the products of both
				552	/// operands.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	553	static __inline__ __m128i __DEFAULT_FN_ATTRS
				554	_mm_mul_epi32 (__m128i __V1, __m128i __V2)
				555	{
				556	return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
				557	}
				558
				559	/* SSE4 Floating Point Dot Product Instructions. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	560	/// Computes the dot product of the two 128-bit vectors of [4 x float]
				561	/// and returns it in the elements of the 128-bit result vector of
				562	/// [4 x float].
				563	///
				564	/// The immediate integer operand controls which input elements
				565	/// will contribute to the dot product, and where the final results are
				566	/// returned.
				567	///
				568	/// \headerfile <x86intrin.h>
				569	///
				570	/// \code
				571	/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
				572	/// \endcode
				573	///
				574	/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
				575	///
				576	/// \param X
				577	/// A 128-bit vector of [4 x float].
				578	/// \param Y
				579	/// A 128-bit vector of [4 x float].
				580	/// \param M
				581	/// An immediate integer operand. Mask bits [7:4] determine which elements
				582	/// of the input vectors are used, with bit [4] corresponding to the lowest
				583	/// element and bit [7] corresponding to the highest element of each [4 x
				584	/// float] vector. If a bit is set, the corresponding elements from the two
				585	/// input vectors are used as an input for dot product; otherwise that input
				586	/// is treated as zero. Bits [3:0] determine which elements of the result
				587	/// will receive a copy of the final dot product, with bit [0] corresponding
				588	/// to the lowest element and bit [3] corresponding to the highest element of
				589	/// each [4 x float] subvector. If a bit is set, the dot product is returned
				590	/// in the corresponding element; otherwise that element is set to zero.
				591	/// \returns A 128-bit vector of [4 x float] containing the dot product.
				592	#define _mm_dp_ps(X, Y, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	593	((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
				594	(__v4sf)(__m128)(Y), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	595
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	596	/// Computes the dot product of the two 128-bit vectors of [2 x double]
				597	/// and returns it in the elements of the 128-bit result vector of
				598	/// [2 x double].
				599	///
				600	/// The immediate integer operand controls which input
				601	/// elements will contribute to the dot product, and where the final results
				602	/// are returned.
				603	///
				604	/// \headerfile <x86intrin.h>
				605	///
				606	/// \code
				607	/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
				608	/// \endcode
				609	///
				610	/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
				611	///
				612	/// \param X
				613	/// A 128-bit vector of [2 x double].
				614	/// \param Y
				615	/// A 128-bit vector of [2 x double].
				616	/// \param M
				617	/// An immediate integer operand. Mask bits [5:4] determine which elements
				618	/// of the input vectors are used, with bit [4] corresponding to the lowest
				619	/// element and bit [5] corresponding to the highest element of each of [2 x
				620	/// double] vector. If a bit is set, the corresponding elements from the two
				621	/// input vectors are used as an input for dot product; otherwise that input
				622	/// is treated as zero. Bits [1:0] determine which elements of the result
				623	/// will receive a copy of the final dot product, with bit [0] corresponding
				624	/// to the lowest element and bit [1] corresponding to the highest element of
				625	/// each [2 x double] vector. If a bit is set, the dot product is returned in
				626	/// the corresponding element; otherwise that element is set to zero.
				627	#define _mm_dp_pd(X, Y, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	628	((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
				629	(__v2df)(__m128d)(Y), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	630
				631	/* SSE4 Streaming Load Hint Instruction. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	632	/// Loads integer values from a 128-bit aligned memory location to a
				633	/// 128-bit integer vector.
				634	///
				635	/// \headerfile <x86intrin.h>
				636	///
				637	/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
				638	///
				639	/// \param __V
				640	/// A pointer to a 128-bit aligned memory location that contains the integer
				641	/// values.
				642	/// \returns A 128-bit integer vector containing the data stored at the
				643	/// specified memory location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	644	static __inline__ __m128i __DEFAULT_FN_ATTRS
				645	_mm_stream_load_si128 (__m128i const *__V)
				646	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	647	return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	648	}
				649
				650	/* SSE4 Packed Integer Min/Max Instructions. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	651	/// Compares the corresponding elements of two 128-bit vectors of
				652	/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
				653	/// of the two values.
				654	///
				655	/// \headerfile <x86intrin.h>
				656	///
				657	/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
				658	///
				659	/// \param __V1
				660	/// A 128-bit vector of [16 x i8].
				661	/// \param __V2
				662	/// A 128-bit vector of [16 x i8]
				663	/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	664	static __inline__ __m128i __DEFAULT_FN_ATTRS
				665	_mm_min_epi8 (__m128i __V1, __m128i __V2)
				666	{
				667	return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
				668	}
				669
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	670	/// Compares the corresponding elements of two 128-bit vectors of
				671	/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
				672	/// greater value of the two.
				673	///
				674	/// \headerfile <x86intrin.h>
				675	///
				676	/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
				677	///
				678	/// \param __V1
				679	/// A 128-bit vector of [16 x i8].
				680	/// \param __V2
				681	/// A 128-bit vector of [16 x i8].
				682	/// \returns A 128-bit vector of [16 x i8] containing the greater values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	683	static __inline__ __m128i __DEFAULT_FN_ATTRS
				684	_mm_max_epi8 (__m128i __V1, __m128i __V2)
				685	{
				686	return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
				687	}
				688
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	689	/// Compares the corresponding elements of two 128-bit vectors of
				690	/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
				691	/// value of the two.
				692	///
				693	/// \headerfile <x86intrin.h>
				694	///
				695	/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
				696	///
				697	/// \param __V1
				698	/// A 128-bit vector of [8 x u16].
				699	/// \param __V2
				700	/// A 128-bit vector of [8 x u16].
				701	/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	702	static __inline__ __m128i __DEFAULT_FN_ATTRS
				703	_mm_min_epu16 (__m128i __V1, __m128i __V2)
				704	{
				705	return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
				706	}
				707
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	708	/// Compares the corresponding elements of two 128-bit vectors of
				709	/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
				710	/// greater value of the two.
				711	///
				712	/// \headerfile <x86intrin.h>
				713	///
				714	/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
				715	///
				716	/// \param __V1
				717	/// A 128-bit vector of [8 x u16].
				718	/// \param __V2
				719	/// A 128-bit vector of [8 x u16].
				720	/// \returns A 128-bit vector of [8 x u16] containing the greater values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	721	static __inline__ __m128i __DEFAULT_FN_ATTRS
				722	_mm_max_epu16 (__m128i __V1, __m128i __V2)
				723	{
				724	return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
				725	}
				726
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	727	/// Compares the corresponding elements of two 128-bit vectors of
				728	/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
				729	/// value of the two.
				730	///
				731	/// \headerfile <x86intrin.h>
				732	///
				733	/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
				734	///
				735	/// \param __V1
				736	/// A 128-bit vector of [4 x i32].
				737	/// \param __V2
				738	/// A 128-bit vector of [4 x i32].
				739	/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	740	static __inline__ __m128i __DEFAULT_FN_ATTRS
				741	_mm_min_epi32 (__m128i __V1, __m128i __V2)
				742	{
				743	return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
				744	}
				745
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	746	/// Compares the corresponding elements of two 128-bit vectors of
				747	/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
				748	/// greater value of the two.
				749	///
				750	/// \headerfile <x86intrin.h>
				751	///
				752	/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
				753	///
				754	/// \param __V1
				755	/// A 128-bit vector of [4 x i32].
				756	/// \param __V2
				757	/// A 128-bit vector of [4 x i32].
				758	/// \returns A 128-bit vector of [4 x i32] containing the greater values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	759	static __inline__ __m128i __DEFAULT_FN_ATTRS
				760	_mm_max_epi32 (__m128i __V1, __m128i __V2)
				761	{
				762	return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
				763	}
				764
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	765	/// Compares the corresponding elements of two 128-bit vectors of
				766	/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
				767	/// value of the two.
				768	///
				769	/// \headerfile <x86intrin.h>
				770	///
				771	/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
				772	///
				773	/// \param __V1
				774	/// A 128-bit vector of [4 x u32].
				775	/// \param __V2
				776	/// A 128-bit vector of [4 x u32].
				777	/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	778	static __inline__ __m128i __DEFAULT_FN_ATTRS
				779	_mm_min_epu32 (__m128i __V1, __m128i __V2)
				780	{
				781	return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
				782	}
				783
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	784	/// Compares the corresponding elements of two 128-bit vectors of
				785	/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
				786	/// greater value of the two.
				787	///
				788	/// \headerfile <x86intrin.h>
				789	///
				790	/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
				791	///
				792	/// \param __V1
				793	/// A 128-bit vector of [4 x u32].
				794	/// \param __V2
				795	/// A 128-bit vector of [4 x u32].
				796	/// \returns A 128-bit vector of [4 x u32] containing the greater values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	797	static __inline__ __m128i __DEFAULT_FN_ATTRS
				798	_mm_max_epu32 (__m128i __V1, __m128i __V2)
				799	{
				800	return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
				801	}
				802
				803	/* SSE4 Insertion and Extraction from XMM Register Instructions. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	804	/// Takes the first argument \a X and inserts an element from the second
				805	/// argument \a Y as selected by the third argument \a N. That result then
				806	/// has elements zeroed out also as selected by the third argument \a N. The
				807	/// resulting 128-bit vector of [4 x float] is then returned.
				808	///
				809	/// \headerfile <x86intrin.h>
				810	///
				811	/// \code
				812	/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
				813	/// \endcode
				814	///
				815	/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
				816	///
				817	/// \param X
				818	/// A 128-bit vector source operand of [4 x float]. With the exception of
				819	/// those bits in the result copied from parameter \a Y and zeroed by bits
				820	/// [3:0] of \a N, all bits from this parameter are copied to the result.
				821	/// \param Y
				822	/// A 128-bit vector source operand of [4 x float]. One single-precision
				823	/// floating-point element from this source, as determined by the immediate
				824	/// parameter, is copied to the result.
				825	/// \param N
				826	/// Specifies which bits from operand \a Y will be copied, which bits in the
				827	/// result they will be be copied to, and which bits in the result will be
				828	/// cleared. The following assignments are made: \n
				829	/// Bits [7:6] specify the bits to copy from operand \a Y: \n
				830	/// 00: Selects bits [31:0] from operand \a Y. \n
				831	/// 01: Selects bits [63:32] from operand \a Y. \n
				832	/// 10: Selects bits [95:64] from operand \a Y. \n
				833	/// 11: Selects bits [127:96] from operand \a Y. \n
				834	/// Bits [5:4] specify the bits in the result to which the selected bits
				835	/// from operand \a Y are copied: \n
				836	/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
				837	/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
				838	/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
				839	/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
				840	/// Bits[3:0]: If any of these bits are set, the corresponding result
				841	/// element is cleared.
				842	/// \returns A 128-bit vector of [4 x float] containing the copied
				843	/// single-precision floating point elements from the operands.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	844	#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	845
				846	/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
				847	/// returns it, using the immediate value parameter \a N as a selector.
				848	///
				849	/// \headerfile <x86intrin.h>
				850	///
				851	/// \code
				852	/// int _mm_extract_ps(__m128 X, const int N);
				853	/// \endcode
				854	///
				855	/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
				856	/// instruction.
				857	///
				858	/// \param X
				859	/// A 128-bit vector of [4 x float].
				860	/// \param N
				861	/// An immediate value. Bits [1:0] determines which bits from the argument
				862	/// \a X are extracted and returned: \n
				863	/// 00: Bits [31:0] of parameter \a X are returned. \n
				864	/// 01: Bits [63:32] of parameter \a X are returned. \n
				865	/// 10: Bits [95:64] of parameter \a X are returned. \n
				866	/// 11: Bits [127:96] of parameter \a X are returned.
				867	/// \returns A 32-bit integer containing the extracted 32 bits of float data.
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	868	#define _mm_extract_ps(X, N) \
				869	__builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	870
				871	/* Miscellaneous insert and extract macros. */
				872	/* Extract a single-precision float from X at index N into D. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	873	#define _MM_EXTRACT_FLOAT(D, X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	874	do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	875
				876	/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
				877	an index suitable for _mm_insert_ps. */
				878	#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) \| ((Y) << 4) \| (Z))
				879
				880	/* Extract a float from X at index N into the first index of the return. */
				881	#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
				882	_MM_MK_INSERTPS_NDX((N), 0, 0x0e))
				883
				884	/* Insert int into packed integer array at index. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	885	/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
				886	/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
				887	/// of an integer parameter \a I into an offset specified by the immediate
				888	/// value parameter \a N.
				889	///
				890	/// \headerfile <x86intrin.h>
				891	///
				892	/// \code
				893	/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
				894	/// \endcode
				895	///
				896	/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
				897	///
				898	/// \param X
				899	/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
				900	/// result and then one of the sixteen elements in the result vector is
				901	/// replaced by the lower 8 bits of \a I.
				902	/// \param I
				903	/// An integer. The lower 8 bits of this operand are written to the result
				904	/// beginning at the offset specified by \a N.
				905	/// \param N
				906	/// An immediate value. Bits [3:0] specify the bit offset in the result at
				907	/// which the lower 8 bits of \a I are written. \n
				908	/// 0000: Bits [7:0] of the result are used for insertion. \n
				909	/// 0001: Bits [15:8] of the result are used for insertion. \n
				910	/// 0010: Bits [23:16] of the result are used for insertion. \n
				911	/// 0011: Bits [31:24] of the result are used for insertion. \n
				912	/// 0100: Bits [39:32] of the result are used for insertion. \n
				913	/// 0101: Bits [47:40] of the result are used for insertion. \n
				914	/// 0110: Bits [55:48] of the result are used for insertion. \n
				915	/// 0111: Bits [63:56] of the result are used for insertion. \n
				916	/// 1000: Bits [71:64] of the result are used for insertion. \n
				917	/// 1001: Bits [79:72] of the result are used for insertion. \n
				918	/// 1010: Bits [87:80] of the result are used for insertion. \n
				919	/// 1011: Bits [95:88] of the result are used for insertion. \n
				920	/// 1100: Bits [103:96] of the result are used for insertion. \n
				921	/// 1101: Bits [111:104] of the result are used for insertion. \n
				922	/// 1110: Bits [119:112] of the result are used for insertion. \n
				923	/// 1111: Bits [127:120] of the result are used for insertion.
				924	/// \returns A 128-bit integer vector containing the constructed values.
				925	#define _mm_insert_epi8(X, I, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	926	((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
				927	(int)(I), (int)(N)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	928
				929	/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
				930	/// the 128-bit integer vector parameter, and then inserting the 32-bit
				931	/// integer parameter \a I at the offset specified by the immediate value
				932	/// parameter \a N.
				933	///
				934	/// \headerfile <x86intrin.h>
				935	///
				936	/// \code
				937	/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
				938	/// \endcode
				939	///
				940	/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
				941	///
				942	/// \param X
				943	/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
				944	/// result and then one of the four elements in the result vector is
				945	/// replaced by \a I.
				946	/// \param I
				947	/// A 32-bit integer that is written to the result beginning at the offset
				948	/// specified by \a N.
				949	/// \param N
				950	/// An immediate value. Bits [1:0] specify the bit offset in the result at
				951	/// which the integer \a I is written. \n
				952	/// 00: Bits [31:0] of the result are used for insertion. \n
				953	/// 01: Bits [63:32] of the result are used for insertion. \n
				954	/// 10: Bits [95:64] of the result are used for insertion. \n
				955	/// 11: Bits [127:96] of the result are used for insertion.
				956	/// \returns A 128-bit integer vector containing the constructed values.
				957	#define _mm_insert_epi32(X, I, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	958	((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
				959	(int)(I), (int)(N)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	960
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	961	#ifdef __x86_64__
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	962	/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
				963	/// the 128-bit integer vector parameter, and then inserting the 64-bit
				964	/// integer parameter \a I, using the immediate value parameter \a N as an
				965	/// insertion location selector.
				966	///
				967	/// \headerfile <x86intrin.h>
				968	///
				969	/// \code
				970	/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
				971	/// \endcode
				972	///
				973	/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
				974	///
				975	/// \param X
				976	/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
				977	/// result and then one of the two elements in the result vector is replaced
				978	/// by \a I.
				979	/// \param I
				980	/// A 64-bit integer that is written to the result beginning at the offset
				981	/// specified by \a N.
				982	/// \param N
				983	/// An immediate value. Bit [0] specifies the bit offset in the result at
				984	/// which the integer \a I is written. \n
				985	/// 0: Bits [63:0] of the result are used for insertion. \n
				986	/// 1: Bits [127:64] of the result are used for insertion. \n
				987	/// \returns A 128-bit integer vector containing the constructed values.
				988	#define _mm_insert_epi64(X, I, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	989	((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
				990	(long long)(I), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	991	#endif /* __x86_64__ */
				992
				993	/* Extract int from packed integer array at index. This returns the element
				994	* as a zero extended value, so it is unsigned.
				995	*/
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	996	/// Extracts an 8-bit element from the 128-bit integer vector of
				997	/// [16 x i8], using the immediate value parameter \a N as a selector.
				998	///
				999	/// \headerfile <x86intrin.h>
				1000	///
				1001	/// \code
				1002	/// int _mm_extract_epi8(__m128i X, const int N);
				1003	/// \endcode
				1004	///
				1005	/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
				1006	///
				1007	/// \param X
				1008	/// A 128-bit integer vector.
				1009	/// \param N
				1010	/// An immediate value. Bits [3:0] specify which 8-bit vector element from
				1011	/// the argument \a X to extract and copy to the result. \n
				1012	/// 0000: Bits [7:0] of parameter \a X are extracted. \n
				1013	/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
				1014	/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
				1015	/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
				1016	/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
				1017	/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
				1018	/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
				1019	/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
				1020	/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
				1021	/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
				1022	/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
				1023	/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
				1024	/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
				1025	/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
				1026	/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
				1027	/// 1111: Bits [127:120] of the parameter \a X are extracted.
				1028	/// \returns An unsigned integer, whose lower 8 bits are selected from the
				1029	/// 128-bit integer vector parameter and the remaining bits are assigned
				1030	/// zeros.
				1031	#define _mm_extract_epi8(X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1032	((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
				1033	(int)(N)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1034
				1035	/// Extracts a 32-bit element from the 128-bit integer vector of
				1036	/// [4 x i32], using the immediate value parameter \a N as a selector.
				1037	///
				1038	/// \headerfile <x86intrin.h>
				1039	///
				1040	/// \code
				1041	/// int _mm_extract_epi32(__m128i X, const int N);
				1042	/// \endcode
				1043	///
				1044	/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
				1045	///
				1046	/// \param X
				1047	/// A 128-bit integer vector.
				1048	/// \param N
				1049	/// An immediate value. Bits [1:0] specify which 32-bit vector element from
				1050	/// the argument \a X to extract and copy to the result. \n
				1051	/// 00: Bits [31:0] of the parameter \a X are extracted. \n
				1052	/// 01: Bits [63:32] of the parameter \a X are extracted. \n
				1053	/// 10: Bits [95:64] of the parameter \a X are extracted. \n
				1054	/// 11: Bits [127:96] of the parameter \a X are exracted.
				1055	/// \returns An integer, whose lower 32 bits are selected from the 128-bit
				1056	/// integer vector parameter and the remaining bits are assigned zeros.
				1057	#define _mm_extract_epi32(X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1058	((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1059
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1060	#ifdef __x86_64__
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1061	/// Extracts a 64-bit element from the 128-bit integer vector of
				1062	/// [2 x i64], using the immediate value parameter \a N as a selector.
				1063	///
				1064	/// \headerfile <x86intrin.h>
				1065	///
				1066	/// \code
				1067	/// long long _mm_extract_epi64(__m128i X, const int N);
				1068	/// \endcode
				1069	///
				1070	/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
				1071	///
				1072	/// \param X
				1073	/// A 128-bit integer vector.
				1074	/// \param N
				1075	/// An immediate value. Bit [0] specifies which 64-bit vector element from
				1076	/// the argument \a X to return. \n
				1077	/// 0: Bits [63:0] are returned. \n
				1078	/// 1: Bits [127:64] are returned. \n
				1079	/// \returns A 64-bit integer.
				1080	#define _mm_extract_epi64(X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1081	((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1082	#endif /* __x86_64 */
				1083
				1084	/* SSE4 128-bit Packed Integer Comparisons. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1085	/// Tests whether the specified bits in a 128-bit integer vector are all
				1086	/// zeros.
				1087	///
				1088	/// \headerfile <x86intrin.h>
				1089	///
				1090	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1091	///
				1092	/// \param __M
				1093	/// A 128-bit integer vector containing the bits to be tested.
				1094	/// \param __V
				1095	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
				1096	/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1097	static __inline__ int __DEFAULT_FN_ATTRS
				1098	_mm_testz_si128(__m128i __M, __m128i __V)
				1099	{
				1100	return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
				1101	}
				1102
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1103	/// Tests whether the specified bits in a 128-bit integer vector are all
				1104	/// ones.
				1105	///
				1106	/// \headerfile <x86intrin.h>
				1107	///
				1108	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1109	///
				1110	/// \param __M
				1111	/// A 128-bit integer vector containing the bits to be tested.
				1112	/// \param __V
				1113	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
				1114	/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1115	static __inline__ int __DEFAULT_FN_ATTRS
				1116	_mm_testc_si128(__m128i __M, __m128i __V)
				1117	{
				1118	return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
				1119	}
				1120
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1121	/// Tests whether the specified bits in a 128-bit integer vector are
				1122	/// neither all zeros nor all ones.
				1123	///
				1124	/// \headerfile <x86intrin.h>
				1125	///
				1126	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1127	///
				1128	/// \param __M
				1129	/// A 128-bit integer vector containing the bits to be tested.
				1130	/// \param __V
				1131	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
				1132	/// \returns TRUE if the specified bits are neither all zeros nor all ones;
				1133	/// FALSE otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1134	static __inline__ int __DEFAULT_FN_ATTRS
				1135	_mm_testnzc_si128(__m128i __M, __m128i __V)
				1136	{
				1137	return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
				1138	}
				1139
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1140	/// Tests whether the specified bits in a 128-bit integer vector are all
				1141	/// ones.
				1142	///
				1143	/// \headerfile <x86intrin.h>
				1144	///
				1145	/// \code
				1146	/// int _mm_test_all_ones(__m128i V);
				1147	/// \endcode
				1148	///
				1149	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1150	///
				1151	/// \param V
				1152	/// A 128-bit integer vector containing the bits to be tested.
				1153	/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
				1154	/// otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1155	#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1156
				1157	/// Tests whether the specified bits in a 128-bit integer vector are
				1158	/// neither all zeros nor all ones.
				1159	///
				1160	/// \headerfile <x86intrin.h>
				1161	///
				1162	/// \code
				1163	/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
				1164	/// \endcode
				1165	///
				1166	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1167	///
				1168	/// \param M
				1169	/// A 128-bit integer vector containing the bits to be tested.
				1170	/// \param V
				1171	/// A 128-bit integer vector selecting which bits to test in operand \a M.
				1172	/// \returns TRUE if the specified bits are neither all zeros nor all ones;
				1173	/// FALSE otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1174	#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1175
				1176	/// Tests whether the specified bits in a 128-bit integer vector are all
				1177	/// zeros.
				1178	///
				1179	/// \headerfile <x86intrin.h>
				1180	///
				1181	/// \code
				1182	/// int _mm_test_all_zeros(__m128i M, __m128i V);
				1183	/// \endcode
				1184	///
				1185	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1186	///
				1187	/// \param M
				1188	/// A 128-bit integer vector containing the bits to be tested.
				1189	/// \param V
				1190	/// A 128-bit integer vector selecting which bits to test in operand \a M.
				1191	/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1192	#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
				1193
				1194	/* SSE4 64-bit Packed Integer Comparisons. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1195	/// Compares each of the corresponding 64-bit values of the 128-bit
				1196	/// integer vectors for equality.
				1197	///
				1198	/// \headerfile <x86intrin.h>
				1199	///
				1200	/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
				1201	///
				1202	/// \param __V1
				1203	/// A 128-bit integer vector.
				1204	/// \param __V2
				1205	/// A 128-bit integer vector.
				1206	/// \returns A 128-bit integer vector containing the comparison results.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1207	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1208	_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
				1209	{
				1210	return (__m128i)((__v2di)__V1 == (__v2di)__V2);
				1211	}
				1212
				1213	/* SSE4 Packed Integer Sign-Extension. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1214	/// Sign-extends each of the lower eight 8-bit integer elements of a
				1215	/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
				1216	/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
				1217	/// are unused.
				1218	///
				1219	/// \headerfile <x86intrin.h>
				1220	///
				1221	/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
				1222	///
				1223	/// \param __V
				1224	/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
				1225	/// extended to 16-bit values.
				1226	/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1227	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1228	_mm_cvtepi8_epi16(__m128i __V)
				1229	{
				1230	/* This function always performs a signed extension, but __v16qi is a char
				1231	which may be signed or unsigned, so use __v16qs. */
				1232	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
				1233	}
				1234
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1235	/// Sign-extends each of the lower four 8-bit integer elements of a
				1236	/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
				1237	/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
				1238	/// vector are unused.
				1239	///
				1240	/// \headerfile <x86intrin.h>
				1241	///
				1242	/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
				1243	///
				1244	/// \param __V
				1245	/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
				1246	/// sign-extended to 32-bit values.
				1247	/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1248	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1249	_mm_cvtepi8_epi32(__m128i __V)
				1250	{
				1251	/* This function always performs a signed extension, but __v16qi is a char
				1252	which may be signed or unsigned, so use __v16qs. */
				1253	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
				1254	}
				1255
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1256	/// Sign-extends each of the lower two 8-bit integer elements of a
				1257	/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
				1258	/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
				1259	/// vector are unused.
				1260	///
				1261	/// \headerfile <x86intrin.h>
				1262	///
				1263	/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
				1264	///
				1265	/// \param __V
				1266	/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
				1267	/// sign-extended to 64-bit values.
				1268	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1269	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1270	_mm_cvtepi8_epi64(__m128i __V)
				1271	{
				1272	/* This function always performs a signed extension, but __v16qi is a char
				1273	which may be signed or unsigned, so use __v16qs. */
				1274	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
				1275	}
				1276
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1277	/// Sign-extends each of the lower four 16-bit integer elements of a
				1278	/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
				1279	/// a 128-bit vector of [4 x i32]. The upper four elements of the input
				1280	/// vector are unused.
				1281	///
				1282	/// \headerfile <x86intrin.h>
				1283	///
				1284	/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
				1285	///
				1286	/// \param __V
				1287	/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
				1288	/// sign-extended to 32-bit values.
				1289	/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1290	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1291	_mm_cvtepi16_epi32(__m128i __V)
				1292	{
				1293	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
				1294	}
				1295
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1296	/// Sign-extends each of the lower two 16-bit integer elements of a
				1297	/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
				1298	/// a 128-bit vector of [2 x i64]. The upper six elements of the input
				1299	/// vector are unused.
				1300	///
				1301	/// \headerfile <x86intrin.h>
				1302	///
				1303	/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
				1304	///
				1305	/// \param __V
				1306	/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
				1307	/// sign-extended to 64-bit values.
				1308	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1309	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1310	_mm_cvtepi16_epi64(__m128i __V)
				1311	{
				1312	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
				1313	}
				1314
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1315	/// Sign-extends each of the lower two 32-bit integer elements of a
				1316	/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
				1317	/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
				1318	/// are unused.
				1319	///
				1320	/// \headerfile <x86intrin.h>
				1321	///
				1322	/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
				1323	///
				1324	/// \param __V
				1325	/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
				1326	/// sign-extended to 64-bit values.
				1327	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1328	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1329	_mm_cvtepi32_epi64(__m128i __V)
				1330	{
				1331	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
				1332	}
				1333
				1334	/* SSE4 Packed Integer Zero-Extension. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1335	/// Zero-extends each of the lower eight 8-bit integer elements of a
				1336	/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
				1337	/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
				1338	/// are unused.
				1339	///
				1340	/// \headerfile <x86intrin.h>
				1341	///
				1342	/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
				1343	///
				1344	/// \param __V
				1345	/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
				1346	/// zero-extended to 16-bit values.
				1347	/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1348	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1349	_mm_cvtepu8_epi16(__m128i __V)
				1350	{
				1351	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
				1352	}
				1353
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1354	/// Zero-extends each of the lower four 8-bit integer elements of a
				1355	/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
				1356	/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
				1357	/// vector are unused.
				1358	///
				1359	/// \headerfile <x86intrin.h>
				1360	///
				1361	/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
				1362	///
				1363	/// \param __V
				1364	/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
				1365	/// zero-extended to 32-bit values.
				1366	/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1367	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1368	_mm_cvtepu8_epi32(__m128i __V)
				1369	{
				1370	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
				1371	}
				1372
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1373	/// Zero-extends each of the lower two 8-bit integer elements of a
				1374	/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
				1375	/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
				1376	/// vector are unused.
				1377	///
				1378	/// \headerfile <x86intrin.h>
				1379	///
				1380	/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
				1381	///
				1382	/// \param __V
				1383	/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
				1384	/// zero-extended to 64-bit values.
				1385	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1386	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1387	_mm_cvtepu8_epi64(__m128i __V)
				1388	{
				1389	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
				1390	}
				1391
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1392	/// Zero-extends each of the lower four 16-bit integer elements of a
				1393	/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
				1394	/// a 128-bit vector of [4 x i32]. The upper four elements of the input
				1395	/// vector are unused.
				1396	///
				1397	/// \headerfile <x86intrin.h>
				1398	///
				1399	/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
				1400	///
				1401	/// \param __V
				1402	/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
				1403	/// zero-extended to 32-bit values.
				1404	/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1405	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1406	_mm_cvtepu16_epi32(__m128i __V)
				1407	{
				1408	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
				1409	}
				1410
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1411	/// Zero-extends each of the lower two 16-bit integer elements of a
				1412	/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
				1413	/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
				1414	/// are unused.
				1415	///
				1416	/// \headerfile <x86intrin.h>
				1417	///
				1418	/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
				1419	///
				1420	/// \param __V
				1421	/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
				1422	/// zero-extended to 64-bit values.
				1423	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1424	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1425	_mm_cvtepu16_epi64(__m128i __V)
				1426	{
				1427	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
				1428	}
				1429
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1430	/// Zero-extends each of the lower two 32-bit integer elements of a
				1431	/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
				1432	/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
				1433	/// are unused.
				1434	///
				1435	/// \headerfile <x86intrin.h>
				1436	///
				1437	/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
				1438	///
				1439	/// \param __V
				1440	/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
				1441	/// zero-extended to 64-bit values.
				1442	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1443	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1444	_mm_cvtepu32_epi64(__m128i __V)
				1445	{
				1446	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
				1447	}
				1448
				1449	/* SSE4 Pack with Unsigned Saturation. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1450	/// Converts 32-bit signed integers from both 128-bit integer vector
				1451	/// operands into 16-bit unsigned integers, and returns the packed result.
				1452	/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
				1453	/// 0x0000 are saturated to 0x0000.
				1454	///
				1455	/// \headerfile <x86intrin.h>
				1456	///
				1457	/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
				1458	///
				1459	/// \param __V1
				1460	/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
				1461	/// signed integer and is converted to a 16-bit unsigned integer with
				1462	/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
				1463	/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
				1464	/// are written to the lower 64 bits of the result.
				1465	/// \param __V2
				1466	/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
				1467	/// signed integer and is converted to a 16-bit unsigned integer with
				1468	/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
				1469	/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
				1470	/// are written to the higher 64 bits of the result.
				1471	/// \returns A 128-bit vector of [8 x i16] containing the converted values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1472	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1473	_mm_packus_epi32(__m128i __V1, __m128i __V2)
				1474	{
				1475	return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
				1476	}
				1477
				1478	/* SSE4 Multiple Packed Sums of Absolute Difference. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1479	/// Subtracts 8-bit unsigned integer values and computes the absolute
				1480	/// values of the differences to the corresponding bits in the destination.
				1481	/// Then sums of the absolute differences are returned according to the bit
				1482	/// fields in the immediate operand.
				1483	///
				1484	/// \headerfile <x86intrin.h>
				1485	///
				1486	/// \code
				1487	/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
				1488	/// \endcode
				1489	///
				1490	/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
				1491	///
				1492	/// \param X
				1493	/// A 128-bit vector of [16 x i8].
				1494	/// \param Y
				1495	/// A 128-bit vector of [16 x i8].
				1496	/// \param M
				1497	/// An 8-bit immediate operand specifying how the absolute differences are to
				1498	/// be calculated, according to the following algorithm:
				1499	/// \code
				1500	/// // M2 represents bit 2 of the immediate operand
				1501	/// // M10 represents bits [1:0] of the immediate operand
				1502	/// i = M2 * 4;
				1503	/// j = M10 * 4;
				1504	/// for (k = 0; k < 8; k = k + 1) {
				1505	/// d0 = abs(X[i + k + 0] - Y[j + 0]);
				1506	/// d1 = abs(X[i + k + 1] - Y[j + 1]);
				1507	/// d2 = abs(X[i + k + 2] - Y[j + 2]);
				1508	/// d3 = abs(X[i + k + 3] - Y[j + 3]);
				1509	/// r[k] = d0 + d1 + d2 + d3;
				1510	/// }
				1511	/// \endcode
				1512	/// \returns A 128-bit integer vector containing the sums of the sets of
				1513	/// absolute differences between both operands.
				1514	#define _mm_mpsadbw_epu8(X, Y, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1515	((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
				1516	(__v16qi)(__m128i)(Y), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1517
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1518	/// Finds the minimum unsigned 16-bit element in the input 128-bit
				1519	/// vector of [8 x u16] and returns it and along with its index.
				1520	///
				1521	/// \headerfile <x86intrin.h>
				1522	///
				1523	/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
				1524	/// instruction.
				1525	///
				1526	/// \param __V
				1527	/// A 128-bit vector of [8 x u16].
				1528	/// \returns A 128-bit value where bits [15:0] contain the minimum value found
				1529	/// in parameter \a __V, bits [18:16] contain the index of the minimum value
				1530	/// and the remaining bits are set to 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1531	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1532	_mm_minpos_epu16(__m128i __V)
				1533	{
				1534	return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
				1535	}
				1536
				1537	/* Handle the sse4.2 definitions here. */
				1538
				1539	/* These definitions are normally in nmmintrin.h, but gcc puts them in here
				1540	so we'll do the same. */
				1541
				1542	#undef __DEFAULT_FN_ATTRS
				1543	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
				1544
				1545	/* These specify the type of data that we're comparing. */
				1546	#define _SIDD_UBYTE_OPS 0x00
				1547	#define _SIDD_UWORD_OPS 0x01
				1548	#define _SIDD_SBYTE_OPS 0x02
				1549	#define _SIDD_SWORD_OPS 0x03
				1550
				1551	/* These specify the type of comparison operation. */
				1552	#define _SIDD_CMP_EQUAL_ANY 0x00
				1553	#define _SIDD_CMP_RANGES 0x04
				1554	#define _SIDD_CMP_EQUAL_EACH 0x08
				1555	#define _SIDD_CMP_EQUAL_ORDERED 0x0c
				1556
				1557	/* These macros specify the polarity of the operation. */
				1558	#define _SIDD_POSITIVE_POLARITY 0x00
				1559	#define _SIDD_NEGATIVE_POLARITY 0x10
				1560	#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
				1561	#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
				1562
				1563	/* These macros are used in _mm_cmpXstri() to specify the return. */
				1564	#define _SIDD_LEAST_SIGNIFICANT 0x00
				1565	#define _SIDD_MOST_SIGNIFICANT 0x40
				1566
				1567	/* These macros are used in _mm_cmpXstri() to specify the return. */
				1568	#define _SIDD_BIT_MASK 0x00
				1569	#define _SIDD_UNIT_MASK 0x40
				1570
				1571	/* SSE4.2 Packed Comparison Intrinsics. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1572	/// Uses the immediate operand \a M to perform a comparison of string
				1573	/// data with implicitly defined lengths that is contained in source operands
				1574	/// \a A and \a B. Returns a 128-bit integer vector representing the result
				1575	/// mask of the comparison.
				1576	///
				1577	/// \headerfile <x86intrin.h>
				1578	///
				1579	/// \code
				1580	/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
				1581	/// \endcode
				1582	///
				1583	/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
				1584	/// instruction.
				1585	///
				1586	/// \param A
				1587	/// A 128-bit integer vector containing one of the source operands to be
				1588	/// compared.
				1589	/// \param B
				1590	/// A 128-bit integer vector containing one of the source operands to be
				1591	/// compared.
				1592	/// \param M
				1593	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1594	/// words, the type of comparison to perform, and the format of the return
				1595	/// value. \n
				1596	/// Bits [1:0]: Determine source data format. \n
				1597	/// 00: 16 unsigned bytes \n
				1598	/// 01: 8 unsigned words \n
				1599	/// 10: 16 signed bytes \n
				1600	/// 11: 8 signed words \n
				1601	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1602	/// 00: Subset: Each character in \a B is compared for equality with all
				1603	/// the characters in \a A. \n
				1604	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1605	/// basis is greater than or equal for even-indexed elements in \a A,
				1606	/// and less than or equal for odd-indexed elements in \a A. \n
				1607	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1608	/// \a B for equality. \n
				1609	/// 11: Substring: Search \a B for substring matches of \a A. \n
				1610	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1611	/// mask of the comparison results. \n
				1612	/// 00: No effect. \n
				1613	/// 01: Negate the bit mask. \n
				1614	/// 10: No effect. \n
				1615	/// 11: Negate the bit mask only for bits with an index less than or equal
				1616	/// to the size of \a A or \a B. \n
				1617	/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
				1618	/// bytes. \n
				1619	/// 0: The result is zero-extended to 16 bytes. \n
				1620	/// 1: The result is expanded to 16 bytes (this expansion is performed by
				1621	/// repeating each bit 8 or 16 times).
				1622	/// \returns Returns a 128-bit integer vector representing the result mask of
				1623	/// the comparison.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1624	#define _mm_cmpistrm(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1625	((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
				1626	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1627
				1628	/// Uses the immediate operand \a M to perform a comparison of string
				1629	/// data with implicitly defined lengths that is contained in source operands
				1630	/// \a A and \a B. Returns an integer representing the result index of the
				1631	/// comparison.
				1632	///
				1633	/// \headerfile <x86intrin.h>
				1634	///
				1635	/// \code
				1636	/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
				1637	/// \endcode
				1638	///
				1639	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				1640	/// instruction.
				1641	///
				1642	/// \param A
				1643	/// A 128-bit integer vector containing one of the source operands to be
				1644	/// compared.
				1645	/// \param B
				1646	/// A 128-bit integer vector containing one of the source operands to be
				1647	/// compared.
				1648	/// \param M
				1649	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1650	/// words, the type of comparison to perform, and the format of the return
				1651	/// value. \n
				1652	/// Bits [1:0]: Determine source data format. \n
				1653	/// 00: 16 unsigned bytes \n
				1654	/// 01: 8 unsigned words \n
				1655	/// 10: 16 signed bytes \n
				1656	/// 11: 8 signed words \n
				1657	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1658	/// 00: Subset: Each character in \a B is compared for equality with all
				1659	/// the characters in \a A. \n
				1660	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1661	/// basis is greater than or equal for even-indexed elements in \a A,
				1662	/// and less than or equal for odd-indexed elements in \a A. \n
				1663	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1664	/// \a B for equality. \n
				1665	/// 11: Substring: Search B for substring matches of \a A. \n
				1666	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1667	/// mask of the comparison results. \n
				1668	/// 00: No effect. \n
				1669	/// 01: Negate the bit mask. \n
				1670	/// 10: No effect. \n
				1671	/// 11: Negate the bit mask only for bits with an index less than or equal
				1672	/// to the size of \a A or \a B. \n
				1673	/// Bit [6]: Determines whether the index of the lowest set bit or the
				1674	/// highest set bit is returned. \n
				1675	/// 0: The index of the least significant set bit. \n
				1676	/// 1: The index of the most significant set bit. \n
				1677	/// \returns Returns an integer representing the result index of the comparison.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1678	#define _mm_cmpistri(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1679	((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
				1680	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1681
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1682	/// Uses the immediate operand \a M to perform a comparison of string
				1683	/// data with explicitly defined lengths that is contained in source operands
				1684	/// \a A and \a B. Returns a 128-bit integer vector representing the result
				1685	/// mask of the comparison.
				1686	///
				1687	/// \headerfile <x86intrin.h>
				1688	///
				1689	/// \code
				1690	/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
				1691	/// \endcode
				1692	///
				1693	/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
				1694	/// instruction.
				1695	///
				1696	/// \param A
				1697	/// A 128-bit integer vector containing one of the source operands to be
				1698	/// compared.
				1699	/// \param LA
				1700	/// An integer that specifies the length of the string in \a A.
				1701	/// \param B
				1702	/// A 128-bit integer vector containing one of the source operands to be
				1703	/// compared.
				1704	/// \param LB
				1705	/// An integer that specifies the length of the string in \a B.
				1706	/// \param M
				1707	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1708	/// words, the type of comparison to perform, and the format of the return
				1709	/// value. \n
				1710	/// Bits [1:0]: Determine source data format. \n
				1711	/// 00: 16 unsigned bytes \n
				1712	/// 01: 8 unsigned words \n
				1713	/// 10: 16 signed bytes \n
				1714	/// 11: 8 signed words \n
				1715	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1716	/// 00: Subset: Each character in \a B is compared for equality with all
				1717	/// the characters in \a A. \n
				1718	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1719	/// basis is greater than or equal for even-indexed elements in \a A,
				1720	/// and less than or equal for odd-indexed elements in \a A. \n
				1721	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1722	/// \a B for equality. \n
				1723	/// 11: Substring: Search \a B for substring matches of \a A. \n
				1724	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1725	/// mask of the comparison results. \n
				1726	/// 00: No effect. \n
				1727	/// 01: Negate the bit mask. \n
				1728	/// 10: No effect. \n
				1729	/// 11: Negate the bit mask only for bits with an index less than or equal
				1730	/// to the size of \a A or \a B. \n
				1731	/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
				1732	/// bytes. \n
				1733	/// 0: The result is zero-extended to 16 bytes. \n
				1734	/// 1: The result is expanded to 16 bytes (this expansion is performed by
				1735	/// repeating each bit 8 or 16 times). \n
				1736	/// \returns Returns a 128-bit integer vector representing the result mask of
				1737	/// the comparison.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1738	#define _mm_cmpestrm(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1739	((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
				1740	(__v16qi)(__m128i)(B), (int)(LB), \
				1741	(int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1742
				1743	/// Uses the immediate operand \a M to perform a comparison of string
				1744	/// data with explicitly defined lengths that is contained in source operands
				1745	/// \a A and \a B. Returns an integer representing the result index of the
				1746	/// comparison.
				1747	///
				1748	/// \headerfile <x86intrin.h>
				1749	///
				1750	/// \code
				1751	/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
				1752	/// \endcode
				1753	///
				1754	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
				1755	/// instruction.
				1756	///
				1757	/// \param A
				1758	/// A 128-bit integer vector containing one of the source operands to be
				1759	/// compared.
				1760	/// \param LA
				1761	/// An integer that specifies the length of the string in \a A.
				1762	/// \param B
				1763	/// A 128-bit integer vector containing one of the source operands to be
				1764	/// compared.
				1765	/// \param LB
				1766	/// An integer that specifies the length of the string in \a B.
				1767	/// \param M
				1768	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1769	/// words, the type of comparison to perform, and the format of the return
				1770	/// value. \n
				1771	/// Bits [1:0]: Determine source data format. \n
				1772	/// 00: 16 unsigned bytes \n
				1773	/// 01: 8 unsigned words \n
				1774	/// 10: 16 signed bytes \n
				1775	/// 11: 8 signed words \n
				1776	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1777	/// 00: Subset: Each character in \a B is compared for equality with all
				1778	/// the characters in \a A. \n
				1779	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1780	/// basis is greater than or equal for even-indexed elements in \a A,
				1781	/// and less than or equal for odd-indexed elements in \a A. \n
				1782	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1783	/// \a B for equality. \n
				1784	/// 11: Substring: Search B for substring matches of \a A. \n
				1785	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1786	/// mask of the comparison results. \n
				1787	/// 00: No effect. \n
				1788	/// 01: Negate the bit mask. \n
				1789	/// 10: No effect. \n
				1790	/// 11: Negate the bit mask only for bits with an index less than or equal
				1791	/// to the size of \a A or \a B. \n
				1792	/// Bit [6]: Determines whether the index of the lowest set bit or the
				1793	/// highest set bit is returned. \n
				1794	/// 0: The index of the least significant set bit. \n
				1795	/// 1: The index of the most significant set bit. \n
				1796	/// \returns Returns an integer representing the result index of the comparison.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1797	#define _mm_cmpestri(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1798	((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
				1799	(__v16qi)(__m128i)(B), (int)(LB), \
				1800	(int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1801
				1802	/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1803	/// Uses the immediate operand \a M to perform a comparison of string
				1804	/// data with implicitly defined lengths that is contained in source operands
				1805	/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
				1806	/// string in \a B is the maximum, otherwise, returns 0.
				1807	///
				1808	/// \headerfile <x86intrin.h>
				1809	///
				1810	/// \code
				1811	/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
				1812	/// \endcode
				1813	///
				1814	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				1815	/// instruction.
				1816	///
				1817	/// \param A
				1818	/// A 128-bit integer vector containing one of the source operands to be
				1819	/// compared.
				1820	/// \param B
				1821	/// A 128-bit integer vector containing one of the source operands to be
				1822	/// compared.
				1823	/// \param M
				1824	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1825	/// words and the type of comparison to perform. \n
				1826	/// Bits [1:0]: Determine source data format. \n
				1827	/// 00: 16 unsigned bytes \n
				1828	/// 01: 8 unsigned words \n
				1829	/// 10: 16 signed bytes \n
				1830	/// 11: 8 signed words \n
				1831	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1832	/// 00: Subset: Each character in \a B is compared for equality with all
				1833	/// the characters in \a A. \n
				1834	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1835	/// basis is greater than or equal for even-indexed elements in \a A,
				1836	/// and less than or equal for odd-indexed elements in \a A. \n
				1837	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1838	/// \a B for equality. \n
				1839	/// 11: Substring: Search \a B for substring matches of \a A. \n
				1840	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1841	/// mask of the comparison results. \n
				1842	/// 00: No effect. \n
				1843	/// 01: Negate the bit mask. \n
				1844	/// 10: No effect. \n
				1845	/// 11: Negate the bit mask only for bits with an index less than or equal
				1846	/// to the size of \a A or \a B. \n
				1847	/// \returns Returns 1 if the bit mask is zero and the length of the string in
				1848	/// \a B is the maximum; otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1849	#define _mm_cmpistra(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1850	((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
				1851	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1852
				1853	/// Uses the immediate operand \a M to perform a comparison of string
				1854	/// data with implicitly defined lengths that is contained in source operands
				1855	/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
				1856	/// 0.
				1857	///
				1858	/// \headerfile <x86intrin.h>
				1859	///
				1860	/// \code
				1861	/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
				1862	/// \endcode
				1863	///
				1864	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				1865	/// instruction.
				1866	///
				1867	/// \param A
				1868	/// A 128-bit integer vector containing one of the source operands to be
				1869	/// compared.
				1870	/// \param B
				1871	/// A 128-bit integer vector containing one of the source operands to be
				1872	/// compared.
				1873	/// \param M
				1874	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1875	/// words and the type of comparison to perform. \n
				1876	/// Bits [1:0]: Determine source data format. \n
				1877	/// 00: 16 unsigned bytes \n
				1878	/// 01: 8 unsigned words \n
				1879	/// 10: 16 signed bytes \n
				1880	/// 11: 8 signed words \n
				1881	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1882	/// 00: Subset: Each character in \a B is compared for equality with all
				1883	/// the characters in \a A. \n
				1884	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1885	/// basis is greater than or equal for even-indexed elements in \a A,
				1886	/// and less than or equal for odd-indexed elements in \a A. \n
				1887	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1888	/// \a B for equality. \n
				1889	/// 11: Substring: Search B for substring matches of \a A. \n
				1890	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1891	/// mask of the comparison results. \n
				1892	/// 00: No effect. \n
				1893	/// 01: Negate the bit mask. \n
				1894	/// 10: No effect. \n
				1895	/// 11: Negate the bit mask only for bits with an index less than or equal
				1896	/// to the size of \a A or \a B.
				1897	/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1898	#define _mm_cmpistrc(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1899	((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
				1900	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1901
				1902	/// Uses the immediate operand \a M to perform a comparison of string
				1903	/// data with implicitly defined lengths that is contained in source operands
				1904	/// \a A and \a B. Returns bit 0 of the resulting bit mask.
				1905	///
				1906	/// \headerfile <x86intrin.h>
				1907	///
				1908	/// \code
				1909	/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
				1910	/// \endcode
				1911	///
				1912	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				1913	/// instruction.
				1914	///
				1915	/// \param A
				1916	/// A 128-bit integer vector containing one of the source operands to be
				1917	/// compared.
				1918	/// \param B
				1919	/// A 128-bit integer vector containing one of the source operands to be
				1920	/// compared.
				1921	/// \param M
				1922	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1923	/// words and the type of comparison to perform. \n
				1924	/// Bits [1:0]: Determine source data format. \n
				1925	/// 00: 16 unsigned bytes \n
				1926	/// 01: 8 unsigned words \n
				1927	/// 10: 16 signed bytes \n
				1928	/// 11: 8 signed words \n
				1929	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1930	/// 00: Subset: Each character in \a B is compared for equality with all
				1931	/// the characters in \a A. \n
				1932	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1933	/// basis is greater than or equal for even-indexed elements in \a A,
				1934	/// and less than or equal for odd-indexed elements in \a A. \n
				1935	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1936	/// \a B for equality. \n
				1937	/// 11: Substring: Search B for substring matches of \a A. \n
				1938	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1939	/// mask of the comparison results. \n
				1940	/// 00: No effect. \n
				1941	/// 01: Negate the bit mask. \n
				1942	/// 10: No effect. \n
				1943	/// 11: Negate the bit mask only for bits with an index less than or equal
				1944	/// to the size of \a A or \a B. \n
				1945	/// \returns Returns bit 0 of the resulting bit mask.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1946	#define _mm_cmpistro(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1947	((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
				1948	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1949
				1950	/// Uses the immediate operand \a M to perform a comparison of string
				1951	/// data with implicitly defined lengths that is contained in source operands
				1952	/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
				1953	/// the maximum, otherwise, returns 0.
				1954	///
				1955	/// \headerfile <x86intrin.h>
				1956	///
				1957	/// \code
				1958	/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
				1959	/// \endcode
				1960	///
				1961	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				1962	/// instruction.
				1963	///
				1964	/// \param A
				1965	/// A 128-bit integer vector containing one of the source operands to be
				1966	/// compared.
				1967	/// \param B
				1968	/// A 128-bit integer vector containing one of the source operands to be
				1969	/// compared.
				1970	/// \param M
				1971	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1972	/// words and the type of comparison to perform. \n
				1973	/// Bits [1:0]: Determine source data format. \n
				1974	/// 00: 16 unsigned bytes \n
				1975	/// 01: 8 unsigned words \n
				1976	/// 10: 16 signed bytes \n
				1977	/// 11: 8 signed words \n
				1978	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1979	/// 00: Subset: Each character in \a B is compared for equality with all
				1980	/// the characters in \a A. \n
				1981	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1982	/// basis is greater than or equal for even-indexed elements in \a A,
				1983	/// and less than or equal for odd-indexed elements in \a A. \n
				1984	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1985	/// \a B for equality. \n
				1986	/// 11: Substring: Search \a B for substring matches of \a A. \n
				1987	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1988	/// mask of the comparison results. \n
				1989	/// 00: No effect. \n
				1990	/// 01: Negate the bit mask. \n
				1991	/// 10: No effect. \n
				1992	/// 11: Negate the bit mask only for bits with an index less than or equal
				1993	/// to the size of \a A or \a B. \n
				1994	/// \returns Returns 1 if the length of the string in \a A is less than the
				1995	/// maximum, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1996	#define _mm_cmpistrs(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1997	((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
				1998	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1999
				2000	/// Uses the immediate operand \a M to perform a comparison of string
				2001	/// data with implicitly defined lengths that is contained in source operands
				2002	/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
				2003	/// the maximum, otherwise, returns 0.
				2004	///
				2005	/// \headerfile <x86intrin.h>
				2006	///
				2007	/// \code
				2008	/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
				2009	/// \endcode
				2010	///
				2011	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				2012	/// instruction.
				2013	///
				2014	/// \param A
				2015	/// A 128-bit integer vector containing one of the source operands to be
				2016	/// compared.
				2017	/// \param B
				2018	/// A 128-bit integer vector containing one of the source operands to be
				2019	/// compared.
				2020	/// \param M
				2021	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2022	/// words and the type of comparison to perform. \n
				2023	/// Bits [1:0]: Determine source data format. \n
				2024	/// 00: 16 unsigned bytes \n
				2025	/// 01: 8 unsigned words \n
				2026	/// 10: 16 signed bytes \n
				2027	/// 11: 8 signed words \n
				2028	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2029	/// 00: Subset: Each character in \a B is compared for equality with all
				2030	/// the characters in \a A. \n
				2031	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2032	/// basis is greater than or equal for even-indexed elements in \a A,
				2033	/// and less than or equal for odd-indexed elements in \a A. \n
				2034	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2035	/// \a B for equality. \n
				2036	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2037	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				2038	/// mask of the comparison results. \n
				2039	/// 00: No effect. \n
				2040	/// 01: Negate the bit mask. \n
				2041	/// 10: No effect. \n
				2042	/// 11: Negate the bit mask only for bits with an index less than or equal
				2043	/// to the size of \a A or \a B.
				2044	/// \returns Returns 1 if the length of the string in \a B is less than the
				2045	/// maximum, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2046	#define _mm_cmpistrz(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2047	((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
				2048	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2049
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2050	/// Uses the immediate operand \a M to perform a comparison of string
				2051	/// data with explicitly defined lengths that is contained in source operands
				2052	/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
				2053	/// string in \a B is the maximum, otherwise, returns 0.
				2054	///
				2055	/// \headerfile <x86intrin.h>
				2056	///
				2057	/// \code
				2058	/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
				2059	/// \endcode
				2060	///
				2061	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
				2062	/// instruction.
				2063	///
				2064	/// \param A
				2065	/// A 128-bit integer vector containing one of the source operands to be
				2066	/// compared.
				2067	/// \param LA
				2068	/// An integer that specifies the length of the string in \a A.
				2069	/// \param B
				2070	/// A 128-bit integer vector containing one of the source operands to be
				2071	/// compared.
				2072	/// \param LB
				2073	/// An integer that specifies the length of the string in \a B.
				2074	/// \param M
				2075	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2076	/// words and the type of comparison to perform. \n
				2077	/// Bits [1:0]: Determine source data format. \n
				2078	/// 00: 16 unsigned bytes \n
				2079	/// 01: 8 unsigned words \n
				2080	/// 10: 16 signed bytes \n
				2081	/// 11: 8 signed words \n
				2082	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2083	/// 00: Subset: Each character in \a B is compared for equality with all
				2084	/// the characters in \a A. \n
				2085	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2086	/// basis is greater than or equal for even-indexed elements in \a A,
				2087	/// and less than or equal for odd-indexed elements in \a A. \n
				2088	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2089	/// \a B for equality. \n
				2090	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2091	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				2092	/// mask of the comparison results. \n
				2093	/// 00: No effect. \n
				2094	/// 01: Negate the bit mask. \n
				2095	/// 10: No effect. \n
				2096	/// 11: Negate the bit mask only for bits with an index less than or equal
				2097	/// to the size of \a A or \a B.
				2098	/// \returns Returns 1 if the bit mask is zero and the length of the string in
				2099	/// \a B is the maximum, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2100	#define _mm_cmpestra(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2101	((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
				2102	(__v16qi)(__m128i)(B), (int)(LB), \
				2103	(int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2104
				2105	/// Uses the immediate operand \a M to perform a comparison of string
				2106	/// data with explicitly defined lengths that is contained in source operands
				2107	/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
				2108	/// returns 0.
				2109	///
				2110	/// \headerfile <x86intrin.h>
				2111	///
				2112	/// \code
				2113	/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
				2114	/// \endcode
				2115	///
				2116	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
				2117	/// instruction.
				2118	///
				2119	/// \param A
				2120	/// A 128-bit integer vector containing one of the source operands to be
				2121	/// compared.
				2122	/// \param LA
				2123	/// An integer that specifies the length of the string in \a A.
				2124	/// \param B
				2125	/// A 128-bit integer vector containing one of the source operands to be
				2126	/// compared.
				2127	/// \param LB
				2128	/// An integer that specifies the length of the string in \a B.
				2129	/// \param M
				2130	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2131	/// words and the type of comparison to perform. \n
				2132	/// Bits [1:0]: Determine source data format. \n
				2133	/// 00: 16 unsigned bytes \n
				2134	/// 01: 8 unsigned words \n
				2135	/// 10: 16 signed bytes \n
				2136	/// 11: 8 signed words \n
				2137	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2138	/// 00: Subset: Each character in \a B is compared for equality with all
				2139	/// the characters in \a A. \n
				2140	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2141	/// basis is greater than or equal for even-indexed elements in \a A,
				2142	/// and less than or equal for odd-indexed elements in \a A. \n
				2143	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2144	/// \a B for equality. \n
				2145	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2146	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				2147	/// mask of the comparison results. \n
				2148	/// 00: No effect. \n
				2149	/// 01: Negate the bit mask. \n
				2150	/// 10: No effect. \n
				2151	/// 11: Negate the bit mask only for bits with an index less than or equal
				2152	/// to the size of \a A or \a B. \n
				2153	/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2154	#define _mm_cmpestrc(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2155	((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
				2156	(__v16qi)(__m128i)(B), (int)(LB), \
				2157	(int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2158
				2159	/// Uses the immediate operand \a M to perform a comparison of string
				2160	/// data with explicitly defined lengths that is contained in source operands
				2161	/// \a A and \a B. Returns bit 0 of the resulting bit mask.
				2162	///
				2163	/// \headerfile <x86intrin.h>
				2164	///
				2165	/// \code
				2166	/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
				2167	/// \endcode
				2168	///
				2169	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
				2170	/// instruction.
				2171	///
				2172	/// \param A
				2173	/// A 128-bit integer vector containing one of the source operands to be
				2174	/// compared.
				2175	/// \param LA
				2176	/// An integer that specifies the length of the string in \a A.
				2177	/// \param B
				2178	/// A 128-bit integer vector containing one of the source operands to be
				2179	/// compared.
				2180	/// \param LB
				2181	/// An integer that specifies the length of the string in \a B.
				2182	/// \param M
				2183	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2184	/// words and the type of comparison to perform. \n
				2185	/// Bits [1:0]: Determine source data format. \n
				2186	/// 00: 16 unsigned bytes \n
				2187	/// 01: 8 unsigned words \n
				2188	/// 10: 16 signed bytes \n
				2189	/// 11: 8 signed words \n
				2190	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2191	/// 00: Subset: Each character in \a B is compared for equality with all
				2192	/// the characters in \a A. \n
				2193	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2194	/// basis is greater than or equal for even-indexed elements in \a A,
				2195	/// and less than or equal for odd-indexed elements in \a A. \n
				2196	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2197	/// \a B for equality. \n
				2198	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2199	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				2200	/// mask of the comparison results. \n
				2201	/// 00: No effect. \n
				2202	/// 01: Negate the bit mask. \n
				2203	/// 10: No effect. \n
				2204	/// 11: Negate the bit mask only for bits with an index less than or equal
				2205	/// to the size of \a A or \a B.
				2206	/// \returns Returns bit 0 of the resulting bit mask.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2207	#define _mm_cmpestro(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2208	((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
				2209	(__v16qi)(__m128i)(B), (int)(LB), \
				2210	(int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2211
				2212	/// Uses the immediate operand \a M to perform a comparison of string
				2213	/// data with explicitly defined lengths that is contained in source operands
				2214	/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
				2215	/// the maximum, otherwise, returns 0.
				2216	///
				2217	/// \headerfile <x86intrin.h>
				2218	///
				2219	/// \code
				2220	/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
				2221	/// \endcode
				2222	///
				2223	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
				2224	/// instruction.
				2225	///
				2226	/// \param A
				2227	/// A 128-bit integer vector containing one of the source operands to be
				2228	/// compared.
				2229	/// \param LA
				2230	/// An integer that specifies the length of the string in \a A.
				2231	/// \param B
				2232	/// A 128-bit integer vector containing one of the source operands to be
				2233	/// compared.
				2234	/// \param LB
				2235	/// An integer that specifies the length of the string in \a B.
				2236	/// \param M
				2237	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2238	/// words and the type of comparison to perform. \n
				2239	/// Bits [1:0]: Determine source data format. \n
				2240	/// 00: 16 unsigned bytes \n
				2241	/// 01: 8 unsigned words \n
				2242	/// 10: 16 signed bytes \n
				2243	/// 11: 8 signed words \n
				2244	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2245	/// 00: Subset: Each character in \a B is compared for equality with all
				2246	/// the characters in \a A. \n
				2247	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2248	/// basis is greater than or equal for even-indexed elements in \a A,
				2249	/// and less than or equal for odd-indexed elements in \a A. \n
				2250	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2251	/// \a B for equality. \n
				2252	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2253	/// Bits [5:4]: Determine whether to perform a one's complement in the bit
				2254	/// mask of the comparison results. \n
				2255	/// 00: No effect. \n
				2256	/// 01: Negate the bit mask. \n
				2257	/// 10: No effect. \n
				2258	/// 11: Negate the bit mask only for bits with an index less than or equal
				2259	/// to the size of \a A or \a B. \n
				2260	/// \returns Returns 1 if the length of the string in \a A is less than the
				2261	/// maximum, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2262	#define _mm_cmpestrs(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2263	((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
				2264	(__v16qi)(__m128i)(B), (int)(LB), \
				2265	(int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2266
				2267	/// Uses the immediate operand \a M to perform a comparison of string
				2268	/// data with explicitly defined lengths that is contained in source operands
				2269	/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
				2270	/// the maximum, otherwise, returns 0.
				2271	///
				2272	/// \headerfile <x86intrin.h>
				2273	///
				2274	/// \code
				2275	/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
				2276	/// \endcode
				2277	///
				2278	/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
				2279	///
				2280	/// \param A
				2281	/// A 128-bit integer vector containing one of the source operands to be
				2282	/// compared.
				2283	/// \param LA
				2284	/// An integer that specifies the length of the string in \a A.
				2285	/// \param B
				2286	/// A 128-bit integer vector containing one of the source operands to be
				2287	/// compared.
				2288	/// \param LB
				2289	/// An integer that specifies the length of the string in \a B.
				2290	/// \param M
				2291	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2292	/// words and the type of comparison to perform. \n
				2293	/// Bits [1:0]: Determine source data format. \n
				2294	/// 00: 16 unsigned bytes \n
				2295	/// 01: 8 unsigned words \n
				2296	/// 10: 16 signed bytes \n
				2297	/// 11: 8 signed words \n
				2298	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2299	/// 00: Subset: Each character in \a B is compared for equality with all
				2300	/// the characters in \a A. \n
				2301	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2302	/// basis is greater than or equal for even-indexed elements in \a A,
				2303	/// and less than or equal for odd-indexed elements in \a A. \n
				2304	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2305	/// \a B for equality. \n
				2306	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2307	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				2308	/// mask of the comparison results. \n
				2309	/// 00: No effect. \n
				2310	/// 01: Negate the bit mask. \n
				2311	/// 10: No effect. \n
				2312	/// 11: Negate the bit mask only for bits with an index less than or equal
				2313	/// to the size of \a A or \a B.
				2314	/// \returns Returns 1 if the length of the string in \a B is less than the
				2315	/// maximum, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2316	#define _mm_cmpestrz(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2317	((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
				2318	(__v16qi)(__m128i)(B), (int)(LB), \
				2319	(int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2320
				2321	/* SSE4.2 Compare Packed Data -- Greater Than. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2322	/// Compares each of the corresponding 64-bit values of the 128-bit
				2323	/// integer vectors to determine if the values in the first operand are
				2324	/// greater than those in the second operand.
				2325	///
				2326	/// \headerfile <x86intrin.h>
				2327	///
				2328	/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
				2329	///
				2330	/// \param __V1
				2331	/// A 128-bit integer vector.
				2332	/// \param __V2
				2333	/// A 128-bit integer vector.
				2334	/// \returns A 128-bit integer vector containing the comparison results.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2335	static __inline__ __m128i __DEFAULT_FN_ATTRS
				2336	_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
				2337	{
				2338	return (__m128i)((__v2di)__V1 > (__v2di)__V2);
				2339	}
				2340
				2341	/* SSE4.2 Accumulate CRC32. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2342	/// Adds the unsigned integer operand to the CRC-32C checksum of the
				2343	/// unsigned char operand.
				2344	///
				2345	/// \headerfile <x86intrin.h>
				2346	///
				2347	/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
				2348	///
				2349	/// \param __C
				2350	/// An unsigned integer operand to add to the CRC-32C checksum of operand
				2351	/// \a __D.
				2352	/// \param __D
				2353	/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
				2354	/// \returns The result of adding operand \a __C to the CRC-32C checksum of
				2355	/// operand \a __D.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2356	static __inline__ unsigned int __DEFAULT_FN_ATTRS
				2357	_mm_crc32_u8(unsigned int __C, unsigned char __D)
				2358	{
				2359	return __builtin_ia32_crc32qi(__C, __D);
				2360	}
				2361
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2362	/// Adds the unsigned integer operand to the CRC-32C checksum of the
				2363	/// unsigned short operand.
				2364	///
				2365	/// \headerfile <x86intrin.h>
				2366	///
				2367	/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
				2368	///
				2369	/// \param __C
				2370	/// An unsigned integer operand to add to the CRC-32C checksum of operand
				2371	/// \a __D.
				2372	/// \param __D
				2373	/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
				2374	/// \returns The result of adding operand \a __C to the CRC-32C checksum of
				2375	/// operand \a __D.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2376	static __inline__ unsigned int __DEFAULT_FN_ATTRS
				2377	_mm_crc32_u16(unsigned int __C, unsigned short __D)
				2378	{
				2379	return __builtin_ia32_crc32hi(__C, __D);
				2380	}
				2381
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2382	/// Adds the first unsigned integer operand to the CRC-32C checksum of
				2383	/// the second unsigned integer operand.
				2384	///
				2385	/// \headerfile <x86intrin.h>
				2386	///
				2387	/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
				2388	///
				2389	/// \param __C
				2390	/// An unsigned integer operand to add to the CRC-32C checksum of operand
				2391	/// \a __D.
				2392	/// \param __D
				2393	/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
				2394	/// \returns The result of adding operand \a __C to the CRC-32C checksum of
				2395	/// operand \a __D.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2396	static __inline__ unsigned int __DEFAULT_FN_ATTRS
				2397	_mm_crc32_u32(unsigned int __C, unsigned int __D)
				2398	{
				2399	return __builtin_ia32_crc32si(__C, __D);
				2400	}
				2401
				2402	#ifdef __x86_64__
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2403	/// Adds the unsigned integer operand to the CRC-32C checksum of the
				2404	/// unsigned 64-bit integer operand.
				2405	///
				2406	/// \headerfile <x86intrin.h>
				2407	///
				2408	/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
				2409	///
				2410	/// \param __C
				2411	/// An unsigned integer operand to add to the CRC-32C checksum of operand
				2412	/// \a __D.
				2413	/// \param __D
				2414	/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
				2415	/// \returns The result of adding operand \a __C to the CRC-32C checksum of
				2416	/// operand \a __D.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2417	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
				2418	_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
				2419	{
				2420	return __builtin_ia32_crc32di(__C, __D);
				2421	}
				2422	#endif /* __x86_64__ */
				2423
				2424	#undef __DEFAULT_FN_ATTRS
				2425
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2426	#include <popcntintrin.h>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2427
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2428	#endif /* __SMMINTRIN_H */