Blame - linux-x86/lib64/clang/14.0.2/include/smmintrin.h - platform/prebuilts/clang-tools

blob: 710e55aaa1203c77f8aba16de667a89b3953d248 [file] [log] [blame]

Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1	/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
				2	*
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	* See https://llvm.org/LICENSE.txt for license information.
				5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	6	*
				7	*===-----------------------------------------------------------------------===
				8	*/
				9
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	10	#ifndef __SMMINTRIN_H
				11	#define __SMMINTRIN_H
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	12
Pirama Arumuga Nainar	ec8c89d	2022-02-23 09:26:16 -0800	[diff] [blame^]	13	#if !defined(__i386__) && !defined(__x86_64__)
				14	#error "This header is only meant to be used on x86 and x64 architecture"
				15	#endif
				16
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	17	#include <tmmintrin.h>
				18
				19	/* Define the default attributes for the functions in this file. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	20	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	21
				22	/* SSE4 Rounding macros. */
				23	#define _MM_FROUND_TO_NEAREST_INT 0x00
				24	#define _MM_FROUND_TO_NEG_INF 0x01
				25	#define _MM_FROUND_TO_POS_INF 0x02
				26	#define _MM_FROUND_TO_ZERO 0x03
				27	#define _MM_FROUND_CUR_DIRECTION 0x04
				28
				29	#define _MM_FROUND_RAISE_EXC 0x00
				30	#define _MM_FROUND_NO_EXC 0x08
				31
				32	#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEAREST_INT)
				33	#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEG_INF)
				34	#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_POS_INF)
				35	#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_ZERO)
				36	#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC \| _MM_FROUND_CUR_DIRECTION)
				37	#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC \| _MM_FROUND_CUR_DIRECTION)
				38
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	39	/// Rounds up each element of the 128-bit vector of [4 x float] to an
				40	/// integer and returns the rounded values in a 128-bit vector of
				41	/// [4 x float].
				42	///
				43	/// \headerfile <x86intrin.h>
				44	///
				45	/// \code
				46	/// __m128 _mm_ceil_ps(__m128 X);
				47	/// \endcode
				48	///
				49	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
				50	///
				51	/// \param X
				52	/// A 128-bit vector of [4 x float] values to be rounded up.
				53	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	54	#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	55
				56	/// Rounds up each element of the 128-bit vector of [2 x double] to an
				57	/// integer and returns the rounded values in a 128-bit vector of
				58	/// [2 x double].
				59	///
				60	/// \headerfile <x86intrin.h>
				61	///
				62	/// \code
				63	/// __m128d _mm_ceil_pd(__m128d X);
				64	/// \endcode
				65	///
				66	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
				67	///
				68	/// \param X
				69	/// A 128-bit vector of [2 x double] values to be rounded up.
				70	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	71	#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	72
				73	/// Copies three upper elements of the first 128-bit vector operand to
				74	/// the corresponding three upper elements of the 128-bit result vector of
				75	/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
				76	/// operand to an integer and copies it to the lowest element of the 128-bit
				77	/// result vector of [4 x float].
				78	///
				79	/// \headerfile <x86intrin.h>
				80	///
				81	/// \code
				82	/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
				83	/// \endcode
				84	///
				85	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
				86	///
				87	/// \param X
				88	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
				89	/// copied to the corresponding bits of the result.
				90	/// \param Y
				91	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
				92	/// rounded up to the nearest integer and copied to the corresponding bits
				93	/// of the result.
				94	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
				95	/// values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	96	#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	97
				98	/// Copies the upper element of the first 128-bit vector operand to the
				99	/// corresponding upper element of the 128-bit result vector of [2 x double].
				100	/// Rounds up the lower element of the second 128-bit vector operand to an
				101	/// integer and copies it to the lower element of the 128-bit result vector
				102	/// of [2 x double].
				103	///
				104	/// \headerfile <x86intrin.h>
				105	///
				106	/// \code
				107	/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
				108	/// \endcode
				109	///
				110	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
				111	///
				112	/// \param X
				113	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
				114	/// copied to the corresponding bits of the result.
				115	/// \param Y
				116	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
				117	/// rounded up to the nearest integer and copied to the corresponding bits
				118	/// of the result.
				119	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
				120	/// values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	121	#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
				122
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	123	/// Rounds down each element of the 128-bit vector of [4 x float] to an
				124	/// an integer and returns the rounded values in a 128-bit vector of
				125	/// [4 x float].
				126	///
				127	/// \headerfile <x86intrin.h>
				128	///
				129	/// \code
				130	/// __m128 _mm_floor_ps(__m128 X);
				131	/// \endcode
				132	///
				133	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
				134	///
				135	/// \param X
				136	/// A 128-bit vector of [4 x float] values to be rounded down.
				137	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	138	#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	139
				140	/// Rounds down each element of the 128-bit vector of [2 x double] to an
				141	/// integer and returns the rounded values in a 128-bit vector of
				142	/// [2 x double].
				143	///
				144	/// \headerfile <x86intrin.h>
				145	///
				146	/// \code
				147	/// __m128d _mm_floor_pd(__m128d X);
				148	/// \endcode
				149	///
				150	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
				151	///
				152	/// \param X
				153	/// A 128-bit vector of [2 x double].
				154	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	155	#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	156
				157	/// Copies three upper elements of the first 128-bit vector operand to
				158	/// the corresponding three upper elements of the 128-bit result vector of
				159	/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
				160	/// operand to an integer and copies it to the lowest element of the 128-bit
				161	/// result vector of [4 x float].
				162	///
				163	/// \headerfile <x86intrin.h>
				164	///
				165	/// \code
				166	/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
				167	/// \endcode
				168	///
				169	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
				170	///
				171	/// \param X
				172	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
				173	/// copied to the corresponding bits of the result.
				174	/// \param Y
				175	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
				176	/// rounded down to the nearest integer and copied to the corresponding bits
				177	/// of the result.
				178	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
				179	/// values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	180	#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	181
				182	/// Copies the upper element of the first 128-bit vector operand to the
				183	/// corresponding upper element of the 128-bit result vector of [2 x double].
				184	/// Rounds down the lower element of the second 128-bit vector operand to an
				185	/// integer and copies it to the lower element of the 128-bit result vector
				186	/// of [2 x double].
				187	///
				188	/// \headerfile <x86intrin.h>
				189	///
				190	/// \code
				191	/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
				192	/// \endcode
				193	///
				194	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
				195	///
				196	/// \param X
				197	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
				198	/// copied to the corresponding bits of the result.
				199	/// \param Y
				200	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
				201	/// rounded down to the nearest integer and copied to the corresponding bits
				202	/// of the result.
				203	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
				204	/// values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	205	#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
				206
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	207	/// Rounds each element of the 128-bit vector of [4 x float] to an
				208	/// integer value according to the rounding control specified by the second
				209	/// argument and returns the rounded values in a 128-bit vector of
				210	/// [4 x float].
				211	///
				212	/// \headerfile <x86intrin.h>
				213	///
				214	/// \code
				215	/// __m128 _mm_round_ps(__m128 X, const int M);
				216	/// \endcode
				217	///
				218	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
				219	///
				220	/// \param X
				221	/// A 128-bit vector of [4 x float].
				222	/// \param M
				223	/// An integer value that specifies the rounding operation. \n
				224	/// Bits [7:4] are reserved. \n
				225	/// Bit [3] is a precision exception value: \n
				226	/// 0: A normal PE exception is used \n
				227	/// 1: The PE field is not updated \n
				228	/// Bit [2] is the rounding control source: \n
				229	/// 0: Use bits [1:0] of \a M \n
				230	/// 1: Use the current MXCSR setting \n
				231	/// Bits [1:0] contain the rounding control definition: \n
				232	/// 00: Nearest \n
				233	/// 01: Downward (toward negative infinity) \n
				234	/// 10: Upward (toward positive infinity) \n
				235	/// 11: Truncated
				236	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
				237	#define _mm_round_ps(X, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	238	((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	239
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	240	/// Copies three upper elements of the first 128-bit vector operand to
				241	/// the corresponding three upper elements of the 128-bit result vector of
				242	/// [4 x float]. Rounds the lowest element of the second 128-bit vector
				243	/// operand to an integer value according to the rounding control specified
				244	/// by the third argument and copies it to the lowest element of the 128-bit
				245	/// result vector of [4 x float].
				246	///
				247	/// \headerfile <x86intrin.h>
				248	///
				249	/// \code
				250	/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
				251	/// \endcode
				252	///
				253	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
				254	///
				255	/// \param X
				256	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
				257	/// copied to the corresponding bits of the result.
				258	/// \param Y
				259	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
				260	/// rounded to the nearest integer using the specified rounding control and
				261	/// copied to the corresponding bits of the result.
				262	/// \param M
				263	/// An integer value that specifies the rounding operation. \n
				264	/// Bits [7:4] are reserved. \n
				265	/// Bit [3] is a precision exception value: \n
				266	/// 0: A normal PE exception is used \n
				267	/// 1: The PE field is not updated \n
				268	/// Bit [2] is the rounding control source: \n
				269	/// 0: Use bits [1:0] of \a M \n
				270	/// 1: Use the current MXCSR setting \n
				271	/// Bits [1:0] contain the rounding control definition: \n
				272	/// 00: Nearest \n
				273	/// 01: Downward (toward negative infinity) \n
				274	/// 10: Upward (toward positive infinity) \n
				275	/// 11: Truncated
				276	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
				277	/// values.
				278	#define _mm_round_ss(X, Y, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	279	((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
				280	(__v4sf)(__m128)(Y), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	281
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	282	/// Rounds each element of the 128-bit vector of [2 x double] to an
				283	/// integer value according to the rounding control specified by the second
				284	/// argument and returns the rounded values in a 128-bit vector of
				285	/// [2 x double].
				286	///
				287	/// \headerfile <x86intrin.h>
				288	///
				289	/// \code
				290	/// __m128d _mm_round_pd(__m128d X, const int M);
				291	/// \endcode
				292	///
				293	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
				294	///
				295	/// \param X
				296	/// A 128-bit vector of [2 x double].
				297	/// \param M
				298	/// An integer value that specifies the rounding operation. \n
				299	/// Bits [7:4] are reserved. \n
				300	/// Bit [3] is a precision exception value: \n
				301	/// 0: A normal PE exception is used \n
				302	/// 1: The PE field is not updated \n
				303	/// Bit [2] is the rounding control source: \n
				304	/// 0: Use bits [1:0] of \a M \n
				305	/// 1: Use the current MXCSR setting \n
				306	/// Bits [1:0] contain the rounding control definition: \n
				307	/// 00: Nearest \n
				308	/// 01: Downward (toward negative infinity) \n
				309	/// 10: Upward (toward positive infinity) \n
				310	/// 11: Truncated
				311	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
				312	#define _mm_round_pd(X, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	313	((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	314
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	315	/// Copies the upper element of the first 128-bit vector operand to the
				316	/// corresponding upper element of the 128-bit result vector of [2 x double].
				317	/// Rounds the lower element of the second 128-bit vector operand to an
				318	/// integer value according to the rounding control specified by the third
				319	/// argument and copies it to the lower element of the 128-bit result vector
				320	/// of [2 x double].
				321	///
				322	/// \headerfile <x86intrin.h>
				323	///
				324	/// \code
				325	/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
				326	/// \endcode
				327	///
				328	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
				329	///
				330	/// \param X
				331	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
				332	/// copied to the corresponding bits of the result.
				333	/// \param Y
				334	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
				335	/// rounded to the nearest integer using the specified rounding control and
				336	/// copied to the corresponding bits of the result.
				337	/// \param M
				338	/// An integer value that specifies the rounding operation. \n
				339	/// Bits [7:4] are reserved. \n
				340	/// Bit [3] is a precision exception value: \n
				341	/// 0: A normal PE exception is used \n
				342	/// 1: The PE field is not updated \n
				343	/// Bit [2] is the rounding control source: \n
				344	/// 0: Use bits [1:0] of \a M \n
				345	/// 1: Use the current MXCSR setting \n
				346	/// Bits [1:0] contain the rounding control definition: \n
				347	/// 00: Nearest \n
				348	/// 01: Downward (toward negative infinity) \n
				349	/// 10: Upward (toward positive infinity) \n
				350	/// 11: Truncated
				351	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
				352	/// values.
				353	#define _mm_round_sd(X, Y, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	354	((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
				355	(__v2df)(__m128d)(Y), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	356
				357	/* SSE4 Packed Blending Intrinsics. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	358	/// Returns a 128-bit vector of [2 x double] where the values are
				359	/// selected from either the first or second operand as specified by the
				360	/// third operand, the control mask.
				361	///
				362	/// \headerfile <x86intrin.h>
				363	///
				364	/// \code
				365	/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
				366	/// \endcode
				367	///
				368	/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
				369	///
				370	/// \param V1
				371	/// A 128-bit vector of [2 x double].
				372	/// \param V2
				373	/// A 128-bit vector of [2 x double].
				374	/// \param M
				375	/// An immediate integer operand, with mask bits [1:0] specifying how the
				376	/// values are to be copied. The position of the mask bit corresponds to the
				377	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
				378	/// element in operand \a V1 is copied to the same position in the result.
				379	/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
				380	/// is copied to the same position in the result.
				381	/// \returns A 128-bit vector of [2 x double] containing the copied values.
				382	#define _mm_blend_pd(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	383	((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
				384	(__v2df)(__m128d)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	385
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	386	/// Returns a 128-bit vector of [4 x float] where the values are selected
				387	/// from either the first or second operand as specified by the third
				388	/// operand, the control mask.
				389	///
				390	/// \headerfile <x86intrin.h>
				391	///
				392	/// \code
				393	/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
				394	/// \endcode
				395	///
				396	/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
				397	///
				398	/// \param V1
				399	/// A 128-bit vector of [4 x float].
				400	/// \param V2
				401	/// A 128-bit vector of [4 x float].
				402	/// \param M
				403	/// An immediate integer operand, with mask bits [3:0] specifying how the
				404	/// values are to be copied. The position of the mask bit corresponds to the
				405	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
				406	/// element in operand \a V1 is copied to the same position in the result.
				407	/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
				408	/// is copied to the same position in the result.
				409	/// \returns A 128-bit vector of [4 x float] containing the copied values.
				410	#define _mm_blend_ps(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	411	((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
				412	(__v4sf)(__m128)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	413
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	414	/// Returns a 128-bit vector of [2 x double] where the values are
				415	/// selected from either the first or second operand as specified by the
				416	/// third operand, the control mask.
				417	///
				418	/// \headerfile <x86intrin.h>
				419	///
				420	/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
				421	///
				422	/// \param __V1
				423	/// A 128-bit vector of [2 x double].
				424	/// \param __V2
				425	/// A 128-bit vector of [2 x double].
				426	/// \param __M
				427	/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
				428	/// values are to be copied. The position of the mask bit corresponds to the
				429	/// most significant bit of a copied value. When a mask bit is 0, the
				430	/// corresponding 64-bit element in operand \a __V1 is copied to the same
				431	/// position in the result. When a mask bit is 1, the corresponding 64-bit
				432	/// element in operand \a __V2 is copied to the same position in the result.
				433	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	434	static __inline__ __m128d __DEFAULT_FN_ATTRS
				435	_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
				436	{
				437	return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
				438	(__v2df)__M);
				439	}
				440
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	441	/// Returns a 128-bit vector of [4 x float] where the values are
				442	/// selected from either the first or second operand as specified by the
				443	/// third operand, the control mask.
				444	///
				445	/// \headerfile <x86intrin.h>
				446	///
				447	/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
				448	///
				449	/// \param __V1
				450	/// A 128-bit vector of [4 x float].
				451	/// \param __V2
				452	/// A 128-bit vector of [4 x float].
				453	/// \param __M
				454	/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
				455	/// how the values are to be copied. The position of the mask bit corresponds
				456	/// to the most significant bit of a copied value. When a mask bit is 0, the
				457	/// corresponding 32-bit element in operand \a __V1 is copied to the same
				458	/// position in the result. When a mask bit is 1, the corresponding 32-bit
				459	/// element in operand \a __V2 is copied to the same position in the result.
				460	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	461	static __inline__ __m128 __DEFAULT_FN_ATTRS
				462	_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
				463	{
				464	return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
				465	(__v4sf)__M);
				466	}
				467
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	468	/// Returns a 128-bit vector of [16 x i8] where the values are selected
				469	/// from either of the first or second operand as specified by the third
				470	/// operand, the control mask.
				471	///
				472	/// \headerfile <x86intrin.h>
				473	///
				474	/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
				475	///
				476	/// \param __V1
				477	/// A 128-bit vector of [16 x i8].
				478	/// \param __V2
				479	/// A 128-bit vector of [16 x i8].
				480	/// \param __M
				481	/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
				482	/// how the values are to be copied. The position of the mask bit corresponds
				483	/// to the most significant bit of a copied value. When a mask bit is 0, the
				484	/// corresponding 8-bit element in operand \a __V1 is copied to the same
				485	/// position in the result. When a mask bit is 1, the corresponding 8-bit
				486	/// element in operand \a __V2 is copied to the same position in the result.
				487	/// \returns A 128-bit vector of [16 x i8] containing the copied values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	488	static __inline__ __m128i __DEFAULT_FN_ATTRS
				489	_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
				490	{
				491	return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
				492	(__v16qi)__M);
				493	}
				494
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	495	/// Returns a 128-bit vector of [8 x i16] where the values are selected
				496	/// from either of the first or second operand as specified by the third
				497	/// operand, the control mask.
				498	///
				499	/// \headerfile <x86intrin.h>
				500	///
				501	/// \code
				502	/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
				503	/// \endcode
				504	///
				505	/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
				506	///
				507	/// \param V1
				508	/// A 128-bit vector of [8 x i16].
				509	/// \param V2
				510	/// A 128-bit vector of [8 x i16].
				511	/// \param M
				512	/// An immediate integer operand, with mask bits [7:0] specifying how the
				513	/// values are to be copied. The position of the mask bit corresponds to the
				514	/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
				515	/// element in operand \a V1 is copied to the same position in the result.
				516	/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
				517	/// is copied to the same position in the result.
				518	/// \returns A 128-bit vector of [8 x i16] containing the copied values.
				519	#define _mm_blend_epi16(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	520	((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
				521	(__v8hi)(__m128i)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	522
				523	/* SSE4 Dword Multiply Instructions. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	524	/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
				525	/// and returns the lower 32 bits of the each product in a 128-bit vector of
				526	/// [4 x i32].
				527	///
				528	/// \headerfile <x86intrin.h>
				529	///
				530	/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
				531	///
				532	/// \param __V1
				533	/// A 128-bit integer vector.
				534	/// \param __V2
				535	/// A 128-bit integer vector.
				536	/// \returns A 128-bit integer vector containing the products of both operands.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	537	static __inline__ __m128i __DEFAULT_FN_ATTRS
				538	_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
				539	{
				540	return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
				541	}
				542
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	543	/// Multiplies corresponding even-indexed elements of two 128-bit
				544	/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
				545	/// containing the products.
				546	///
				547	/// \headerfile <x86intrin.h>
				548	///
				549	/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
				550	///
				551	/// \param __V1
				552	/// A 128-bit vector of [4 x i32].
				553	/// \param __V2
				554	/// A 128-bit vector of [4 x i32].
				555	/// \returns A 128-bit vector of [2 x i64] containing the products of both
				556	/// operands.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	557	static __inline__ __m128i __DEFAULT_FN_ATTRS
				558	_mm_mul_epi32 (__m128i __V1, __m128i __V2)
				559	{
				560	return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
				561	}
				562
				563	/* SSE4 Floating Point Dot Product Instructions. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	564	/// Computes the dot product of the two 128-bit vectors of [4 x float]
				565	/// and returns it in the elements of the 128-bit result vector of
				566	/// [4 x float].
				567	///
				568	/// The immediate integer operand controls which input elements
				569	/// will contribute to the dot product, and where the final results are
				570	/// returned.
				571	///
				572	/// \headerfile <x86intrin.h>
				573	///
				574	/// \code
				575	/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
				576	/// \endcode
				577	///
				578	/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
				579	///
				580	/// \param X
				581	/// A 128-bit vector of [4 x float].
				582	/// \param Y
				583	/// A 128-bit vector of [4 x float].
				584	/// \param M
				585	/// An immediate integer operand. Mask bits [7:4] determine which elements
				586	/// of the input vectors are used, with bit [4] corresponding to the lowest
				587	/// element and bit [7] corresponding to the highest element of each [4 x
				588	/// float] vector. If a bit is set, the corresponding elements from the two
				589	/// input vectors are used as an input for dot product; otherwise that input
				590	/// is treated as zero. Bits [3:0] determine which elements of the result
				591	/// will receive a copy of the final dot product, with bit [0] corresponding
				592	/// to the lowest element and bit [3] corresponding to the highest element of
				593	/// each [4 x float] subvector. If a bit is set, the dot product is returned
				594	/// in the corresponding element; otherwise that element is set to zero.
				595	/// \returns A 128-bit vector of [4 x float] containing the dot product.
				596	#define _mm_dp_ps(X, Y, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	597	((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
				598	(__v4sf)(__m128)(Y), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	599
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	600	/// Computes the dot product of the two 128-bit vectors of [2 x double]
				601	/// and returns it in the elements of the 128-bit result vector of
				602	/// [2 x double].
				603	///
				604	/// The immediate integer operand controls which input
				605	/// elements will contribute to the dot product, and where the final results
				606	/// are returned.
				607	///
				608	/// \headerfile <x86intrin.h>
				609	///
				610	/// \code
				611	/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
				612	/// \endcode
				613	///
				614	/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
				615	///
				616	/// \param X
				617	/// A 128-bit vector of [2 x double].
				618	/// \param Y
				619	/// A 128-bit vector of [2 x double].
				620	/// \param M
				621	/// An immediate integer operand. Mask bits [5:4] determine which elements
				622	/// of the input vectors are used, with bit [4] corresponding to the lowest
				623	/// element and bit [5] corresponding to the highest element of each of [2 x
				624	/// double] vector. If a bit is set, the corresponding elements from the two
				625	/// input vectors are used as an input for dot product; otherwise that input
				626	/// is treated as zero. Bits [1:0] determine which elements of the result
				627	/// will receive a copy of the final dot product, with bit [0] corresponding
				628	/// to the lowest element and bit [1] corresponding to the highest element of
				629	/// each [2 x double] vector. If a bit is set, the dot product is returned in
				630	/// the corresponding element; otherwise that element is set to zero.
				631	#define _mm_dp_pd(X, Y, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	632	((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
				633	(__v2df)(__m128d)(Y), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	634
				635	/* SSE4 Streaming Load Hint Instruction. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	636	/// Loads integer values from a 128-bit aligned memory location to a
				637	/// 128-bit integer vector.
				638	///
				639	/// \headerfile <x86intrin.h>
				640	///
				641	/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
				642	///
				643	/// \param __V
				644	/// A pointer to a 128-bit aligned memory location that contains the integer
				645	/// values.
				646	/// \returns A 128-bit integer vector containing the data stored at the
				647	/// specified memory location.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	648	static __inline__ __m128i __DEFAULT_FN_ATTRS
				649	_mm_stream_load_si128 (__m128i const *__V)
				650	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	651	return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	652	}
				653
				654	/* SSE4 Packed Integer Min/Max Instructions. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	655	/// Compares the corresponding elements of two 128-bit vectors of
				656	/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
				657	/// of the two values.
				658	///
				659	/// \headerfile <x86intrin.h>
				660	///
				661	/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
				662	///
				663	/// \param __V1
				664	/// A 128-bit vector of [16 x i8].
				665	/// \param __V2
				666	/// A 128-bit vector of [16 x i8]
				667	/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	668	static __inline__ __m128i __DEFAULT_FN_ATTRS
				669	_mm_min_epi8 (__m128i __V1, __m128i __V2)
				670	{
				671	return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
				672	}
				673
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	674	/// Compares the corresponding elements of two 128-bit vectors of
				675	/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
				676	/// greater value of the two.
				677	///
				678	/// \headerfile <x86intrin.h>
				679	///
				680	/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
				681	///
				682	/// \param __V1
				683	/// A 128-bit vector of [16 x i8].
				684	/// \param __V2
				685	/// A 128-bit vector of [16 x i8].
				686	/// \returns A 128-bit vector of [16 x i8] containing the greater values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	687	static __inline__ __m128i __DEFAULT_FN_ATTRS
				688	_mm_max_epi8 (__m128i __V1, __m128i __V2)
				689	{
				690	return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
				691	}
				692
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	693	/// Compares the corresponding elements of two 128-bit vectors of
				694	/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
				695	/// value of the two.
				696	///
				697	/// \headerfile <x86intrin.h>
				698	///
				699	/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
				700	///
				701	/// \param __V1
				702	/// A 128-bit vector of [8 x u16].
				703	/// \param __V2
				704	/// A 128-bit vector of [8 x u16].
				705	/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	706	static __inline__ __m128i __DEFAULT_FN_ATTRS
				707	_mm_min_epu16 (__m128i __V1, __m128i __V2)
				708	{
				709	return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
				710	}
				711
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	712	/// Compares the corresponding elements of two 128-bit vectors of
				713	/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
				714	/// greater value of the two.
				715	///
				716	/// \headerfile <x86intrin.h>
				717	///
				718	/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
				719	///
				720	/// \param __V1
				721	/// A 128-bit vector of [8 x u16].
				722	/// \param __V2
				723	/// A 128-bit vector of [8 x u16].
				724	/// \returns A 128-bit vector of [8 x u16] containing the greater values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	725	static __inline__ __m128i __DEFAULT_FN_ATTRS
				726	_mm_max_epu16 (__m128i __V1, __m128i __V2)
				727	{
				728	return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
				729	}
				730
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	731	/// Compares the corresponding elements of two 128-bit vectors of
				732	/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
				733	/// value of the two.
				734	///
				735	/// \headerfile <x86intrin.h>
				736	///
				737	/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
				738	///
				739	/// \param __V1
				740	/// A 128-bit vector of [4 x i32].
				741	/// \param __V2
				742	/// A 128-bit vector of [4 x i32].
				743	/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	744	static __inline__ __m128i __DEFAULT_FN_ATTRS
				745	_mm_min_epi32 (__m128i __V1, __m128i __V2)
				746	{
				747	return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
				748	}
				749
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	750	/// Compares the corresponding elements of two 128-bit vectors of
				751	/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
				752	/// greater value of the two.
				753	///
				754	/// \headerfile <x86intrin.h>
				755	///
				756	/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
				757	///
				758	/// \param __V1
				759	/// A 128-bit vector of [4 x i32].
				760	/// \param __V2
				761	/// A 128-bit vector of [4 x i32].
				762	/// \returns A 128-bit vector of [4 x i32] containing the greater values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	763	static __inline__ __m128i __DEFAULT_FN_ATTRS
				764	_mm_max_epi32 (__m128i __V1, __m128i __V2)
				765	{
				766	return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
				767	}
				768
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	769	/// Compares the corresponding elements of two 128-bit vectors of
				770	/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
				771	/// value of the two.
				772	///
				773	/// \headerfile <x86intrin.h>
				774	///
				775	/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
				776	///
				777	/// \param __V1
				778	/// A 128-bit vector of [4 x u32].
				779	/// \param __V2
				780	/// A 128-bit vector of [4 x u32].
				781	/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	782	static __inline__ __m128i __DEFAULT_FN_ATTRS
				783	_mm_min_epu32 (__m128i __V1, __m128i __V2)
				784	{
				785	return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
				786	}
				787
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	788	/// Compares the corresponding elements of two 128-bit vectors of
				789	/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
				790	/// greater value of the two.
				791	///
				792	/// \headerfile <x86intrin.h>
				793	///
				794	/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
				795	///
				796	/// \param __V1
				797	/// A 128-bit vector of [4 x u32].
				798	/// \param __V2
				799	/// A 128-bit vector of [4 x u32].
				800	/// \returns A 128-bit vector of [4 x u32] containing the greater values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	801	static __inline__ __m128i __DEFAULT_FN_ATTRS
				802	_mm_max_epu32 (__m128i __V1, __m128i __V2)
				803	{
				804	return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
				805	}
				806
				807	/* SSE4 Insertion and Extraction from XMM Register Instructions. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	808	/// Takes the first argument \a X and inserts an element from the second
				809	/// argument \a Y as selected by the third argument \a N. That result then
				810	/// has elements zeroed out also as selected by the third argument \a N. The
				811	/// resulting 128-bit vector of [4 x float] is then returned.
				812	///
				813	/// \headerfile <x86intrin.h>
				814	///
				815	/// \code
				816	/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
				817	/// \endcode
				818	///
				819	/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
				820	///
				821	/// \param X
				822	/// A 128-bit vector source operand of [4 x float]. With the exception of
				823	/// those bits in the result copied from parameter \a Y and zeroed by bits
				824	/// [3:0] of \a N, all bits from this parameter are copied to the result.
				825	/// \param Y
				826	/// A 128-bit vector source operand of [4 x float]. One single-precision
				827	/// floating-point element from this source, as determined by the immediate
				828	/// parameter, is copied to the result.
				829	/// \param N
				830	/// Specifies which bits from operand \a Y will be copied, which bits in the
				831	/// result they will be be copied to, and which bits in the result will be
				832	/// cleared. The following assignments are made: \n
				833	/// Bits [7:6] specify the bits to copy from operand \a Y: \n
				834	/// 00: Selects bits [31:0] from operand \a Y. \n
				835	/// 01: Selects bits [63:32] from operand \a Y. \n
				836	/// 10: Selects bits [95:64] from operand \a Y. \n
				837	/// 11: Selects bits [127:96] from operand \a Y. \n
				838	/// Bits [5:4] specify the bits in the result to which the selected bits
				839	/// from operand \a Y are copied: \n
				840	/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
				841	/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
				842	/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
				843	/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
				844	/// Bits[3:0]: If any of these bits are set, the corresponding result
				845	/// element is cleared.
				846	/// \returns A 128-bit vector of [4 x float] containing the copied
				847	/// single-precision floating point elements from the operands.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	848	#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	849
				850	/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
				851	/// returns it, using the immediate value parameter \a N as a selector.
				852	///
				853	/// \headerfile <x86intrin.h>
				854	///
				855	/// \code
				856	/// int _mm_extract_ps(__m128 X, const int N);
				857	/// \endcode
				858	///
				859	/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
				860	/// instruction.
				861	///
				862	/// \param X
				863	/// A 128-bit vector of [4 x float].
				864	/// \param N
				865	/// An immediate value. Bits [1:0] determines which bits from the argument
				866	/// \a X are extracted and returned: \n
				867	/// 00: Bits [31:0] of parameter \a X are returned. \n
				868	/// 01: Bits [63:32] of parameter \a X are returned. \n
				869	/// 10: Bits [95:64] of parameter \a X are returned. \n
				870	/// 11: Bits [127:96] of parameter \a X are returned.
				871	/// \returns A 32-bit integer containing the extracted 32 bits of float data.
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	872	#define _mm_extract_ps(X, N) \
				873	__builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	874
				875	/* Miscellaneous insert and extract macros. */
				876	/* Extract a single-precision float from X at index N into D. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	877	#define _MM_EXTRACT_FLOAT(D, X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	878	do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	879
				880	/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
				881	an index suitable for _mm_insert_ps. */
				882	#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) \| ((Y) << 4) \| (Z))
				883
				884	/* Extract a float from X at index N into the first index of the return. */
				885	#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
				886	_MM_MK_INSERTPS_NDX((N), 0, 0x0e))
				887
				888	/* Insert int into packed integer array at index. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	889	/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
				890	/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
				891	/// of an integer parameter \a I into an offset specified by the immediate
				892	/// value parameter \a N.
				893	///
				894	/// \headerfile <x86intrin.h>
				895	///
				896	/// \code
				897	/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
				898	/// \endcode
				899	///
				900	/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
				901	///
				902	/// \param X
				903	/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
				904	/// result and then one of the sixteen elements in the result vector is
				905	/// replaced by the lower 8 bits of \a I.
				906	/// \param I
				907	/// An integer. The lower 8 bits of this operand are written to the result
				908	/// beginning at the offset specified by \a N.
				909	/// \param N
				910	/// An immediate value. Bits [3:0] specify the bit offset in the result at
				911	/// which the lower 8 bits of \a I are written. \n
				912	/// 0000: Bits [7:0] of the result are used for insertion. \n
				913	/// 0001: Bits [15:8] of the result are used for insertion. \n
				914	/// 0010: Bits [23:16] of the result are used for insertion. \n
				915	/// 0011: Bits [31:24] of the result are used for insertion. \n
				916	/// 0100: Bits [39:32] of the result are used for insertion. \n
				917	/// 0101: Bits [47:40] of the result are used for insertion. \n
				918	/// 0110: Bits [55:48] of the result are used for insertion. \n
				919	/// 0111: Bits [63:56] of the result are used for insertion. \n
				920	/// 1000: Bits [71:64] of the result are used for insertion. \n
				921	/// 1001: Bits [79:72] of the result are used for insertion. \n
				922	/// 1010: Bits [87:80] of the result are used for insertion. \n
				923	/// 1011: Bits [95:88] of the result are used for insertion. \n
				924	/// 1100: Bits [103:96] of the result are used for insertion. \n
				925	/// 1101: Bits [111:104] of the result are used for insertion. \n
				926	/// 1110: Bits [119:112] of the result are used for insertion. \n
				927	/// 1111: Bits [127:120] of the result are used for insertion.
				928	/// \returns A 128-bit integer vector containing the constructed values.
				929	#define _mm_insert_epi8(X, I, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	930	((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
				931	(int)(I), (int)(N)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	932
				933	/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
				934	/// the 128-bit integer vector parameter, and then inserting the 32-bit
				935	/// integer parameter \a I at the offset specified by the immediate value
				936	/// parameter \a N.
				937	///
				938	/// \headerfile <x86intrin.h>
				939	///
				940	/// \code
				941	/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
				942	/// \endcode
				943	///
				944	/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
				945	///
				946	/// \param X
				947	/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
				948	/// result and then one of the four elements in the result vector is
				949	/// replaced by \a I.
				950	/// \param I
				951	/// A 32-bit integer that is written to the result beginning at the offset
				952	/// specified by \a N.
				953	/// \param N
				954	/// An immediate value. Bits [1:0] specify the bit offset in the result at
				955	/// which the integer \a I is written. \n
				956	/// 00: Bits [31:0] of the result are used for insertion. \n
				957	/// 01: Bits [63:32] of the result are used for insertion. \n
				958	/// 10: Bits [95:64] of the result are used for insertion. \n
				959	/// 11: Bits [127:96] of the result are used for insertion.
				960	/// \returns A 128-bit integer vector containing the constructed values.
				961	#define _mm_insert_epi32(X, I, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	962	((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
				963	(int)(I), (int)(N)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	964
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	965	#ifdef __x86_64__
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	966	/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
				967	/// the 128-bit integer vector parameter, and then inserting the 64-bit
				968	/// integer parameter \a I, using the immediate value parameter \a N as an
				969	/// insertion location selector.
				970	///
				971	/// \headerfile <x86intrin.h>
				972	///
				973	/// \code
				974	/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
				975	/// \endcode
				976	///
				977	/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
				978	///
				979	/// \param X
				980	/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
				981	/// result and then one of the two elements in the result vector is replaced
				982	/// by \a I.
				983	/// \param I
				984	/// A 64-bit integer that is written to the result beginning at the offset
				985	/// specified by \a N.
				986	/// \param N
				987	/// An immediate value. Bit [0] specifies the bit offset in the result at
				988	/// which the integer \a I is written. \n
				989	/// 0: Bits [63:0] of the result are used for insertion. \n
				990	/// 1: Bits [127:64] of the result are used for insertion. \n
				991	/// \returns A 128-bit integer vector containing the constructed values.
				992	#define _mm_insert_epi64(X, I, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	993	((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
				994	(long long)(I), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	995	#endif /* __x86_64__ */
				996
				997	/* Extract int from packed integer array at index. This returns the element
				998	* as a zero extended value, so it is unsigned.
				999	*/
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1000	/// Extracts an 8-bit element from the 128-bit integer vector of
				1001	/// [16 x i8], using the immediate value parameter \a N as a selector.
				1002	///
				1003	/// \headerfile <x86intrin.h>
				1004	///
				1005	/// \code
				1006	/// int _mm_extract_epi8(__m128i X, const int N);
				1007	/// \endcode
				1008	///
				1009	/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
				1010	///
				1011	/// \param X
				1012	/// A 128-bit integer vector.
				1013	/// \param N
				1014	/// An immediate value. Bits [3:0] specify which 8-bit vector element from
				1015	/// the argument \a X to extract and copy to the result. \n
				1016	/// 0000: Bits [7:0] of parameter \a X are extracted. \n
				1017	/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
				1018	/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
				1019	/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
				1020	/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
				1021	/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
				1022	/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
				1023	/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
				1024	/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
				1025	/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
				1026	/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
				1027	/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
				1028	/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
				1029	/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
				1030	/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
				1031	/// 1111: Bits [127:120] of the parameter \a X are extracted.
				1032	/// \returns An unsigned integer, whose lower 8 bits are selected from the
				1033	/// 128-bit integer vector parameter and the remaining bits are assigned
				1034	/// zeros.
				1035	#define _mm_extract_epi8(X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1036	((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
				1037	(int)(N)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1038
				1039	/// Extracts a 32-bit element from the 128-bit integer vector of
				1040	/// [4 x i32], using the immediate value parameter \a N as a selector.
				1041	///
				1042	/// \headerfile <x86intrin.h>
				1043	///
				1044	/// \code
				1045	/// int _mm_extract_epi32(__m128i X, const int N);
				1046	/// \endcode
				1047	///
				1048	/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
				1049	///
				1050	/// \param X
				1051	/// A 128-bit integer vector.
				1052	/// \param N
				1053	/// An immediate value. Bits [1:0] specify which 32-bit vector element from
				1054	/// the argument \a X to extract and copy to the result. \n
				1055	/// 00: Bits [31:0] of the parameter \a X are extracted. \n
				1056	/// 01: Bits [63:32] of the parameter \a X are extracted. \n
				1057	/// 10: Bits [95:64] of the parameter \a X are extracted. \n
				1058	/// 11: Bits [127:96] of the parameter \a X are exracted.
				1059	/// \returns An integer, whose lower 32 bits are selected from the 128-bit
				1060	/// integer vector parameter and the remaining bits are assigned zeros.
				1061	#define _mm_extract_epi32(X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1062	((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1063
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1064	#ifdef __x86_64__
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1065	/// Extracts a 64-bit element from the 128-bit integer vector of
				1066	/// [2 x i64], using the immediate value parameter \a N as a selector.
				1067	///
				1068	/// \headerfile <x86intrin.h>
				1069	///
				1070	/// \code
				1071	/// long long _mm_extract_epi64(__m128i X, const int N);
				1072	/// \endcode
				1073	///
				1074	/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
				1075	///
				1076	/// \param X
				1077	/// A 128-bit integer vector.
				1078	/// \param N
				1079	/// An immediate value. Bit [0] specifies which 64-bit vector element from
				1080	/// the argument \a X to return. \n
				1081	/// 0: Bits [63:0] are returned. \n
				1082	/// 1: Bits [127:64] are returned. \n
				1083	/// \returns A 64-bit integer.
				1084	#define _mm_extract_epi64(X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1085	((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1086	#endif /* __x86_64 */
				1087
				1088	/* SSE4 128-bit Packed Integer Comparisons. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1089	/// Tests whether the specified bits in a 128-bit integer vector are all
				1090	/// zeros.
				1091	///
				1092	/// \headerfile <x86intrin.h>
				1093	///
				1094	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1095	///
				1096	/// \param __M
				1097	/// A 128-bit integer vector containing the bits to be tested.
				1098	/// \param __V
				1099	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
				1100	/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1101	static __inline__ int __DEFAULT_FN_ATTRS
				1102	_mm_testz_si128(__m128i __M, __m128i __V)
				1103	{
				1104	return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
				1105	}
				1106
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1107	/// Tests whether the specified bits in a 128-bit integer vector are all
				1108	/// ones.
				1109	///
				1110	/// \headerfile <x86intrin.h>
				1111	///
				1112	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1113	///
				1114	/// \param __M
				1115	/// A 128-bit integer vector containing the bits to be tested.
				1116	/// \param __V
				1117	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
				1118	/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1119	static __inline__ int __DEFAULT_FN_ATTRS
				1120	_mm_testc_si128(__m128i __M, __m128i __V)
				1121	{
				1122	return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
				1123	}
				1124
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1125	/// Tests whether the specified bits in a 128-bit integer vector are
				1126	/// neither all zeros nor all ones.
				1127	///
				1128	/// \headerfile <x86intrin.h>
				1129	///
				1130	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1131	///
				1132	/// \param __M
				1133	/// A 128-bit integer vector containing the bits to be tested.
				1134	/// \param __V
				1135	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
				1136	/// \returns TRUE if the specified bits are neither all zeros nor all ones;
				1137	/// FALSE otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1138	static __inline__ int __DEFAULT_FN_ATTRS
				1139	_mm_testnzc_si128(__m128i __M, __m128i __V)
				1140	{
				1141	return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
				1142	}
				1143
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1144	/// Tests whether the specified bits in a 128-bit integer vector are all
				1145	/// ones.
				1146	///
				1147	/// \headerfile <x86intrin.h>
				1148	///
				1149	/// \code
				1150	/// int _mm_test_all_ones(__m128i V);
				1151	/// \endcode
				1152	///
				1153	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1154	///
				1155	/// \param V
				1156	/// A 128-bit integer vector containing the bits to be tested.
				1157	/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
				1158	/// otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1159	#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1160
				1161	/// Tests whether the specified bits in a 128-bit integer vector are
				1162	/// neither all zeros nor all ones.
				1163	///
				1164	/// \headerfile <x86intrin.h>
				1165	///
				1166	/// \code
				1167	/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
				1168	/// \endcode
				1169	///
				1170	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1171	///
				1172	/// \param M
				1173	/// A 128-bit integer vector containing the bits to be tested.
				1174	/// \param V
				1175	/// A 128-bit integer vector selecting which bits to test in operand \a M.
				1176	/// \returns TRUE if the specified bits are neither all zeros nor all ones;
				1177	/// FALSE otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1178	#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1179
				1180	/// Tests whether the specified bits in a 128-bit integer vector are all
				1181	/// zeros.
				1182	///
				1183	/// \headerfile <x86intrin.h>
				1184	///
				1185	/// \code
				1186	/// int _mm_test_all_zeros(__m128i M, __m128i V);
				1187	/// \endcode
				1188	///
				1189	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
				1190	///
				1191	/// \param M
				1192	/// A 128-bit integer vector containing the bits to be tested.
				1193	/// \param V
				1194	/// A 128-bit integer vector selecting which bits to test in operand \a M.
				1195	/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1196	#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
				1197
				1198	/* SSE4 64-bit Packed Integer Comparisons. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1199	/// Compares each of the corresponding 64-bit values of the 128-bit
				1200	/// integer vectors for equality.
				1201	///
				1202	/// \headerfile <x86intrin.h>
				1203	///
				1204	/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
				1205	///
				1206	/// \param __V1
				1207	/// A 128-bit integer vector.
				1208	/// \param __V2
				1209	/// A 128-bit integer vector.
				1210	/// \returns A 128-bit integer vector containing the comparison results.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1211	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1212	_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
				1213	{
				1214	return (__m128i)((__v2di)__V1 == (__v2di)__V2);
				1215	}
				1216
				1217	/* SSE4 Packed Integer Sign-Extension. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1218	/// Sign-extends each of the lower eight 8-bit integer elements of a
				1219	/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
				1220	/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
				1221	/// are unused.
				1222	///
				1223	/// \headerfile <x86intrin.h>
				1224	///
				1225	/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
				1226	///
				1227	/// \param __V
				1228	/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
				1229	/// extended to 16-bit values.
				1230	/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1231	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1232	_mm_cvtepi8_epi16(__m128i __V)
				1233	{
				1234	/* This function always performs a signed extension, but __v16qi is a char
				1235	which may be signed or unsigned, so use __v16qs. */
				1236	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
				1237	}
				1238
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1239	/// Sign-extends each of the lower four 8-bit integer elements of a
				1240	/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
				1241	/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
				1242	/// vector are unused.
				1243	///
				1244	/// \headerfile <x86intrin.h>
				1245	///
				1246	/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
				1247	///
				1248	/// \param __V
				1249	/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
				1250	/// sign-extended to 32-bit values.
				1251	/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1252	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1253	_mm_cvtepi8_epi32(__m128i __V)
				1254	{
				1255	/* This function always performs a signed extension, but __v16qi is a char
				1256	which may be signed or unsigned, so use __v16qs. */
				1257	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
				1258	}
				1259
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1260	/// Sign-extends each of the lower two 8-bit integer elements of a
				1261	/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
				1262	/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
				1263	/// vector are unused.
				1264	///
				1265	/// \headerfile <x86intrin.h>
				1266	///
				1267	/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
				1268	///
				1269	/// \param __V
				1270	/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
				1271	/// sign-extended to 64-bit values.
				1272	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1273	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1274	_mm_cvtepi8_epi64(__m128i __V)
				1275	{
				1276	/* This function always performs a signed extension, but __v16qi is a char
				1277	which may be signed or unsigned, so use __v16qs. */
				1278	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
				1279	}
				1280
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1281	/// Sign-extends each of the lower four 16-bit integer elements of a
				1282	/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
				1283	/// a 128-bit vector of [4 x i32]. The upper four elements of the input
				1284	/// vector are unused.
				1285	///
				1286	/// \headerfile <x86intrin.h>
				1287	///
				1288	/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
				1289	///
				1290	/// \param __V
				1291	/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
				1292	/// sign-extended to 32-bit values.
				1293	/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1294	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1295	_mm_cvtepi16_epi32(__m128i __V)
				1296	{
				1297	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
				1298	}
				1299
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1300	/// Sign-extends each of the lower two 16-bit integer elements of a
				1301	/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
				1302	/// a 128-bit vector of [2 x i64]. The upper six elements of the input
				1303	/// vector are unused.
				1304	///
				1305	/// \headerfile <x86intrin.h>
				1306	///
				1307	/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
				1308	///
				1309	/// \param __V
				1310	/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
				1311	/// sign-extended to 64-bit values.
				1312	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1313	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1314	_mm_cvtepi16_epi64(__m128i __V)
				1315	{
				1316	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
				1317	}
				1318
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1319	/// Sign-extends each of the lower two 32-bit integer elements of a
				1320	/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
				1321	/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
				1322	/// are unused.
				1323	///
				1324	/// \headerfile <x86intrin.h>
				1325	///
				1326	/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
				1327	///
				1328	/// \param __V
				1329	/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
				1330	/// sign-extended to 64-bit values.
				1331	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1332	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1333	_mm_cvtepi32_epi64(__m128i __V)
				1334	{
				1335	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
				1336	}
				1337
				1338	/* SSE4 Packed Integer Zero-Extension. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1339	/// Zero-extends each of the lower eight 8-bit integer elements of a
				1340	/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
				1341	/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
				1342	/// are unused.
				1343	///
				1344	/// \headerfile <x86intrin.h>
				1345	///
				1346	/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
				1347	///
				1348	/// \param __V
				1349	/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
				1350	/// zero-extended to 16-bit values.
				1351	/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1352	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1353	_mm_cvtepu8_epi16(__m128i __V)
				1354	{
				1355	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
				1356	}
				1357
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1358	/// Zero-extends each of the lower four 8-bit integer elements of a
				1359	/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
				1360	/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
				1361	/// vector are unused.
				1362	///
				1363	/// \headerfile <x86intrin.h>
				1364	///
				1365	/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
				1366	///
				1367	/// \param __V
				1368	/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
				1369	/// zero-extended to 32-bit values.
				1370	/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1371	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1372	_mm_cvtepu8_epi32(__m128i __V)
				1373	{
				1374	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
				1375	}
				1376
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1377	/// Zero-extends each of the lower two 8-bit integer elements of a
				1378	/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
				1379	/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
				1380	/// vector are unused.
				1381	///
				1382	/// \headerfile <x86intrin.h>
				1383	///
				1384	/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
				1385	///
				1386	/// \param __V
				1387	/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
				1388	/// zero-extended to 64-bit values.
				1389	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1390	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1391	_mm_cvtepu8_epi64(__m128i __V)
				1392	{
				1393	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
				1394	}
				1395
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1396	/// Zero-extends each of the lower four 16-bit integer elements of a
				1397	/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
				1398	/// a 128-bit vector of [4 x i32]. The upper four elements of the input
				1399	/// vector are unused.
				1400	///
				1401	/// \headerfile <x86intrin.h>
				1402	///
				1403	/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
				1404	///
				1405	/// \param __V
				1406	/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
				1407	/// zero-extended to 32-bit values.
				1408	/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1409	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1410	_mm_cvtepu16_epi32(__m128i __V)
				1411	{
				1412	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
				1413	}
				1414
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1415	/// Zero-extends each of the lower two 16-bit integer elements of a
				1416	/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
				1417	/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
				1418	/// are unused.
				1419	///
				1420	/// \headerfile <x86intrin.h>
				1421	///
				1422	/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
				1423	///
				1424	/// \param __V
				1425	/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
				1426	/// zero-extended to 64-bit values.
				1427	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1428	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1429	_mm_cvtepu16_epi64(__m128i __V)
				1430	{
				1431	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
				1432	}
				1433
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1434	/// Zero-extends each of the lower two 32-bit integer elements of a
				1435	/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
				1436	/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
				1437	/// are unused.
				1438	///
				1439	/// \headerfile <x86intrin.h>
				1440	///
				1441	/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
				1442	///
				1443	/// \param __V
				1444	/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
				1445	/// zero-extended to 64-bit values.
				1446	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1447	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1448	_mm_cvtepu32_epi64(__m128i __V)
				1449	{
				1450	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
				1451	}
				1452
				1453	/* SSE4 Pack with Unsigned Saturation. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1454	/// Converts 32-bit signed integers from both 128-bit integer vector
				1455	/// operands into 16-bit unsigned integers, and returns the packed result.
				1456	/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
				1457	/// 0x0000 are saturated to 0x0000.
				1458	///
				1459	/// \headerfile <x86intrin.h>
				1460	///
				1461	/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
				1462	///
				1463	/// \param __V1
				1464	/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
				1465	/// signed integer and is converted to a 16-bit unsigned integer with
				1466	/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
				1467	/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
				1468	/// are written to the lower 64 bits of the result.
				1469	/// \param __V2
				1470	/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
				1471	/// signed integer and is converted to a 16-bit unsigned integer with
				1472	/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
				1473	/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
				1474	/// are written to the higher 64 bits of the result.
				1475	/// \returns A 128-bit vector of [8 x i16] containing the converted values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1476	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1477	_mm_packus_epi32(__m128i __V1, __m128i __V2)
				1478	{
				1479	return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
				1480	}
				1481
				1482	/* SSE4 Multiple Packed Sums of Absolute Difference. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1483	/// Subtracts 8-bit unsigned integer values and computes the absolute
				1484	/// values of the differences to the corresponding bits in the destination.
				1485	/// Then sums of the absolute differences are returned according to the bit
				1486	/// fields in the immediate operand.
				1487	///
				1488	/// \headerfile <x86intrin.h>
				1489	///
				1490	/// \code
				1491	/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
				1492	/// \endcode
				1493	///
				1494	/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
				1495	///
				1496	/// \param X
				1497	/// A 128-bit vector of [16 x i8].
				1498	/// \param Y
				1499	/// A 128-bit vector of [16 x i8].
				1500	/// \param M
				1501	/// An 8-bit immediate operand specifying how the absolute differences are to
				1502	/// be calculated, according to the following algorithm:
				1503	/// \code
				1504	/// // M2 represents bit 2 of the immediate operand
				1505	/// // M10 represents bits [1:0] of the immediate operand
				1506	/// i = M2 * 4;
				1507	/// j = M10 * 4;
				1508	/// for (k = 0; k < 8; k = k + 1) {
				1509	/// d0 = abs(X[i + k + 0] - Y[j + 0]);
				1510	/// d1 = abs(X[i + k + 1] - Y[j + 1]);
				1511	/// d2 = abs(X[i + k + 2] - Y[j + 2]);
				1512	/// d3 = abs(X[i + k + 3] - Y[j + 3]);
				1513	/// r[k] = d0 + d1 + d2 + d3;
				1514	/// }
				1515	/// \endcode
				1516	/// \returns A 128-bit integer vector containing the sums of the sets of
				1517	/// absolute differences between both operands.
				1518	#define _mm_mpsadbw_epu8(X, Y, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1519	((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
				1520	(__v16qi)(__m128i)(Y), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1521
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1522	/// Finds the minimum unsigned 16-bit element in the input 128-bit
				1523	/// vector of [8 x u16] and returns it and along with its index.
				1524	///
				1525	/// \headerfile <x86intrin.h>
				1526	///
				1527	/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
				1528	/// instruction.
				1529	///
				1530	/// \param __V
				1531	/// A 128-bit vector of [8 x u16].
				1532	/// \returns A 128-bit value where bits [15:0] contain the minimum value found
				1533	/// in parameter \a __V, bits [18:16] contain the index of the minimum value
				1534	/// and the remaining bits are set to 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1535	static __inline__ __m128i __DEFAULT_FN_ATTRS
				1536	_mm_minpos_epu16(__m128i __V)
				1537	{
				1538	return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
				1539	}
				1540
				1541	/* Handle the sse4.2 definitions here. */
				1542
				1543	/* These definitions are normally in nmmintrin.h, but gcc puts them in here
				1544	so we'll do the same. */
				1545
				1546	#undef __DEFAULT_FN_ATTRS
				1547	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
				1548
				1549	/* These specify the type of data that we're comparing. */
				1550	#define _SIDD_UBYTE_OPS 0x00
				1551	#define _SIDD_UWORD_OPS 0x01
				1552	#define _SIDD_SBYTE_OPS 0x02
				1553	#define _SIDD_SWORD_OPS 0x03
				1554
				1555	/* These specify the type of comparison operation. */
				1556	#define _SIDD_CMP_EQUAL_ANY 0x00
				1557	#define _SIDD_CMP_RANGES 0x04
				1558	#define _SIDD_CMP_EQUAL_EACH 0x08
				1559	#define _SIDD_CMP_EQUAL_ORDERED 0x0c
				1560
				1561	/* These macros specify the polarity of the operation. */
				1562	#define _SIDD_POSITIVE_POLARITY 0x00
				1563	#define _SIDD_NEGATIVE_POLARITY 0x10
				1564	#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
				1565	#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
				1566
				1567	/* These macros are used in _mm_cmpXstri() to specify the return. */
				1568	#define _SIDD_LEAST_SIGNIFICANT 0x00
				1569	#define _SIDD_MOST_SIGNIFICANT 0x40
				1570
				1571	/* These macros are used in _mm_cmpXstri() to specify the return. */
				1572	#define _SIDD_BIT_MASK 0x00
				1573	#define _SIDD_UNIT_MASK 0x40
				1574
				1575	/* SSE4.2 Packed Comparison Intrinsics. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1576	/// Uses the immediate operand \a M to perform a comparison of string
				1577	/// data with implicitly defined lengths that is contained in source operands
				1578	/// \a A and \a B. Returns a 128-bit integer vector representing the result
				1579	/// mask of the comparison.
				1580	///
				1581	/// \headerfile <x86intrin.h>
				1582	///
				1583	/// \code
				1584	/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
				1585	/// \endcode
				1586	///
				1587	/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
				1588	/// instruction.
				1589	///
				1590	/// \param A
				1591	/// A 128-bit integer vector containing one of the source operands to be
				1592	/// compared.
				1593	/// \param B
				1594	/// A 128-bit integer vector containing one of the source operands to be
				1595	/// compared.
				1596	/// \param M
				1597	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1598	/// words, the type of comparison to perform, and the format of the return
				1599	/// value. \n
				1600	/// Bits [1:0]: Determine source data format. \n
				1601	/// 00: 16 unsigned bytes \n
				1602	/// 01: 8 unsigned words \n
				1603	/// 10: 16 signed bytes \n
				1604	/// 11: 8 signed words \n
				1605	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1606	/// 00: Subset: Each character in \a B is compared for equality with all
				1607	/// the characters in \a A. \n
				1608	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1609	/// basis is greater than or equal for even-indexed elements in \a A,
				1610	/// and less than or equal for odd-indexed elements in \a A. \n
				1611	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1612	/// \a B for equality. \n
				1613	/// 11: Substring: Search \a B for substring matches of \a A. \n
				1614	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1615	/// mask of the comparison results. \n
				1616	/// 00: No effect. \n
				1617	/// 01: Negate the bit mask. \n
				1618	/// 10: No effect. \n
				1619	/// 11: Negate the bit mask only for bits with an index less than or equal
				1620	/// to the size of \a A or \a B. \n
				1621	/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
				1622	/// bytes. \n
				1623	/// 0: The result is zero-extended to 16 bytes. \n
				1624	/// 1: The result is expanded to 16 bytes (this expansion is performed by
				1625	/// repeating each bit 8 or 16 times).
				1626	/// \returns Returns a 128-bit integer vector representing the result mask of
				1627	/// the comparison.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1628	#define _mm_cmpistrm(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1629	((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
				1630	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1631
				1632	/// Uses the immediate operand \a M to perform a comparison of string
				1633	/// data with implicitly defined lengths that is contained in source operands
				1634	/// \a A and \a B. Returns an integer representing the result index of the
				1635	/// comparison.
				1636	///
				1637	/// \headerfile <x86intrin.h>
				1638	///
				1639	/// \code
				1640	/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
				1641	/// \endcode
				1642	///
				1643	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				1644	/// instruction.
				1645	///
				1646	/// \param A
				1647	/// A 128-bit integer vector containing one of the source operands to be
				1648	/// compared.
				1649	/// \param B
				1650	/// A 128-bit integer vector containing one of the source operands to be
				1651	/// compared.
				1652	/// \param M
				1653	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1654	/// words, the type of comparison to perform, and the format of the return
				1655	/// value. \n
				1656	/// Bits [1:0]: Determine source data format. \n
				1657	/// 00: 16 unsigned bytes \n
				1658	/// 01: 8 unsigned words \n
				1659	/// 10: 16 signed bytes \n
				1660	/// 11: 8 signed words \n
				1661	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1662	/// 00: Subset: Each character in \a B is compared for equality with all
				1663	/// the characters in \a A. \n
				1664	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1665	/// basis is greater than or equal for even-indexed elements in \a A,
				1666	/// and less than or equal for odd-indexed elements in \a A. \n
				1667	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1668	/// \a B for equality. \n
				1669	/// 11: Substring: Search B for substring matches of \a A. \n
				1670	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1671	/// mask of the comparison results. \n
				1672	/// 00: No effect. \n
				1673	/// 01: Negate the bit mask. \n
				1674	/// 10: No effect. \n
				1675	/// 11: Negate the bit mask only for bits with an index less than or equal
				1676	/// to the size of \a A or \a B. \n
				1677	/// Bit [6]: Determines whether the index of the lowest set bit or the
				1678	/// highest set bit is returned. \n
				1679	/// 0: The index of the least significant set bit. \n
				1680	/// 1: The index of the most significant set bit. \n
				1681	/// \returns Returns an integer representing the result index of the comparison.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1682	#define _mm_cmpistri(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1683	((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
				1684	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1685
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1686	/// Uses the immediate operand \a M to perform a comparison of string
				1687	/// data with explicitly defined lengths that is contained in source operands
				1688	/// \a A and \a B. Returns a 128-bit integer vector representing the result
				1689	/// mask of the comparison.
				1690	///
				1691	/// \headerfile <x86intrin.h>
				1692	///
				1693	/// \code
				1694	/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
				1695	/// \endcode
				1696	///
				1697	/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
				1698	/// instruction.
				1699	///
				1700	/// \param A
				1701	/// A 128-bit integer vector containing one of the source operands to be
				1702	/// compared.
				1703	/// \param LA
				1704	/// An integer that specifies the length of the string in \a A.
				1705	/// \param B
				1706	/// A 128-bit integer vector containing one of the source operands to be
				1707	/// compared.
				1708	/// \param LB
				1709	/// An integer that specifies the length of the string in \a B.
				1710	/// \param M
				1711	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1712	/// words, the type of comparison to perform, and the format of the return
				1713	/// value. \n
				1714	/// Bits [1:0]: Determine source data format. \n
				1715	/// 00: 16 unsigned bytes \n
				1716	/// 01: 8 unsigned words \n
				1717	/// 10: 16 signed bytes \n
				1718	/// 11: 8 signed words \n
				1719	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1720	/// 00: Subset: Each character in \a B is compared for equality with all
				1721	/// the characters in \a A. \n
				1722	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1723	/// basis is greater than or equal for even-indexed elements in \a A,
				1724	/// and less than or equal for odd-indexed elements in \a A. \n
				1725	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1726	/// \a B for equality. \n
				1727	/// 11: Substring: Search \a B for substring matches of \a A. \n
				1728	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1729	/// mask of the comparison results. \n
				1730	/// 00: No effect. \n
				1731	/// 01: Negate the bit mask. \n
				1732	/// 10: No effect. \n
				1733	/// 11: Negate the bit mask only for bits with an index less than or equal
				1734	/// to the size of \a A or \a B. \n
				1735	/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
				1736	/// bytes. \n
				1737	/// 0: The result is zero-extended to 16 bytes. \n
				1738	/// 1: The result is expanded to 16 bytes (this expansion is performed by
				1739	/// repeating each bit 8 or 16 times). \n
				1740	/// \returns Returns a 128-bit integer vector representing the result mask of
				1741	/// the comparison.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1742	#define _mm_cmpestrm(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1743	((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
				1744	(__v16qi)(__m128i)(B), (int)(LB), \
				1745	(int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1746
				1747	/// Uses the immediate operand \a M to perform a comparison of string
				1748	/// data with explicitly defined lengths that is contained in source operands
				1749	/// \a A and \a B. Returns an integer representing the result index of the
				1750	/// comparison.
				1751	///
				1752	/// \headerfile <x86intrin.h>
				1753	///
				1754	/// \code
				1755	/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
				1756	/// \endcode
				1757	///
				1758	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
				1759	/// instruction.
				1760	///
				1761	/// \param A
				1762	/// A 128-bit integer vector containing one of the source operands to be
				1763	/// compared.
				1764	/// \param LA
				1765	/// An integer that specifies the length of the string in \a A.
				1766	/// \param B
				1767	/// A 128-bit integer vector containing one of the source operands to be
				1768	/// compared.
				1769	/// \param LB
				1770	/// An integer that specifies the length of the string in \a B.
				1771	/// \param M
				1772	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1773	/// words, the type of comparison to perform, and the format of the return
				1774	/// value. \n
				1775	/// Bits [1:0]: Determine source data format. \n
				1776	/// 00: 16 unsigned bytes \n
				1777	/// 01: 8 unsigned words \n
				1778	/// 10: 16 signed bytes \n
				1779	/// 11: 8 signed words \n
				1780	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1781	/// 00: Subset: Each character in \a B is compared for equality with all
				1782	/// the characters in \a A. \n
				1783	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1784	/// basis is greater than or equal for even-indexed elements in \a A,
				1785	/// and less than or equal for odd-indexed elements in \a A. \n
				1786	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1787	/// \a B for equality. \n
				1788	/// 11: Substring: Search B for substring matches of \a A. \n
				1789	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1790	/// mask of the comparison results. \n
				1791	/// 00: No effect. \n
				1792	/// 01: Negate the bit mask. \n
				1793	/// 10: No effect. \n
				1794	/// 11: Negate the bit mask only for bits with an index less than or equal
				1795	/// to the size of \a A or \a B. \n
				1796	/// Bit [6]: Determines whether the index of the lowest set bit or the
				1797	/// highest set bit is returned. \n
				1798	/// 0: The index of the least significant set bit. \n
				1799	/// 1: The index of the most significant set bit. \n
				1800	/// \returns Returns an integer representing the result index of the comparison.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1801	#define _mm_cmpestri(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1802	((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
				1803	(__v16qi)(__m128i)(B), (int)(LB), \
				1804	(int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1805
				1806	/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1807	/// Uses the immediate operand \a M to perform a comparison of string
				1808	/// data with implicitly defined lengths that is contained in source operands
				1809	/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
				1810	/// string in \a B is the maximum, otherwise, returns 0.
				1811	///
				1812	/// \headerfile <x86intrin.h>
				1813	///
				1814	/// \code
				1815	/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
				1816	/// \endcode
				1817	///
				1818	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				1819	/// instruction.
				1820	///
				1821	/// \param A
				1822	/// A 128-bit integer vector containing one of the source operands to be
				1823	/// compared.
				1824	/// \param B
				1825	/// A 128-bit integer vector containing one of the source operands to be
				1826	/// compared.
				1827	/// \param M
				1828	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1829	/// words and the type of comparison to perform. \n
				1830	/// Bits [1:0]: Determine source data format. \n
				1831	/// 00: 16 unsigned bytes \n
				1832	/// 01: 8 unsigned words \n
				1833	/// 10: 16 signed bytes \n
				1834	/// 11: 8 signed words \n
				1835	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1836	/// 00: Subset: Each character in \a B is compared for equality with all
				1837	/// the characters in \a A. \n
				1838	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1839	/// basis is greater than or equal for even-indexed elements in \a A,
				1840	/// and less than or equal for odd-indexed elements in \a A. \n
				1841	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1842	/// \a B for equality. \n
				1843	/// 11: Substring: Search \a B for substring matches of \a A. \n
				1844	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1845	/// mask of the comparison results. \n
				1846	/// 00: No effect. \n
				1847	/// 01: Negate the bit mask. \n
				1848	/// 10: No effect. \n
				1849	/// 11: Negate the bit mask only for bits with an index less than or equal
				1850	/// to the size of \a A or \a B. \n
				1851	/// \returns Returns 1 if the bit mask is zero and the length of the string in
				1852	/// \a B is the maximum; otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1853	#define _mm_cmpistra(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1854	((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
				1855	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1856
				1857	/// Uses the immediate operand \a M to perform a comparison of string
				1858	/// data with implicitly defined lengths that is contained in source operands
				1859	/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
				1860	/// 0.
				1861	///
				1862	/// \headerfile <x86intrin.h>
				1863	///
				1864	/// \code
				1865	/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
				1866	/// \endcode
				1867	///
				1868	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				1869	/// instruction.
				1870	///
				1871	/// \param A
				1872	/// A 128-bit integer vector containing one of the source operands to be
				1873	/// compared.
				1874	/// \param B
				1875	/// A 128-bit integer vector containing one of the source operands to be
				1876	/// compared.
				1877	/// \param M
				1878	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1879	/// words and the type of comparison to perform. \n
				1880	/// Bits [1:0]: Determine source data format. \n
				1881	/// 00: 16 unsigned bytes \n
				1882	/// 01: 8 unsigned words \n
				1883	/// 10: 16 signed bytes \n
				1884	/// 11: 8 signed words \n
				1885	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1886	/// 00: Subset: Each character in \a B is compared for equality with all
				1887	/// the characters in \a A. \n
				1888	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1889	/// basis is greater than or equal for even-indexed elements in \a A,
				1890	/// and less than or equal for odd-indexed elements in \a A. \n
				1891	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1892	/// \a B for equality. \n
				1893	/// 11: Substring: Search B for substring matches of \a A. \n
				1894	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1895	/// mask of the comparison results. \n
				1896	/// 00: No effect. \n
				1897	/// 01: Negate the bit mask. \n
				1898	/// 10: No effect. \n
				1899	/// 11: Negate the bit mask only for bits with an index less than or equal
				1900	/// to the size of \a A or \a B.
				1901	/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1902	#define _mm_cmpistrc(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1903	((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
				1904	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1905
				1906	/// Uses the immediate operand \a M to perform a comparison of string
				1907	/// data with implicitly defined lengths that is contained in source operands
				1908	/// \a A and \a B. Returns bit 0 of the resulting bit mask.
				1909	///
				1910	/// \headerfile <x86intrin.h>
				1911	///
				1912	/// \code
				1913	/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
				1914	/// \endcode
				1915	///
				1916	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				1917	/// instruction.
				1918	///
				1919	/// \param A
				1920	/// A 128-bit integer vector containing one of the source operands to be
				1921	/// compared.
				1922	/// \param B
				1923	/// A 128-bit integer vector containing one of the source operands to be
				1924	/// compared.
				1925	/// \param M
				1926	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1927	/// words and the type of comparison to perform. \n
				1928	/// Bits [1:0]: Determine source data format. \n
				1929	/// 00: 16 unsigned bytes \n
				1930	/// 01: 8 unsigned words \n
				1931	/// 10: 16 signed bytes \n
				1932	/// 11: 8 signed words \n
				1933	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1934	/// 00: Subset: Each character in \a B is compared for equality with all
				1935	/// the characters in \a A. \n
				1936	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1937	/// basis is greater than or equal for even-indexed elements in \a A,
				1938	/// and less than or equal for odd-indexed elements in \a A. \n
				1939	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1940	/// \a B for equality. \n
				1941	/// 11: Substring: Search B for substring matches of \a A. \n
				1942	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1943	/// mask of the comparison results. \n
				1944	/// 00: No effect. \n
				1945	/// 01: Negate the bit mask. \n
				1946	/// 10: No effect. \n
				1947	/// 11: Negate the bit mask only for bits with an index less than or equal
				1948	/// to the size of \a A or \a B. \n
				1949	/// \returns Returns bit 0 of the resulting bit mask.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1950	#define _mm_cmpistro(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1951	((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
				1952	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1953
				1954	/// Uses the immediate operand \a M to perform a comparison of string
				1955	/// data with implicitly defined lengths that is contained in source operands
				1956	/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
				1957	/// the maximum, otherwise, returns 0.
				1958	///
				1959	/// \headerfile <x86intrin.h>
				1960	///
				1961	/// \code
				1962	/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
				1963	/// \endcode
				1964	///
				1965	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				1966	/// instruction.
				1967	///
				1968	/// \param A
				1969	/// A 128-bit integer vector containing one of the source operands to be
				1970	/// compared.
				1971	/// \param B
				1972	/// A 128-bit integer vector containing one of the source operands to be
				1973	/// compared.
				1974	/// \param M
				1975	/// An 8-bit immediate operand specifying whether the characters are bytes or
				1976	/// words and the type of comparison to perform. \n
				1977	/// Bits [1:0]: Determine source data format. \n
				1978	/// 00: 16 unsigned bytes \n
				1979	/// 01: 8 unsigned words \n
				1980	/// 10: 16 signed bytes \n
				1981	/// 11: 8 signed words \n
				1982	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				1983	/// 00: Subset: Each character in \a B is compared for equality with all
				1984	/// the characters in \a A. \n
				1985	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				1986	/// basis is greater than or equal for even-indexed elements in \a A,
				1987	/// and less than or equal for odd-indexed elements in \a A. \n
				1988	/// 10: Match: Compare each pair of corresponding characters in \a A and
				1989	/// \a B for equality. \n
				1990	/// 11: Substring: Search \a B for substring matches of \a A. \n
				1991	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				1992	/// mask of the comparison results. \n
				1993	/// 00: No effect. \n
				1994	/// 01: Negate the bit mask. \n
				1995	/// 10: No effect. \n
				1996	/// 11: Negate the bit mask only for bits with an index less than or equal
				1997	/// to the size of \a A or \a B. \n
				1998	/// \returns Returns 1 if the length of the string in \a A is less than the
				1999	/// maximum, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2000	#define _mm_cmpistrs(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2001	((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
				2002	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2003
				2004	/// Uses the immediate operand \a M to perform a comparison of string
				2005	/// data with implicitly defined lengths that is contained in source operands
				2006	/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
				2007	/// the maximum, otherwise, returns 0.
				2008	///
				2009	/// \headerfile <x86intrin.h>
				2010	///
				2011	/// \code
				2012	/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
				2013	/// \endcode
				2014	///
				2015	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
				2016	/// instruction.
				2017	///
				2018	/// \param A
				2019	/// A 128-bit integer vector containing one of the source operands to be
				2020	/// compared.
				2021	/// \param B
				2022	/// A 128-bit integer vector containing one of the source operands to be
				2023	/// compared.
				2024	/// \param M
				2025	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2026	/// words and the type of comparison to perform. \n
				2027	/// Bits [1:0]: Determine source data format. \n
				2028	/// 00: 16 unsigned bytes \n
				2029	/// 01: 8 unsigned words \n
				2030	/// 10: 16 signed bytes \n
				2031	/// 11: 8 signed words \n
				2032	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2033	/// 00: Subset: Each character in \a B is compared for equality with all
				2034	/// the characters in \a A. \n
				2035	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2036	/// basis is greater than or equal for even-indexed elements in \a A,
				2037	/// and less than or equal for odd-indexed elements in \a A. \n
				2038	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2039	/// \a B for equality. \n
				2040	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2041	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				2042	/// mask of the comparison results. \n
				2043	/// 00: No effect. \n
				2044	/// 01: Negate the bit mask. \n
				2045	/// 10: No effect. \n
				2046	/// 11: Negate the bit mask only for bits with an index less than or equal
				2047	/// to the size of \a A or \a B.
				2048	/// \returns Returns 1 if the length of the string in \a B is less than the
				2049	/// maximum, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2050	#define _mm_cmpistrz(A, B, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2051	((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
				2052	(__v16qi)(__m128i)(B), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2053
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2054	/// Uses the immediate operand \a M to perform a comparison of string
				2055	/// data with explicitly defined lengths that is contained in source operands
				2056	/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
				2057	/// string in \a B is the maximum, otherwise, returns 0.
				2058	///
				2059	/// \headerfile <x86intrin.h>
				2060	///
				2061	/// \code
				2062	/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
				2063	/// \endcode
				2064	///
				2065	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
				2066	/// instruction.
				2067	///
				2068	/// \param A
				2069	/// A 128-bit integer vector containing one of the source operands to be
				2070	/// compared.
				2071	/// \param LA
				2072	/// An integer that specifies the length of the string in \a A.
				2073	/// \param B
				2074	/// A 128-bit integer vector containing one of the source operands to be
				2075	/// compared.
				2076	/// \param LB
				2077	/// An integer that specifies the length of the string in \a B.
				2078	/// \param M
				2079	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2080	/// words and the type of comparison to perform. \n
				2081	/// Bits [1:0]: Determine source data format. \n
				2082	/// 00: 16 unsigned bytes \n
				2083	/// 01: 8 unsigned words \n
				2084	/// 10: 16 signed bytes \n
				2085	/// 11: 8 signed words \n
				2086	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2087	/// 00: Subset: Each character in \a B is compared for equality with all
				2088	/// the characters in \a A. \n
				2089	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2090	/// basis is greater than or equal for even-indexed elements in \a A,
				2091	/// and less than or equal for odd-indexed elements in \a A. \n
				2092	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2093	/// \a B for equality. \n
				2094	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2095	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				2096	/// mask of the comparison results. \n
				2097	/// 00: No effect. \n
				2098	/// 01: Negate the bit mask. \n
				2099	/// 10: No effect. \n
				2100	/// 11: Negate the bit mask only for bits with an index less than or equal
				2101	/// to the size of \a A or \a B.
				2102	/// \returns Returns 1 if the bit mask is zero and the length of the string in
				2103	/// \a B is the maximum, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2104	#define _mm_cmpestra(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2105	((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
				2106	(__v16qi)(__m128i)(B), (int)(LB), \
				2107	(int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2108
				2109	/// Uses the immediate operand \a M to perform a comparison of string
				2110	/// data with explicitly defined lengths that is contained in source operands
				2111	/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
				2112	/// returns 0.
				2113	///
				2114	/// \headerfile <x86intrin.h>
				2115	///
				2116	/// \code
				2117	/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
				2118	/// \endcode
				2119	///
				2120	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
				2121	/// instruction.
				2122	///
				2123	/// \param A
				2124	/// A 128-bit integer vector containing one of the source operands to be
				2125	/// compared.
				2126	/// \param LA
				2127	/// An integer that specifies the length of the string in \a A.
				2128	/// \param B
				2129	/// A 128-bit integer vector containing one of the source operands to be
				2130	/// compared.
				2131	/// \param LB
				2132	/// An integer that specifies the length of the string in \a B.
				2133	/// \param M
				2134	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2135	/// words and the type of comparison to perform. \n
				2136	/// Bits [1:0]: Determine source data format. \n
				2137	/// 00: 16 unsigned bytes \n
				2138	/// 01: 8 unsigned words \n
				2139	/// 10: 16 signed bytes \n
				2140	/// 11: 8 signed words \n
				2141	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2142	/// 00: Subset: Each character in \a B is compared for equality with all
				2143	/// the characters in \a A. \n
				2144	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2145	/// basis is greater than or equal for even-indexed elements in \a A,
				2146	/// and less than or equal for odd-indexed elements in \a A. \n
				2147	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2148	/// \a B for equality. \n
				2149	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2150	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				2151	/// mask of the comparison results. \n
				2152	/// 00: No effect. \n
				2153	/// 01: Negate the bit mask. \n
				2154	/// 10: No effect. \n
				2155	/// 11: Negate the bit mask only for bits with an index less than or equal
				2156	/// to the size of \a A or \a B. \n
				2157	/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2158	#define _mm_cmpestrc(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2159	((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
				2160	(__v16qi)(__m128i)(B), (int)(LB), \
				2161	(int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2162
				2163	/// Uses the immediate operand \a M to perform a comparison of string
				2164	/// data with explicitly defined lengths that is contained in source operands
				2165	/// \a A and \a B. Returns bit 0 of the resulting bit mask.
				2166	///
				2167	/// \headerfile <x86intrin.h>
				2168	///
				2169	/// \code
				2170	/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
				2171	/// \endcode
				2172	///
				2173	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
				2174	/// instruction.
				2175	///
				2176	/// \param A
				2177	/// A 128-bit integer vector containing one of the source operands to be
				2178	/// compared.
				2179	/// \param LA
				2180	/// An integer that specifies the length of the string in \a A.
				2181	/// \param B
				2182	/// A 128-bit integer vector containing one of the source operands to be
				2183	/// compared.
				2184	/// \param LB
				2185	/// An integer that specifies the length of the string in \a B.
				2186	/// \param M
				2187	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2188	/// words and the type of comparison to perform. \n
				2189	/// Bits [1:0]: Determine source data format. \n
				2190	/// 00: 16 unsigned bytes \n
				2191	/// 01: 8 unsigned words \n
				2192	/// 10: 16 signed bytes \n
				2193	/// 11: 8 signed words \n
				2194	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2195	/// 00: Subset: Each character in \a B is compared for equality with all
				2196	/// the characters in \a A. \n
				2197	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2198	/// basis is greater than or equal for even-indexed elements in \a A,
				2199	/// and less than or equal for odd-indexed elements in \a A. \n
				2200	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2201	/// \a B for equality. \n
				2202	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2203	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				2204	/// mask of the comparison results. \n
				2205	/// 00: No effect. \n
				2206	/// 01: Negate the bit mask. \n
				2207	/// 10: No effect. \n
				2208	/// 11: Negate the bit mask only for bits with an index less than or equal
				2209	/// to the size of \a A or \a B.
				2210	/// \returns Returns bit 0 of the resulting bit mask.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2211	#define _mm_cmpestro(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2212	((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
				2213	(__v16qi)(__m128i)(B), (int)(LB), \
				2214	(int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2215
				2216	/// Uses the immediate operand \a M to perform a comparison of string
				2217	/// data with explicitly defined lengths that is contained in source operands
				2218	/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
				2219	/// the maximum, otherwise, returns 0.
				2220	///
				2221	/// \headerfile <x86intrin.h>
				2222	///
				2223	/// \code
				2224	/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
				2225	/// \endcode
				2226	///
				2227	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
				2228	/// instruction.
				2229	///
				2230	/// \param A
				2231	/// A 128-bit integer vector containing one of the source operands to be
				2232	/// compared.
				2233	/// \param LA
				2234	/// An integer that specifies the length of the string in \a A.
				2235	/// \param B
				2236	/// A 128-bit integer vector containing one of the source operands to be
				2237	/// compared.
				2238	/// \param LB
				2239	/// An integer that specifies the length of the string in \a B.
				2240	/// \param M
				2241	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2242	/// words and the type of comparison to perform. \n
				2243	/// Bits [1:0]: Determine source data format. \n
				2244	/// 00: 16 unsigned bytes \n
				2245	/// 01: 8 unsigned words \n
				2246	/// 10: 16 signed bytes \n
				2247	/// 11: 8 signed words \n
				2248	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2249	/// 00: Subset: Each character in \a B is compared for equality with all
				2250	/// the characters in \a A. \n
				2251	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2252	/// basis is greater than or equal for even-indexed elements in \a A,
				2253	/// and less than or equal for odd-indexed elements in \a A. \n
				2254	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2255	/// \a B for equality. \n
				2256	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2257	/// Bits [5:4]: Determine whether to perform a one's complement in the bit
				2258	/// mask of the comparison results. \n
				2259	/// 00: No effect. \n
				2260	/// 01: Negate the bit mask. \n
				2261	/// 10: No effect. \n
				2262	/// 11: Negate the bit mask only for bits with an index less than or equal
				2263	/// to the size of \a A or \a B. \n
				2264	/// \returns Returns 1 if the length of the string in \a A is less than the
				2265	/// maximum, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2266	#define _mm_cmpestrs(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2267	((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
				2268	(__v16qi)(__m128i)(B), (int)(LB), \
				2269	(int)(M)))
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2270
				2271	/// Uses the immediate operand \a M to perform a comparison of string
				2272	/// data with explicitly defined lengths that is contained in source operands
				2273	/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
				2274	/// the maximum, otherwise, returns 0.
				2275	///
				2276	/// \headerfile <x86intrin.h>
				2277	///
				2278	/// \code
				2279	/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
				2280	/// \endcode
				2281	///
				2282	/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
				2283	///
				2284	/// \param A
				2285	/// A 128-bit integer vector containing one of the source operands to be
				2286	/// compared.
				2287	/// \param LA
				2288	/// An integer that specifies the length of the string in \a A.
				2289	/// \param B
				2290	/// A 128-bit integer vector containing one of the source operands to be
				2291	/// compared.
				2292	/// \param LB
				2293	/// An integer that specifies the length of the string in \a B.
				2294	/// \param M
				2295	/// An 8-bit immediate operand specifying whether the characters are bytes or
				2296	/// words and the type of comparison to perform. \n
				2297	/// Bits [1:0]: Determine source data format. \n
				2298	/// 00: 16 unsigned bytes \n
				2299	/// 01: 8 unsigned words \n
				2300	/// 10: 16 signed bytes \n
				2301	/// 11: 8 signed words \n
				2302	/// Bits [3:2]: Determine comparison type and aggregation method. \n
				2303	/// 00: Subset: Each character in \a B is compared for equality with all
				2304	/// the characters in \a A. \n
				2305	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
				2306	/// basis is greater than or equal for even-indexed elements in \a A,
				2307	/// and less than or equal for odd-indexed elements in \a A. \n
				2308	/// 10: Match: Compare each pair of corresponding characters in \a A and
				2309	/// \a B for equality. \n
				2310	/// 11: Substring: Search \a B for substring matches of \a A. \n
				2311	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
				2312	/// mask of the comparison results. \n
				2313	/// 00: No effect. \n
				2314	/// 01: Negate the bit mask. \n
				2315	/// 10: No effect. \n
				2316	/// 11: Negate the bit mask only for bits with an index less than or equal
				2317	/// to the size of \a A or \a B.
				2318	/// \returns Returns 1 if the length of the string in \a B is less than the
				2319	/// maximum, otherwise, returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2320	#define _mm_cmpestrz(A, LA, B, LB, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2321	((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
				2322	(__v16qi)(__m128i)(B), (int)(LB), \
				2323	(int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2324
				2325	/* SSE4.2 Compare Packed Data -- Greater Than. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2326	/// Compares each of the corresponding 64-bit values of the 128-bit
				2327	/// integer vectors to determine if the values in the first operand are
				2328	/// greater than those in the second operand.
				2329	///
				2330	/// \headerfile <x86intrin.h>
				2331	///
				2332	/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
				2333	///
				2334	/// \param __V1
				2335	/// A 128-bit integer vector.
				2336	/// \param __V2
				2337	/// A 128-bit integer vector.
				2338	/// \returns A 128-bit integer vector containing the comparison results.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2339	static __inline__ __m128i __DEFAULT_FN_ATTRS
				2340	_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
				2341	{
				2342	return (__m128i)((__v2di)__V1 > (__v2di)__V2);
				2343	}
				2344
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2345	#undef __DEFAULT_FN_ATTRS
				2346
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2347	#include <popcntintrin.h>
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2348
Pirama Arumuga Nainar	ec8c89d	2022-02-23 09:26:16 -0800	[diff] [blame^]	2349	#include <crc32intrin.h>
				2350
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2351	#endif /* __SMMINTRIN_H */