blob: 8913a196144bb53c5f85e98dbcc79418be67c907 [file] [log] [blame]
Logan Chien2833ffb2018-10-09 10:03:24 +08001/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2 *
Logan Chiendf4f7662019-09-04 16:45:23 -07003 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien2833ffb2018-10-09 10:03:24 +08006 *
7 *===-----------------------------------------------------------------------===
8 */
9
Logan Chien55afb0a2018-10-15 10:42:14 +080010#ifndef __SMMINTRIN_H
11#define __SMMINTRIN_H
Logan Chien2833ffb2018-10-09 10:03:24 +080012
13#include <tmmintrin.h>
14
15/* Define the default attributes for the functions in this file. */
Logan Chien55afb0a2018-10-15 10:42:14 +080016#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
Logan Chien2833ffb2018-10-09 10:03:24 +080017
18/* SSE4 Rounding macros. */
19#define _MM_FROUND_TO_NEAREST_INT 0x00
20#define _MM_FROUND_TO_NEG_INF 0x01
21#define _MM_FROUND_TO_POS_INF 0x02
22#define _MM_FROUND_TO_ZERO 0x03
23#define _MM_FROUND_CUR_DIRECTION 0x04
24
25#define _MM_FROUND_RAISE_EXC 0x00
26#define _MM_FROUND_NO_EXC 0x08
27
28#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
29#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
30#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
31#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
32#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
33#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
34
Logan Chien55afb0a2018-10-15 10:42:14 +080035/// Rounds up each element of the 128-bit vector of [4 x float] to an
36/// integer and returns the rounded values in a 128-bit vector of
37/// [4 x float].
38///
39/// \headerfile <x86intrin.h>
40///
41/// \code
42/// __m128 _mm_ceil_ps(__m128 X);
43/// \endcode
44///
45/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
46///
47/// \param X
48/// A 128-bit vector of [4 x float] values to be rounded up.
49/// \returns A 128-bit vector of [4 x float] containing the rounded values.
Logan Chien2833ffb2018-10-09 10:03:24 +080050#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
Logan Chien55afb0a2018-10-15 10:42:14 +080051
52/// Rounds up each element of the 128-bit vector of [2 x double] to an
53/// integer and returns the rounded values in a 128-bit vector of
54/// [2 x double].
55///
56/// \headerfile <x86intrin.h>
57///
58/// \code
59/// __m128d _mm_ceil_pd(__m128d X);
60/// \endcode
61///
62/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
63///
64/// \param X
65/// A 128-bit vector of [2 x double] values to be rounded up.
66/// \returns A 128-bit vector of [2 x double] containing the rounded values.
Logan Chien2833ffb2018-10-09 10:03:24 +080067#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
Logan Chien55afb0a2018-10-15 10:42:14 +080068
69/// Copies three upper elements of the first 128-bit vector operand to
70/// the corresponding three upper elements of the 128-bit result vector of
71/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
72/// operand to an integer and copies it to the lowest element of the 128-bit
73/// result vector of [4 x float].
74///
75/// \headerfile <x86intrin.h>
76///
77/// \code
78/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
79/// \endcode
80///
81/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
82///
83/// \param X
84/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
85/// copied to the corresponding bits of the result.
86/// \param Y
87/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
88/// rounded up to the nearest integer and copied to the corresponding bits
89/// of the result.
90/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
91/// values.
Logan Chien2833ffb2018-10-09 10:03:24 +080092#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
Logan Chien55afb0a2018-10-15 10:42:14 +080093
94/// Copies the upper element of the first 128-bit vector operand to the
95/// corresponding upper element of the 128-bit result vector of [2 x double].
96/// Rounds up the lower element of the second 128-bit vector operand to an
97/// integer and copies it to the lower element of the 128-bit result vector
98/// of [2 x double].
99///
100/// \headerfile <x86intrin.h>
101///
102/// \code
103/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
104/// \endcode
105///
106/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
107///
108/// \param X
109/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
110/// copied to the corresponding bits of the result.
111/// \param Y
112/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
113/// rounded up to the nearest integer and copied to the corresponding bits
114/// of the result.
115/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
116/// values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800117#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
118
Logan Chien55afb0a2018-10-15 10:42:14 +0800119/// Rounds down each element of the 128-bit vector of [4 x float] to an
120/// an integer and returns the rounded values in a 128-bit vector of
121/// [4 x float].
122///
123/// \headerfile <x86intrin.h>
124///
125/// \code
126/// __m128 _mm_floor_ps(__m128 X);
127/// \endcode
128///
129/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
130///
131/// \param X
132/// A 128-bit vector of [4 x float] values to be rounded down.
133/// \returns A 128-bit vector of [4 x float] containing the rounded values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800134#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
Logan Chien55afb0a2018-10-15 10:42:14 +0800135
136/// Rounds down each element of the 128-bit vector of [2 x double] to an
137/// integer and returns the rounded values in a 128-bit vector of
138/// [2 x double].
139///
140/// \headerfile <x86intrin.h>
141///
142/// \code
143/// __m128d _mm_floor_pd(__m128d X);
144/// \endcode
145///
146/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
147///
148/// \param X
149/// A 128-bit vector of [2 x double].
150/// \returns A 128-bit vector of [2 x double] containing the rounded values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800151#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
Logan Chien55afb0a2018-10-15 10:42:14 +0800152
153/// Copies three upper elements of the first 128-bit vector operand to
154/// the corresponding three upper elements of the 128-bit result vector of
155/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
156/// operand to an integer and copies it to the lowest element of the 128-bit
157/// result vector of [4 x float].
158///
159/// \headerfile <x86intrin.h>
160///
161/// \code
162/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
163/// \endcode
164///
165/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
166///
167/// \param X
168/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
169/// copied to the corresponding bits of the result.
170/// \param Y
171/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
172/// rounded down to the nearest integer and copied to the corresponding bits
173/// of the result.
174/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
175/// values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800176#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
Logan Chien55afb0a2018-10-15 10:42:14 +0800177
178/// Copies the upper element of the first 128-bit vector operand to the
179/// corresponding upper element of the 128-bit result vector of [2 x double].
180/// Rounds down the lower element of the second 128-bit vector operand to an
181/// integer and copies it to the lower element of the 128-bit result vector
182/// of [2 x double].
183///
184/// \headerfile <x86intrin.h>
185///
186/// \code
187/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
188/// \endcode
189///
190/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
191///
192/// \param X
193/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
194/// copied to the corresponding bits of the result.
195/// \param Y
196/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
197/// rounded down to the nearest integer and copied to the corresponding bits
198/// of the result.
199/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
200/// values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800201#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
202
Logan Chien55afb0a2018-10-15 10:42:14 +0800203/// Rounds each element of the 128-bit vector of [4 x float] to an
204/// integer value according to the rounding control specified by the second
205/// argument and returns the rounded values in a 128-bit vector of
206/// [4 x float].
207///
208/// \headerfile <x86intrin.h>
209///
210/// \code
211/// __m128 _mm_round_ps(__m128 X, const int M);
212/// \endcode
213///
214/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
215///
216/// \param X
217/// A 128-bit vector of [4 x float].
218/// \param M
219/// An integer value that specifies the rounding operation. \n
220/// Bits [7:4] are reserved. \n
221/// Bit [3] is a precision exception value: \n
222/// 0: A normal PE exception is used \n
223/// 1: The PE field is not updated \n
224/// Bit [2] is the rounding control source: \n
225/// 0: Use bits [1:0] of \a M \n
226/// 1: Use the current MXCSR setting \n
227/// Bits [1:0] contain the rounding control definition: \n
228/// 00: Nearest \n
229/// 01: Downward (toward negative infinity) \n
230/// 10: Upward (toward positive infinity) \n
231/// 11: Truncated
232/// \returns A 128-bit vector of [4 x float] containing the rounded values.
233#define _mm_round_ps(X, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800234 ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800235
Logan Chien55afb0a2018-10-15 10:42:14 +0800236/// Copies three upper elements of the first 128-bit vector operand to
237/// the corresponding three upper elements of the 128-bit result vector of
238/// [4 x float]. Rounds the lowest element of the second 128-bit vector
239/// operand to an integer value according to the rounding control specified
240/// by the third argument and copies it to the lowest element of the 128-bit
241/// result vector of [4 x float].
242///
243/// \headerfile <x86intrin.h>
244///
245/// \code
246/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
247/// \endcode
248///
249/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
250///
251/// \param X
252/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
253/// copied to the corresponding bits of the result.
254/// \param Y
255/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
256/// rounded to the nearest integer using the specified rounding control and
257/// copied to the corresponding bits of the result.
258/// \param M
259/// An integer value that specifies the rounding operation. \n
260/// Bits [7:4] are reserved. \n
261/// Bit [3] is a precision exception value: \n
262/// 0: A normal PE exception is used \n
263/// 1: The PE field is not updated \n
264/// Bit [2] is the rounding control source: \n
265/// 0: Use bits [1:0] of \a M \n
266/// 1: Use the current MXCSR setting \n
267/// Bits [1:0] contain the rounding control definition: \n
268/// 00: Nearest \n
269/// 01: Downward (toward negative infinity) \n
270/// 10: Upward (toward positive infinity) \n
271/// 11: Truncated
272/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
273/// values.
274#define _mm_round_ss(X, Y, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800275 ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
276 (__v4sf)(__m128)(Y), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800277
Logan Chien55afb0a2018-10-15 10:42:14 +0800278/// Rounds each element of the 128-bit vector of [2 x double] to an
279/// integer value according to the rounding control specified by the second
280/// argument and returns the rounded values in a 128-bit vector of
281/// [2 x double].
282///
283/// \headerfile <x86intrin.h>
284///
285/// \code
286/// __m128d _mm_round_pd(__m128d X, const int M);
287/// \endcode
288///
289/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
290///
291/// \param X
292/// A 128-bit vector of [2 x double].
293/// \param M
294/// An integer value that specifies the rounding operation. \n
295/// Bits [7:4] are reserved. \n
296/// Bit [3] is a precision exception value: \n
297/// 0: A normal PE exception is used \n
298/// 1: The PE field is not updated \n
299/// Bit [2] is the rounding control source: \n
300/// 0: Use bits [1:0] of \a M \n
301/// 1: Use the current MXCSR setting \n
302/// Bits [1:0] contain the rounding control definition: \n
303/// 00: Nearest \n
304/// 01: Downward (toward negative infinity) \n
305/// 10: Upward (toward positive infinity) \n
306/// 11: Truncated
307/// \returns A 128-bit vector of [2 x double] containing the rounded values.
308#define _mm_round_pd(X, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800309 ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800310
Logan Chien55afb0a2018-10-15 10:42:14 +0800311/// Copies the upper element of the first 128-bit vector operand to the
312/// corresponding upper element of the 128-bit result vector of [2 x double].
313/// Rounds the lower element of the second 128-bit vector operand to an
314/// integer value according to the rounding control specified by the third
315/// argument and copies it to the lower element of the 128-bit result vector
316/// of [2 x double].
317///
318/// \headerfile <x86intrin.h>
319///
320/// \code
321/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
322/// \endcode
323///
324/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
325///
326/// \param X
327/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
328/// copied to the corresponding bits of the result.
329/// \param Y
330/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
331/// rounded to the nearest integer using the specified rounding control and
332/// copied to the corresponding bits of the result.
333/// \param M
334/// An integer value that specifies the rounding operation. \n
335/// Bits [7:4] are reserved. \n
336/// Bit [3] is a precision exception value: \n
337/// 0: A normal PE exception is used \n
338/// 1: The PE field is not updated \n
339/// Bit [2] is the rounding control source: \n
340/// 0: Use bits [1:0] of \a M \n
341/// 1: Use the current MXCSR setting \n
342/// Bits [1:0] contain the rounding control definition: \n
343/// 00: Nearest \n
344/// 01: Downward (toward negative infinity) \n
345/// 10: Upward (toward positive infinity) \n
346/// 11: Truncated
347/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
348/// values.
349#define _mm_round_sd(X, Y, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800350 ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
351 (__v2df)(__m128d)(Y), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800352
353/* SSE4 Packed Blending Intrinsics. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800354/// Returns a 128-bit vector of [2 x double] where the values are
355/// selected from either the first or second operand as specified by the
356/// third operand, the control mask.
357///
358/// \headerfile <x86intrin.h>
359///
360/// \code
361/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
362/// \endcode
363///
364/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
365///
366/// \param V1
367/// A 128-bit vector of [2 x double].
368/// \param V2
369/// A 128-bit vector of [2 x double].
370/// \param M
371/// An immediate integer operand, with mask bits [1:0] specifying how the
372/// values are to be copied. The position of the mask bit corresponds to the
373/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
374/// element in operand \a V1 is copied to the same position in the result.
375/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
376/// is copied to the same position in the result.
377/// \returns A 128-bit vector of [2 x double] containing the copied values.
378#define _mm_blend_pd(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800379 ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
380 (__v2df)(__m128d)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800381
Logan Chien55afb0a2018-10-15 10:42:14 +0800382/// Returns a 128-bit vector of [4 x float] where the values are selected
383/// from either the first or second operand as specified by the third
384/// operand, the control mask.
385///
386/// \headerfile <x86intrin.h>
387///
388/// \code
389/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
390/// \endcode
391///
392/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
393///
394/// \param V1
395/// A 128-bit vector of [4 x float].
396/// \param V2
397/// A 128-bit vector of [4 x float].
398/// \param M
399/// An immediate integer operand, with mask bits [3:0] specifying how the
400/// values are to be copied. The position of the mask bit corresponds to the
401/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
402/// element in operand \a V1 is copied to the same position in the result.
403/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
404/// is copied to the same position in the result.
405/// \returns A 128-bit vector of [4 x float] containing the copied values.
406#define _mm_blend_ps(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800407 ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
408 (__v4sf)(__m128)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800409
Logan Chien55afb0a2018-10-15 10:42:14 +0800410/// Returns a 128-bit vector of [2 x double] where the values are
411/// selected from either the first or second operand as specified by the
412/// third operand, the control mask.
413///
414/// \headerfile <x86intrin.h>
415///
416/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
417///
418/// \param __V1
419/// A 128-bit vector of [2 x double].
420/// \param __V2
421/// A 128-bit vector of [2 x double].
422/// \param __M
423/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
424/// values are to be copied. The position of the mask bit corresponds to the
425/// most significant bit of a copied value. When a mask bit is 0, the
426/// corresponding 64-bit element in operand \a __V1 is copied to the same
427/// position in the result. When a mask bit is 1, the corresponding 64-bit
428/// element in operand \a __V2 is copied to the same position in the result.
429/// \returns A 128-bit vector of [2 x double] containing the copied values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800430static __inline__ __m128d __DEFAULT_FN_ATTRS
431_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
432{
433 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
434 (__v2df)__M);
435}
436
Logan Chien55afb0a2018-10-15 10:42:14 +0800437/// Returns a 128-bit vector of [4 x float] where the values are
438/// selected from either the first or second operand as specified by the
439/// third operand, the control mask.
440///
441/// \headerfile <x86intrin.h>
442///
443/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
444///
445/// \param __V1
446/// A 128-bit vector of [4 x float].
447/// \param __V2
448/// A 128-bit vector of [4 x float].
449/// \param __M
450/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
451/// how the values are to be copied. The position of the mask bit corresponds
452/// to the most significant bit of a copied value. When a mask bit is 0, the
453/// corresponding 32-bit element in operand \a __V1 is copied to the same
454/// position in the result. When a mask bit is 1, the corresponding 32-bit
455/// element in operand \a __V2 is copied to the same position in the result.
456/// \returns A 128-bit vector of [4 x float] containing the copied values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800457static __inline__ __m128 __DEFAULT_FN_ATTRS
458_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
459{
460 return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
461 (__v4sf)__M);
462}
463
Logan Chien55afb0a2018-10-15 10:42:14 +0800464/// Returns a 128-bit vector of [16 x i8] where the values are selected
465/// from either of the first or second operand as specified by the third
466/// operand, the control mask.
467///
468/// \headerfile <x86intrin.h>
469///
470/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
471///
472/// \param __V1
473/// A 128-bit vector of [16 x i8].
474/// \param __V2
475/// A 128-bit vector of [16 x i8].
476/// \param __M
477/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
478/// how the values are to be copied. The position of the mask bit corresponds
479/// to the most significant bit of a copied value. When a mask bit is 0, the
480/// corresponding 8-bit element in operand \a __V1 is copied to the same
481/// position in the result. When a mask bit is 1, the corresponding 8-bit
482/// element in operand \a __V2 is copied to the same position in the result.
483/// \returns A 128-bit vector of [16 x i8] containing the copied values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800484static __inline__ __m128i __DEFAULT_FN_ATTRS
485_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
486{
487 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
488 (__v16qi)__M);
489}
490
Logan Chien55afb0a2018-10-15 10:42:14 +0800491/// Returns a 128-bit vector of [8 x i16] where the values are selected
492/// from either of the first or second operand as specified by the third
493/// operand, the control mask.
494///
495/// \headerfile <x86intrin.h>
496///
497/// \code
498/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
499/// \endcode
500///
501/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
502///
503/// \param V1
504/// A 128-bit vector of [8 x i16].
505/// \param V2
506/// A 128-bit vector of [8 x i16].
507/// \param M
508/// An immediate integer operand, with mask bits [7:0] specifying how the
509/// values are to be copied. The position of the mask bit corresponds to the
510/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
511/// element in operand \a V1 is copied to the same position in the result.
512/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
513/// is copied to the same position in the result.
514/// \returns A 128-bit vector of [8 x i16] containing the copied values.
515#define _mm_blend_epi16(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800516 ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
517 (__v8hi)(__m128i)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800518
519/* SSE4 Dword Multiply Instructions. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800520/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
521/// and returns the lower 32 bits of the each product in a 128-bit vector of
522/// [4 x i32].
523///
524/// \headerfile <x86intrin.h>
525///
526/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
527///
528/// \param __V1
529/// A 128-bit integer vector.
530/// \param __V2
531/// A 128-bit integer vector.
532/// \returns A 128-bit integer vector containing the products of both operands.
Logan Chien2833ffb2018-10-09 10:03:24 +0800533static __inline__ __m128i __DEFAULT_FN_ATTRS
534_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
535{
536 return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
537}
538
Logan Chien55afb0a2018-10-15 10:42:14 +0800539/// Multiplies corresponding even-indexed elements of two 128-bit
540/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
541/// containing the products.
542///
543/// \headerfile <x86intrin.h>
544///
545/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
546///
547/// \param __V1
548/// A 128-bit vector of [4 x i32].
549/// \param __V2
550/// A 128-bit vector of [4 x i32].
551/// \returns A 128-bit vector of [2 x i64] containing the products of both
552/// operands.
Logan Chien2833ffb2018-10-09 10:03:24 +0800553static __inline__ __m128i __DEFAULT_FN_ATTRS
554_mm_mul_epi32 (__m128i __V1, __m128i __V2)
555{
556 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
557}
558
559/* SSE4 Floating Point Dot Product Instructions. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800560/// Computes the dot product of the two 128-bit vectors of [4 x float]
561/// and returns it in the elements of the 128-bit result vector of
562/// [4 x float].
563///
564/// The immediate integer operand controls which input elements
565/// will contribute to the dot product, and where the final results are
566/// returned.
567///
568/// \headerfile <x86intrin.h>
569///
570/// \code
571/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
572/// \endcode
573///
574/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
575///
576/// \param X
577/// A 128-bit vector of [4 x float].
578/// \param Y
579/// A 128-bit vector of [4 x float].
580/// \param M
581/// An immediate integer operand. Mask bits [7:4] determine which elements
582/// of the input vectors are used, with bit [4] corresponding to the lowest
583/// element and bit [7] corresponding to the highest element of each [4 x
584/// float] vector. If a bit is set, the corresponding elements from the two
585/// input vectors are used as an input for dot product; otherwise that input
586/// is treated as zero. Bits [3:0] determine which elements of the result
587/// will receive a copy of the final dot product, with bit [0] corresponding
588/// to the lowest element and bit [3] corresponding to the highest element of
589/// each [4 x float] subvector. If a bit is set, the dot product is returned
590/// in the corresponding element; otherwise that element is set to zero.
591/// \returns A 128-bit vector of [4 x float] containing the dot product.
592#define _mm_dp_ps(X, Y, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800593 ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
594 (__v4sf)(__m128)(Y), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800595
Logan Chien55afb0a2018-10-15 10:42:14 +0800596/// Computes the dot product of the two 128-bit vectors of [2 x double]
597/// and returns it in the elements of the 128-bit result vector of
598/// [2 x double].
599///
600/// The immediate integer operand controls which input
601/// elements will contribute to the dot product, and where the final results
602/// are returned.
603///
604/// \headerfile <x86intrin.h>
605///
606/// \code
607/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
608/// \endcode
609///
610/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
611///
612/// \param X
613/// A 128-bit vector of [2 x double].
614/// \param Y
615/// A 128-bit vector of [2 x double].
616/// \param M
617/// An immediate integer operand. Mask bits [5:4] determine which elements
618/// of the input vectors are used, with bit [4] corresponding to the lowest
619/// element and bit [5] corresponding to the highest element of each of [2 x
620/// double] vector. If a bit is set, the corresponding elements from the two
621/// input vectors are used as an input for dot product; otherwise that input
622/// is treated as zero. Bits [1:0] determine which elements of the result
623/// will receive a copy of the final dot product, with bit [0] corresponding
624/// to the lowest element and bit [1] corresponding to the highest element of
625/// each [2 x double] vector. If a bit is set, the dot product is returned in
626/// the corresponding element; otherwise that element is set to zero.
627#define _mm_dp_pd(X, Y, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800628 ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
629 (__v2df)(__m128d)(Y), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800630
631/* SSE4 Streaming Load Hint Instruction. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800632/// Loads integer values from a 128-bit aligned memory location to a
633/// 128-bit integer vector.
634///
635/// \headerfile <x86intrin.h>
636///
637/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
638///
639/// \param __V
640/// A pointer to a 128-bit aligned memory location that contains the integer
641/// values.
642/// \returns A 128-bit integer vector containing the data stored at the
643/// specified memory location.
Logan Chien2833ffb2018-10-09 10:03:24 +0800644static __inline__ __m128i __DEFAULT_FN_ATTRS
645_mm_stream_load_si128 (__m128i const *__V)
646{
Logan Chien55afb0a2018-10-15 10:42:14 +0800647 return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
Logan Chien2833ffb2018-10-09 10:03:24 +0800648}
649
650/* SSE4 Packed Integer Min/Max Instructions. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800651/// Compares the corresponding elements of two 128-bit vectors of
652/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
653/// of the two values.
654///
655/// \headerfile <x86intrin.h>
656///
657/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
658///
659/// \param __V1
660/// A 128-bit vector of [16 x i8].
661/// \param __V2
662/// A 128-bit vector of [16 x i8]
663/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800664static __inline__ __m128i __DEFAULT_FN_ATTRS
665_mm_min_epi8 (__m128i __V1, __m128i __V2)
666{
667 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
668}
669
Logan Chien55afb0a2018-10-15 10:42:14 +0800670/// Compares the corresponding elements of two 128-bit vectors of
671/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
672/// greater value of the two.
673///
674/// \headerfile <x86intrin.h>
675///
676/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
677///
678/// \param __V1
679/// A 128-bit vector of [16 x i8].
680/// \param __V2
681/// A 128-bit vector of [16 x i8].
682/// \returns A 128-bit vector of [16 x i8] containing the greater values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800683static __inline__ __m128i __DEFAULT_FN_ATTRS
684_mm_max_epi8 (__m128i __V1, __m128i __V2)
685{
686 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
687}
688
Logan Chien55afb0a2018-10-15 10:42:14 +0800689/// Compares the corresponding elements of two 128-bit vectors of
690/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
691/// value of the two.
692///
693/// \headerfile <x86intrin.h>
694///
695/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
696///
697/// \param __V1
698/// A 128-bit vector of [8 x u16].
699/// \param __V2
700/// A 128-bit vector of [8 x u16].
701/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800702static __inline__ __m128i __DEFAULT_FN_ATTRS
703_mm_min_epu16 (__m128i __V1, __m128i __V2)
704{
705 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
706}
707
Logan Chien55afb0a2018-10-15 10:42:14 +0800708/// Compares the corresponding elements of two 128-bit vectors of
709/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
710/// greater value of the two.
711///
712/// \headerfile <x86intrin.h>
713///
714/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
715///
716/// \param __V1
717/// A 128-bit vector of [8 x u16].
718/// \param __V2
719/// A 128-bit vector of [8 x u16].
720/// \returns A 128-bit vector of [8 x u16] containing the greater values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800721static __inline__ __m128i __DEFAULT_FN_ATTRS
722_mm_max_epu16 (__m128i __V1, __m128i __V2)
723{
724 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
725}
726
Logan Chien55afb0a2018-10-15 10:42:14 +0800727/// Compares the corresponding elements of two 128-bit vectors of
728/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
729/// value of the two.
730///
731/// \headerfile <x86intrin.h>
732///
733/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
734///
735/// \param __V1
736/// A 128-bit vector of [4 x i32].
737/// \param __V2
738/// A 128-bit vector of [4 x i32].
739/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800740static __inline__ __m128i __DEFAULT_FN_ATTRS
741_mm_min_epi32 (__m128i __V1, __m128i __V2)
742{
743 return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
744}
745
Logan Chien55afb0a2018-10-15 10:42:14 +0800746/// Compares the corresponding elements of two 128-bit vectors of
747/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
748/// greater value of the two.
749///
750/// \headerfile <x86intrin.h>
751///
752/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
753///
754/// \param __V1
755/// A 128-bit vector of [4 x i32].
756/// \param __V2
757/// A 128-bit vector of [4 x i32].
758/// \returns A 128-bit vector of [4 x i32] containing the greater values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800759static __inline__ __m128i __DEFAULT_FN_ATTRS
760_mm_max_epi32 (__m128i __V1, __m128i __V2)
761{
762 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
763}
764
Logan Chien55afb0a2018-10-15 10:42:14 +0800765/// Compares the corresponding elements of two 128-bit vectors of
766/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
767/// value of the two.
768///
769/// \headerfile <x86intrin.h>
770///
771/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
772///
773/// \param __V1
774/// A 128-bit vector of [4 x u32].
775/// \param __V2
776/// A 128-bit vector of [4 x u32].
777/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800778static __inline__ __m128i __DEFAULT_FN_ATTRS
779_mm_min_epu32 (__m128i __V1, __m128i __V2)
780{
781 return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
782}
783
Logan Chien55afb0a2018-10-15 10:42:14 +0800784/// Compares the corresponding elements of two 128-bit vectors of
785/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
786/// greater value of the two.
787///
788/// \headerfile <x86intrin.h>
789///
790/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
791///
792/// \param __V1
793/// A 128-bit vector of [4 x u32].
794/// \param __V2
795/// A 128-bit vector of [4 x u32].
796/// \returns A 128-bit vector of [4 x u32] containing the greater values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800797static __inline__ __m128i __DEFAULT_FN_ATTRS
798_mm_max_epu32 (__m128i __V1, __m128i __V2)
799{
800 return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
801}
802
803/* SSE4 Insertion and Extraction from XMM Register Instructions. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800804/// Takes the first argument \a X and inserts an element from the second
805/// argument \a Y as selected by the third argument \a N. That result then
806/// has elements zeroed out also as selected by the third argument \a N. The
807/// resulting 128-bit vector of [4 x float] is then returned.
808///
809/// \headerfile <x86intrin.h>
810///
811/// \code
812/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
813/// \endcode
814///
815/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
816///
817/// \param X
818/// A 128-bit vector source operand of [4 x float]. With the exception of
819/// those bits in the result copied from parameter \a Y and zeroed by bits
820/// [3:0] of \a N, all bits from this parameter are copied to the result.
821/// \param Y
822/// A 128-bit vector source operand of [4 x float]. One single-precision
823/// floating-point element from this source, as determined by the immediate
824/// parameter, is copied to the result.
825/// \param N
826/// Specifies which bits from operand \a Y will be copied, which bits in the
827/// result they will be be copied to, and which bits in the result will be
828/// cleared. The following assignments are made: \n
829/// Bits [7:6] specify the bits to copy from operand \a Y: \n
830/// 00: Selects bits [31:0] from operand \a Y. \n
831/// 01: Selects bits [63:32] from operand \a Y. \n
832/// 10: Selects bits [95:64] from operand \a Y. \n
833/// 11: Selects bits [127:96] from operand \a Y. \n
834/// Bits [5:4] specify the bits in the result to which the selected bits
835/// from operand \a Y are copied: \n
836/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
837/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
838/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
839/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
840/// Bits[3:0]: If any of these bits are set, the corresponding result
841/// element is cleared.
842/// \returns A 128-bit vector of [4 x float] containing the copied
843/// single-precision floating point elements from the operands.
Logan Chien2833ffb2018-10-09 10:03:24 +0800844#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
Logan Chien55afb0a2018-10-15 10:42:14 +0800845
846/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
847/// returns it, using the immediate value parameter \a N as a selector.
848///
849/// \headerfile <x86intrin.h>
850///
851/// \code
852/// int _mm_extract_ps(__m128 X, const int N);
853/// \endcode
854///
855/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
856/// instruction.
857///
858/// \param X
859/// A 128-bit vector of [4 x float].
860/// \param N
861/// An immediate value. Bits [1:0] determines which bits from the argument
862/// \a X are extracted and returned: \n
863/// 00: Bits [31:0] of parameter \a X are returned. \n
864/// 01: Bits [63:32] of parameter \a X are returned. \n
865/// 10: Bits [95:64] of parameter \a X are returned. \n
866/// 11: Bits [127:96] of parameter \a X are returned.
867/// \returns A 32-bit integer containing the extracted 32 bits of float data.
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800868#define _mm_extract_ps(X, N) \
869 __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800870
871/* Miscellaneous insert and extract macros. */
872/* Extract a single-precision float from X at index N into D. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800873#define _MM_EXTRACT_FLOAT(D, X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800874 do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
Logan Chien2833ffb2018-10-09 10:03:24 +0800875
876/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
877 an index suitable for _mm_insert_ps. */
878#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
879
880/* Extract a float from X at index N into the first index of the return. */
881#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
882 _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
883
884/* Insert int into packed integer array at index. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800885/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
886/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
887/// of an integer parameter \a I into an offset specified by the immediate
888/// value parameter \a N.
889///
890/// \headerfile <x86intrin.h>
891///
892/// \code
893/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
894/// \endcode
895///
896/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
897///
898/// \param X
899/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
900/// result and then one of the sixteen elements in the result vector is
901/// replaced by the lower 8 bits of \a I.
902/// \param I
903/// An integer. The lower 8 bits of this operand are written to the result
904/// beginning at the offset specified by \a N.
905/// \param N
906/// An immediate value. Bits [3:0] specify the bit offset in the result at
907/// which the lower 8 bits of \a I are written. \n
908/// 0000: Bits [7:0] of the result are used for insertion. \n
909/// 0001: Bits [15:8] of the result are used for insertion. \n
910/// 0010: Bits [23:16] of the result are used for insertion. \n
911/// 0011: Bits [31:24] of the result are used for insertion. \n
912/// 0100: Bits [39:32] of the result are used for insertion. \n
913/// 0101: Bits [47:40] of the result are used for insertion. \n
914/// 0110: Bits [55:48] of the result are used for insertion. \n
915/// 0111: Bits [63:56] of the result are used for insertion. \n
916/// 1000: Bits [71:64] of the result are used for insertion. \n
917/// 1001: Bits [79:72] of the result are used for insertion. \n
918/// 1010: Bits [87:80] of the result are used for insertion. \n
919/// 1011: Bits [95:88] of the result are used for insertion. \n
920/// 1100: Bits [103:96] of the result are used for insertion. \n
921/// 1101: Bits [111:104] of the result are used for insertion. \n
922/// 1110: Bits [119:112] of the result are used for insertion. \n
923/// 1111: Bits [127:120] of the result are used for insertion.
924/// \returns A 128-bit integer vector containing the constructed values.
925#define _mm_insert_epi8(X, I, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800926 ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
927 (int)(I), (int)(N)))
Logan Chien55afb0a2018-10-15 10:42:14 +0800928
929/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
930/// the 128-bit integer vector parameter, and then inserting the 32-bit
931/// integer parameter \a I at the offset specified by the immediate value
932/// parameter \a N.
933///
934/// \headerfile <x86intrin.h>
935///
936/// \code
937/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
938/// \endcode
939///
940/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
941///
942/// \param X
943/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
944/// result and then one of the four elements in the result vector is
945/// replaced by \a I.
946/// \param I
947/// A 32-bit integer that is written to the result beginning at the offset
948/// specified by \a N.
949/// \param N
950/// An immediate value. Bits [1:0] specify the bit offset in the result at
951/// which the integer \a I is written. \n
952/// 00: Bits [31:0] of the result are used for insertion. \n
953/// 01: Bits [63:32] of the result are used for insertion. \n
954/// 10: Bits [95:64] of the result are used for insertion. \n
955/// 11: Bits [127:96] of the result are used for insertion.
956/// \returns A 128-bit integer vector containing the constructed values.
957#define _mm_insert_epi32(X, I, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800958 ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
959 (int)(I), (int)(N)))
Logan Chien55afb0a2018-10-15 10:42:14 +0800960
Logan Chien2833ffb2018-10-09 10:03:24 +0800961#ifdef __x86_64__
Logan Chien55afb0a2018-10-15 10:42:14 +0800962/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
963/// the 128-bit integer vector parameter, and then inserting the 64-bit
964/// integer parameter \a I, using the immediate value parameter \a N as an
965/// insertion location selector.
966///
967/// \headerfile <x86intrin.h>
968///
969/// \code
970/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
971/// \endcode
972///
973/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
974///
975/// \param X
976/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
977/// result and then one of the two elements in the result vector is replaced
978/// by \a I.
979/// \param I
980/// A 64-bit integer that is written to the result beginning at the offset
981/// specified by \a N.
982/// \param N
983/// An immediate value. Bit [0] specifies the bit offset in the result at
984/// which the integer \a I is written. \n
985/// 0: Bits [63:0] of the result are used for insertion. \n
986/// 1: Bits [127:64] of the result are used for insertion. \n
987/// \returns A 128-bit integer vector containing the constructed values.
988#define _mm_insert_epi64(X, I, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800989 ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
990 (long long)(I), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800991#endif /* __x86_64__ */
992
993/* Extract int from packed integer array at index. This returns the element
994 * as a zero extended value, so it is unsigned.
995 */
Logan Chien55afb0a2018-10-15 10:42:14 +0800996/// Extracts an 8-bit element from the 128-bit integer vector of
997/// [16 x i8], using the immediate value parameter \a N as a selector.
998///
999/// \headerfile <x86intrin.h>
1000///
1001/// \code
1002/// int _mm_extract_epi8(__m128i X, const int N);
1003/// \endcode
1004///
1005/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
1006///
1007/// \param X
1008/// A 128-bit integer vector.
1009/// \param N
1010/// An immediate value. Bits [3:0] specify which 8-bit vector element from
1011/// the argument \a X to extract and copy to the result. \n
1012/// 0000: Bits [7:0] of parameter \a X are extracted. \n
1013/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
1014/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
1015/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
1016/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
1017/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
1018/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
1019/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
1020/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
1021/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
1022/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
1023/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
1024/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
1025/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
1026/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
1027/// 1111: Bits [127:120] of the parameter \a X are extracted.
1028/// \returns An unsigned integer, whose lower 8 bits are selected from the
1029/// 128-bit integer vector parameter and the remaining bits are assigned
1030/// zeros.
1031#define _mm_extract_epi8(X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001032 ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
1033 (int)(N)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001034
1035/// Extracts a 32-bit element from the 128-bit integer vector of
1036/// [4 x i32], using the immediate value parameter \a N as a selector.
1037///
1038/// \headerfile <x86intrin.h>
1039///
1040/// \code
1041/// int _mm_extract_epi32(__m128i X, const int N);
1042/// \endcode
1043///
1044/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
1045///
1046/// \param X
1047/// A 128-bit integer vector.
1048/// \param N
1049/// An immediate value. Bits [1:0] specify which 32-bit vector element from
1050/// the argument \a X to extract and copy to the result. \n
1051/// 00: Bits [31:0] of the parameter \a X are extracted. \n
1052/// 01: Bits [63:32] of the parameter \a X are extracted. \n
1053/// 10: Bits [95:64] of the parameter \a X are extracted. \n
1054/// 11: Bits [127:96] of the parameter \a X are exracted.
1055/// \returns An integer, whose lower 32 bits are selected from the 128-bit
1056/// integer vector parameter and the remaining bits are assigned zeros.
1057#define _mm_extract_epi32(X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001058 ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001059
Logan Chien2833ffb2018-10-09 10:03:24 +08001060#ifdef __x86_64__
Logan Chien55afb0a2018-10-15 10:42:14 +08001061/// Extracts a 64-bit element from the 128-bit integer vector of
1062/// [2 x i64], using the immediate value parameter \a N as a selector.
1063///
1064/// \headerfile <x86intrin.h>
1065///
1066/// \code
1067/// long long _mm_extract_epi64(__m128i X, const int N);
1068/// \endcode
1069///
1070/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1071///
1072/// \param X
1073/// A 128-bit integer vector.
1074/// \param N
1075/// An immediate value. Bit [0] specifies which 64-bit vector element from
1076/// the argument \a X to return. \n
1077/// 0: Bits [63:0] are returned. \n
1078/// 1: Bits [127:64] are returned. \n
1079/// \returns A 64-bit integer.
1080#define _mm_extract_epi64(X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001081 ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001082#endif /* __x86_64 */
1083
1084/* SSE4 128-bit Packed Integer Comparisons. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001085/// Tests whether the specified bits in a 128-bit integer vector are all
1086/// zeros.
1087///
1088/// \headerfile <x86intrin.h>
1089///
1090/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1091///
1092/// \param __M
1093/// A 128-bit integer vector containing the bits to be tested.
1094/// \param __V
1095/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1096/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001097static __inline__ int __DEFAULT_FN_ATTRS
1098_mm_testz_si128(__m128i __M, __m128i __V)
1099{
1100 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1101}
1102
Logan Chien55afb0a2018-10-15 10:42:14 +08001103/// Tests whether the specified bits in a 128-bit integer vector are all
1104/// ones.
1105///
1106/// \headerfile <x86intrin.h>
1107///
1108/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1109///
1110/// \param __M
1111/// A 128-bit integer vector containing the bits to be tested.
1112/// \param __V
1113/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1114/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001115static __inline__ int __DEFAULT_FN_ATTRS
1116_mm_testc_si128(__m128i __M, __m128i __V)
1117{
1118 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1119}
1120
Logan Chien55afb0a2018-10-15 10:42:14 +08001121/// Tests whether the specified bits in a 128-bit integer vector are
1122/// neither all zeros nor all ones.
1123///
1124/// \headerfile <x86intrin.h>
1125///
1126/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1127///
1128/// \param __M
1129/// A 128-bit integer vector containing the bits to be tested.
1130/// \param __V
1131/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1132/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1133/// FALSE otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001134static __inline__ int __DEFAULT_FN_ATTRS
1135_mm_testnzc_si128(__m128i __M, __m128i __V)
1136{
1137 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1138}
1139
Logan Chien55afb0a2018-10-15 10:42:14 +08001140/// Tests whether the specified bits in a 128-bit integer vector are all
1141/// ones.
1142///
1143/// \headerfile <x86intrin.h>
1144///
1145/// \code
1146/// int _mm_test_all_ones(__m128i V);
1147/// \endcode
1148///
1149/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1150///
1151/// \param V
1152/// A 128-bit integer vector containing the bits to be tested.
1153/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1154/// otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001155#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001156
1157/// Tests whether the specified bits in a 128-bit integer vector are
1158/// neither all zeros nor all ones.
1159///
1160/// \headerfile <x86intrin.h>
1161///
1162/// \code
1163/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1164/// \endcode
1165///
1166/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1167///
1168/// \param M
1169/// A 128-bit integer vector containing the bits to be tested.
1170/// \param V
1171/// A 128-bit integer vector selecting which bits to test in operand \a M.
1172/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1173/// FALSE otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001174#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
Logan Chien55afb0a2018-10-15 10:42:14 +08001175
1176/// Tests whether the specified bits in a 128-bit integer vector are all
1177/// zeros.
1178///
1179/// \headerfile <x86intrin.h>
1180///
1181/// \code
1182/// int _mm_test_all_zeros(__m128i M, __m128i V);
1183/// \endcode
1184///
1185/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1186///
1187/// \param M
1188/// A 128-bit integer vector containing the bits to be tested.
1189/// \param V
1190/// A 128-bit integer vector selecting which bits to test in operand \a M.
1191/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001192#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
1193
1194/* SSE4 64-bit Packed Integer Comparisons. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001195/// Compares each of the corresponding 64-bit values of the 128-bit
1196/// integer vectors for equality.
1197///
1198/// \headerfile <x86intrin.h>
1199///
1200/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
1201///
1202/// \param __V1
1203/// A 128-bit integer vector.
1204/// \param __V2
1205/// A 128-bit integer vector.
1206/// \returns A 128-bit integer vector containing the comparison results.
Logan Chien2833ffb2018-10-09 10:03:24 +08001207static __inline__ __m128i __DEFAULT_FN_ATTRS
1208_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
1209{
1210 return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1211}
1212
1213/* SSE4 Packed Integer Sign-Extension. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001214/// Sign-extends each of the lower eight 8-bit integer elements of a
1215/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1216/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1217/// are unused.
1218///
1219/// \headerfile <x86intrin.h>
1220///
1221/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
1222///
1223/// \param __V
1224/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
1225/// extended to 16-bit values.
1226/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001227static __inline__ __m128i __DEFAULT_FN_ATTRS
1228_mm_cvtepi8_epi16(__m128i __V)
1229{
1230 /* This function always performs a signed extension, but __v16qi is a char
1231 which may be signed or unsigned, so use __v16qs. */
1232 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
1233}
1234
Logan Chien55afb0a2018-10-15 10:42:14 +08001235/// Sign-extends each of the lower four 8-bit integer elements of a
1236/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1237/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1238/// vector are unused.
1239///
1240/// \headerfile <x86intrin.h>
1241///
1242/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
1243///
1244/// \param __V
1245/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1246/// sign-extended to 32-bit values.
1247/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001248static __inline__ __m128i __DEFAULT_FN_ATTRS
1249_mm_cvtepi8_epi32(__m128i __V)
1250{
1251 /* This function always performs a signed extension, but __v16qi is a char
1252 which may be signed or unsigned, so use __v16qs. */
1253 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1254}
1255
Logan Chien55afb0a2018-10-15 10:42:14 +08001256/// Sign-extends each of the lower two 8-bit integer elements of a
1257/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1258/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1259/// vector are unused.
1260///
1261/// \headerfile <x86intrin.h>
1262///
1263/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
1264///
1265/// \param __V
1266/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1267/// sign-extended to 64-bit values.
1268/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001269static __inline__ __m128i __DEFAULT_FN_ATTRS
1270_mm_cvtepi8_epi64(__m128i __V)
1271{
1272 /* This function always performs a signed extension, but __v16qi is a char
1273 which may be signed or unsigned, so use __v16qs. */
1274 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1275}
1276
Logan Chien55afb0a2018-10-15 10:42:14 +08001277/// Sign-extends each of the lower four 16-bit integer elements of a
1278/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1279/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1280/// vector are unused.
1281///
1282/// \headerfile <x86intrin.h>
1283///
1284/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
1285///
1286/// \param __V
1287/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1288/// sign-extended to 32-bit values.
1289/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001290static __inline__ __m128i __DEFAULT_FN_ATTRS
1291_mm_cvtepi16_epi32(__m128i __V)
1292{
1293 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1294}
1295
Logan Chien55afb0a2018-10-15 10:42:14 +08001296/// Sign-extends each of the lower two 16-bit integer elements of a
1297/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1298/// a 128-bit vector of [2 x i64]. The upper six elements of the input
1299/// vector are unused.
1300///
1301/// \headerfile <x86intrin.h>
1302///
1303/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
1304///
1305/// \param __V
1306/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1307/// sign-extended to 64-bit values.
1308/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001309static __inline__ __m128i __DEFAULT_FN_ATTRS
1310_mm_cvtepi16_epi64(__m128i __V)
1311{
1312 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1313}
1314
Logan Chien55afb0a2018-10-15 10:42:14 +08001315/// Sign-extends each of the lower two 32-bit integer elements of a
1316/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1317/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1318/// are unused.
1319///
1320/// \headerfile <x86intrin.h>
1321///
1322/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
1323///
1324/// \param __V
1325/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1326/// sign-extended to 64-bit values.
1327/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001328static __inline__ __m128i __DEFAULT_FN_ATTRS
1329_mm_cvtepi32_epi64(__m128i __V)
1330{
1331 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1332}
1333
1334/* SSE4 Packed Integer Zero-Extension. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001335/// Zero-extends each of the lower eight 8-bit integer elements of a
1336/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1337/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1338/// are unused.
1339///
1340/// \headerfile <x86intrin.h>
1341///
1342/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
1343///
1344/// \param __V
1345/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1346/// zero-extended to 16-bit values.
1347/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001348static __inline__ __m128i __DEFAULT_FN_ATTRS
1349_mm_cvtepu8_epi16(__m128i __V)
1350{
1351 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
1352}
1353
Logan Chien55afb0a2018-10-15 10:42:14 +08001354/// Zero-extends each of the lower four 8-bit integer elements of a
1355/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1356/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1357/// vector are unused.
1358///
1359/// \headerfile <x86intrin.h>
1360///
1361/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
1362///
1363/// \param __V
1364/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1365/// zero-extended to 32-bit values.
1366/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001367static __inline__ __m128i __DEFAULT_FN_ATTRS
1368_mm_cvtepu8_epi32(__m128i __V)
1369{
1370 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1371}
1372
Logan Chien55afb0a2018-10-15 10:42:14 +08001373/// Zero-extends each of the lower two 8-bit integer elements of a
1374/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1375/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1376/// vector are unused.
1377///
1378/// \headerfile <x86intrin.h>
1379///
1380/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
1381///
1382/// \param __V
1383/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1384/// zero-extended to 64-bit values.
1385/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001386static __inline__ __m128i __DEFAULT_FN_ATTRS
1387_mm_cvtepu8_epi64(__m128i __V)
1388{
1389 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1390}
1391
Logan Chien55afb0a2018-10-15 10:42:14 +08001392/// Zero-extends each of the lower four 16-bit integer elements of a
1393/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1394/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1395/// vector are unused.
1396///
1397/// \headerfile <x86intrin.h>
1398///
1399/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
1400///
1401/// \param __V
1402/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1403/// zero-extended to 32-bit values.
1404/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001405static __inline__ __m128i __DEFAULT_FN_ATTRS
1406_mm_cvtepu16_epi32(__m128i __V)
1407{
1408 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1409}
1410
Logan Chien55afb0a2018-10-15 10:42:14 +08001411/// Zero-extends each of the lower two 16-bit integer elements of a
1412/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1413/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1414/// are unused.
1415///
1416/// \headerfile <x86intrin.h>
1417///
1418/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
1419///
1420/// \param __V
1421/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1422/// zero-extended to 64-bit values.
1423/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001424static __inline__ __m128i __DEFAULT_FN_ATTRS
1425_mm_cvtepu16_epi64(__m128i __V)
1426{
1427 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1428}
1429
Logan Chien55afb0a2018-10-15 10:42:14 +08001430/// Zero-extends each of the lower two 32-bit integer elements of a
1431/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1432/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1433/// are unused.
1434///
1435/// \headerfile <x86intrin.h>
1436///
1437/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
1438///
1439/// \param __V
1440/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1441/// zero-extended to 64-bit values.
1442/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001443static __inline__ __m128i __DEFAULT_FN_ATTRS
1444_mm_cvtepu32_epi64(__m128i __V)
1445{
1446 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1447}
1448
1449/* SSE4 Pack with Unsigned Saturation. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001450/// Converts 32-bit signed integers from both 128-bit integer vector
1451/// operands into 16-bit unsigned integers, and returns the packed result.
1452/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1453/// 0x0000 are saturated to 0x0000.
1454///
1455/// \headerfile <x86intrin.h>
1456///
1457/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
1458///
1459/// \param __V1
1460/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1461/// signed integer and is converted to a 16-bit unsigned integer with
1462/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1463/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1464/// are written to the lower 64 bits of the result.
1465/// \param __V2
1466/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1467/// signed integer and is converted to a 16-bit unsigned integer with
1468/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1469/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1470/// are written to the higher 64 bits of the result.
1471/// \returns A 128-bit vector of [8 x i16] containing the converted values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001472static __inline__ __m128i __DEFAULT_FN_ATTRS
1473_mm_packus_epi32(__m128i __V1, __m128i __V2)
1474{
1475 return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1476}
1477
1478/* SSE4 Multiple Packed Sums of Absolute Difference. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001479/// Subtracts 8-bit unsigned integer values and computes the absolute
1480/// values of the differences to the corresponding bits in the destination.
1481/// Then sums of the absolute differences are returned according to the bit
1482/// fields in the immediate operand.
1483///
1484/// \headerfile <x86intrin.h>
1485///
1486/// \code
1487/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1488/// \endcode
1489///
1490/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
1491///
1492/// \param X
1493/// A 128-bit vector of [16 x i8].
1494/// \param Y
1495/// A 128-bit vector of [16 x i8].
1496/// \param M
1497/// An 8-bit immediate operand specifying how the absolute differences are to
1498/// be calculated, according to the following algorithm:
1499/// \code
1500/// // M2 represents bit 2 of the immediate operand
1501/// // M10 represents bits [1:0] of the immediate operand
1502/// i = M2 * 4;
1503/// j = M10 * 4;
1504/// for (k = 0; k < 8; k = k + 1) {
1505/// d0 = abs(X[i + k + 0] - Y[j + 0]);
1506/// d1 = abs(X[i + k + 1] - Y[j + 1]);
1507/// d2 = abs(X[i + k + 2] - Y[j + 2]);
1508/// d3 = abs(X[i + k + 3] - Y[j + 3]);
1509/// r[k] = d0 + d1 + d2 + d3;
1510/// }
1511/// \endcode
1512/// \returns A 128-bit integer vector containing the sums of the sets of
1513/// absolute differences between both operands.
1514#define _mm_mpsadbw_epu8(X, Y, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001515 ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
1516 (__v16qi)(__m128i)(Y), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001517
Logan Chien55afb0a2018-10-15 10:42:14 +08001518/// Finds the minimum unsigned 16-bit element in the input 128-bit
1519/// vector of [8 x u16] and returns it and along with its index.
1520///
1521/// \headerfile <x86intrin.h>
1522///
1523/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
1524/// instruction.
1525///
1526/// \param __V
1527/// A 128-bit vector of [8 x u16].
1528/// \returns A 128-bit value where bits [15:0] contain the minimum value found
1529/// in parameter \a __V, bits [18:16] contain the index of the minimum value
1530/// and the remaining bits are set to 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08001531static __inline__ __m128i __DEFAULT_FN_ATTRS
1532_mm_minpos_epu16(__m128i __V)
1533{
1534 return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
1535}
1536
1537/* Handle the sse4.2 definitions here. */
1538
1539/* These definitions are normally in nmmintrin.h, but gcc puts them in here
1540 so we'll do the same. */
1541
1542#undef __DEFAULT_FN_ATTRS
1543#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1544
1545/* These specify the type of data that we're comparing. */
1546#define _SIDD_UBYTE_OPS 0x00
1547#define _SIDD_UWORD_OPS 0x01
1548#define _SIDD_SBYTE_OPS 0x02
1549#define _SIDD_SWORD_OPS 0x03
1550
1551/* These specify the type of comparison operation. */
1552#define _SIDD_CMP_EQUAL_ANY 0x00
1553#define _SIDD_CMP_RANGES 0x04
1554#define _SIDD_CMP_EQUAL_EACH 0x08
1555#define _SIDD_CMP_EQUAL_ORDERED 0x0c
1556
1557/* These macros specify the polarity of the operation. */
1558#define _SIDD_POSITIVE_POLARITY 0x00
1559#define _SIDD_NEGATIVE_POLARITY 0x10
1560#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
1561#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
1562
1563/* These macros are used in _mm_cmpXstri() to specify the return. */
1564#define _SIDD_LEAST_SIGNIFICANT 0x00
1565#define _SIDD_MOST_SIGNIFICANT 0x40
1566
1567/* These macros are used in _mm_cmpXstri() to specify the return. */
1568#define _SIDD_BIT_MASK 0x00
1569#define _SIDD_UNIT_MASK 0x40
1570
1571/* SSE4.2 Packed Comparison Intrinsics. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001572/// Uses the immediate operand \a M to perform a comparison of string
1573/// data with implicitly defined lengths that is contained in source operands
1574/// \a A and \a B. Returns a 128-bit integer vector representing the result
1575/// mask of the comparison.
1576///
1577/// \headerfile <x86intrin.h>
1578///
1579/// \code
1580/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1581/// \endcode
1582///
1583/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
1584/// instruction.
1585///
1586/// \param A
1587/// A 128-bit integer vector containing one of the source operands to be
1588/// compared.
1589/// \param B
1590/// A 128-bit integer vector containing one of the source operands to be
1591/// compared.
1592/// \param M
1593/// An 8-bit immediate operand specifying whether the characters are bytes or
1594/// words, the type of comparison to perform, and the format of the return
1595/// value. \n
1596/// Bits [1:0]: Determine source data format. \n
1597/// 00: 16 unsigned bytes \n
1598/// 01: 8 unsigned words \n
1599/// 10: 16 signed bytes \n
1600/// 11: 8 signed words \n
1601/// Bits [3:2]: Determine comparison type and aggregation method. \n
1602/// 00: Subset: Each character in \a B is compared for equality with all
1603/// the characters in \a A. \n
1604/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1605/// basis is greater than or equal for even-indexed elements in \a A,
1606/// and less than or equal for odd-indexed elements in \a A. \n
1607/// 10: Match: Compare each pair of corresponding characters in \a A and
1608/// \a B for equality. \n
1609/// 11: Substring: Search \a B for substring matches of \a A. \n
1610/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1611/// mask of the comparison results. \n
1612/// 00: No effect. \n
1613/// 01: Negate the bit mask. \n
1614/// 10: No effect. \n
1615/// 11: Negate the bit mask only for bits with an index less than or equal
1616/// to the size of \a A or \a B. \n
1617/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1618/// bytes. \n
1619/// 0: The result is zero-extended to 16 bytes. \n
1620/// 1: The result is expanded to 16 bytes (this expansion is performed by
1621/// repeating each bit 8 or 16 times).
1622/// \returns Returns a 128-bit integer vector representing the result mask of
1623/// the comparison.
Logan Chien2833ffb2018-10-09 10:03:24 +08001624#define _mm_cmpistrm(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001625 ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
1626 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001627
1628/// Uses the immediate operand \a M to perform a comparison of string
1629/// data with implicitly defined lengths that is contained in source operands
1630/// \a A and \a B. Returns an integer representing the result index of the
1631/// comparison.
1632///
1633/// \headerfile <x86intrin.h>
1634///
1635/// \code
1636/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1637/// \endcode
1638///
1639/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1640/// instruction.
1641///
1642/// \param A
1643/// A 128-bit integer vector containing one of the source operands to be
1644/// compared.
1645/// \param B
1646/// A 128-bit integer vector containing one of the source operands to be
1647/// compared.
1648/// \param M
1649/// An 8-bit immediate operand specifying whether the characters are bytes or
1650/// words, the type of comparison to perform, and the format of the return
1651/// value. \n
1652/// Bits [1:0]: Determine source data format. \n
1653/// 00: 16 unsigned bytes \n
1654/// 01: 8 unsigned words \n
1655/// 10: 16 signed bytes \n
1656/// 11: 8 signed words \n
1657/// Bits [3:2]: Determine comparison type and aggregation method. \n
1658/// 00: Subset: Each character in \a B is compared for equality with all
1659/// the characters in \a A. \n
1660/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1661/// basis is greater than or equal for even-indexed elements in \a A,
1662/// and less than or equal for odd-indexed elements in \a A. \n
1663/// 10: Match: Compare each pair of corresponding characters in \a A and
1664/// \a B for equality. \n
1665/// 11: Substring: Search B for substring matches of \a A. \n
1666/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1667/// mask of the comparison results. \n
1668/// 00: No effect. \n
1669/// 01: Negate the bit mask. \n
1670/// 10: No effect. \n
1671/// 11: Negate the bit mask only for bits with an index less than or equal
1672/// to the size of \a A or \a B. \n
1673/// Bit [6]: Determines whether the index of the lowest set bit or the
1674/// highest set bit is returned. \n
1675/// 0: The index of the least significant set bit. \n
1676/// 1: The index of the most significant set bit. \n
1677/// \returns Returns an integer representing the result index of the comparison.
Logan Chien2833ffb2018-10-09 10:03:24 +08001678#define _mm_cmpistri(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001679 ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
1680 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001681
Logan Chien55afb0a2018-10-15 10:42:14 +08001682/// Uses the immediate operand \a M to perform a comparison of string
1683/// data with explicitly defined lengths that is contained in source operands
1684/// \a A and \a B. Returns a 128-bit integer vector representing the result
1685/// mask of the comparison.
1686///
1687/// \headerfile <x86intrin.h>
1688///
1689/// \code
1690/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1691/// \endcode
1692///
1693/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
1694/// instruction.
1695///
1696/// \param A
1697/// A 128-bit integer vector containing one of the source operands to be
1698/// compared.
1699/// \param LA
1700/// An integer that specifies the length of the string in \a A.
1701/// \param B
1702/// A 128-bit integer vector containing one of the source operands to be
1703/// compared.
1704/// \param LB
1705/// An integer that specifies the length of the string in \a B.
1706/// \param M
1707/// An 8-bit immediate operand specifying whether the characters are bytes or
1708/// words, the type of comparison to perform, and the format of the return
1709/// value. \n
1710/// Bits [1:0]: Determine source data format. \n
1711/// 00: 16 unsigned bytes \n
1712/// 01: 8 unsigned words \n
1713/// 10: 16 signed bytes \n
1714/// 11: 8 signed words \n
1715/// Bits [3:2]: Determine comparison type and aggregation method. \n
1716/// 00: Subset: Each character in \a B is compared for equality with all
1717/// the characters in \a A. \n
1718/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1719/// basis is greater than or equal for even-indexed elements in \a A,
1720/// and less than or equal for odd-indexed elements in \a A. \n
1721/// 10: Match: Compare each pair of corresponding characters in \a A and
1722/// \a B for equality. \n
1723/// 11: Substring: Search \a B for substring matches of \a A. \n
1724/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1725/// mask of the comparison results. \n
1726/// 00: No effect. \n
1727/// 01: Negate the bit mask. \n
1728/// 10: No effect. \n
1729/// 11: Negate the bit mask only for bits with an index less than or equal
1730/// to the size of \a A or \a B. \n
1731/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1732/// bytes. \n
1733/// 0: The result is zero-extended to 16 bytes. \n
1734/// 1: The result is expanded to 16 bytes (this expansion is performed by
1735/// repeating each bit 8 or 16 times). \n
1736/// \returns Returns a 128-bit integer vector representing the result mask of
1737/// the comparison.
Logan Chien2833ffb2018-10-09 10:03:24 +08001738#define _mm_cmpestrm(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001739 ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
1740 (__v16qi)(__m128i)(B), (int)(LB), \
1741 (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001742
1743/// Uses the immediate operand \a M to perform a comparison of string
1744/// data with explicitly defined lengths that is contained in source operands
1745/// \a A and \a B. Returns an integer representing the result index of the
1746/// comparison.
1747///
1748/// \headerfile <x86intrin.h>
1749///
1750/// \code
1751/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1752/// \endcode
1753///
1754/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
1755/// instruction.
1756///
1757/// \param A
1758/// A 128-bit integer vector containing one of the source operands to be
1759/// compared.
1760/// \param LA
1761/// An integer that specifies the length of the string in \a A.
1762/// \param B
1763/// A 128-bit integer vector containing one of the source operands to be
1764/// compared.
1765/// \param LB
1766/// An integer that specifies the length of the string in \a B.
1767/// \param M
1768/// An 8-bit immediate operand specifying whether the characters are bytes or
1769/// words, the type of comparison to perform, and the format of the return
1770/// value. \n
1771/// Bits [1:0]: Determine source data format. \n
1772/// 00: 16 unsigned bytes \n
1773/// 01: 8 unsigned words \n
1774/// 10: 16 signed bytes \n
1775/// 11: 8 signed words \n
1776/// Bits [3:2]: Determine comparison type and aggregation method. \n
1777/// 00: Subset: Each character in \a B is compared for equality with all
1778/// the characters in \a A. \n
1779/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1780/// basis is greater than or equal for even-indexed elements in \a A,
1781/// and less than or equal for odd-indexed elements in \a A. \n
1782/// 10: Match: Compare each pair of corresponding characters in \a A and
1783/// \a B for equality. \n
1784/// 11: Substring: Search B for substring matches of \a A. \n
1785/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1786/// mask of the comparison results. \n
1787/// 00: No effect. \n
1788/// 01: Negate the bit mask. \n
1789/// 10: No effect. \n
1790/// 11: Negate the bit mask only for bits with an index less than or equal
1791/// to the size of \a A or \a B. \n
1792/// Bit [6]: Determines whether the index of the lowest set bit or the
1793/// highest set bit is returned. \n
1794/// 0: The index of the least significant set bit. \n
1795/// 1: The index of the most significant set bit. \n
1796/// \returns Returns an integer representing the result index of the comparison.
Logan Chien2833ffb2018-10-09 10:03:24 +08001797#define _mm_cmpestri(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001798 ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
1799 (__v16qi)(__m128i)(B), (int)(LB), \
1800 (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001801
1802/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001803/// Uses the immediate operand \a M to perform a comparison of string
1804/// data with implicitly defined lengths that is contained in source operands
1805/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1806/// string in \a B is the maximum, otherwise, returns 0.
1807///
1808/// \headerfile <x86intrin.h>
1809///
1810/// \code
1811/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1812/// \endcode
1813///
1814/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1815/// instruction.
1816///
1817/// \param A
1818/// A 128-bit integer vector containing one of the source operands to be
1819/// compared.
1820/// \param B
1821/// A 128-bit integer vector containing one of the source operands to be
1822/// compared.
1823/// \param M
1824/// An 8-bit immediate operand specifying whether the characters are bytes or
1825/// words and the type of comparison to perform. \n
1826/// Bits [1:0]: Determine source data format. \n
1827/// 00: 16 unsigned bytes \n
1828/// 01: 8 unsigned words \n
1829/// 10: 16 signed bytes \n
1830/// 11: 8 signed words \n
1831/// Bits [3:2]: Determine comparison type and aggregation method. \n
1832/// 00: Subset: Each character in \a B is compared for equality with all
1833/// the characters in \a A. \n
1834/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1835/// basis is greater than or equal for even-indexed elements in \a A,
1836/// and less than or equal for odd-indexed elements in \a A. \n
1837/// 10: Match: Compare each pair of corresponding characters in \a A and
1838/// \a B for equality. \n
1839/// 11: Substring: Search \a B for substring matches of \a A. \n
1840/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1841/// mask of the comparison results. \n
1842/// 00: No effect. \n
1843/// 01: Negate the bit mask. \n
1844/// 10: No effect. \n
1845/// 11: Negate the bit mask only for bits with an index less than or equal
1846/// to the size of \a A or \a B. \n
1847/// \returns Returns 1 if the bit mask is zero and the length of the string in
1848/// \a B is the maximum; otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08001849#define _mm_cmpistra(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001850 ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
1851 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001852
1853/// Uses the immediate operand \a M to perform a comparison of string
1854/// data with implicitly defined lengths that is contained in source operands
1855/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1856/// 0.
1857///
1858/// \headerfile <x86intrin.h>
1859///
1860/// \code
1861/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1862/// \endcode
1863///
1864/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1865/// instruction.
1866///
1867/// \param A
1868/// A 128-bit integer vector containing one of the source operands to be
1869/// compared.
1870/// \param B
1871/// A 128-bit integer vector containing one of the source operands to be
1872/// compared.
1873/// \param M
1874/// An 8-bit immediate operand specifying whether the characters are bytes or
1875/// words and the type of comparison to perform. \n
1876/// Bits [1:0]: Determine source data format. \n
1877/// 00: 16 unsigned bytes \n
1878/// 01: 8 unsigned words \n
1879/// 10: 16 signed bytes \n
1880/// 11: 8 signed words \n
1881/// Bits [3:2]: Determine comparison type and aggregation method. \n
1882/// 00: Subset: Each character in \a B is compared for equality with all
1883/// the characters in \a A. \n
1884/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1885/// basis is greater than or equal for even-indexed elements in \a A,
1886/// and less than or equal for odd-indexed elements in \a A. \n
1887/// 10: Match: Compare each pair of corresponding characters in \a A and
1888/// \a B for equality. \n
1889/// 11: Substring: Search B for substring matches of \a A. \n
1890/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1891/// mask of the comparison results. \n
1892/// 00: No effect. \n
1893/// 01: Negate the bit mask. \n
1894/// 10: No effect. \n
1895/// 11: Negate the bit mask only for bits with an index less than or equal
1896/// to the size of \a A or \a B.
1897/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08001898#define _mm_cmpistrc(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001899 ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
1900 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001901
1902/// Uses the immediate operand \a M to perform a comparison of string
1903/// data with implicitly defined lengths that is contained in source operands
1904/// \a A and \a B. Returns bit 0 of the resulting bit mask.
1905///
1906/// \headerfile <x86intrin.h>
1907///
1908/// \code
1909/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
1910/// \endcode
1911///
1912/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1913/// instruction.
1914///
1915/// \param A
1916/// A 128-bit integer vector containing one of the source operands to be
1917/// compared.
1918/// \param B
1919/// A 128-bit integer vector containing one of the source operands to be
1920/// compared.
1921/// \param M
1922/// An 8-bit immediate operand specifying whether the characters are bytes or
1923/// words and the type of comparison to perform. \n
1924/// Bits [1:0]: Determine source data format. \n
1925/// 00: 16 unsigned bytes \n
1926/// 01: 8 unsigned words \n
1927/// 10: 16 signed bytes \n
1928/// 11: 8 signed words \n
1929/// Bits [3:2]: Determine comparison type and aggregation method. \n
1930/// 00: Subset: Each character in \a B is compared for equality with all
1931/// the characters in \a A. \n
1932/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1933/// basis is greater than or equal for even-indexed elements in \a A,
1934/// and less than or equal for odd-indexed elements in \a A. \n
1935/// 10: Match: Compare each pair of corresponding characters in \a A and
1936/// \a B for equality. \n
1937/// 11: Substring: Search B for substring matches of \a A. \n
1938/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1939/// mask of the comparison results. \n
1940/// 00: No effect. \n
1941/// 01: Negate the bit mask. \n
1942/// 10: No effect. \n
1943/// 11: Negate the bit mask only for bits with an index less than or equal
1944/// to the size of \a A or \a B. \n
1945/// \returns Returns bit 0 of the resulting bit mask.
Logan Chien2833ffb2018-10-09 10:03:24 +08001946#define _mm_cmpistro(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001947 ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
1948 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001949
1950/// Uses the immediate operand \a M to perform a comparison of string
1951/// data with implicitly defined lengths that is contained in source operands
1952/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
1953/// the maximum, otherwise, returns 0.
1954///
1955/// \headerfile <x86intrin.h>
1956///
1957/// \code
1958/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
1959/// \endcode
1960///
1961/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1962/// instruction.
1963///
1964/// \param A
1965/// A 128-bit integer vector containing one of the source operands to be
1966/// compared.
1967/// \param B
1968/// A 128-bit integer vector containing one of the source operands to be
1969/// compared.
1970/// \param M
1971/// An 8-bit immediate operand specifying whether the characters are bytes or
1972/// words and the type of comparison to perform. \n
1973/// Bits [1:0]: Determine source data format. \n
1974/// 00: 16 unsigned bytes \n
1975/// 01: 8 unsigned words \n
1976/// 10: 16 signed bytes \n
1977/// 11: 8 signed words \n
1978/// Bits [3:2]: Determine comparison type and aggregation method. \n
1979/// 00: Subset: Each character in \a B is compared for equality with all
1980/// the characters in \a A. \n
1981/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1982/// basis is greater than or equal for even-indexed elements in \a A,
1983/// and less than or equal for odd-indexed elements in \a A. \n
1984/// 10: Match: Compare each pair of corresponding characters in \a A and
1985/// \a B for equality. \n
1986/// 11: Substring: Search \a B for substring matches of \a A. \n
1987/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1988/// mask of the comparison results. \n
1989/// 00: No effect. \n
1990/// 01: Negate the bit mask. \n
1991/// 10: No effect. \n
1992/// 11: Negate the bit mask only for bits with an index less than or equal
1993/// to the size of \a A or \a B. \n
1994/// \returns Returns 1 if the length of the string in \a A is less than the
1995/// maximum, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08001996#define _mm_cmpistrs(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001997 ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
1998 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001999
2000/// Uses the immediate operand \a M to perform a comparison of string
2001/// data with implicitly defined lengths that is contained in source operands
2002/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2003/// the maximum, otherwise, returns 0.
2004///
2005/// \headerfile <x86intrin.h>
2006///
2007/// \code
2008/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
2009/// \endcode
2010///
2011/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
2012/// instruction.
2013///
2014/// \param A
2015/// A 128-bit integer vector containing one of the source operands to be
2016/// compared.
2017/// \param B
2018/// A 128-bit integer vector containing one of the source operands to be
2019/// compared.
2020/// \param M
2021/// An 8-bit immediate operand specifying whether the characters are bytes or
2022/// words and the type of comparison to perform. \n
2023/// Bits [1:0]: Determine source data format. \n
2024/// 00: 16 unsigned bytes \n
2025/// 01: 8 unsigned words \n
2026/// 10: 16 signed bytes \n
2027/// 11: 8 signed words \n
2028/// Bits [3:2]: Determine comparison type and aggregation method. \n
2029/// 00: Subset: Each character in \a B is compared for equality with all
2030/// the characters in \a A. \n
2031/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2032/// basis is greater than or equal for even-indexed elements in \a A,
2033/// and less than or equal for odd-indexed elements in \a A. \n
2034/// 10: Match: Compare each pair of corresponding characters in \a A and
2035/// \a B for equality. \n
2036/// 11: Substring: Search \a B for substring matches of \a A. \n
2037/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2038/// mask of the comparison results. \n
2039/// 00: No effect. \n
2040/// 01: Negate the bit mask. \n
2041/// 10: No effect. \n
2042/// 11: Negate the bit mask only for bits with an index less than or equal
2043/// to the size of \a A or \a B.
2044/// \returns Returns 1 if the length of the string in \a B is less than the
2045/// maximum, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002046#define _mm_cmpistrz(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002047 ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
2048 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002049
Logan Chien55afb0a2018-10-15 10:42:14 +08002050/// Uses the immediate operand \a M to perform a comparison of string
2051/// data with explicitly defined lengths that is contained in source operands
2052/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2053/// string in \a B is the maximum, otherwise, returns 0.
2054///
2055/// \headerfile <x86intrin.h>
2056///
2057/// \code
2058/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2059/// \endcode
2060///
2061/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2062/// instruction.
2063///
2064/// \param A
2065/// A 128-bit integer vector containing one of the source operands to be
2066/// compared.
2067/// \param LA
2068/// An integer that specifies the length of the string in \a A.
2069/// \param B
2070/// A 128-bit integer vector containing one of the source operands to be
2071/// compared.
2072/// \param LB
2073/// An integer that specifies the length of the string in \a B.
2074/// \param M
2075/// An 8-bit immediate operand specifying whether the characters are bytes or
2076/// words and the type of comparison to perform. \n
2077/// Bits [1:0]: Determine source data format. \n
2078/// 00: 16 unsigned bytes \n
2079/// 01: 8 unsigned words \n
2080/// 10: 16 signed bytes \n
2081/// 11: 8 signed words \n
2082/// Bits [3:2]: Determine comparison type and aggregation method. \n
2083/// 00: Subset: Each character in \a B is compared for equality with all
2084/// the characters in \a A. \n
2085/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2086/// basis is greater than or equal for even-indexed elements in \a A,
2087/// and less than or equal for odd-indexed elements in \a A. \n
2088/// 10: Match: Compare each pair of corresponding characters in \a A and
2089/// \a B for equality. \n
2090/// 11: Substring: Search \a B for substring matches of \a A. \n
2091/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2092/// mask of the comparison results. \n
2093/// 00: No effect. \n
2094/// 01: Negate the bit mask. \n
2095/// 10: No effect. \n
2096/// 11: Negate the bit mask only for bits with an index less than or equal
2097/// to the size of \a A or \a B.
2098/// \returns Returns 1 if the bit mask is zero and the length of the string in
2099/// \a B is the maximum, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002100#define _mm_cmpestra(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002101 ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
2102 (__v16qi)(__m128i)(B), (int)(LB), \
2103 (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08002104
2105/// Uses the immediate operand \a M to perform a comparison of string
2106/// data with explicitly defined lengths that is contained in source operands
2107/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2108/// returns 0.
2109///
2110/// \headerfile <x86intrin.h>
2111///
2112/// \code
2113/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2114/// \endcode
2115///
2116/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2117/// instruction.
2118///
2119/// \param A
2120/// A 128-bit integer vector containing one of the source operands to be
2121/// compared.
2122/// \param LA
2123/// An integer that specifies the length of the string in \a A.
2124/// \param B
2125/// A 128-bit integer vector containing one of the source operands to be
2126/// compared.
2127/// \param LB
2128/// An integer that specifies the length of the string in \a B.
2129/// \param M
2130/// An 8-bit immediate operand specifying whether the characters are bytes or
2131/// words and the type of comparison to perform. \n
2132/// Bits [1:0]: Determine source data format. \n
2133/// 00: 16 unsigned bytes \n
2134/// 01: 8 unsigned words \n
2135/// 10: 16 signed bytes \n
2136/// 11: 8 signed words \n
2137/// Bits [3:2]: Determine comparison type and aggregation method. \n
2138/// 00: Subset: Each character in \a B is compared for equality with all
2139/// the characters in \a A. \n
2140/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2141/// basis is greater than or equal for even-indexed elements in \a A,
2142/// and less than or equal for odd-indexed elements in \a A. \n
2143/// 10: Match: Compare each pair of corresponding characters in \a A and
2144/// \a B for equality. \n
2145/// 11: Substring: Search \a B for substring matches of \a A. \n
2146/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2147/// mask of the comparison results. \n
2148/// 00: No effect. \n
2149/// 01: Negate the bit mask. \n
2150/// 10: No effect. \n
2151/// 11: Negate the bit mask only for bits with an index less than or equal
2152/// to the size of \a A or \a B. \n
2153/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002154#define _mm_cmpestrc(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002155 ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
2156 (__v16qi)(__m128i)(B), (int)(LB), \
2157 (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08002158
2159/// Uses the immediate operand \a M to perform a comparison of string
2160/// data with explicitly defined lengths that is contained in source operands
2161/// \a A and \a B. Returns bit 0 of the resulting bit mask.
2162///
2163/// \headerfile <x86intrin.h>
2164///
2165/// \code
2166/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2167/// \endcode
2168///
2169/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2170/// instruction.
2171///
2172/// \param A
2173/// A 128-bit integer vector containing one of the source operands to be
2174/// compared.
2175/// \param LA
2176/// An integer that specifies the length of the string in \a A.
2177/// \param B
2178/// A 128-bit integer vector containing one of the source operands to be
2179/// compared.
2180/// \param LB
2181/// An integer that specifies the length of the string in \a B.
2182/// \param M
2183/// An 8-bit immediate operand specifying whether the characters are bytes or
2184/// words and the type of comparison to perform. \n
2185/// Bits [1:0]: Determine source data format. \n
2186/// 00: 16 unsigned bytes \n
2187/// 01: 8 unsigned words \n
2188/// 10: 16 signed bytes \n
2189/// 11: 8 signed words \n
2190/// Bits [3:2]: Determine comparison type and aggregation method. \n
2191/// 00: Subset: Each character in \a B is compared for equality with all
2192/// the characters in \a A. \n
2193/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2194/// basis is greater than or equal for even-indexed elements in \a A,
2195/// and less than or equal for odd-indexed elements in \a A. \n
2196/// 10: Match: Compare each pair of corresponding characters in \a A and
2197/// \a B for equality. \n
2198/// 11: Substring: Search \a B for substring matches of \a A. \n
2199/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2200/// mask of the comparison results. \n
2201/// 00: No effect. \n
2202/// 01: Negate the bit mask. \n
2203/// 10: No effect. \n
2204/// 11: Negate the bit mask only for bits with an index less than or equal
2205/// to the size of \a A or \a B.
2206/// \returns Returns bit 0 of the resulting bit mask.
Logan Chien2833ffb2018-10-09 10:03:24 +08002207#define _mm_cmpestro(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002208 ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
2209 (__v16qi)(__m128i)(B), (int)(LB), \
2210 (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08002211
2212/// Uses the immediate operand \a M to perform a comparison of string
2213/// data with explicitly defined lengths that is contained in source operands
2214/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
2215/// the maximum, otherwise, returns 0.
2216///
2217/// \headerfile <x86intrin.h>
2218///
2219/// \code
2220/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2221/// \endcode
2222///
2223/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2224/// instruction.
2225///
2226/// \param A
2227/// A 128-bit integer vector containing one of the source operands to be
2228/// compared.
2229/// \param LA
2230/// An integer that specifies the length of the string in \a A.
2231/// \param B
2232/// A 128-bit integer vector containing one of the source operands to be
2233/// compared.
2234/// \param LB
2235/// An integer that specifies the length of the string in \a B.
2236/// \param M
2237/// An 8-bit immediate operand specifying whether the characters are bytes or
2238/// words and the type of comparison to perform. \n
2239/// Bits [1:0]: Determine source data format. \n
2240/// 00: 16 unsigned bytes \n
2241/// 01: 8 unsigned words \n
2242/// 10: 16 signed bytes \n
2243/// 11: 8 signed words \n
2244/// Bits [3:2]: Determine comparison type and aggregation method. \n
2245/// 00: Subset: Each character in \a B is compared for equality with all
2246/// the characters in \a A. \n
2247/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2248/// basis is greater than or equal for even-indexed elements in \a A,
2249/// and less than or equal for odd-indexed elements in \a A. \n
2250/// 10: Match: Compare each pair of corresponding characters in \a A and
2251/// \a B for equality. \n
2252/// 11: Substring: Search \a B for substring matches of \a A. \n
2253/// Bits [5:4]: Determine whether to perform a one's complement in the bit
2254/// mask of the comparison results. \n
2255/// 00: No effect. \n
2256/// 01: Negate the bit mask. \n
2257/// 10: No effect. \n
2258/// 11: Negate the bit mask only for bits with an index less than or equal
2259/// to the size of \a A or \a B. \n
2260/// \returns Returns 1 if the length of the string in \a A is less than the
2261/// maximum, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002262#define _mm_cmpestrs(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002263 ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
2264 (__v16qi)(__m128i)(B), (int)(LB), \
2265 (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08002266
2267/// Uses the immediate operand \a M to perform a comparison of string
2268/// data with explicitly defined lengths that is contained in source operands
2269/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2270/// the maximum, otherwise, returns 0.
2271///
2272/// \headerfile <x86intrin.h>
2273///
2274/// \code
2275/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2276/// \endcode
2277///
2278/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
2279///
2280/// \param A
2281/// A 128-bit integer vector containing one of the source operands to be
2282/// compared.
2283/// \param LA
2284/// An integer that specifies the length of the string in \a A.
2285/// \param B
2286/// A 128-bit integer vector containing one of the source operands to be
2287/// compared.
2288/// \param LB
2289/// An integer that specifies the length of the string in \a B.
2290/// \param M
2291/// An 8-bit immediate operand specifying whether the characters are bytes or
2292/// words and the type of comparison to perform. \n
2293/// Bits [1:0]: Determine source data format. \n
2294/// 00: 16 unsigned bytes \n
2295/// 01: 8 unsigned words \n
2296/// 10: 16 signed bytes \n
2297/// 11: 8 signed words \n
2298/// Bits [3:2]: Determine comparison type and aggregation method. \n
2299/// 00: Subset: Each character in \a B is compared for equality with all
2300/// the characters in \a A. \n
2301/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2302/// basis is greater than or equal for even-indexed elements in \a A,
2303/// and less than or equal for odd-indexed elements in \a A. \n
2304/// 10: Match: Compare each pair of corresponding characters in \a A and
2305/// \a B for equality. \n
2306/// 11: Substring: Search \a B for substring matches of \a A. \n
2307/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2308/// mask of the comparison results. \n
2309/// 00: No effect. \n
2310/// 01: Negate the bit mask. \n
2311/// 10: No effect. \n
2312/// 11: Negate the bit mask only for bits with an index less than or equal
2313/// to the size of \a A or \a B.
2314/// \returns Returns 1 if the length of the string in \a B is less than the
2315/// maximum, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002316#define _mm_cmpestrz(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002317 ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
2318 (__v16qi)(__m128i)(B), (int)(LB), \
2319 (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002320
2321/* SSE4.2 Compare Packed Data -- Greater Than. */
Logan Chien55afb0a2018-10-15 10:42:14 +08002322/// Compares each of the corresponding 64-bit values of the 128-bit
2323/// integer vectors to determine if the values in the first operand are
2324/// greater than those in the second operand.
2325///
2326/// \headerfile <x86intrin.h>
2327///
2328/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
2329///
2330/// \param __V1
2331/// A 128-bit integer vector.
2332/// \param __V2
2333/// A 128-bit integer vector.
2334/// \returns A 128-bit integer vector containing the comparison results.
Logan Chien2833ffb2018-10-09 10:03:24 +08002335static __inline__ __m128i __DEFAULT_FN_ATTRS
2336_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
2337{
2338 return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2339}
2340
2341/* SSE4.2 Accumulate CRC32. */
Logan Chien55afb0a2018-10-15 10:42:14 +08002342/// Adds the unsigned integer operand to the CRC-32C checksum of the
2343/// unsigned char operand.
2344///
2345/// \headerfile <x86intrin.h>
2346///
2347/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
2348///
2349/// \param __C
2350/// An unsigned integer operand to add to the CRC-32C checksum of operand
2351/// \a __D.
2352/// \param __D
2353/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
2354/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2355/// operand \a __D.
Logan Chien2833ffb2018-10-09 10:03:24 +08002356static __inline__ unsigned int __DEFAULT_FN_ATTRS
2357_mm_crc32_u8(unsigned int __C, unsigned char __D)
2358{
2359 return __builtin_ia32_crc32qi(__C, __D);
2360}
2361
Logan Chien55afb0a2018-10-15 10:42:14 +08002362/// Adds the unsigned integer operand to the CRC-32C checksum of the
2363/// unsigned short operand.
2364///
2365/// \headerfile <x86intrin.h>
2366///
2367/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
2368///
2369/// \param __C
2370/// An unsigned integer operand to add to the CRC-32C checksum of operand
2371/// \a __D.
2372/// \param __D
2373/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
2374/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2375/// operand \a __D.
Logan Chien2833ffb2018-10-09 10:03:24 +08002376static __inline__ unsigned int __DEFAULT_FN_ATTRS
2377_mm_crc32_u16(unsigned int __C, unsigned short __D)
2378{
2379 return __builtin_ia32_crc32hi(__C, __D);
2380}
2381
Logan Chien55afb0a2018-10-15 10:42:14 +08002382/// Adds the first unsigned integer operand to the CRC-32C checksum of
2383/// the second unsigned integer operand.
2384///
2385/// \headerfile <x86intrin.h>
2386///
2387/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
2388///
2389/// \param __C
2390/// An unsigned integer operand to add to the CRC-32C checksum of operand
2391/// \a __D.
2392/// \param __D
2393/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
2394/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2395/// operand \a __D.
Logan Chien2833ffb2018-10-09 10:03:24 +08002396static __inline__ unsigned int __DEFAULT_FN_ATTRS
2397_mm_crc32_u32(unsigned int __C, unsigned int __D)
2398{
2399 return __builtin_ia32_crc32si(__C, __D);
2400}
2401
2402#ifdef __x86_64__
Logan Chien55afb0a2018-10-15 10:42:14 +08002403/// Adds the unsigned integer operand to the CRC-32C checksum of the
2404/// unsigned 64-bit integer operand.
2405///
2406/// \headerfile <x86intrin.h>
2407///
2408/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
2409///
2410/// \param __C
2411/// An unsigned integer operand to add to the CRC-32C checksum of operand
2412/// \a __D.
2413/// \param __D
2414/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
2415/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2416/// operand \a __D.
Logan Chien2833ffb2018-10-09 10:03:24 +08002417static __inline__ unsigned long long __DEFAULT_FN_ATTRS
2418_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
2419{
2420 return __builtin_ia32_crc32di(__C, __D);
2421}
2422#endif /* __x86_64__ */
2423
2424#undef __DEFAULT_FN_ATTRS
2425
Logan Chien2833ffb2018-10-09 10:03:24 +08002426#include <popcntintrin.h>
Logan Chien2833ffb2018-10-09 10:03:24 +08002427
Logan Chien55afb0a2018-10-15 10:42:14 +08002428#endif /* __SMMINTRIN_H */