blob: 710e55aaa1203c77f8aba16de667a89b3953d248 [file] [log] [blame]
Logan Chien2833ffb2018-10-09 10:03:24 +08001/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2 *
Logan Chiendf4f7662019-09-04 16:45:23 -07003 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien2833ffb2018-10-09 10:03:24 +08006 *
7 *===-----------------------------------------------------------------------===
8 */
9
Logan Chien55afb0a2018-10-15 10:42:14 +080010#ifndef __SMMINTRIN_H
11#define __SMMINTRIN_H
Logan Chien2833ffb2018-10-09 10:03:24 +080012
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -080013#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
Logan Chien2833ffb2018-10-09 10:03:24 +080017#include <tmmintrin.h>
18
19/* Define the default attributes for the functions in this file. */
Logan Chien55afb0a2018-10-15 10:42:14 +080020#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
Logan Chien2833ffb2018-10-09 10:03:24 +080021
22/* SSE4 Rounding macros. */
23#define _MM_FROUND_TO_NEAREST_INT 0x00
24#define _MM_FROUND_TO_NEG_INF 0x01
25#define _MM_FROUND_TO_POS_INF 0x02
26#define _MM_FROUND_TO_ZERO 0x03
27#define _MM_FROUND_CUR_DIRECTION 0x04
28
29#define _MM_FROUND_RAISE_EXC 0x00
30#define _MM_FROUND_NO_EXC 0x08
31
32#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
33#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
34#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
35#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
36#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
37#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
38
Logan Chien55afb0a2018-10-15 10:42:14 +080039/// Rounds up each element of the 128-bit vector of [4 x float] to an
40/// integer and returns the rounded values in a 128-bit vector of
41/// [4 x float].
42///
43/// \headerfile <x86intrin.h>
44///
45/// \code
46/// __m128 _mm_ceil_ps(__m128 X);
47/// \endcode
48///
49/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
50///
51/// \param X
52/// A 128-bit vector of [4 x float] values to be rounded up.
53/// \returns A 128-bit vector of [4 x float] containing the rounded values.
Logan Chien2833ffb2018-10-09 10:03:24 +080054#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
Logan Chien55afb0a2018-10-15 10:42:14 +080055
56/// Rounds up each element of the 128-bit vector of [2 x double] to an
57/// integer and returns the rounded values in a 128-bit vector of
58/// [2 x double].
59///
60/// \headerfile <x86intrin.h>
61///
62/// \code
63/// __m128d _mm_ceil_pd(__m128d X);
64/// \endcode
65///
66/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
67///
68/// \param X
69/// A 128-bit vector of [2 x double] values to be rounded up.
70/// \returns A 128-bit vector of [2 x double] containing the rounded values.
Logan Chien2833ffb2018-10-09 10:03:24 +080071#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
Logan Chien55afb0a2018-10-15 10:42:14 +080072
73/// Copies three upper elements of the first 128-bit vector operand to
74/// the corresponding three upper elements of the 128-bit result vector of
75/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
76/// operand to an integer and copies it to the lowest element of the 128-bit
77/// result vector of [4 x float].
78///
79/// \headerfile <x86intrin.h>
80///
81/// \code
82/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
83/// \endcode
84///
85/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
86///
87/// \param X
88/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
89/// copied to the corresponding bits of the result.
90/// \param Y
91/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
92/// rounded up to the nearest integer and copied to the corresponding bits
93/// of the result.
94/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
95/// values.
Logan Chien2833ffb2018-10-09 10:03:24 +080096#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
Logan Chien55afb0a2018-10-15 10:42:14 +080097
98/// Copies the upper element of the first 128-bit vector operand to the
99/// corresponding upper element of the 128-bit result vector of [2 x double].
100/// Rounds up the lower element of the second 128-bit vector operand to an
101/// integer and copies it to the lower element of the 128-bit result vector
102/// of [2 x double].
103///
104/// \headerfile <x86intrin.h>
105///
106/// \code
107/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
108/// \endcode
109///
110/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
111///
112/// \param X
113/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
114/// copied to the corresponding bits of the result.
115/// \param Y
116/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
117/// rounded up to the nearest integer and copied to the corresponding bits
118/// of the result.
119/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
120/// values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800121#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
122
Logan Chien55afb0a2018-10-15 10:42:14 +0800123/// Rounds down each element of the 128-bit vector of [4 x float] to an
124/// an integer and returns the rounded values in a 128-bit vector of
125/// [4 x float].
126///
127/// \headerfile <x86intrin.h>
128///
129/// \code
130/// __m128 _mm_floor_ps(__m128 X);
131/// \endcode
132///
133/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
134///
135/// \param X
136/// A 128-bit vector of [4 x float] values to be rounded down.
137/// \returns A 128-bit vector of [4 x float] containing the rounded values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800138#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
Logan Chien55afb0a2018-10-15 10:42:14 +0800139
140/// Rounds down each element of the 128-bit vector of [2 x double] to an
141/// integer and returns the rounded values in a 128-bit vector of
142/// [2 x double].
143///
144/// \headerfile <x86intrin.h>
145///
146/// \code
147/// __m128d _mm_floor_pd(__m128d X);
148/// \endcode
149///
150/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
151///
152/// \param X
153/// A 128-bit vector of [2 x double].
154/// \returns A 128-bit vector of [2 x double] containing the rounded values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800155#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
Logan Chien55afb0a2018-10-15 10:42:14 +0800156
157/// Copies three upper elements of the first 128-bit vector operand to
158/// the corresponding three upper elements of the 128-bit result vector of
159/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
160/// operand to an integer and copies it to the lowest element of the 128-bit
161/// result vector of [4 x float].
162///
163/// \headerfile <x86intrin.h>
164///
165/// \code
166/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
167/// \endcode
168///
169/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
170///
171/// \param X
172/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
173/// copied to the corresponding bits of the result.
174/// \param Y
175/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
176/// rounded down to the nearest integer and copied to the corresponding bits
177/// of the result.
178/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
179/// values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800180#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
Logan Chien55afb0a2018-10-15 10:42:14 +0800181
182/// Copies the upper element of the first 128-bit vector operand to the
183/// corresponding upper element of the 128-bit result vector of [2 x double].
184/// Rounds down the lower element of the second 128-bit vector operand to an
185/// integer and copies it to the lower element of the 128-bit result vector
186/// of [2 x double].
187///
188/// \headerfile <x86intrin.h>
189///
190/// \code
191/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
192/// \endcode
193///
194/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
195///
196/// \param X
197/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
198/// copied to the corresponding bits of the result.
199/// \param Y
200/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
201/// rounded down to the nearest integer and copied to the corresponding bits
202/// of the result.
203/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
204/// values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800205#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
206
Logan Chien55afb0a2018-10-15 10:42:14 +0800207/// Rounds each element of the 128-bit vector of [4 x float] to an
208/// integer value according to the rounding control specified by the second
209/// argument and returns the rounded values in a 128-bit vector of
210/// [4 x float].
211///
212/// \headerfile <x86intrin.h>
213///
214/// \code
215/// __m128 _mm_round_ps(__m128 X, const int M);
216/// \endcode
217///
218/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
219///
220/// \param X
221/// A 128-bit vector of [4 x float].
222/// \param M
223/// An integer value that specifies the rounding operation. \n
224/// Bits [7:4] are reserved. \n
225/// Bit [3] is a precision exception value: \n
226/// 0: A normal PE exception is used \n
227/// 1: The PE field is not updated \n
228/// Bit [2] is the rounding control source: \n
229/// 0: Use bits [1:0] of \a M \n
230/// 1: Use the current MXCSR setting \n
231/// Bits [1:0] contain the rounding control definition: \n
232/// 00: Nearest \n
233/// 01: Downward (toward negative infinity) \n
234/// 10: Upward (toward positive infinity) \n
235/// 11: Truncated
236/// \returns A 128-bit vector of [4 x float] containing the rounded values.
237#define _mm_round_ps(X, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800238 ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800239
Logan Chien55afb0a2018-10-15 10:42:14 +0800240/// Copies three upper elements of the first 128-bit vector operand to
241/// the corresponding three upper elements of the 128-bit result vector of
242/// [4 x float]. Rounds the lowest element of the second 128-bit vector
243/// operand to an integer value according to the rounding control specified
244/// by the third argument and copies it to the lowest element of the 128-bit
245/// result vector of [4 x float].
246///
247/// \headerfile <x86intrin.h>
248///
249/// \code
250/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
251/// \endcode
252///
253/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
254///
255/// \param X
256/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
257/// copied to the corresponding bits of the result.
258/// \param Y
259/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
260/// rounded to the nearest integer using the specified rounding control and
261/// copied to the corresponding bits of the result.
262/// \param M
263/// An integer value that specifies the rounding operation. \n
264/// Bits [7:4] are reserved. \n
265/// Bit [3] is a precision exception value: \n
266/// 0: A normal PE exception is used \n
267/// 1: The PE field is not updated \n
268/// Bit [2] is the rounding control source: \n
269/// 0: Use bits [1:0] of \a M \n
270/// 1: Use the current MXCSR setting \n
271/// Bits [1:0] contain the rounding control definition: \n
272/// 00: Nearest \n
273/// 01: Downward (toward negative infinity) \n
274/// 10: Upward (toward positive infinity) \n
275/// 11: Truncated
276/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
277/// values.
278#define _mm_round_ss(X, Y, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800279 ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
280 (__v4sf)(__m128)(Y), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800281
Logan Chien55afb0a2018-10-15 10:42:14 +0800282/// Rounds each element of the 128-bit vector of [2 x double] to an
283/// integer value according to the rounding control specified by the second
284/// argument and returns the rounded values in a 128-bit vector of
285/// [2 x double].
286///
287/// \headerfile <x86intrin.h>
288///
289/// \code
290/// __m128d _mm_round_pd(__m128d X, const int M);
291/// \endcode
292///
293/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
294///
295/// \param X
296/// A 128-bit vector of [2 x double].
297/// \param M
298/// An integer value that specifies the rounding operation. \n
299/// Bits [7:4] are reserved. \n
300/// Bit [3] is a precision exception value: \n
301/// 0: A normal PE exception is used \n
302/// 1: The PE field is not updated \n
303/// Bit [2] is the rounding control source: \n
304/// 0: Use bits [1:0] of \a M \n
305/// 1: Use the current MXCSR setting \n
306/// Bits [1:0] contain the rounding control definition: \n
307/// 00: Nearest \n
308/// 01: Downward (toward negative infinity) \n
309/// 10: Upward (toward positive infinity) \n
310/// 11: Truncated
311/// \returns A 128-bit vector of [2 x double] containing the rounded values.
312#define _mm_round_pd(X, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800313 ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800314
Logan Chien55afb0a2018-10-15 10:42:14 +0800315/// Copies the upper element of the first 128-bit vector operand to the
316/// corresponding upper element of the 128-bit result vector of [2 x double].
317/// Rounds the lower element of the second 128-bit vector operand to an
318/// integer value according to the rounding control specified by the third
319/// argument and copies it to the lower element of the 128-bit result vector
320/// of [2 x double].
321///
322/// \headerfile <x86intrin.h>
323///
324/// \code
325/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
326/// \endcode
327///
328/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
329///
330/// \param X
331/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
332/// copied to the corresponding bits of the result.
333/// \param Y
334/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
335/// rounded to the nearest integer using the specified rounding control and
336/// copied to the corresponding bits of the result.
337/// \param M
338/// An integer value that specifies the rounding operation. \n
339/// Bits [7:4] are reserved. \n
340/// Bit [3] is a precision exception value: \n
341/// 0: A normal PE exception is used \n
342/// 1: The PE field is not updated \n
343/// Bit [2] is the rounding control source: \n
344/// 0: Use bits [1:0] of \a M \n
345/// 1: Use the current MXCSR setting \n
346/// Bits [1:0] contain the rounding control definition: \n
347/// 00: Nearest \n
348/// 01: Downward (toward negative infinity) \n
349/// 10: Upward (toward positive infinity) \n
350/// 11: Truncated
351/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
352/// values.
353#define _mm_round_sd(X, Y, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800354 ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
355 (__v2df)(__m128d)(Y), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800356
357/* SSE4 Packed Blending Intrinsics. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800358/// Returns a 128-bit vector of [2 x double] where the values are
359/// selected from either the first or second operand as specified by the
360/// third operand, the control mask.
361///
362/// \headerfile <x86intrin.h>
363///
364/// \code
365/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
366/// \endcode
367///
368/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
369///
370/// \param V1
371/// A 128-bit vector of [2 x double].
372/// \param V2
373/// A 128-bit vector of [2 x double].
374/// \param M
375/// An immediate integer operand, with mask bits [1:0] specifying how the
376/// values are to be copied. The position of the mask bit corresponds to the
377/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
378/// element in operand \a V1 is copied to the same position in the result.
379/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
380/// is copied to the same position in the result.
381/// \returns A 128-bit vector of [2 x double] containing the copied values.
382#define _mm_blend_pd(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800383 ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
384 (__v2df)(__m128d)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800385
Logan Chien55afb0a2018-10-15 10:42:14 +0800386/// Returns a 128-bit vector of [4 x float] where the values are selected
387/// from either the first or second operand as specified by the third
388/// operand, the control mask.
389///
390/// \headerfile <x86intrin.h>
391///
392/// \code
393/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
394/// \endcode
395///
396/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
397///
398/// \param V1
399/// A 128-bit vector of [4 x float].
400/// \param V2
401/// A 128-bit vector of [4 x float].
402/// \param M
403/// An immediate integer operand, with mask bits [3:0] specifying how the
404/// values are to be copied. The position of the mask bit corresponds to the
405/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
406/// element in operand \a V1 is copied to the same position in the result.
407/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
408/// is copied to the same position in the result.
409/// \returns A 128-bit vector of [4 x float] containing the copied values.
410#define _mm_blend_ps(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800411 ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
412 (__v4sf)(__m128)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800413
Logan Chien55afb0a2018-10-15 10:42:14 +0800414/// Returns a 128-bit vector of [2 x double] where the values are
415/// selected from either the first or second operand as specified by the
416/// third operand, the control mask.
417///
418/// \headerfile <x86intrin.h>
419///
420/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
421///
422/// \param __V1
423/// A 128-bit vector of [2 x double].
424/// \param __V2
425/// A 128-bit vector of [2 x double].
426/// \param __M
427/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
428/// values are to be copied. The position of the mask bit corresponds to the
429/// most significant bit of a copied value. When a mask bit is 0, the
430/// corresponding 64-bit element in operand \a __V1 is copied to the same
431/// position in the result. When a mask bit is 1, the corresponding 64-bit
432/// element in operand \a __V2 is copied to the same position in the result.
433/// \returns A 128-bit vector of [2 x double] containing the copied values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800434static __inline__ __m128d __DEFAULT_FN_ATTRS
435_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
436{
437 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
438 (__v2df)__M);
439}
440
Logan Chien55afb0a2018-10-15 10:42:14 +0800441/// Returns a 128-bit vector of [4 x float] where the values are
442/// selected from either the first or second operand as specified by the
443/// third operand, the control mask.
444///
445/// \headerfile <x86intrin.h>
446///
447/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
448///
449/// \param __V1
450/// A 128-bit vector of [4 x float].
451/// \param __V2
452/// A 128-bit vector of [4 x float].
453/// \param __M
454/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
455/// how the values are to be copied. The position of the mask bit corresponds
456/// to the most significant bit of a copied value. When a mask bit is 0, the
457/// corresponding 32-bit element in operand \a __V1 is copied to the same
458/// position in the result. When a mask bit is 1, the corresponding 32-bit
459/// element in operand \a __V2 is copied to the same position in the result.
460/// \returns A 128-bit vector of [4 x float] containing the copied values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800461static __inline__ __m128 __DEFAULT_FN_ATTRS
462_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
463{
464 return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
465 (__v4sf)__M);
466}
467
Logan Chien55afb0a2018-10-15 10:42:14 +0800468/// Returns a 128-bit vector of [16 x i8] where the values are selected
469/// from either of the first or second operand as specified by the third
470/// operand, the control mask.
471///
472/// \headerfile <x86intrin.h>
473///
474/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
475///
476/// \param __V1
477/// A 128-bit vector of [16 x i8].
478/// \param __V2
479/// A 128-bit vector of [16 x i8].
480/// \param __M
481/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
482/// how the values are to be copied. The position of the mask bit corresponds
483/// to the most significant bit of a copied value. When a mask bit is 0, the
484/// corresponding 8-bit element in operand \a __V1 is copied to the same
485/// position in the result. When a mask bit is 1, the corresponding 8-bit
486/// element in operand \a __V2 is copied to the same position in the result.
487/// \returns A 128-bit vector of [16 x i8] containing the copied values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800488static __inline__ __m128i __DEFAULT_FN_ATTRS
489_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
490{
491 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
492 (__v16qi)__M);
493}
494
Logan Chien55afb0a2018-10-15 10:42:14 +0800495/// Returns a 128-bit vector of [8 x i16] where the values are selected
496/// from either of the first or second operand as specified by the third
497/// operand, the control mask.
498///
499/// \headerfile <x86intrin.h>
500///
501/// \code
502/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
503/// \endcode
504///
505/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
506///
507/// \param V1
508/// A 128-bit vector of [8 x i16].
509/// \param V2
510/// A 128-bit vector of [8 x i16].
511/// \param M
512/// An immediate integer operand, with mask bits [7:0] specifying how the
513/// values are to be copied. The position of the mask bit corresponds to the
514/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
515/// element in operand \a V1 is copied to the same position in the result.
516/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
517/// is copied to the same position in the result.
518/// \returns A 128-bit vector of [8 x i16] containing the copied values.
519#define _mm_blend_epi16(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800520 ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
521 (__v8hi)(__m128i)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800522
523/* SSE4 Dword Multiply Instructions. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800524/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
525/// and returns the lower 32 bits of the each product in a 128-bit vector of
526/// [4 x i32].
527///
528/// \headerfile <x86intrin.h>
529///
530/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
531///
532/// \param __V1
533/// A 128-bit integer vector.
534/// \param __V2
535/// A 128-bit integer vector.
536/// \returns A 128-bit integer vector containing the products of both operands.
Logan Chien2833ffb2018-10-09 10:03:24 +0800537static __inline__ __m128i __DEFAULT_FN_ATTRS
538_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
539{
540 return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
541}
542
Logan Chien55afb0a2018-10-15 10:42:14 +0800543/// Multiplies corresponding even-indexed elements of two 128-bit
544/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
545/// containing the products.
546///
547/// \headerfile <x86intrin.h>
548///
549/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
550///
551/// \param __V1
552/// A 128-bit vector of [4 x i32].
553/// \param __V2
554/// A 128-bit vector of [4 x i32].
555/// \returns A 128-bit vector of [2 x i64] containing the products of both
556/// operands.
Logan Chien2833ffb2018-10-09 10:03:24 +0800557static __inline__ __m128i __DEFAULT_FN_ATTRS
558_mm_mul_epi32 (__m128i __V1, __m128i __V2)
559{
560 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
561}
562
563/* SSE4 Floating Point Dot Product Instructions. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800564/// Computes the dot product of the two 128-bit vectors of [4 x float]
565/// and returns it in the elements of the 128-bit result vector of
566/// [4 x float].
567///
568/// The immediate integer operand controls which input elements
569/// will contribute to the dot product, and where the final results are
570/// returned.
571///
572/// \headerfile <x86intrin.h>
573///
574/// \code
575/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
576/// \endcode
577///
578/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
579///
580/// \param X
581/// A 128-bit vector of [4 x float].
582/// \param Y
583/// A 128-bit vector of [4 x float].
584/// \param M
585/// An immediate integer operand. Mask bits [7:4] determine which elements
586/// of the input vectors are used, with bit [4] corresponding to the lowest
587/// element and bit [7] corresponding to the highest element of each [4 x
588/// float] vector. If a bit is set, the corresponding elements from the two
589/// input vectors are used as an input for dot product; otherwise that input
590/// is treated as zero. Bits [3:0] determine which elements of the result
591/// will receive a copy of the final dot product, with bit [0] corresponding
592/// to the lowest element and bit [3] corresponding to the highest element of
593/// each [4 x float] subvector. If a bit is set, the dot product is returned
594/// in the corresponding element; otherwise that element is set to zero.
595/// \returns A 128-bit vector of [4 x float] containing the dot product.
596#define _mm_dp_ps(X, Y, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800597 ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
598 (__v4sf)(__m128)(Y), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800599
Logan Chien55afb0a2018-10-15 10:42:14 +0800600/// Computes the dot product of the two 128-bit vectors of [2 x double]
601/// and returns it in the elements of the 128-bit result vector of
602/// [2 x double].
603///
604/// The immediate integer operand controls which input
605/// elements will contribute to the dot product, and where the final results
606/// are returned.
607///
608/// \headerfile <x86intrin.h>
609///
610/// \code
611/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
612/// \endcode
613///
614/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
615///
616/// \param X
617/// A 128-bit vector of [2 x double].
618/// \param Y
619/// A 128-bit vector of [2 x double].
620/// \param M
621/// An immediate integer operand. Mask bits [5:4] determine which elements
622/// of the input vectors are used, with bit [4] corresponding to the lowest
623/// element and bit [5] corresponding to the highest element of each of [2 x
624/// double] vector. If a bit is set, the corresponding elements from the two
625/// input vectors are used as an input for dot product; otherwise that input
626/// is treated as zero. Bits [1:0] determine which elements of the result
627/// will receive a copy of the final dot product, with bit [0] corresponding
628/// to the lowest element and bit [1] corresponding to the highest element of
629/// each [2 x double] vector. If a bit is set, the dot product is returned in
630/// the corresponding element; otherwise that element is set to zero.
631#define _mm_dp_pd(X, Y, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800632 ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
633 (__v2df)(__m128d)(Y), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800634
635/* SSE4 Streaming Load Hint Instruction. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800636/// Loads integer values from a 128-bit aligned memory location to a
637/// 128-bit integer vector.
638///
639/// \headerfile <x86intrin.h>
640///
641/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
642///
643/// \param __V
644/// A pointer to a 128-bit aligned memory location that contains the integer
645/// values.
646/// \returns A 128-bit integer vector containing the data stored at the
647/// specified memory location.
Logan Chien2833ffb2018-10-09 10:03:24 +0800648static __inline__ __m128i __DEFAULT_FN_ATTRS
649_mm_stream_load_si128 (__m128i const *__V)
650{
Logan Chien55afb0a2018-10-15 10:42:14 +0800651 return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
Logan Chien2833ffb2018-10-09 10:03:24 +0800652}
653
654/* SSE4 Packed Integer Min/Max Instructions. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800655/// Compares the corresponding elements of two 128-bit vectors of
656/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
657/// of the two values.
658///
659/// \headerfile <x86intrin.h>
660///
661/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
662///
663/// \param __V1
664/// A 128-bit vector of [16 x i8].
665/// \param __V2
666/// A 128-bit vector of [16 x i8]
667/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800668static __inline__ __m128i __DEFAULT_FN_ATTRS
669_mm_min_epi8 (__m128i __V1, __m128i __V2)
670{
671 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
672}
673
Logan Chien55afb0a2018-10-15 10:42:14 +0800674/// Compares the corresponding elements of two 128-bit vectors of
675/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
676/// greater value of the two.
677///
678/// \headerfile <x86intrin.h>
679///
680/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
681///
682/// \param __V1
683/// A 128-bit vector of [16 x i8].
684/// \param __V2
685/// A 128-bit vector of [16 x i8].
686/// \returns A 128-bit vector of [16 x i8] containing the greater values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800687static __inline__ __m128i __DEFAULT_FN_ATTRS
688_mm_max_epi8 (__m128i __V1, __m128i __V2)
689{
690 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
691}
692
Logan Chien55afb0a2018-10-15 10:42:14 +0800693/// Compares the corresponding elements of two 128-bit vectors of
694/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
695/// value of the two.
696///
697/// \headerfile <x86intrin.h>
698///
699/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
700///
701/// \param __V1
702/// A 128-bit vector of [8 x u16].
703/// \param __V2
704/// A 128-bit vector of [8 x u16].
705/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800706static __inline__ __m128i __DEFAULT_FN_ATTRS
707_mm_min_epu16 (__m128i __V1, __m128i __V2)
708{
709 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
710}
711
Logan Chien55afb0a2018-10-15 10:42:14 +0800712/// Compares the corresponding elements of two 128-bit vectors of
713/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
714/// greater value of the two.
715///
716/// \headerfile <x86intrin.h>
717///
718/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
719///
720/// \param __V1
721/// A 128-bit vector of [8 x u16].
722/// \param __V2
723/// A 128-bit vector of [8 x u16].
724/// \returns A 128-bit vector of [8 x u16] containing the greater values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800725static __inline__ __m128i __DEFAULT_FN_ATTRS
726_mm_max_epu16 (__m128i __V1, __m128i __V2)
727{
728 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
729}
730
Logan Chien55afb0a2018-10-15 10:42:14 +0800731/// Compares the corresponding elements of two 128-bit vectors of
732/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
733/// value of the two.
734///
735/// \headerfile <x86intrin.h>
736///
737/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
738///
739/// \param __V1
740/// A 128-bit vector of [4 x i32].
741/// \param __V2
742/// A 128-bit vector of [4 x i32].
743/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800744static __inline__ __m128i __DEFAULT_FN_ATTRS
745_mm_min_epi32 (__m128i __V1, __m128i __V2)
746{
747 return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
748}
749
Logan Chien55afb0a2018-10-15 10:42:14 +0800750/// Compares the corresponding elements of two 128-bit vectors of
751/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
752/// greater value of the two.
753///
754/// \headerfile <x86intrin.h>
755///
756/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
757///
758/// \param __V1
759/// A 128-bit vector of [4 x i32].
760/// \param __V2
761/// A 128-bit vector of [4 x i32].
762/// \returns A 128-bit vector of [4 x i32] containing the greater values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800763static __inline__ __m128i __DEFAULT_FN_ATTRS
764_mm_max_epi32 (__m128i __V1, __m128i __V2)
765{
766 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
767}
768
Logan Chien55afb0a2018-10-15 10:42:14 +0800769/// Compares the corresponding elements of two 128-bit vectors of
770/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
771/// value of the two.
772///
773/// \headerfile <x86intrin.h>
774///
775/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
776///
777/// \param __V1
778/// A 128-bit vector of [4 x u32].
779/// \param __V2
780/// A 128-bit vector of [4 x u32].
781/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800782static __inline__ __m128i __DEFAULT_FN_ATTRS
783_mm_min_epu32 (__m128i __V1, __m128i __V2)
784{
785 return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
786}
787
Logan Chien55afb0a2018-10-15 10:42:14 +0800788/// Compares the corresponding elements of two 128-bit vectors of
789/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
790/// greater value of the two.
791///
792/// \headerfile <x86intrin.h>
793///
794/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
795///
796/// \param __V1
797/// A 128-bit vector of [4 x u32].
798/// \param __V2
799/// A 128-bit vector of [4 x u32].
800/// \returns A 128-bit vector of [4 x u32] containing the greater values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800801static __inline__ __m128i __DEFAULT_FN_ATTRS
802_mm_max_epu32 (__m128i __V1, __m128i __V2)
803{
804 return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
805}
806
807/* SSE4 Insertion and Extraction from XMM Register Instructions. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800808/// Takes the first argument \a X and inserts an element from the second
809/// argument \a Y as selected by the third argument \a N. That result then
810/// has elements zeroed out also as selected by the third argument \a N. The
811/// resulting 128-bit vector of [4 x float] is then returned.
812///
813/// \headerfile <x86intrin.h>
814///
815/// \code
816/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
817/// \endcode
818///
819/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
820///
821/// \param X
822/// A 128-bit vector source operand of [4 x float]. With the exception of
823/// those bits in the result copied from parameter \a Y and zeroed by bits
824/// [3:0] of \a N, all bits from this parameter are copied to the result.
825/// \param Y
826/// A 128-bit vector source operand of [4 x float]. One single-precision
827/// floating-point element from this source, as determined by the immediate
828/// parameter, is copied to the result.
829/// \param N
830/// Specifies which bits from operand \a Y will be copied, which bits in the
831/// result they will be be copied to, and which bits in the result will be
832/// cleared. The following assignments are made: \n
833/// Bits [7:6] specify the bits to copy from operand \a Y: \n
834/// 00: Selects bits [31:0] from operand \a Y. \n
835/// 01: Selects bits [63:32] from operand \a Y. \n
836/// 10: Selects bits [95:64] from operand \a Y. \n
837/// 11: Selects bits [127:96] from operand \a Y. \n
838/// Bits [5:4] specify the bits in the result to which the selected bits
839/// from operand \a Y are copied: \n
840/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
841/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
842/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
843/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
844/// Bits[3:0]: If any of these bits are set, the corresponding result
845/// element is cleared.
846/// \returns A 128-bit vector of [4 x float] containing the copied
847/// single-precision floating point elements from the operands.
Logan Chien2833ffb2018-10-09 10:03:24 +0800848#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
Logan Chien55afb0a2018-10-15 10:42:14 +0800849
850/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
851/// returns it, using the immediate value parameter \a N as a selector.
852///
853/// \headerfile <x86intrin.h>
854///
855/// \code
856/// int _mm_extract_ps(__m128 X, const int N);
857/// \endcode
858///
859/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
860/// instruction.
861///
862/// \param X
863/// A 128-bit vector of [4 x float].
864/// \param N
865/// An immediate value. Bits [1:0] determines which bits from the argument
866/// \a X are extracted and returned: \n
867/// 00: Bits [31:0] of parameter \a X are returned. \n
868/// 01: Bits [63:32] of parameter \a X are returned. \n
869/// 10: Bits [95:64] of parameter \a X are returned. \n
870/// 11: Bits [127:96] of parameter \a X are returned.
871/// \returns A 32-bit integer containing the extracted 32 bits of float data.
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800872#define _mm_extract_ps(X, N) \
873 __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800874
875/* Miscellaneous insert and extract macros. */
876/* Extract a single-precision float from X at index N into D. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800877#define _MM_EXTRACT_FLOAT(D, X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800878 do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
Logan Chien2833ffb2018-10-09 10:03:24 +0800879
880/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
881 an index suitable for _mm_insert_ps. */
882#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
883
884/* Extract a float from X at index N into the first index of the return. */
885#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
886 _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
887
888/* Insert int into packed integer array at index. */
Logan Chien55afb0a2018-10-15 10:42:14 +0800889/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
890/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
891/// of an integer parameter \a I into an offset specified by the immediate
892/// value parameter \a N.
893///
894/// \headerfile <x86intrin.h>
895///
896/// \code
897/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
898/// \endcode
899///
900/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
901///
902/// \param X
903/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
904/// result and then one of the sixteen elements in the result vector is
905/// replaced by the lower 8 bits of \a I.
906/// \param I
907/// An integer. The lower 8 bits of this operand are written to the result
908/// beginning at the offset specified by \a N.
909/// \param N
910/// An immediate value. Bits [3:0] specify the bit offset in the result at
911/// which the lower 8 bits of \a I are written. \n
912/// 0000: Bits [7:0] of the result are used for insertion. \n
913/// 0001: Bits [15:8] of the result are used for insertion. \n
914/// 0010: Bits [23:16] of the result are used for insertion. \n
915/// 0011: Bits [31:24] of the result are used for insertion. \n
916/// 0100: Bits [39:32] of the result are used for insertion. \n
917/// 0101: Bits [47:40] of the result are used for insertion. \n
918/// 0110: Bits [55:48] of the result are used for insertion. \n
919/// 0111: Bits [63:56] of the result are used for insertion. \n
920/// 1000: Bits [71:64] of the result are used for insertion. \n
921/// 1001: Bits [79:72] of the result are used for insertion. \n
922/// 1010: Bits [87:80] of the result are used for insertion. \n
923/// 1011: Bits [95:88] of the result are used for insertion. \n
924/// 1100: Bits [103:96] of the result are used for insertion. \n
925/// 1101: Bits [111:104] of the result are used for insertion. \n
926/// 1110: Bits [119:112] of the result are used for insertion. \n
927/// 1111: Bits [127:120] of the result are used for insertion.
928/// \returns A 128-bit integer vector containing the constructed values.
929#define _mm_insert_epi8(X, I, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800930 ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
931 (int)(I), (int)(N)))
Logan Chien55afb0a2018-10-15 10:42:14 +0800932
933/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
934/// the 128-bit integer vector parameter, and then inserting the 32-bit
935/// integer parameter \a I at the offset specified by the immediate value
936/// parameter \a N.
937///
938/// \headerfile <x86intrin.h>
939///
940/// \code
941/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
942/// \endcode
943///
944/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
945///
946/// \param X
947/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
948/// result and then one of the four elements in the result vector is
949/// replaced by \a I.
950/// \param I
951/// A 32-bit integer that is written to the result beginning at the offset
952/// specified by \a N.
953/// \param N
954/// An immediate value. Bits [1:0] specify the bit offset in the result at
955/// which the integer \a I is written. \n
956/// 00: Bits [31:0] of the result are used for insertion. \n
957/// 01: Bits [63:32] of the result are used for insertion. \n
958/// 10: Bits [95:64] of the result are used for insertion. \n
959/// 11: Bits [127:96] of the result are used for insertion.
960/// \returns A 128-bit integer vector containing the constructed values.
961#define _mm_insert_epi32(X, I, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800962 ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
963 (int)(I), (int)(N)))
Logan Chien55afb0a2018-10-15 10:42:14 +0800964
Logan Chien2833ffb2018-10-09 10:03:24 +0800965#ifdef __x86_64__
Logan Chien55afb0a2018-10-15 10:42:14 +0800966/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
967/// the 128-bit integer vector parameter, and then inserting the 64-bit
968/// integer parameter \a I, using the immediate value parameter \a N as an
969/// insertion location selector.
970///
971/// \headerfile <x86intrin.h>
972///
973/// \code
974/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
975/// \endcode
976///
977/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
978///
979/// \param X
980/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
981/// result and then one of the two elements in the result vector is replaced
982/// by \a I.
983/// \param I
984/// A 64-bit integer that is written to the result beginning at the offset
985/// specified by \a N.
986/// \param N
987/// An immediate value. Bit [0] specifies the bit offset in the result at
988/// which the integer \a I is written. \n
989/// 0: Bits [63:0] of the result are used for insertion. \n
990/// 1: Bits [127:64] of the result are used for insertion. \n
991/// \returns A 128-bit integer vector containing the constructed values.
992#define _mm_insert_epi64(X, I, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800993 ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
994 (long long)(I), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800995#endif /* __x86_64__ */
996
997/* Extract int from packed integer array at index. This returns the element
998 * as a zero extended value, so it is unsigned.
999 */
Logan Chien55afb0a2018-10-15 10:42:14 +08001000/// Extracts an 8-bit element from the 128-bit integer vector of
1001/// [16 x i8], using the immediate value parameter \a N as a selector.
1002///
1003/// \headerfile <x86intrin.h>
1004///
1005/// \code
1006/// int _mm_extract_epi8(__m128i X, const int N);
1007/// \endcode
1008///
1009/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
1010///
1011/// \param X
1012/// A 128-bit integer vector.
1013/// \param N
1014/// An immediate value. Bits [3:0] specify which 8-bit vector element from
1015/// the argument \a X to extract and copy to the result. \n
1016/// 0000: Bits [7:0] of parameter \a X are extracted. \n
1017/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
1018/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
1019/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
1020/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
1021/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
1022/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
1023/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
1024/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
1025/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
1026/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
1027/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
1028/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
1029/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
1030/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
1031/// 1111: Bits [127:120] of the parameter \a X are extracted.
1032/// \returns An unsigned integer, whose lower 8 bits are selected from the
1033/// 128-bit integer vector parameter and the remaining bits are assigned
1034/// zeros.
1035#define _mm_extract_epi8(X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001036 ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
1037 (int)(N)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001038
1039/// Extracts a 32-bit element from the 128-bit integer vector of
1040/// [4 x i32], using the immediate value parameter \a N as a selector.
1041///
1042/// \headerfile <x86intrin.h>
1043///
1044/// \code
1045/// int _mm_extract_epi32(__m128i X, const int N);
1046/// \endcode
1047///
1048/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
1049///
1050/// \param X
1051/// A 128-bit integer vector.
1052/// \param N
1053/// An immediate value. Bits [1:0] specify which 32-bit vector element from
1054/// the argument \a X to extract and copy to the result. \n
1055/// 00: Bits [31:0] of the parameter \a X are extracted. \n
1056/// 01: Bits [63:32] of the parameter \a X are extracted. \n
1057/// 10: Bits [95:64] of the parameter \a X are extracted. \n
1058/// 11: Bits [127:96] of the parameter \a X are exracted.
1059/// \returns An integer, whose lower 32 bits are selected from the 128-bit
1060/// integer vector parameter and the remaining bits are assigned zeros.
1061#define _mm_extract_epi32(X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001062 ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001063
Logan Chien2833ffb2018-10-09 10:03:24 +08001064#ifdef __x86_64__
Logan Chien55afb0a2018-10-15 10:42:14 +08001065/// Extracts a 64-bit element from the 128-bit integer vector of
1066/// [2 x i64], using the immediate value parameter \a N as a selector.
1067///
1068/// \headerfile <x86intrin.h>
1069///
1070/// \code
1071/// long long _mm_extract_epi64(__m128i X, const int N);
1072/// \endcode
1073///
1074/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1075///
1076/// \param X
1077/// A 128-bit integer vector.
1078/// \param N
1079/// An immediate value. Bit [0] specifies which 64-bit vector element from
1080/// the argument \a X to return. \n
1081/// 0: Bits [63:0] are returned. \n
1082/// 1: Bits [127:64] are returned. \n
1083/// \returns A 64-bit integer.
1084#define _mm_extract_epi64(X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001085 ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001086#endif /* __x86_64 */
1087
1088/* SSE4 128-bit Packed Integer Comparisons. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001089/// Tests whether the specified bits in a 128-bit integer vector are all
1090/// zeros.
1091///
1092/// \headerfile <x86intrin.h>
1093///
1094/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1095///
1096/// \param __M
1097/// A 128-bit integer vector containing the bits to be tested.
1098/// \param __V
1099/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1100/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001101static __inline__ int __DEFAULT_FN_ATTRS
1102_mm_testz_si128(__m128i __M, __m128i __V)
1103{
1104 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1105}
1106
Logan Chien55afb0a2018-10-15 10:42:14 +08001107/// Tests whether the specified bits in a 128-bit integer vector are all
1108/// ones.
1109///
1110/// \headerfile <x86intrin.h>
1111///
1112/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1113///
1114/// \param __M
1115/// A 128-bit integer vector containing the bits to be tested.
1116/// \param __V
1117/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1118/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001119static __inline__ int __DEFAULT_FN_ATTRS
1120_mm_testc_si128(__m128i __M, __m128i __V)
1121{
1122 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1123}
1124
Logan Chien55afb0a2018-10-15 10:42:14 +08001125/// Tests whether the specified bits in a 128-bit integer vector are
1126/// neither all zeros nor all ones.
1127///
1128/// \headerfile <x86intrin.h>
1129///
1130/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1131///
1132/// \param __M
1133/// A 128-bit integer vector containing the bits to be tested.
1134/// \param __V
1135/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1136/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1137/// FALSE otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001138static __inline__ int __DEFAULT_FN_ATTRS
1139_mm_testnzc_si128(__m128i __M, __m128i __V)
1140{
1141 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1142}
1143
Logan Chien55afb0a2018-10-15 10:42:14 +08001144/// Tests whether the specified bits in a 128-bit integer vector are all
1145/// ones.
1146///
1147/// \headerfile <x86intrin.h>
1148///
1149/// \code
1150/// int _mm_test_all_ones(__m128i V);
1151/// \endcode
1152///
1153/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1154///
1155/// \param V
1156/// A 128-bit integer vector containing the bits to be tested.
1157/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1158/// otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001159#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001160
1161/// Tests whether the specified bits in a 128-bit integer vector are
1162/// neither all zeros nor all ones.
1163///
1164/// \headerfile <x86intrin.h>
1165///
1166/// \code
1167/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1168/// \endcode
1169///
1170/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1171///
1172/// \param M
1173/// A 128-bit integer vector containing the bits to be tested.
1174/// \param V
1175/// A 128-bit integer vector selecting which bits to test in operand \a M.
1176/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1177/// FALSE otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001178#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
Logan Chien55afb0a2018-10-15 10:42:14 +08001179
1180/// Tests whether the specified bits in a 128-bit integer vector are all
1181/// zeros.
1182///
1183/// \headerfile <x86intrin.h>
1184///
1185/// \code
1186/// int _mm_test_all_zeros(__m128i M, __m128i V);
1187/// \endcode
1188///
1189/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1190///
1191/// \param M
1192/// A 128-bit integer vector containing the bits to be tested.
1193/// \param V
1194/// A 128-bit integer vector selecting which bits to test in operand \a M.
1195/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
Logan Chien2833ffb2018-10-09 10:03:24 +08001196#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
1197
1198/* SSE4 64-bit Packed Integer Comparisons. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001199/// Compares each of the corresponding 64-bit values of the 128-bit
1200/// integer vectors for equality.
1201///
1202/// \headerfile <x86intrin.h>
1203///
1204/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
1205///
1206/// \param __V1
1207/// A 128-bit integer vector.
1208/// \param __V2
1209/// A 128-bit integer vector.
1210/// \returns A 128-bit integer vector containing the comparison results.
Logan Chien2833ffb2018-10-09 10:03:24 +08001211static __inline__ __m128i __DEFAULT_FN_ATTRS
1212_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
1213{
1214 return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1215}
1216
1217/* SSE4 Packed Integer Sign-Extension. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001218/// Sign-extends each of the lower eight 8-bit integer elements of a
1219/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1220/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1221/// are unused.
1222///
1223/// \headerfile <x86intrin.h>
1224///
1225/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
1226///
1227/// \param __V
1228/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
1229/// extended to 16-bit values.
1230/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001231static __inline__ __m128i __DEFAULT_FN_ATTRS
1232_mm_cvtepi8_epi16(__m128i __V)
1233{
1234 /* This function always performs a signed extension, but __v16qi is a char
1235 which may be signed or unsigned, so use __v16qs. */
1236 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
1237}
1238
Logan Chien55afb0a2018-10-15 10:42:14 +08001239/// Sign-extends each of the lower four 8-bit integer elements of a
1240/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1241/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1242/// vector are unused.
1243///
1244/// \headerfile <x86intrin.h>
1245///
1246/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
1247///
1248/// \param __V
1249/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1250/// sign-extended to 32-bit values.
1251/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001252static __inline__ __m128i __DEFAULT_FN_ATTRS
1253_mm_cvtepi8_epi32(__m128i __V)
1254{
1255 /* This function always performs a signed extension, but __v16qi is a char
1256 which may be signed or unsigned, so use __v16qs. */
1257 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1258}
1259
Logan Chien55afb0a2018-10-15 10:42:14 +08001260/// Sign-extends each of the lower two 8-bit integer elements of a
1261/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1262/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1263/// vector are unused.
1264///
1265/// \headerfile <x86intrin.h>
1266///
1267/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
1268///
1269/// \param __V
1270/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1271/// sign-extended to 64-bit values.
1272/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001273static __inline__ __m128i __DEFAULT_FN_ATTRS
1274_mm_cvtepi8_epi64(__m128i __V)
1275{
1276 /* This function always performs a signed extension, but __v16qi is a char
1277 which may be signed or unsigned, so use __v16qs. */
1278 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1279}
1280
Logan Chien55afb0a2018-10-15 10:42:14 +08001281/// Sign-extends each of the lower four 16-bit integer elements of a
1282/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1283/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1284/// vector are unused.
1285///
1286/// \headerfile <x86intrin.h>
1287///
1288/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
1289///
1290/// \param __V
1291/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1292/// sign-extended to 32-bit values.
1293/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001294static __inline__ __m128i __DEFAULT_FN_ATTRS
1295_mm_cvtepi16_epi32(__m128i __V)
1296{
1297 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1298}
1299
Logan Chien55afb0a2018-10-15 10:42:14 +08001300/// Sign-extends each of the lower two 16-bit integer elements of a
1301/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1302/// a 128-bit vector of [2 x i64]. The upper six elements of the input
1303/// vector are unused.
1304///
1305/// \headerfile <x86intrin.h>
1306///
1307/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
1308///
1309/// \param __V
1310/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1311/// sign-extended to 64-bit values.
1312/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001313static __inline__ __m128i __DEFAULT_FN_ATTRS
1314_mm_cvtepi16_epi64(__m128i __V)
1315{
1316 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1317}
1318
Logan Chien55afb0a2018-10-15 10:42:14 +08001319/// Sign-extends each of the lower two 32-bit integer elements of a
1320/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1321/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1322/// are unused.
1323///
1324/// \headerfile <x86intrin.h>
1325///
1326/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
1327///
1328/// \param __V
1329/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1330/// sign-extended to 64-bit values.
1331/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001332static __inline__ __m128i __DEFAULT_FN_ATTRS
1333_mm_cvtepi32_epi64(__m128i __V)
1334{
1335 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1336}
1337
1338/* SSE4 Packed Integer Zero-Extension. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001339/// Zero-extends each of the lower eight 8-bit integer elements of a
1340/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1341/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1342/// are unused.
1343///
1344/// \headerfile <x86intrin.h>
1345///
1346/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
1347///
1348/// \param __V
1349/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1350/// zero-extended to 16-bit values.
1351/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001352static __inline__ __m128i __DEFAULT_FN_ATTRS
1353_mm_cvtepu8_epi16(__m128i __V)
1354{
1355 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
1356}
1357
Logan Chien55afb0a2018-10-15 10:42:14 +08001358/// Zero-extends each of the lower four 8-bit integer elements of a
1359/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1360/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1361/// vector are unused.
1362///
1363/// \headerfile <x86intrin.h>
1364///
1365/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
1366///
1367/// \param __V
1368/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1369/// zero-extended to 32-bit values.
1370/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001371static __inline__ __m128i __DEFAULT_FN_ATTRS
1372_mm_cvtepu8_epi32(__m128i __V)
1373{
1374 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1375}
1376
Logan Chien55afb0a2018-10-15 10:42:14 +08001377/// Zero-extends each of the lower two 8-bit integer elements of a
1378/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1379/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1380/// vector are unused.
1381///
1382/// \headerfile <x86intrin.h>
1383///
1384/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
1385///
1386/// \param __V
1387/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1388/// zero-extended to 64-bit values.
1389/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001390static __inline__ __m128i __DEFAULT_FN_ATTRS
1391_mm_cvtepu8_epi64(__m128i __V)
1392{
1393 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1394}
1395
Logan Chien55afb0a2018-10-15 10:42:14 +08001396/// Zero-extends each of the lower four 16-bit integer elements of a
1397/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1398/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1399/// vector are unused.
1400///
1401/// \headerfile <x86intrin.h>
1402///
1403/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
1404///
1405/// \param __V
1406/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1407/// zero-extended to 32-bit values.
1408/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001409static __inline__ __m128i __DEFAULT_FN_ATTRS
1410_mm_cvtepu16_epi32(__m128i __V)
1411{
1412 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1413}
1414
Logan Chien55afb0a2018-10-15 10:42:14 +08001415/// Zero-extends each of the lower two 16-bit integer elements of a
1416/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1417/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1418/// are unused.
1419///
1420/// \headerfile <x86intrin.h>
1421///
1422/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
1423///
1424/// \param __V
1425/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1426/// zero-extended to 64-bit values.
1427/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001428static __inline__ __m128i __DEFAULT_FN_ATTRS
1429_mm_cvtepu16_epi64(__m128i __V)
1430{
1431 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1432}
1433
Logan Chien55afb0a2018-10-15 10:42:14 +08001434/// Zero-extends each of the lower two 32-bit integer elements of a
1435/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1436/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1437/// are unused.
1438///
1439/// \headerfile <x86intrin.h>
1440///
1441/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
1442///
1443/// \param __V
1444/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1445/// zero-extended to 64-bit values.
1446/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001447static __inline__ __m128i __DEFAULT_FN_ATTRS
1448_mm_cvtepu32_epi64(__m128i __V)
1449{
1450 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1451}
1452
1453/* SSE4 Pack with Unsigned Saturation. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001454/// Converts 32-bit signed integers from both 128-bit integer vector
1455/// operands into 16-bit unsigned integers, and returns the packed result.
1456/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1457/// 0x0000 are saturated to 0x0000.
1458///
1459/// \headerfile <x86intrin.h>
1460///
1461/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
1462///
1463/// \param __V1
1464/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1465/// signed integer and is converted to a 16-bit unsigned integer with
1466/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1467/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1468/// are written to the lower 64 bits of the result.
1469/// \param __V2
1470/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1471/// signed integer and is converted to a 16-bit unsigned integer with
1472/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1473/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1474/// are written to the higher 64 bits of the result.
1475/// \returns A 128-bit vector of [8 x i16] containing the converted values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001476static __inline__ __m128i __DEFAULT_FN_ATTRS
1477_mm_packus_epi32(__m128i __V1, __m128i __V2)
1478{
1479 return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1480}
1481
1482/* SSE4 Multiple Packed Sums of Absolute Difference. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001483/// Subtracts 8-bit unsigned integer values and computes the absolute
1484/// values of the differences to the corresponding bits in the destination.
1485/// Then sums of the absolute differences are returned according to the bit
1486/// fields in the immediate operand.
1487///
1488/// \headerfile <x86intrin.h>
1489///
1490/// \code
1491/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1492/// \endcode
1493///
1494/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
1495///
1496/// \param X
1497/// A 128-bit vector of [16 x i8].
1498/// \param Y
1499/// A 128-bit vector of [16 x i8].
1500/// \param M
1501/// An 8-bit immediate operand specifying how the absolute differences are to
1502/// be calculated, according to the following algorithm:
1503/// \code
1504/// // M2 represents bit 2 of the immediate operand
1505/// // M10 represents bits [1:0] of the immediate operand
1506/// i = M2 * 4;
1507/// j = M10 * 4;
1508/// for (k = 0; k < 8; k = k + 1) {
1509/// d0 = abs(X[i + k + 0] - Y[j + 0]);
1510/// d1 = abs(X[i + k + 1] - Y[j + 1]);
1511/// d2 = abs(X[i + k + 2] - Y[j + 2]);
1512/// d3 = abs(X[i + k + 3] - Y[j + 3]);
1513/// r[k] = d0 + d1 + d2 + d3;
1514/// }
1515/// \endcode
1516/// \returns A 128-bit integer vector containing the sums of the sets of
1517/// absolute differences between both operands.
1518#define _mm_mpsadbw_epu8(X, Y, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001519 ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
1520 (__v16qi)(__m128i)(Y), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001521
Logan Chien55afb0a2018-10-15 10:42:14 +08001522/// Finds the minimum unsigned 16-bit element in the input 128-bit
1523/// vector of [8 x u16] and returns it and along with its index.
1524///
1525/// \headerfile <x86intrin.h>
1526///
1527/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
1528/// instruction.
1529///
1530/// \param __V
1531/// A 128-bit vector of [8 x u16].
1532/// \returns A 128-bit value where bits [15:0] contain the minimum value found
1533/// in parameter \a __V, bits [18:16] contain the index of the minimum value
1534/// and the remaining bits are set to 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08001535static __inline__ __m128i __DEFAULT_FN_ATTRS
1536_mm_minpos_epu16(__m128i __V)
1537{
1538 return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
1539}
1540
1541/* Handle the sse4.2 definitions here. */
1542
1543/* These definitions are normally in nmmintrin.h, but gcc puts them in here
1544 so we'll do the same. */
1545
1546#undef __DEFAULT_FN_ATTRS
1547#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1548
1549/* These specify the type of data that we're comparing. */
1550#define _SIDD_UBYTE_OPS 0x00
1551#define _SIDD_UWORD_OPS 0x01
1552#define _SIDD_SBYTE_OPS 0x02
1553#define _SIDD_SWORD_OPS 0x03
1554
1555/* These specify the type of comparison operation. */
1556#define _SIDD_CMP_EQUAL_ANY 0x00
1557#define _SIDD_CMP_RANGES 0x04
1558#define _SIDD_CMP_EQUAL_EACH 0x08
1559#define _SIDD_CMP_EQUAL_ORDERED 0x0c
1560
1561/* These macros specify the polarity of the operation. */
1562#define _SIDD_POSITIVE_POLARITY 0x00
1563#define _SIDD_NEGATIVE_POLARITY 0x10
1564#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
1565#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
1566
1567/* These macros are used in _mm_cmpXstri() to specify the return. */
1568#define _SIDD_LEAST_SIGNIFICANT 0x00
1569#define _SIDD_MOST_SIGNIFICANT 0x40
1570
1571/* These macros are used in _mm_cmpXstri() to specify the return. */
1572#define _SIDD_BIT_MASK 0x00
1573#define _SIDD_UNIT_MASK 0x40
1574
1575/* SSE4.2 Packed Comparison Intrinsics. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001576/// Uses the immediate operand \a M to perform a comparison of string
1577/// data with implicitly defined lengths that is contained in source operands
1578/// \a A and \a B. Returns a 128-bit integer vector representing the result
1579/// mask of the comparison.
1580///
1581/// \headerfile <x86intrin.h>
1582///
1583/// \code
1584/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1585/// \endcode
1586///
1587/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
1588/// instruction.
1589///
1590/// \param A
1591/// A 128-bit integer vector containing one of the source operands to be
1592/// compared.
1593/// \param B
1594/// A 128-bit integer vector containing one of the source operands to be
1595/// compared.
1596/// \param M
1597/// An 8-bit immediate operand specifying whether the characters are bytes or
1598/// words, the type of comparison to perform, and the format of the return
1599/// value. \n
1600/// Bits [1:0]: Determine source data format. \n
1601/// 00: 16 unsigned bytes \n
1602/// 01: 8 unsigned words \n
1603/// 10: 16 signed bytes \n
1604/// 11: 8 signed words \n
1605/// Bits [3:2]: Determine comparison type and aggregation method. \n
1606/// 00: Subset: Each character in \a B is compared for equality with all
1607/// the characters in \a A. \n
1608/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1609/// basis is greater than or equal for even-indexed elements in \a A,
1610/// and less than or equal for odd-indexed elements in \a A. \n
1611/// 10: Match: Compare each pair of corresponding characters in \a A and
1612/// \a B for equality. \n
1613/// 11: Substring: Search \a B for substring matches of \a A. \n
1614/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1615/// mask of the comparison results. \n
1616/// 00: No effect. \n
1617/// 01: Negate the bit mask. \n
1618/// 10: No effect. \n
1619/// 11: Negate the bit mask only for bits with an index less than or equal
1620/// to the size of \a A or \a B. \n
1621/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1622/// bytes. \n
1623/// 0: The result is zero-extended to 16 bytes. \n
1624/// 1: The result is expanded to 16 bytes (this expansion is performed by
1625/// repeating each bit 8 or 16 times).
1626/// \returns Returns a 128-bit integer vector representing the result mask of
1627/// the comparison.
Logan Chien2833ffb2018-10-09 10:03:24 +08001628#define _mm_cmpistrm(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001629 ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
1630 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001631
1632/// Uses the immediate operand \a M to perform a comparison of string
1633/// data with implicitly defined lengths that is contained in source operands
1634/// \a A and \a B. Returns an integer representing the result index of the
1635/// comparison.
1636///
1637/// \headerfile <x86intrin.h>
1638///
1639/// \code
1640/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1641/// \endcode
1642///
1643/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1644/// instruction.
1645///
1646/// \param A
1647/// A 128-bit integer vector containing one of the source operands to be
1648/// compared.
1649/// \param B
1650/// A 128-bit integer vector containing one of the source operands to be
1651/// compared.
1652/// \param M
1653/// An 8-bit immediate operand specifying whether the characters are bytes or
1654/// words, the type of comparison to perform, and the format of the return
1655/// value. \n
1656/// Bits [1:0]: Determine source data format. \n
1657/// 00: 16 unsigned bytes \n
1658/// 01: 8 unsigned words \n
1659/// 10: 16 signed bytes \n
1660/// 11: 8 signed words \n
1661/// Bits [3:2]: Determine comparison type and aggregation method. \n
1662/// 00: Subset: Each character in \a B is compared for equality with all
1663/// the characters in \a A. \n
1664/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1665/// basis is greater than or equal for even-indexed elements in \a A,
1666/// and less than or equal for odd-indexed elements in \a A. \n
1667/// 10: Match: Compare each pair of corresponding characters in \a A and
1668/// \a B for equality. \n
1669/// 11: Substring: Search B for substring matches of \a A. \n
1670/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1671/// mask of the comparison results. \n
1672/// 00: No effect. \n
1673/// 01: Negate the bit mask. \n
1674/// 10: No effect. \n
1675/// 11: Negate the bit mask only for bits with an index less than or equal
1676/// to the size of \a A or \a B. \n
1677/// Bit [6]: Determines whether the index of the lowest set bit or the
1678/// highest set bit is returned. \n
1679/// 0: The index of the least significant set bit. \n
1680/// 1: The index of the most significant set bit. \n
1681/// \returns Returns an integer representing the result index of the comparison.
Logan Chien2833ffb2018-10-09 10:03:24 +08001682#define _mm_cmpistri(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001683 ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
1684 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001685
Logan Chien55afb0a2018-10-15 10:42:14 +08001686/// Uses the immediate operand \a M to perform a comparison of string
1687/// data with explicitly defined lengths that is contained in source operands
1688/// \a A and \a B. Returns a 128-bit integer vector representing the result
1689/// mask of the comparison.
1690///
1691/// \headerfile <x86intrin.h>
1692///
1693/// \code
1694/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1695/// \endcode
1696///
1697/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
1698/// instruction.
1699///
1700/// \param A
1701/// A 128-bit integer vector containing one of the source operands to be
1702/// compared.
1703/// \param LA
1704/// An integer that specifies the length of the string in \a A.
1705/// \param B
1706/// A 128-bit integer vector containing one of the source operands to be
1707/// compared.
1708/// \param LB
1709/// An integer that specifies the length of the string in \a B.
1710/// \param M
1711/// An 8-bit immediate operand specifying whether the characters are bytes or
1712/// words, the type of comparison to perform, and the format of the return
1713/// value. \n
1714/// Bits [1:0]: Determine source data format. \n
1715/// 00: 16 unsigned bytes \n
1716/// 01: 8 unsigned words \n
1717/// 10: 16 signed bytes \n
1718/// 11: 8 signed words \n
1719/// Bits [3:2]: Determine comparison type and aggregation method. \n
1720/// 00: Subset: Each character in \a B is compared for equality with all
1721/// the characters in \a A. \n
1722/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1723/// basis is greater than or equal for even-indexed elements in \a A,
1724/// and less than or equal for odd-indexed elements in \a A. \n
1725/// 10: Match: Compare each pair of corresponding characters in \a A and
1726/// \a B for equality. \n
1727/// 11: Substring: Search \a B for substring matches of \a A. \n
1728/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1729/// mask of the comparison results. \n
1730/// 00: No effect. \n
1731/// 01: Negate the bit mask. \n
1732/// 10: No effect. \n
1733/// 11: Negate the bit mask only for bits with an index less than or equal
1734/// to the size of \a A or \a B. \n
1735/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1736/// bytes. \n
1737/// 0: The result is zero-extended to 16 bytes. \n
1738/// 1: The result is expanded to 16 bytes (this expansion is performed by
1739/// repeating each bit 8 or 16 times). \n
1740/// \returns Returns a 128-bit integer vector representing the result mask of
1741/// the comparison.
Logan Chien2833ffb2018-10-09 10:03:24 +08001742#define _mm_cmpestrm(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001743 ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
1744 (__v16qi)(__m128i)(B), (int)(LB), \
1745 (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001746
1747/// Uses the immediate operand \a M to perform a comparison of string
1748/// data with explicitly defined lengths that is contained in source operands
1749/// \a A and \a B. Returns an integer representing the result index of the
1750/// comparison.
1751///
1752/// \headerfile <x86intrin.h>
1753///
1754/// \code
1755/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1756/// \endcode
1757///
1758/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
1759/// instruction.
1760///
1761/// \param A
1762/// A 128-bit integer vector containing one of the source operands to be
1763/// compared.
1764/// \param LA
1765/// An integer that specifies the length of the string in \a A.
1766/// \param B
1767/// A 128-bit integer vector containing one of the source operands to be
1768/// compared.
1769/// \param LB
1770/// An integer that specifies the length of the string in \a B.
1771/// \param M
1772/// An 8-bit immediate operand specifying whether the characters are bytes or
1773/// words, the type of comparison to perform, and the format of the return
1774/// value. \n
1775/// Bits [1:0]: Determine source data format. \n
1776/// 00: 16 unsigned bytes \n
1777/// 01: 8 unsigned words \n
1778/// 10: 16 signed bytes \n
1779/// 11: 8 signed words \n
1780/// Bits [3:2]: Determine comparison type and aggregation method. \n
1781/// 00: Subset: Each character in \a B is compared for equality with all
1782/// the characters in \a A. \n
1783/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1784/// basis is greater than or equal for even-indexed elements in \a A,
1785/// and less than or equal for odd-indexed elements in \a A. \n
1786/// 10: Match: Compare each pair of corresponding characters in \a A and
1787/// \a B for equality. \n
1788/// 11: Substring: Search B for substring matches of \a A. \n
1789/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1790/// mask of the comparison results. \n
1791/// 00: No effect. \n
1792/// 01: Negate the bit mask. \n
1793/// 10: No effect. \n
1794/// 11: Negate the bit mask only for bits with an index less than or equal
1795/// to the size of \a A or \a B. \n
1796/// Bit [6]: Determines whether the index of the lowest set bit or the
1797/// highest set bit is returned. \n
1798/// 0: The index of the least significant set bit. \n
1799/// 1: The index of the most significant set bit. \n
1800/// \returns Returns an integer representing the result index of the comparison.
Logan Chien2833ffb2018-10-09 10:03:24 +08001801#define _mm_cmpestri(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001802 ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
1803 (__v16qi)(__m128i)(B), (int)(LB), \
1804 (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001805
1806/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001807/// Uses the immediate operand \a M to perform a comparison of string
1808/// data with implicitly defined lengths that is contained in source operands
1809/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1810/// string in \a B is the maximum, otherwise, returns 0.
1811///
1812/// \headerfile <x86intrin.h>
1813///
1814/// \code
1815/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1816/// \endcode
1817///
1818/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1819/// instruction.
1820///
1821/// \param A
1822/// A 128-bit integer vector containing one of the source operands to be
1823/// compared.
1824/// \param B
1825/// A 128-bit integer vector containing one of the source operands to be
1826/// compared.
1827/// \param M
1828/// An 8-bit immediate operand specifying whether the characters are bytes or
1829/// words and the type of comparison to perform. \n
1830/// Bits [1:0]: Determine source data format. \n
1831/// 00: 16 unsigned bytes \n
1832/// 01: 8 unsigned words \n
1833/// 10: 16 signed bytes \n
1834/// 11: 8 signed words \n
1835/// Bits [3:2]: Determine comparison type and aggregation method. \n
1836/// 00: Subset: Each character in \a B is compared for equality with all
1837/// the characters in \a A. \n
1838/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1839/// basis is greater than or equal for even-indexed elements in \a A,
1840/// and less than or equal for odd-indexed elements in \a A. \n
1841/// 10: Match: Compare each pair of corresponding characters in \a A and
1842/// \a B for equality. \n
1843/// 11: Substring: Search \a B for substring matches of \a A. \n
1844/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1845/// mask of the comparison results. \n
1846/// 00: No effect. \n
1847/// 01: Negate the bit mask. \n
1848/// 10: No effect. \n
1849/// 11: Negate the bit mask only for bits with an index less than or equal
1850/// to the size of \a A or \a B. \n
1851/// \returns Returns 1 if the bit mask is zero and the length of the string in
1852/// \a B is the maximum; otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08001853#define _mm_cmpistra(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001854 ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
1855 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001856
1857/// Uses the immediate operand \a M to perform a comparison of string
1858/// data with implicitly defined lengths that is contained in source operands
1859/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1860/// 0.
1861///
1862/// \headerfile <x86intrin.h>
1863///
1864/// \code
1865/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1866/// \endcode
1867///
1868/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1869/// instruction.
1870///
1871/// \param A
1872/// A 128-bit integer vector containing one of the source operands to be
1873/// compared.
1874/// \param B
1875/// A 128-bit integer vector containing one of the source operands to be
1876/// compared.
1877/// \param M
1878/// An 8-bit immediate operand specifying whether the characters are bytes or
1879/// words and the type of comparison to perform. \n
1880/// Bits [1:0]: Determine source data format. \n
1881/// 00: 16 unsigned bytes \n
1882/// 01: 8 unsigned words \n
1883/// 10: 16 signed bytes \n
1884/// 11: 8 signed words \n
1885/// Bits [3:2]: Determine comparison type and aggregation method. \n
1886/// 00: Subset: Each character in \a B is compared for equality with all
1887/// the characters in \a A. \n
1888/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1889/// basis is greater than or equal for even-indexed elements in \a A,
1890/// and less than or equal for odd-indexed elements in \a A. \n
1891/// 10: Match: Compare each pair of corresponding characters in \a A and
1892/// \a B for equality. \n
1893/// 11: Substring: Search B for substring matches of \a A. \n
1894/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1895/// mask of the comparison results. \n
1896/// 00: No effect. \n
1897/// 01: Negate the bit mask. \n
1898/// 10: No effect. \n
1899/// 11: Negate the bit mask only for bits with an index less than or equal
1900/// to the size of \a A or \a B.
1901/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08001902#define _mm_cmpistrc(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001903 ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
1904 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001905
1906/// Uses the immediate operand \a M to perform a comparison of string
1907/// data with implicitly defined lengths that is contained in source operands
1908/// \a A and \a B. Returns bit 0 of the resulting bit mask.
1909///
1910/// \headerfile <x86intrin.h>
1911///
1912/// \code
1913/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
1914/// \endcode
1915///
1916/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1917/// instruction.
1918///
1919/// \param A
1920/// A 128-bit integer vector containing one of the source operands to be
1921/// compared.
1922/// \param B
1923/// A 128-bit integer vector containing one of the source operands to be
1924/// compared.
1925/// \param M
1926/// An 8-bit immediate operand specifying whether the characters are bytes or
1927/// words and the type of comparison to perform. \n
1928/// Bits [1:0]: Determine source data format. \n
1929/// 00: 16 unsigned bytes \n
1930/// 01: 8 unsigned words \n
1931/// 10: 16 signed bytes \n
1932/// 11: 8 signed words \n
1933/// Bits [3:2]: Determine comparison type and aggregation method. \n
1934/// 00: Subset: Each character in \a B is compared for equality with all
1935/// the characters in \a A. \n
1936/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1937/// basis is greater than or equal for even-indexed elements in \a A,
1938/// and less than or equal for odd-indexed elements in \a A. \n
1939/// 10: Match: Compare each pair of corresponding characters in \a A and
1940/// \a B for equality. \n
1941/// 11: Substring: Search B for substring matches of \a A. \n
1942/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1943/// mask of the comparison results. \n
1944/// 00: No effect. \n
1945/// 01: Negate the bit mask. \n
1946/// 10: No effect. \n
1947/// 11: Negate the bit mask only for bits with an index less than or equal
1948/// to the size of \a A or \a B. \n
1949/// \returns Returns bit 0 of the resulting bit mask.
Logan Chien2833ffb2018-10-09 10:03:24 +08001950#define _mm_cmpistro(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001951 ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
1952 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08001953
1954/// Uses the immediate operand \a M to perform a comparison of string
1955/// data with implicitly defined lengths that is contained in source operands
1956/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
1957/// the maximum, otherwise, returns 0.
1958///
1959/// \headerfile <x86intrin.h>
1960///
1961/// \code
1962/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
1963/// \endcode
1964///
1965/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1966/// instruction.
1967///
1968/// \param A
1969/// A 128-bit integer vector containing one of the source operands to be
1970/// compared.
1971/// \param B
1972/// A 128-bit integer vector containing one of the source operands to be
1973/// compared.
1974/// \param M
1975/// An 8-bit immediate operand specifying whether the characters are bytes or
1976/// words and the type of comparison to perform. \n
1977/// Bits [1:0]: Determine source data format. \n
1978/// 00: 16 unsigned bytes \n
1979/// 01: 8 unsigned words \n
1980/// 10: 16 signed bytes \n
1981/// 11: 8 signed words \n
1982/// Bits [3:2]: Determine comparison type and aggregation method. \n
1983/// 00: Subset: Each character in \a B is compared for equality with all
1984/// the characters in \a A. \n
1985/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1986/// basis is greater than or equal for even-indexed elements in \a A,
1987/// and less than or equal for odd-indexed elements in \a A. \n
1988/// 10: Match: Compare each pair of corresponding characters in \a A and
1989/// \a B for equality. \n
1990/// 11: Substring: Search \a B for substring matches of \a A. \n
1991/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1992/// mask of the comparison results. \n
1993/// 00: No effect. \n
1994/// 01: Negate the bit mask. \n
1995/// 10: No effect. \n
1996/// 11: Negate the bit mask only for bits with an index less than or equal
1997/// to the size of \a A or \a B. \n
1998/// \returns Returns 1 if the length of the string in \a A is less than the
1999/// maximum, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002000#define _mm_cmpistrs(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002001 ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
2002 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08002003
2004/// Uses the immediate operand \a M to perform a comparison of string
2005/// data with implicitly defined lengths that is contained in source operands
2006/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2007/// the maximum, otherwise, returns 0.
2008///
2009/// \headerfile <x86intrin.h>
2010///
2011/// \code
2012/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
2013/// \endcode
2014///
2015/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
2016/// instruction.
2017///
2018/// \param A
2019/// A 128-bit integer vector containing one of the source operands to be
2020/// compared.
2021/// \param B
2022/// A 128-bit integer vector containing one of the source operands to be
2023/// compared.
2024/// \param M
2025/// An 8-bit immediate operand specifying whether the characters are bytes or
2026/// words and the type of comparison to perform. \n
2027/// Bits [1:0]: Determine source data format. \n
2028/// 00: 16 unsigned bytes \n
2029/// 01: 8 unsigned words \n
2030/// 10: 16 signed bytes \n
2031/// 11: 8 signed words \n
2032/// Bits [3:2]: Determine comparison type and aggregation method. \n
2033/// 00: Subset: Each character in \a B is compared for equality with all
2034/// the characters in \a A. \n
2035/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2036/// basis is greater than or equal for even-indexed elements in \a A,
2037/// and less than or equal for odd-indexed elements in \a A. \n
2038/// 10: Match: Compare each pair of corresponding characters in \a A and
2039/// \a B for equality. \n
2040/// 11: Substring: Search \a B for substring matches of \a A. \n
2041/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2042/// mask of the comparison results. \n
2043/// 00: No effect. \n
2044/// 01: Negate the bit mask. \n
2045/// 10: No effect. \n
2046/// 11: Negate the bit mask only for bits with an index less than or equal
2047/// to the size of \a A or \a B.
2048/// \returns Returns 1 if the length of the string in \a B is less than the
2049/// maximum, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002050#define _mm_cmpistrz(A, B, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002051 ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
2052 (__v16qi)(__m128i)(B), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002053
Logan Chien55afb0a2018-10-15 10:42:14 +08002054/// Uses the immediate operand \a M to perform a comparison of string
2055/// data with explicitly defined lengths that is contained in source operands
2056/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2057/// string in \a B is the maximum, otherwise, returns 0.
2058///
2059/// \headerfile <x86intrin.h>
2060///
2061/// \code
2062/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2063/// \endcode
2064///
2065/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2066/// instruction.
2067///
2068/// \param A
2069/// A 128-bit integer vector containing one of the source operands to be
2070/// compared.
2071/// \param LA
2072/// An integer that specifies the length of the string in \a A.
2073/// \param B
2074/// A 128-bit integer vector containing one of the source operands to be
2075/// compared.
2076/// \param LB
2077/// An integer that specifies the length of the string in \a B.
2078/// \param M
2079/// An 8-bit immediate operand specifying whether the characters are bytes or
2080/// words and the type of comparison to perform. \n
2081/// Bits [1:0]: Determine source data format. \n
2082/// 00: 16 unsigned bytes \n
2083/// 01: 8 unsigned words \n
2084/// 10: 16 signed bytes \n
2085/// 11: 8 signed words \n
2086/// Bits [3:2]: Determine comparison type and aggregation method. \n
2087/// 00: Subset: Each character in \a B is compared for equality with all
2088/// the characters in \a A. \n
2089/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2090/// basis is greater than or equal for even-indexed elements in \a A,
2091/// and less than or equal for odd-indexed elements in \a A. \n
2092/// 10: Match: Compare each pair of corresponding characters in \a A and
2093/// \a B for equality. \n
2094/// 11: Substring: Search \a B for substring matches of \a A. \n
2095/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2096/// mask of the comparison results. \n
2097/// 00: No effect. \n
2098/// 01: Negate the bit mask. \n
2099/// 10: No effect. \n
2100/// 11: Negate the bit mask only for bits with an index less than or equal
2101/// to the size of \a A or \a B.
2102/// \returns Returns 1 if the bit mask is zero and the length of the string in
2103/// \a B is the maximum, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002104#define _mm_cmpestra(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002105 ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
2106 (__v16qi)(__m128i)(B), (int)(LB), \
2107 (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08002108
2109/// Uses the immediate operand \a M to perform a comparison of string
2110/// data with explicitly defined lengths that is contained in source operands
2111/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2112/// returns 0.
2113///
2114/// \headerfile <x86intrin.h>
2115///
2116/// \code
2117/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2118/// \endcode
2119///
2120/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2121/// instruction.
2122///
2123/// \param A
2124/// A 128-bit integer vector containing one of the source operands to be
2125/// compared.
2126/// \param LA
2127/// An integer that specifies the length of the string in \a A.
2128/// \param B
2129/// A 128-bit integer vector containing one of the source operands to be
2130/// compared.
2131/// \param LB
2132/// An integer that specifies the length of the string in \a B.
2133/// \param M
2134/// An 8-bit immediate operand specifying whether the characters are bytes or
2135/// words and the type of comparison to perform. \n
2136/// Bits [1:0]: Determine source data format. \n
2137/// 00: 16 unsigned bytes \n
2138/// 01: 8 unsigned words \n
2139/// 10: 16 signed bytes \n
2140/// 11: 8 signed words \n
2141/// Bits [3:2]: Determine comparison type and aggregation method. \n
2142/// 00: Subset: Each character in \a B is compared for equality with all
2143/// the characters in \a A. \n
2144/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2145/// basis is greater than or equal for even-indexed elements in \a A,
2146/// and less than or equal for odd-indexed elements in \a A. \n
2147/// 10: Match: Compare each pair of corresponding characters in \a A and
2148/// \a B for equality. \n
2149/// 11: Substring: Search \a B for substring matches of \a A. \n
2150/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2151/// mask of the comparison results. \n
2152/// 00: No effect. \n
2153/// 01: Negate the bit mask. \n
2154/// 10: No effect. \n
2155/// 11: Negate the bit mask only for bits with an index less than or equal
2156/// to the size of \a A or \a B. \n
2157/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002158#define _mm_cmpestrc(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002159 ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
2160 (__v16qi)(__m128i)(B), (int)(LB), \
2161 (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08002162
2163/// Uses the immediate operand \a M to perform a comparison of string
2164/// data with explicitly defined lengths that is contained in source operands
2165/// \a A and \a B. Returns bit 0 of the resulting bit mask.
2166///
2167/// \headerfile <x86intrin.h>
2168///
2169/// \code
2170/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2171/// \endcode
2172///
2173/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2174/// instruction.
2175///
2176/// \param A
2177/// A 128-bit integer vector containing one of the source operands to be
2178/// compared.
2179/// \param LA
2180/// An integer that specifies the length of the string in \a A.
2181/// \param B
2182/// A 128-bit integer vector containing one of the source operands to be
2183/// compared.
2184/// \param LB
2185/// An integer that specifies the length of the string in \a B.
2186/// \param M
2187/// An 8-bit immediate operand specifying whether the characters are bytes or
2188/// words and the type of comparison to perform. \n
2189/// Bits [1:0]: Determine source data format. \n
2190/// 00: 16 unsigned bytes \n
2191/// 01: 8 unsigned words \n
2192/// 10: 16 signed bytes \n
2193/// 11: 8 signed words \n
2194/// Bits [3:2]: Determine comparison type and aggregation method. \n
2195/// 00: Subset: Each character in \a B is compared for equality with all
2196/// the characters in \a A. \n
2197/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2198/// basis is greater than or equal for even-indexed elements in \a A,
2199/// and less than or equal for odd-indexed elements in \a A. \n
2200/// 10: Match: Compare each pair of corresponding characters in \a A and
2201/// \a B for equality. \n
2202/// 11: Substring: Search \a B for substring matches of \a A. \n
2203/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2204/// mask of the comparison results. \n
2205/// 00: No effect. \n
2206/// 01: Negate the bit mask. \n
2207/// 10: No effect. \n
2208/// 11: Negate the bit mask only for bits with an index less than or equal
2209/// to the size of \a A or \a B.
2210/// \returns Returns bit 0 of the resulting bit mask.
Logan Chien2833ffb2018-10-09 10:03:24 +08002211#define _mm_cmpestro(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002212 ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
2213 (__v16qi)(__m128i)(B), (int)(LB), \
2214 (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08002215
2216/// Uses the immediate operand \a M to perform a comparison of string
2217/// data with explicitly defined lengths that is contained in source operands
2218/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
2219/// the maximum, otherwise, returns 0.
2220///
2221/// \headerfile <x86intrin.h>
2222///
2223/// \code
2224/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2225/// \endcode
2226///
2227/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2228/// instruction.
2229///
2230/// \param A
2231/// A 128-bit integer vector containing one of the source operands to be
2232/// compared.
2233/// \param LA
2234/// An integer that specifies the length of the string in \a A.
2235/// \param B
2236/// A 128-bit integer vector containing one of the source operands to be
2237/// compared.
2238/// \param LB
2239/// An integer that specifies the length of the string in \a B.
2240/// \param M
2241/// An 8-bit immediate operand specifying whether the characters are bytes or
2242/// words and the type of comparison to perform. \n
2243/// Bits [1:0]: Determine source data format. \n
2244/// 00: 16 unsigned bytes \n
2245/// 01: 8 unsigned words \n
2246/// 10: 16 signed bytes \n
2247/// 11: 8 signed words \n
2248/// Bits [3:2]: Determine comparison type and aggregation method. \n
2249/// 00: Subset: Each character in \a B is compared for equality with all
2250/// the characters in \a A. \n
2251/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2252/// basis is greater than or equal for even-indexed elements in \a A,
2253/// and less than or equal for odd-indexed elements in \a A. \n
2254/// 10: Match: Compare each pair of corresponding characters in \a A and
2255/// \a B for equality. \n
2256/// 11: Substring: Search \a B for substring matches of \a A. \n
2257/// Bits [5:4]: Determine whether to perform a one's complement in the bit
2258/// mask of the comparison results. \n
2259/// 00: No effect. \n
2260/// 01: Negate the bit mask. \n
2261/// 10: No effect. \n
2262/// 11: Negate the bit mask only for bits with an index less than or equal
2263/// to the size of \a A or \a B. \n
2264/// \returns Returns 1 if the length of the string in \a A is less than the
2265/// maximum, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002266#define _mm_cmpestrs(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002267 ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
2268 (__v16qi)(__m128i)(B), (int)(LB), \
2269 (int)(M)))
Logan Chien55afb0a2018-10-15 10:42:14 +08002270
2271/// Uses the immediate operand \a M to perform a comparison of string
2272/// data with explicitly defined lengths that is contained in source operands
2273/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2274/// the maximum, otherwise, returns 0.
2275///
2276/// \headerfile <x86intrin.h>
2277///
2278/// \code
2279/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2280/// \endcode
2281///
2282/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
2283///
2284/// \param A
2285/// A 128-bit integer vector containing one of the source operands to be
2286/// compared.
2287/// \param LA
2288/// An integer that specifies the length of the string in \a A.
2289/// \param B
2290/// A 128-bit integer vector containing one of the source operands to be
2291/// compared.
2292/// \param LB
2293/// An integer that specifies the length of the string in \a B.
2294/// \param M
2295/// An 8-bit immediate operand specifying whether the characters are bytes or
2296/// words and the type of comparison to perform. \n
2297/// Bits [1:0]: Determine source data format. \n
2298/// 00: 16 unsigned bytes \n
2299/// 01: 8 unsigned words \n
2300/// 10: 16 signed bytes \n
2301/// 11: 8 signed words \n
2302/// Bits [3:2]: Determine comparison type and aggregation method. \n
2303/// 00: Subset: Each character in \a B is compared for equality with all
2304/// the characters in \a A. \n
2305/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2306/// basis is greater than or equal for even-indexed elements in \a A,
2307/// and less than or equal for odd-indexed elements in \a A. \n
2308/// 10: Match: Compare each pair of corresponding characters in \a A and
2309/// \a B for equality. \n
2310/// 11: Substring: Search \a B for substring matches of \a A. \n
2311/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2312/// mask of the comparison results. \n
2313/// 00: No effect. \n
2314/// 01: Negate the bit mask. \n
2315/// 10: No effect. \n
2316/// 11: Negate the bit mask only for bits with an index less than or equal
2317/// to the size of \a A or \a B.
2318/// \returns Returns 1 if the length of the string in \a B is less than the
2319/// maximum, otherwise, returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002320#define _mm_cmpestrz(A, LA, B, LB, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002321 ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
2322 (__v16qi)(__m128i)(B), (int)(LB), \
2323 (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002324
2325/* SSE4.2 Compare Packed Data -- Greater Than. */
Logan Chien55afb0a2018-10-15 10:42:14 +08002326/// Compares each of the corresponding 64-bit values of the 128-bit
2327/// integer vectors to determine if the values in the first operand are
2328/// greater than those in the second operand.
2329///
2330/// \headerfile <x86intrin.h>
2331///
2332/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
2333///
2334/// \param __V1
2335/// A 128-bit integer vector.
2336/// \param __V2
2337/// A 128-bit integer vector.
2338/// \returns A 128-bit integer vector containing the comparison results.
Logan Chien2833ffb2018-10-09 10:03:24 +08002339static __inline__ __m128i __DEFAULT_FN_ATTRS
2340_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
2341{
2342 return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2343}
2344
Logan Chien2833ffb2018-10-09 10:03:24 +08002345#undef __DEFAULT_FN_ATTRS
2346
Logan Chien2833ffb2018-10-09 10:03:24 +08002347#include <popcntintrin.h>
Logan Chien2833ffb2018-10-09 10:03:24 +08002348
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002349#include <crc32intrin.h>
2350
Logan Chien55afb0a2018-10-15 10:42:14 +08002351#endif /* __SMMINTRIN_H */