blob: 620453c97783cd9d7e0013e90f90759635fb7080 [file] [log] [blame]
Logan Chien2833ffb2018-10-09 10:03:24 +08001/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
Logan Chiendf4f7662019-09-04 16:45:23 -07003 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien2833ffb2018-10-09 10:03:24 +08006 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __XMMINTRIN_H
11#define __XMMINTRIN_H
12
13#include <mmintrin.h>
14
15typedef int __v4si __attribute__((__vector_size__(16)));
16typedef float __v4sf __attribute__((__vector_size__(16)));
Logan Chiendbcf4122019-03-21 10:50:25 +080017typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
18
19typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
Logan Chien2833ffb2018-10-09 10:03:24 +080020
21/* Unsigned types */
22typedef unsigned int __v4su __attribute__((__vector_size__(16)));
23
24/* This header should only be included in a hosted environment as it depends on
25 * a standard library to provide allocation routines. */
26#if __STDC_HOSTED__
27#include <mm_malloc.h>
28#endif
29
30/* Define the default attributes for the functions in this file. */
Logan Chien55afb0a2018-10-15 10:42:14 +080031#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
32#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
Logan Chien2833ffb2018-10-09 10:03:24 +080033
Logan Chien55afb0a2018-10-15 10:42:14 +080034/// Adds the 32-bit float values in the low-order bits of the operands.
Logan Chien2833ffb2018-10-09 10:03:24 +080035///
36/// \headerfile <x86intrin.h>
37///
Logan Chien55afb0a2018-10-15 10:42:14 +080038/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +080039///
40/// \param __a
41/// A 128-bit vector of [4 x float] containing one of the source operands.
42/// The lower 32 bits of this operand are used in the calculation.
43/// \param __b
44/// A 128-bit vector of [4 x float] containing one of the source operands.
45/// The lower 32 bits of this operand are used in the calculation.
46/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
47/// of the lower 32 bits of both operands. The upper 96 bits are copied from
48/// the upper 96 bits of the first source operand.
49static __inline__ __m128 __DEFAULT_FN_ATTRS
50_mm_add_ss(__m128 __a, __m128 __b)
51{
52 __a[0] += __b[0];
53 return __a;
54}
55
Logan Chien55afb0a2018-10-15 10:42:14 +080056/// Adds two 128-bit vectors of [4 x float], and returns the results of
Logan Chien2833ffb2018-10-09 10:03:24 +080057/// the addition.
58///
59/// \headerfile <x86intrin.h>
60///
Logan Chien55afb0a2018-10-15 10:42:14 +080061/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +080062///
63/// \param __a
64/// A 128-bit vector of [4 x float] containing one of the source operands.
65/// \param __b
66/// A 128-bit vector of [4 x float] containing one of the source operands.
67/// \returns A 128-bit vector of [4 x float] containing the sums of both
68/// operands.
69static __inline__ __m128 __DEFAULT_FN_ATTRS
70_mm_add_ps(__m128 __a, __m128 __b)
71{
72 return (__m128)((__v4sf)__a + (__v4sf)__b);
73}
74
Logan Chien55afb0a2018-10-15 10:42:14 +080075/// Subtracts the 32-bit float value in the low-order bits of the second
Logan Chien2833ffb2018-10-09 10:03:24 +080076/// operand from the corresponding value in the first operand.
77///
78/// \headerfile <x86intrin.h>
79///
Logan Chien55afb0a2018-10-15 10:42:14 +080080/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +080081///
82/// \param __a
83/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
84/// of this operand are used in the calculation.
85/// \param __b
86/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
87/// bits of this operand are used in the calculation.
88/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
89/// difference of the lower 32 bits of both operands. The upper 96 bits are
90/// copied from the upper 96 bits of the first source operand.
91static __inline__ __m128 __DEFAULT_FN_ATTRS
92_mm_sub_ss(__m128 __a, __m128 __b)
93{
94 __a[0] -= __b[0];
95 return __a;
96}
97
Logan Chien55afb0a2018-10-15 10:42:14 +080098/// Subtracts each of the values of the second operand from the first
Logan Chien2833ffb2018-10-09 10:03:24 +080099/// operand, both of which are 128-bit vectors of [4 x float] and returns
100/// the results of the subtraction.
101///
102/// \headerfile <x86intrin.h>
103///
Logan Chien55afb0a2018-10-15 10:42:14 +0800104/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800105///
106/// \param __a
107/// A 128-bit vector of [4 x float] containing the minuend.
108/// \param __b
109/// A 128-bit vector of [4 x float] containing the subtrahend.
110/// \returns A 128-bit vector of [4 x float] containing the differences between
111/// both operands.
112static __inline__ __m128 __DEFAULT_FN_ATTRS
113_mm_sub_ps(__m128 __a, __m128 __b)
114{
115 return (__m128)((__v4sf)__a - (__v4sf)__b);
116}
117
Logan Chien55afb0a2018-10-15 10:42:14 +0800118/// Multiplies two 32-bit float values in the low-order bits of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800119/// operands.
120///
121/// \headerfile <x86intrin.h>
122///
Logan Chien55afb0a2018-10-15 10:42:14 +0800123/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800124///
125/// \param __a
126/// A 128-bit vector of [4 x float] containing one of the source operands.
127/// The lower 32 bits of this operand are used in the calculation.
128/// \param __b
129/// A 128-bit vector of [4 x float] containing one of the source operands.
130/// The lower 32 bits of this operand are used in the calculation.
131/// \returns A 128-bit vector of [4 x float] containing the product of the lower
132/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
133/// bits of the first source operand.
134static __inline__ __m128 __DEFAULT_FN_ATTRS
135_mm_mul_ss(__m128 __a, __m128 __b)
136{
137 __a[0] *= __b[0];
138 return __a;
139}
140
Logan Chien55afb0a2018-10-15 10:42:14 +0800141/// Multiplies two 128-bit vectors of [4 x float] and returns the
Logan Chien2833ffb2018-10-09 10:03:24 +0800142/// results of the multiplication.
143///
144/// \headerfile <x86intrin.h>
145///
Logan Chien55afb0a2018-10-15 10:42:14 +0800146/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800147///
148/// \param __a
149/// A 128-bit vector of [4 x float] containing one of the source operands.
150/// \param __b
151/// A 128-bit vector of [4 x float] containing one of the source operands.
152/// \returns A 128-bit vector of [4 x float] containing the products of both
153/// operands.
154static __inline__ __m128 __DEFAULT_FN_ATTRS
155_mm_mul_ps(__m128 __a, __m128 __b)
156{
157 return (__m128)((__v4sf)__a * (__v4sf)__b);
158}
159
Logan Chien55afb0a2018-10-15 10:42:14 +0800160/// Divides the value in the low-order 32 bits of the first operand by
Logan Chien2833ffb2018-10-09 10:03:24 +0800161/// the corresponding value in the second operand.
162///
163/// \headerfile <x86intrin.h>
164///
Logan Chien55afb0a2018-10-15 10:42:14 +0800165/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800166///
167/// \param __a
168/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
169/// bits of this operand are used in the calculation.
170/// \param __b
171/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
172/// of this operand are used in the calculation.
173/// \returns A 128-bit vector of [4 x float] containing the quotients of the
174/// lower 32 bits of both operands. The upper 96 bits are copied from the
175/// upper 96 bits of the first source operand.
176static __inline__ __m128 __DEFAULT_FN_ATTRS
177_mm_div_ss(__m128 __a, __m128 __b)
178{
179 __a[0] /= __b[0];
180 return __a;
181}
182
Logan Chien55afb0a2018-10-15 10:42:14 +0800183/// Divides two 128-bit vectors of [4 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +0800184///
185/// \headerfile <x86intrin.h>
186///
Logan Chien55afb0a2018-10-15 10:42:14 +0800187/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800188///
189/// \param __a
190/// A 128-bit vector of [4 x float] containing the dividend.
191/// \param __b
192/// A 128-bit vector of [4 x float] containing the divisor.
193/// \returns A 128-bit vector of [4 x float] containing the quotients of both
194/// operands.
195static __inline__ __m128 __DEFAULT_FN_ATTRS
196_mm_div_ps(__m128 __a, __m128 __b)
197{
198 return (__m128)((__v4sf)__a / (__v4sf)__b);
199}
200
Logan Chien55afb0a2018-10-15 10:42:14 +0800201/// Calculates the square root of the value stored in the low-order bits
Logan Chien2833ffb2018-10-09 10:03:24 +0800202/// of a 128-bit vector of [4 x float].
203///
204/// \headerfile <x86intrin.h>
205///
Logan Chien55afb0a2018-10-15 10:42:14 +0800206/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800207///
208/// \param __a
209/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
210/// used in the calculation.
211/// \returns A 128-bit vector of [4 x float] containing the square root of the
212/// value in the low-order bits of the operand.
213static __inline__ __m128 __DEFAULT_FN_ATTRS
214_mm_sqrt_ss(__m128 __a)
215{
Logan Chien55afb0a2018-10-15 10:42:14 +0800216 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
Logan Chien2833ffb2018-10-09 10:03:24 +0800217}
218
Logan Chien55afb0a2018-10-15 10:42:14 +0800219/// Calculates the square roots of the values stored in a 128-bit vector
Logan Chien2833ffb2018-10-09 10:03:24 +0800220/// of [4 x float].
221///
222/// \headerfile <x86intrin.h>
223///
Logan Chien55afb0a2018-10-15 10:42:14 +0800224/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800225///
226/// \param __a
227/// A 128-bit vector of [4 x float].
228/// \returns A 128-bit vector of [4 x float] containing the square roots of the
229/// values in the operand.
230static __inline__ __m128 __DEFAULT_FN_ATTRS
231_mm_sqrt_ps(__m128 __a)
232{
233 return __builtin_ia32_sqrtps((__v4sf)__a);
234}
235
Logan Chien55afb0a2018-10-15 10:42:14 +0800236/// Calculates the approximate reciprocal of the value stored in the
Logan Chien2833ffb2018-10-09 10:03:24 +0800237/// low-order bits of a 128-bit vector of [4 x float].
238///
239/// \headerfile <x86intrin.h>
240///
Logan Chien55afb0a2018-10-15 10:42:14 +0800241/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800242///
243/// \param __a
244/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
245/// used in the calculation.
246/// \returns A 128-bit vector of [4 x float] containing the approximate
247/// reciprocal of the value in the low-order bits of the operand.
248static __inline__ __m128 __DEFAULT_FN_ATTRS
249_mm_rcp_ss(__m128 __a)
250{
Logan Chien55afb0a2018-10-15 10:42:14 +0800251 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
Logan Chien2833ffb2018-10-09 10:03:24 +0800252}
253
Logan Chien55afb0a2018-10-15 10:42:14 +0800254/// Calculates the approximate reciprocals of the values stored in a
Logan Chien2833ffb2018-10-09 10:03:24 +0800255/// 128-bit vector of [4 x float].
256///
257/// \headerfile <x86intrin.h>
258///
Logan Chien55afb0a2018-10-15 10:42:14 +0800259/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800260///
261/// \param __a
262/// A 128-bit vector of [4 x float].
263/// \returns A 128-bit vector of [4 x float] containing the approximate
264/// reciprocals of the values in the operand.
265static __inline__ __m128 __DEFAULT_FN_ATTRS
266_mm_rcp_ps(__m128 __a)
267{
Logan Chien55afb0a2018-10-15 10:42:14 +0800268 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
Logan Chien2833ffb2018-10-09 10:03:24 +0800269}
270
Logan Chien55afb0a2018-10-15 10:42:14 +0800271/// Calculates the approximate reciprocal of the square root of the value
Logan Chien2833ffb2018-10-09 10:03:24 +0800272/// stored in the low-order bits of a 128-bit vector of [4 x float].
273///
274/// \headerfile <x86intrin.h>
275///
Logan Chien55afb0a2018-10-15 10:42:14 +0800276/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800277///
278/// \param __a
279/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
280/// used in the calculation.
281/// \returns A 128-bit vector of [4 x float] containing the approximate
282/// reciprocal of the square root of the value in the low-order bits of the
283/// operand.
284static __inline__ __m128 __DEFAULT_FN_ATTRS
285_mm_rsqrt_ss(__m128 __a)
286{
Logan Chien55afb0a2018-10-15 10:42:14 +0800287 return __builtin_ia32_rsqrtss((__v4sf)__a);
Logan Chien2833ffb2018-10-09 10:03:24 +0800288}
289
Logan Chien55afb0a2018-10-15 10:42:14 +0800290/// Calculates the approximate reciprocals of the square roots of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800291/// values stored in a 128-bit vector of [4 x float].
292///
293/// \headerfile <x86intrin.h>
294///
Logan Chien55afb0a2018-10-15 10:42:14 +0800295/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800296///
297/// \param __a
298/// A 128-bit vector of [4 x float].
299/// \returns A 128-bit vector of [4 x float] containing the approximate
300/// reciprocals of the square roots of the values in the operand.
301static __inline__ __m128 __DEFAULT_FN_ATTRS
302_mm_rsqrt_ps(__m128 __a)
303{
304 return __builtin_ia32_rsqrtps((__v4sf)__a);
305}
306
Logan Chien55afb0a2018-10-15 10:42:14 +0800307/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800308/// operands and returns the lesser value in the low-order bits of the
309/// vector of [4 x float].
310///
311/// \headerfile <x86intrin.h>
312///
Logan Chien55afb0a2018-10-15 10:42:14 +0800313/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800314///
315/// \param __a
316/// A 128-bit vector of [4 x float] containing one of the operands. The lower
317/// 32 bits of this operand are used in the comparison.
318/// \param __b
319/// A 128-bit vector of [4 x float] containing one of the operands. The lower
320/// 32 bits of this operand are used in the comparison.
321/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
322/// minimum value between both operands. The upper 96 bits are copied from
323/// the upper 96 bits of the first source operand.
324static __inline__ __m128 __DEFAULT_FN_ATTRS
325_mm_min_ss(__m128 __a, __m128 __b)
326{
327 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
328}
329
Logan Chien55afb0a2018-10-15 10:42:14 +0800330/// Compares two 128-bit vectors of [4 x float] and returns the lesser
331/// of each pair of values.
Logan Chien2833ffb2018-10-09 10:03:24 +0800332///
333/// \headerfile <x86intrin.h>
334///
Logan Chien55afb0a2018-10-15 10:42:14 +0800335/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800336///
337/// \param __a
338/// A 128-bit vector of [4 x float] containing one of the operands.
339/// \param __b
340/// A 128-bit vector of [4 x float] containing one of the operands.
341/// \returns A 128-bit vector of [4 x float] containing the minimum values
342/// between both operands.
343static __inline__ __m128 __DEFAULT_FN_ATTRS
344_mm_min_ps(__m128 __a, __m128 __b)
345{
346 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
347}
348
Logan Chien55afb0a2018-10-15 10:42:14 +0800349/// Compares two 32-bit float values in the low-order bits of both
350/// operands and returns the greater value in the low-order bits of a 128-bit
351/// vector of [4 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +0800352///
353/// \headerfile <x86intrin.h>
354///
Logan Chien55afb0a2018-10-15 10:42:14 +0800355/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800356///
357/// \param __a
358/// A 128-bit vector of [4 x float] containing one of the operands. The lower
359/// 32 bits of this operand are used in the comparison.
360/// \param __b
361/// A 128-bit vector of [4 x float] containing one of the operands. The lower
362/// 32 bits of this operand are used in the comparison.
363/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
364/// maximum value between both operands. The upper 96 bits are copied from
365/// the upper 96 bits of the first source operand.
366static __inline__ __m128 __DEFAULT_FN_ATTRS
367_mm_max_ss(__m128 __a, __m128 __b)
368{
369 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
370}
371
Logan Chien55afb0a2018-10-15 10:42:14 +0800372/// Compares two 128-bit vectors of [4 x float] and returns the greater
Logan Chien2833ffb2018-10-09 10:03:24 +0800373/// of each pair of values.
374///
375/// \headerfile <x86intrin.h>
376///
Logan Chien55afb0a2018-10-15 10:42:14 +0800377/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800378///
379/// \param __a
380/// A 128-bit vector of [4 x float] containing one of the operands.
381/// \param __b
382/// A 128-bit vector of [4 x float] containing one of the operands.
383/// \returns A 128-bit vector of [4 x float] containing the maximum values
384/// between both operands.
385static __inline__ __m128 __DEFAULT_FN_ATTRS
386_mm_max_ps(__m128 __a, __m128 __b)
387{
388 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
389}
390
Logan Chien55afb0a2018-10-15 10:42:14 +0800391/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +0800392///
393/// \headerfile <x86intrin.h>
394///
Logan Chien55afb0a2018-10-15 10:42:14 +0800395/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800396///
397/// \param __a
398/// A 128-bit vector containing one of the source operands.
399/// \param __b
400/// A 128-bit vector containing one of the source operands.
401/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
402/// values between both operands.
403static __inline__ __m128 __DEFAULT_FN_ATTRS
404_mm_and_ps(__m128 __a, __m128 __b)
405{
406 return (__m128)((__v4su)__a & (__v4su)__b);
407}
408
Logan Chien55afb0a2018-10-15 10:42:14 +0800409/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
Logan Chien2833ffb2018-10-09 10:03:24 +0800410/// the one's complement of the values contained in the first source
411/// operand.
412///
413/// \headerfile <x86intrin.h>
414///
Logan Chien55afb0a2018-10-15 10:42:14 +0800415/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800416///
417/// \param __a
418/// A 128-bit vector of [4 x float] containing the first source operand. The
419/// one's complement of this value is used in the bitwise AND.
420/// \param __b
421/// A 128-bit vector of [4 x float] containing the second source operand.
422/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
423/// one's complement of the first operand and the values in the second
424/// operand.
425static __inline__ __m128 __DEFAULT_FN_ATTRS
426_mm_andnot_ps(__m128 __a, __m128 __b)
427{
428 return (__m128)(~(__v4su)__a & (__v4su)__b);
429}
430
Logan Chien55afb0a2018-10-15 10:42:14 +0800431/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +0800432///
433/// \headerfile <x86intrin.h>
434///
Logan Chien55afb0a2018-10-15 10:42:14 +0800435/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800436///
437/// \param __a
438/// A 128-bit vector of [4 x float] containing one of the source operands.
439/// \param __b
440/// A 128-bit vector of [4 x float] containing one of the source operands.
441/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
442/// values between both operands.
443static __inline__ __m128 __DEFAULT_FN_ATTRS
444_mm_or_ps(__m128 __a, __m128 __b)
445{
446 return (__m128)((__v4su)__a | (__v4su)__b);
447}
448
Logan Chien55afb0a2018-10-15 10:42:14 +0800449/// Performs a bitwise exclusive OR of two 128-bit vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +0800450/// [4 x float].
451///
452/// \headerfile <x86intrin.h>
453///
Logan Chien55afb0a2018-10-15 10:42:14 +0800454/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800455///
456/// \param __a
457/// A 128-bit vector of [4 x float] containing one of the source operands.
458/// \param __b
459/// A 128-bit vector of [4 x float] containing one of the source operands.
460/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
461/// of the values between both operands.
462static __inline__ __m128 __DEFAULT_FN_ATTRS
463_mm_xor_ps(__m128 __a, __m128 __b)
464{
465 return (__m128)((__v4su)__a ^ (__v4su)__b);
466}
467
Logan Chien55afb0a2018-10-15 10:42:14 +0800468/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800469/// operands for equality and returns the result of the comparison in the
470/// low-order bits of a vector [4 x float].
471///
472/// \headerfile <x86intrin.h>
473///
Logan Chien55afb0a2018-10-15 10:42:14 +0800474/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800475///
476/// \param __a
477/// A 128-bit vector of [4 x float] containing one of the operands. The lower
478/// 32 bits of this operand are used in the comparison.
479/// \param __b
480/// A 128-bit vector of [4 x float] containing one of the operands. The lower
481/// 32 bits of this operand are used in the comparison.
482/// \returns A 128-bit vector of [4 x float] containing the comparison results
483/// in the low-order bits.
484static __inline__ __m128 __DEFAULT_FN_ATTRS
485_mm_cmpeq_ss(__m128 __a, __m128 __b)
486{
487 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
488}
489
Logan Chien55afb0a2018-10-15 10:42:14 +0800490/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800491/// 128-bit vectors of [4 x float] for equality.
492///
493/// \headerfile <x86intrin.h>
494///
Logan Chien55afb0a2018-10-15 10:42:14 +0800495/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800496///
497/// \param __a
498/// A 128-bit vector of [4 x float].
499/// \param __b
500/// A 128-bit vector of [4 x float].
501/// \returns A 128-bit vector of [4 x float] containing the comparison results.
502static __inline__ __m128 __DEFAULT_FN_ATTRS
503_mm_cmpeq_ps(__m128 __a, __m128 __b)
504{
505 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
506}
507
Logan Chien55afb0a2018-10-15 10:42:14 +0800508/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800509/// operands to determine if the value in the first operand is less than the
510/// corresponding value in the second operand and returns the result of the
511/// comparison in the low-order bits of a vector of [4 x float].
512///
513/// \headerfile <x86intrin.h>
514///
Logan Chien55afb0a2018-10-15 10:42:14 +0800515/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800516///
517/// \param __a
518/// A 128-bit vector of [4 x float] containing one of the operands. The lower
519/// 32 bits of this operand are used in the comparison.
520/// \param __b
521/// A 128-bit vector of [4 x float] containing one of the operands. The lower
522/// 32 bits of this operand are used in the comparison.
523/// \returns A 128-bit vector of [4 x float] containing the comparison results
524/// in the low-order bits.
525static __inline__ __m128 __DEFAULT_FN_ATTRS
526_mm_cmplt_ss(__m128 __a, __m128 __b)
527{
528 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
529}
530
Logan Chien55afb0a2018-10-15 10:42:14 +0800531/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800532/// 128-bit vectors of [4 x float] to determine if the values in the first
533/// operand are less than those in the second operand.
534///
535/// \headerfile <x86intrin.h>
536///
Logan Chien55afb0a2018-10-15 10:42:14 +0800537/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800538///
539/// \param __a
540/// A 128-bit vector of [4 x float].
541/// \param __b
542/// A 128-bit vector of [4 x float].
543/// \returns A 128-bit vector of [4 x float] containing the comparison results.
544static __inline__ __m128 __DEFAULT_FN_ATTRS
545_mm_cmplt_ps(__m128 __a, __m128 __b)
546{
547 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
548}
549
Logan Chien55afb0a2018-10-15 10:42:14 +0800550/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800551/// operands to determine if the value in the first operand is less than or
552/// equal to the corresponding value in the second operand and returns the
553/// result of the comparison in the low-order bits of a vector of
554/// [4 x float].
555///
556/// \headerfile <x86intrin.h>
557///
Logan Chien55afb0a2018-10-15 10:42:14 +0800558/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800559///
560/// \param __a
561/// A 128-bit vector of [4 x float] containing one of the operands. The lower
562/// 32 bits of this operand are used in the comparison.
563/// \param __b
564/// A 128-bit vector of [4 x float] containing one of the operands. The lower
565/// 32 bits of this operand are used in the comparison.
566/// \returns A 128-bit vector of [4 x float] containing the comparison results
567/// in the low-order bits.
568static __inline__ __m128 __DEFAULT_FN_ATTRS
569_mm_cmple_ss(__m128 __a, __m128 __b)
570{
571 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
572}
573
Logan Chien55afb0a2018-10-15 10:42:14 +0800574/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800575/// 128-bit vectors of [4 x float] to determine if the values in the first
576/// operand are less than or equal to those in the second operand.
577///
578/// \headerfile <x86intrin.h>
579///
Logan Chien55afb0a2018-10-15 10:42:14 +0800580/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800581///
582/// \param __a
583/// A 128-bit vector of [4 x float].
584/// \param __b
585/// A 128-bit vector of [4 x float].
586/// \returns A 128-bit vector of [4 x float] containing the comparison results.
587static __inline__ __m128 __DEFAULT_FN_ATTRS
588_mm_cmple_ps(__m128 __a, __m128 __b)
589{
590 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
591}
592
Logan Chien55afb0a2018-10-15 10:42:14 +0800593/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800594/// operands to determine if the value in the first operand is greater than
595/// the corresponding value in the second operand and returns the result of
596/// the comparison in the low-order bits of a vector of [4 x float].
597///
598/// \headerfile <x86intrin.h>
599///
Logan Chien55afb0a2018-10-15 10:42:14 +0800600/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800601///
602/// \param __a
603/// A 128-bit vector of [4 x float] containing one of the operands. The lower
604/// 32 bits of this operand are used in the comparison.
605/// \param __b
606/// A 128-bit vector of [4 x float] containing one of the operands. The lower
607/// 32 bits of this operand are used in the comparison.
608/// \returns A 128-bit vector of [4 x float] containing the comparison results
609/// in the low-order bits.
610static __inline__ __m128 __DEFAULT_FN_ATTRS
611_mm_cmpgt_ss(__m128 __a, __m128 __b)
612{
613 return (__m128)__builtin_shufflevector((__v4sf)__a,
614 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
615 4, 1, 2, 3);
616}
617
Logan Chien55afb0a2018-10-15 10:42:14 +0800618/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800619/// 128-bit vectors of [4 x float] to determine if the values in the first
620/// operand are greater than those in the second operand.
621///
622/// \headerfile <x86intrin.h>
623///
Logan Chien55afb0a2018-10-15 10:42:14 +0800624/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800625///
626/// \param __a
627/// A 128-bit vector of [4 x float].
628/// \param __b
629/// A 128-bit vector of [4 x float].
630/// \returns A 128-bit vector of [4 x float] containing the comparison results.
631static __inline__ __m128 __DEFAULT_FN_ATTRS
632_mm_cmpgt_ps(__m128 __a, __m128 __b)
633{
634 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
635}
636
Logan Chien55afb0a2018-10-15 10:42:14 +0800637/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800638/// operands to determine if the value in the first operand is greater than
639/// or equal to the corresponding value in the second operand and returns
640/// the result of the comparison in the low-order bits of a vector of
641/// [4 x float].
642///
643/// \headerfile <x86intrin.h>
644///
Logan Chien55afb0a2018-10-15 10:42:14 +0800645/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800646///
647/// \param __a
648/// A 128-bit vector of [4 x float] containing one of the operands. The lower
649/// 32 bits of this operand are used in the comparison.
650/// \param __b
651/// A 128-bit vector of [4 x float] containing one of the operands. The lower
652/// 32 bits of this operand are used in the comparison.
653/// \returns A 128-bit vector of [4 x float] containing the comparison results
654/// in the low-order bits.
655static __inline__ __m128 __DEFAULT_FN_ATTRS
656_mm_cmpge_ss(__m128 __a, __m128 __b)
657{
658 return (__m128)__builtin_shufflevector((__v4sf)__a,
659 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
660 4, 1, 2, 3);
661}
662
Logan Chien55afb0a2018-10-15 10:42:14 +0800663/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800664/// 128-bit vectors of [4 x float] to determine if the values in the first
665/// operand are greater than or equal to those in the second operand.
666///
667/// \headerfile <x86intrin.h>
668///
Logan Chien55afb0a2018-10-15 10:42:14 +0800669/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800670///
671/// \param __a
672/// A 128-bit vector of [4 x float].
673/// \param __b
674/// A 128-bit vector of [4 x float].
675/// \returns A 128-bit vector of [4 x float] containing the comparison results.
676static __inline__ __m128 __DEFAULT_FN_ATTRS
677_mm_cmpge_ps(__m128 __a, __m128 __b)
678{
679 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
680}
681
Logan Chien55afb0a2018-10-15 10:42:14 +0800682/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800683/// operands for inequality and returns the result of the comparison in the
684/// low-order bits of a vector of [4 x float].
685///
686/// \headerfile <x86intrin.h>
687///
Logan Chien55afb0a2018-10-15 10:42:14 +0800688/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
689/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800690///
691/// \param __a
692/// A 128-bit vector of [4 x float] containing one of the operands. The lower
693/// 32 bits of this operand are used in the comparison.
694/// \param __b
695/// A 128-bit vector of [4 x float] containing one of the operands. The lower
696/// 32 bits of this operand are used in the comparison.
697/// \returns A 128-bit vector of [4 x float] containing the comparison results
698/// in the low-order bits.
699static __inline__ __m128 __DEFAULT_FN_ATTRS
700_mm_cmpneq_ss(__m128 __a, __m128 __b)
701{
702 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
703}
704
Logan Chien55afb0a2018-10-15 10:42:14 +0800705/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800706/// 128-bit vectors of [4 x float] for inequality.
707///
708/// \headerfile <x86intrin.h>
709///
Logan Chien55afb0a2018-10-15 10:42:14 +0800710/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
711/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800712///
713/// \param __a
714/// A 128-bit vector of [4 x float].
715/// \param __b
716/// A 128-bit vector of [4 x float].
717/// \returns A 128-bit vector of [4 x float] containing the comparison results.
718static __inline__ __m128 __DEFAULT_FN_ATTRS
719_mm_cmpneq_ps(__m128 __a, __m128 __b)
720{
721 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
722}
723
Logan Chien55afb0a2018-10-15 10:42:14 +0800724/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800725/// operands to determine if the value in the first operand is not less than
726/// the corresponding value in the second operand and returns the result of
727/// the comparison in the low-order bits of a vector of [4 x float].
728///
729/// \headerfile <x86intrin.h>
730///
Logan Chien55afb0a2018-10-15 10:42:14 +0800731/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
732/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800733///
734/// \param __a
735/// A 128-bit vector of [4 x float] containing one of the operands. The lower
736/// 32 bits of this operand are used in the comparison.
737/// \param __b
738/// A 128-bit vector of [4 x float] containing one of the operands. The lower
739/// 32 bits of this operand are used in the comparison.
740/// \returns A 128-bit vector of [4 x float] containing the comparison results
741/// in the low-order bits.
742static __inline__ __m128 __DEFAULT_FN_ATTRS
743_mm_cmpnlt_ss(__m128 __a, __m128 __b)
744{
745 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
746}
747
Logan Chien55afb0a2018-10-15 10:42:14 +0800748/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800749/// 128-bit vectors of [4 x float] to determine if the values in the first
750/// operand are not less than those in the second operand.
751///
752/// \headerfile <x86intrin.h>
753///
Logan Chien55afb0a2018-10-15 10:42:14 +0800754/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
755/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800756///
757/// \param __a
758/// A 128-bit vector of [4 x float].
759/// \param __b
760/// A 128-bit vector of [4 x float].
761/// \returns A 128-bit vector of [4 x float] containing the comparison results.
762static __inline__ __m128 __DEFAULT_FN_ATTRS
763_mm_cmpnlt_ps(__m128 __a, __m128 __b)
764{
765 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
766}
767
Logan Chien55afb0a2018-10-15 10:42:14 +0800768/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800769/// operands to determine if the value in the first operand is not less than
770/// or equal to the corresponding value in the second operand and returns
771/// the result of the comparison in the low-order bits of a vector of
772/// [4 x float].
773///
774/// \headerfile <x86intrin.h>
775///
Logan Chien55afb0a2018-10-15 10:42:14 +0800776/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
777/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800778///
779/// \param __a
780/// A 128-bit vector of [4 x float] containing one of the operands. The lower
781/// 32 bits of this operand are used in the comparison.
782/// \param __b
783/// A 128-bit vector of [4 x float] containing one of the operands. The lower
784/// 32 bits of this operand are used in the comparison.
785/// \returns A 128-bit vector of [4 x float] containing the comparison results
786/// in the low-order bits.
787static __inline__ __m128 __DEFAULT_FN_ATTRS
788_mm_cmpnle_ss(__m128 __a, __m128 __b)
789{
790 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
791}
792
Logan Chien55afb0a2018-10-15 10:42:14 +0800793/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800794/// 128-bit vectors of [4 x float] to determine if the values in the first
795/// operand are not less than or equal to those in the second operand.
796///
797/// \headerfile <x86intrin.h>
798///
Logan Chien55afb0a2018-10-15 10:42:14 +0800799/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
800/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800801///
802/// \param __a
803/// A 128-bit vector of [4 x float].
804/// \param __b
805/// A 128-bit vector of [4 x float].
806/// \returns A 128-bit vector of [4 x float] containing the comparison results.
807static __inline__ __m128 __DEFAULT_FN_ATTRS
808_mm_cmpnle_ps(__m128 __a, __m128 __b)
809{
810 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
811}
812
Logan Chien55afb0a2018-10-15 10:42:14 +0800813/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800814/// operands to determine if the value in the first operand is not greater
815/// than the corresponding value in the second operand and returns the
816/// result of the comparison in the low-order bits of a vector of
817/// [4 x float].
818///
819/// \headerfile <x86intrin.h>
820///
Logan Chien55afb0a2018-10-15 10:42:14 +0800821/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
822/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800823///
824/// \param __a
825/// A 128-bit vector of [4 x float] containing one of the operands. The lower
826/// 32 bits of this operand are used in the comparison.
827/// \param __b
828/// A 128-bit vector of [4 x float] containing one of the operands. The lower
829/// 32 bits of this operand are used in the comparison.
830/// \returns A 128-bit vector of [4 x float] containing the comparison results
831/// in the low-order bits.
832static __inline__ __m128 __DEFAULT_FN_ATTRS
833_mm_cmpngt_ss(__m128 __a, __m128 __b)
834{
835 return (__m128)__builtin_shufflevector((__v4sf)__a,
836 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
837 4, 1, 2, 3);
838}
839
Logan Chien55afb0a2018-10-15 10:42:14 +0800840/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800841/// 128-bit vectors of [4 x float] to determine if the values in the first
842/// operand are not greater than those in the second operand.
843///
844/// \headerfile <x86intrin.h>
845///
Logan Chien55afb0a2018-10-15 10:42:14 +0800846/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
847/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800848///
849/// \param __a
850/// A 128-bit vector of [4 x float].
851/// \param __b
852/// A 128-bit vector of [4 x float].
853/// \returns A 128-bit vector of [4 x float] containing the comparison results.
854static __inline__ __m128 __DEFAULT_FN_ATTRS
855_mm_cmpngt_ps(__m128 __a, __m128 __b)
856{
857 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
858}
859
Logan Chien55afb0a2018-10-15 10:42:14 +0800860/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800861/// operands to determine if the value in the first operand is not greater
862/// than or equal to the corresponding value in the second operand and
863/// returns the result of the comparison in the low-order bits of a vector
864/// of [4 x float].
865///
866/// \headerfile <x86intrin.h>
867///
Logan Chien55afb0a2018-10-15 10:42:14 +0800868/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
869/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800870///
871/// \param __a
872/// A 128-bit vector of [4 x float] containing one of the operands. The lower
873/// 32 bits of this operand are used in the comparison.
874/// \param __b
875/// A 128-bit vector of [4 x float] containing one of the operands. The lower
876/// 32 bits of this operand are used in the comparison.
877/// \returns A 128-bit vector of [4 x float] containing the comparison results
878/// in the low-order bits.
879static __inline__ __m128 __DEFAULT_FN_ATTRS
880_mm_cmpnge_ss(__m128 __a, __m128 __b)
881{
882 return (__m128)__builtin_shufflevector((__v4sf)__a,
883 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
884 4, 1, 2, 3);
885}
886
Logan Chien55afb0a2018-10-15 10:42:14 +0800887/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800888/// 128-bit vectors of [4 x float] to determine if the values in the first
889/// operand are not greater than or equal to those in the second operand.
890///
891/// \headerfile <x86intrin.h>
892///
Logan Chien55afb0a2018-10-15 10:42:14 +0800893/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
894/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800895///
896/// \param __a
897/// A 128-bit vector of [4 x float].
898/// \param __b
899/// A 128-bit vector of [4 x float].
900/// \returns A 128-bit vector of [4 x float] containing the comparison results.
901static __inline__ __m128 __DEFAULT_FN_ATTRS
902_mm_cmpnge_ps(__m128 __a, __m128 __b)
903{
904 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
905}
906
Logan Chien55afb0a2018-10-15 10:42:14 +0800907/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800908/// operands to determine if the value in the first operand is ordered with
909/// respect to the corresponding value in the second operand and returns the
910/// result of the comparison in the low-order bits of a vector of
911/// [4 x float].
912///
913/// \headerfile <x86intrin.h>
914///
Logan Chien55afb0a2018-10-15 10:42:14 +0800915/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
916/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800917///
918/// \param __a
919/// A 128-bit vector of [4 x float] containing one of the operands. The lower
920/// 32 bits of this operand are used in the comparison.
921/// \param __b
922/// A 128-bit vector of [4 x float] containing one of the operands. The lower
923/// 32 bits of this operand are used in the comparison.
924/// \returns A 128-bit vector of [4 x float] containing the comparison results
925/// in the low-order bits.
926static __inline__ __m128 __DEFAULT_FN_ATTRS
927_mm_cmpord_ss(__m128 __a, __m128 __b)
928{
929 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
930}
931
Logan Chien55afb0a2018-10-15 10:42:14 +0800932/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800933/// 128-bit vectors of [4 x float] to determine if the values in the first
934/// operand are ordered with respect to those in the second operand.
935///
936/// \headerfile <x86intrin.h>
937///
Logan Chien55afb0a2018-10-15 10:42:14 +0800938/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
939/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800940///
941/// \param __a
942/// A 128-bit vector of [4 x float].
943/// \param __b
944/// A 128-bit vector of [4 x float].
945/// \returns A 128-bit vector of [4 x float] containing the comparison results.
946static __inline__ __m128 __DEFAULT_FN_ATTRS
947_mm_cmpord_ps(__m128 __a, __m128 __b)
948{
949 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
950}
951
Logan Chien55afb0a2018-10-15 10:42:14 +0800952/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800953/// operands to determine if the value in the first operand is unordered
954/// with respect to the corresponding value in the second operand and
955/// returns the result of the comparison in the low-order bits of a vector
956/// of [4 x float].
957///
958/// \headerfile <x86intrin.h>
959///
Logan Chien55afb0a2018-10-15 10:42:14 +0800960/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
961/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800962///
963/// \param __a
964/// A 128-bit vector of [4 x float] containing one of the operands. The lower
965/// 32 bits of this operand are used in the comparison.
966/// \param __b
967/// A 128-bit vector of [4 x float] containing one of the operands. The lower
968/// 32 bits of this operand are used in the comparison.
969/// \returns A 128-bit vector of [4 x float] containing the comparison results
970/// in the low-order bits.
971static __inline__ __m128 __DEFAULT_FN_ATTRS
972_mm_cmpunord_ss(__m128 __a, __m128 __b)
973{
974 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
975}
976
Logan Chien55afb0a2018-10-15 10:42:14 +0800977/// Compares each of the corresponding 32-bit float values of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800978/// 128-bit vectors of [4 x float] to determine if the values in the first
979/// operand are unordered with respect to those in the second operand.
980///
981/// \headerfile <x86intrin.h>
982///
Logan Chien55afb0a2018-10-15 10:42:14 +0800983/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
984/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +0800985///
986/// \param __a
987/// A 128-bit vector of [4 x float].
988/// \param __b
989/// A 128-bit vector of [4 x float].
990/// \returns A 128-bit vector of [4 x float] containing the comparison results.
991static __inline__ __m128 __DEFAULT_FN_ATTRS
992_mm_cmpunord_ps(__m128 __a, __m128 __b)
993{
994 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
995}
996
Logan Chien55afb0a2018-10-15 10:42:14 +0800997/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +0800998/// operands for equality and returns the result of the comparison.
999///
Logan Chien55afb0a2018-10-15 10:42:14 +08001000/// If either of the two lower 32-bit values is NaN, 0 is returned.
1001///
Logan Chien2833ffb2018-10-09 10:03:24 +08001002/// \headerfile <x86intrin.h>
1003///
Logan Chien55afb0a2018-10-15 10:42:14 +08001004/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1005/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001006///
1007/// \param __a
1008/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1009/// used in the comparison.
1010/// \param __b
1011/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1012/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001013/// \returns An integer containing the comparison results. If either of the
1014/// two lower 32-bit values is NaN, 0 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001015static __inline__ int __DEFAULT_FN_ATTRS
1016_mm_comieq_ss(__m128 __a, __m128 __b)
1017{
1018 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1019}
1020
Logan Chien55afb0a2018-10-15 10:42:14 +08001021/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +08001022/// operands to determine if the first operand is less than the second
1023/// operand and returns the result of the comparison.
1024///
Logan Chien55afb0a2018-10-15 10:42:14 +08001025/// If either of the two lower 32-bit values is NaN, 0 is returned.
1026///
Logan Chien2833ffb2018-10-09 10:03:24 +08001027/// \headerfile <x86intrin.h>
1028///
Logan Chien55afb0a2018-10-15 10:42:14 +08001029/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1030/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001031///
1032/// \param __a
1033/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1034/// used in the comparison.
1035/// \param __b
1036/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1037/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001038/// \returns An integer containing the comparison results. If either of the two
1039/// lower 32-bit values is NaN, 0 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001040static __inline__ int __DEFAULT_FN_ATTRS
1041_mm_comilt_ss(__m128 __a, __m128 __b)
1042{
1043 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1044}
1045
Logan Chien55afb0a2018-10-15 10:42:14 +08001046/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +08001047/// operands to determine if the first operand is less than or equal to the
1048/// second operand and returns the result of the comparison.
1049///
Logan Chien55afb0a2018-10-15 10:42:14 +08001050/// If either of the two lower 32-bit values is NaN, 0 is returned.
1051///
Logan Chien2833ffb2018-10-09 10:03:24 +08001052/// \headerfile <x86intrin.h>
1053///
Logan Chien55afb0a2018-10-15 10:42:14 +08001054/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001055///
1056/// \param __a
1057/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1058/// used in the comparison.
1059/// \param __b
1060/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1061/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001062/// \returns An integer containing the comparison results. If either of the two
1063/// lower 32-bit values is NaN, 0 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001064static __inline__ int __DEFAULT_FN_ATTRS
1065_mm_comile_ss(__m128 __a, __m128 __b)
1066{
1067 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1068}
1069
Logan Chien55afb0a2018-10-15 10:42:14 +08001070/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +08001071/// operands to determine if the first operand is greater than the second
1072/// operand and returns the result of the comparison.
1073///
Logan Chien55afb0a2018-10-15 10:42:14 +08001074/// If either of the two lower 32-bit values is NaN, 0 is returned.
1075///
Logan Chien2833ffb2018-10-09 10:03:24 +08001076/// \headerfile <x86intrin.h>
1077///
Logan Chien55afb0a2018-10-15 10:42:14 +08001078/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001079///
1080/// \param __a
1081/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1082/// used in the comparison.
1083/// \param __b
1084/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1085/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001086/// \returns An integer containing the comparison results. If either of the
1087/// two lower 32-bit values is NaN, 0 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001088static __inline__ int __DEFAULT_FN_ATTRS
1089_mm_comigt_ss(__m128 __a, __m128 __b)
1090{
1091 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1092}
1093
Logan Chien55afb0a2018-10-15 10:42:14 +08001094/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +08001095/// operands to determine if the first operand is greater than or equal to
1096/// the second operand and returns the result of the comparison.
1097///
Logan Chien55afb0a2018-10-15 10:42:14 +08001098/// If either of the two lower 32-bit values is NaN, 0 is returned.
1099///
Logan Chien2833ffb2018-10-09 10:03:24 +08001100/// \headerfile <x86intrin.h>
1101///
Logan Chien55afb0a2018-10-15 10:42:14 +08001102/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001103///
1104/// \param __a
1105/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1106/// used in the comparison.
1107/// \param __b
1108/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1109/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001110/// \returns An integer containing the comparison results. If either of the two
1111/// lower 32-bit values is NaN, 0 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001112static __inline__ int __DEFAULT_FN_ATTRS
1113_mm_comige_ss(__m128 __a, __m128 __b)
1114{
1115 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1116}
1117
Logan Chien55afb0a2018-10-15 10:42:14 +08001118/// Compares two 32-bit float values in the low-order bits of both
Logan Chien2833ffb2018-10-09 10:03:24 +08001119/// operands to determine if the first operand is not equal to the second
1120/// operand and returns the result of the comparison.
1121///
Logan Chien55afb0a2018-10-15 10:42:14 +08001122/// If either of the two lower 32-bit values is NaN, 1 is returned.
1123///
Logan Chien2833ffb2018-10-09 10:03:24 +08001124/// \headerfile <x86intrin.h>
1125///
Logan Chien55afb0a2018-10-15 10:42:14 +08001126/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001127///
1128/// \param __a
1129/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1130/// used in the comparison.
1131/// \param __b
1132/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1133/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001134/// \returns An integer containing the comparison results. If either of the
1135/// two lower 32-bit values is NaN, 1 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001136static __inline__ int __DEFAULT_FN_ATTRS
1137_mm_comineq_ss(__m128 __a, __m128 __b)
1138{
1139 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1140}
1141
Logan Chien55afb0a2018-10-15 10:42:14 +08001142/// Performs an unordered comparison of two 32-bit float values using
Logan Chien2833ffb2018-10-09 10:03:24 +08001143/// the low-order bits of both operands to determine equality and returns
1144/// the result of the comparison.
1145///
Logan Chien55afb0a2018-10-15 10:42:14 +08001146/// If either of the two lower 32-bit values is NaN, 0 is returned.
1147///
Logan Chien2833ffb2018-10-09 10:03:24 +08001148/// \headerfile <x86intrin.h>
1149///
Logan Chien55afb0a2018-10-15 10:42:14 +08001150/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001151///
1152/// \param __a
1153/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1154/// used in the comparison.
1155/// \param __b
1156/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1157/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001158/// \returns An integer containing the comparison results. If either of the two
1159/// lower 32-bit values is NaN, 0 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001160static __inline__ int __DEFAULT_FN_ATTRS
1161_mm_ucomieq_ss(__m128 __a, __m128 __b)
1162{
1163 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1164}
1165
Logan Chien55afb0a2018-10-15 10:42:14 +08001166/// Performs an unordered comparison of two 32-bit float values using
Logan Chien2833ffb2018-10-09 10:03:24 +08001167/// the low-order bits of both operands to determine if the first operand is
1168/// less than the second operand and returns the result of the comparison.
1169///
Logan Chien55afb0a2018-10-15 10:42:14 +08001170/// If either of the two lower 32-bit values is NaN, 0 is returned.
1171///
Logan Chien2833ffb2018-10-09 10:03:24 +08001172/// \headerfile <x86intrin.h>
1173///
Logan Chien55afb0a2018-10-15 10:42:14 +08001174/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001175///
1176/// \param __a
1177/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178/// used in the comparison.
1179/// \param __b
1180/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1181/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001182/// \returns An integer containing the comparison results. If either of the two
1183/// lower 32-bit values is NaN, 0 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001184static __inline__ int __DEFAULT_FN_ATTRS
1185_mm_ucomilt_ss(__m128 __a, __m128 __b)
1186{
1187 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1188}
1189
Logan Chien55afb0a2018-10-15 10:42:14 +08001190/// Performs an unordered comparison of two 32-bit float values using
1191/// the low-order bits of both operands to determine if the first operand is
1192/// less than or equal to the second operand and returns the result of the
1193/// comparison.
1194///
1195/// If either of the two lower 32-bit values is NaN, 0 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001196///
1197/// \headerfile <x86intrin.h>
1198///
Logan Chien55afb0a2018-10-15 10:42:14 +08001199/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001200///
1201/// \param __a
1202/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1203/// used in the comparison.
1204/// \param __b
1205/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1206/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001207/// \returns An integer containing the comparison results. If either of the two
1208/// lower 32-bit values is NaN, 0 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001209static __inline__ int __DEFAULT_FN_ATTRS
1210_mm_ucomile_ss(__m128 __a, __m128 __b)
1211{
1212 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1213}
1214
Logan Chien55afb0a2018-10-15 10:42:14 +08001215/// Performs an unordered comparison of two 32-bit float values using
1216/// the low-order bits of both operands to determine if the first operand is
1217/// greater than the second operand and returns the result of the
Logan Chien2833ffb2018-10-09 10:03:24 +08001218/// comparison.
1219///
Logan Chien55afb0a2018-10-15 10:42:14 +08001220/// If either of the two lower 32-bit values is NaN, 0 is returned.
1221///
Logan Chien2833ffb2018-10-09 10:03:24 +08001222/// \headerfile <x86intrin.h>
1223///
Logan Chien55afb0a2018-10-15 10:42:14 +08001224/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001225///
1226/// \param __a
1227/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1228/// used in the comparison.
1229/// \param __b
1230/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1231/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001232/// \returns An integer containing the comparison results. If either of the two
1233/// lower 32-bit values is NaN, 0 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001234static __inline__ int __DEFAULT_FN_ATTRS
1235_mm_ucomigt_ss(__m128 __a, __m128 __b)
1236{
1237 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1238}
1239
Logan Chien55afb0a2018-10-15 10:42:14 +08001240/// Performs an unordered comparison of two 32-bit float values using
Logan Chien2833ffb2018-10-09 10:03:24 +08001241/// the low-order bits of both operands to determine if the first operand is
1242/// greater than or equal to the second operand and returns the result of
1243/// the comparison.
1244///
Logan Chien55afb0a2018-10-15 10:42:14 +08001245/// If either of the two lower 32-bit values is NaN, 0 is returned.
1246///
Logan Chien2833ffb2018-10-09 10:03:24 +08001247/// \headerfile <x86intrin.h>
1248///
Logan Chien55afb0a2018-10-15 10:42:14 +08001249/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001250///
1251/// \param __a
1252/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1253/// used in the comparison.
1254/// \param __b
1255/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1256/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001257/// \returns An integer containing the comparison results. If either of the two
1258/// lower 32-bit values is NaN, 0 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001259static __inline__ int __DEFAULT_FN_ATTRS
1260_mm_ucomige_ss(__m128 __a, __m128 __b)
1261{
1262 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1263}
1264
Logan Chien55afb0a2018-10-15 10:42:14 +08001265/// Performs an unordered comparison of two 32-bit float values using
Logan Chien2833ffb2018-10-09 10:03:24 +08001266/// the low-order bits of both operands to determine inequality and returns
1267/// the result of the comparison.
1268///
Logan Chien55afb0a2018-10-15 10:42:14 +08001269/// If either of the two lower 32-bit values is NaN, 1 is returned.
1270///
Logan Chien2833ffb2018-10-09 10:03:24 +08001271/// \headerfile <x86intrin.h>
1272///
Logan Chien55afb0a2018-10-15 10:42:14 +08001273/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001274///
1275/// \param __a
1276/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277/// used in the comparison.
1278/// \param __b
1279/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1280/// used in the comparison.
Logan Chien55afb0a2018-10-15 10:42:14 +08001281/// \returns An integer containing the comparison results. If either of the two
1282/// lower 32-bit values is NaN, 1 is returned.
Logan Chien2833ffb2018-10-09 10:03:24 +08001283static __inline__ int __DEFAULT_FN_ATTRS
1284_mm_ucomineq_ss(__m128 __a, __m128 __b)
1285{
1286 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1287}
1288
Logan Chien55afb0a2018-10-15 10:42:14 +08001289/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien2833ffb2018-10-09 10:03:24 +08001290/// [4 x float] into a 32-bit integer.
1291///
1292/// \headerfile <x86intrin.h>
1293///
Logan Chien55afb0a2018-10-15 10:42:14 +08001294/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1295/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001296///
1297/// \param __a
1298/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1299/// used in the conversion.
1300/// \returns A 32-bit integer containing the converted value.
1301static __inline__ int __DEFAULT_FN_ATTRS
1302_mm_cvtss_si32(__m128 __a)
1303{
1304 return __builtin_ia32_cvtss2si((__v4sf)__a);
1305}
1306
Logan Chien55afb0a2018-10-15 10:42:14 +08001307/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien2833ffb2018-10-09 10:03:24 +08001308/// [4 x float] into a 32-bit integer.
1309///
1310/// \headerfile <x86intrin.h>
1311///
Logan Chien55afb0a2018-10-15 10:42:14 +08001312/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1313/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001314///
1315/// \param __a
1316/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1317/// used in the conversion.
1318/// \returns A 32-bit integer containing the converted value.
1319static __inline__ int __DEFAULT_FN_ATTRS
1320_mm_cvt_ss2si(__m128 __a)
1321{
1322 return _mm_cvtss_si32(__a);
1323}
1324
1325#ifdef __x86_64__
1326
Logan Chien55afb0a2018-10-15 10:42:14 +08001327/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien2833ffb2018-10-09 10:03:24 +08001328/// [4 x float] into a 64-bit integer.
1329///
1330/// \headerfile <x86intrin.h>
1331///
Logan Chien55afb0a2018-10-15 10:42:14 +08001332/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1333/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001334///
1335/// \param __a
1336/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1337/// used in the conversion.
1338/// \returns A 64-bit integer containing the converted value.
1339static __inline__ long long __DEFAULT_FN_ATTRS
1340_mm_cvtss_si64(__m128 __a)
1341{
1342 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1343}
1344
1345#endif
1346
Logan Chien55afb0a2018-10-15 10:42:14 +08001347/// Converts two low-order float values in a 128-bit vector of
Logan Chien2833ffb2018-10-09 10:03:24 +08001348/// [4 x float] into a 64-bit vector of [2 x i32].
1349///
1350/// \headerfile <x86intrin.h>
1351///
Logan Chien55afb0a2018-10-15 10:42:14 +08001352/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001353///
1354/// \param __a
1355/// A 128-bit vector of [4 x float].
1356/// \returns A 64-bit integer vector containing the converted values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001357static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08001358_mm_cvtps_pi32(__m128 __a)
1359{
1360 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1361}
1362
Logan Chien55afb0a2018-10-15 10:42:14 +08001363/// Converts two low-order float values in a 128-bit vector of
Logan Chien2833ffb2018-10-09 10:03:24 +08001364/// [4 x float] into a 64-bit vector of [2 x i32].
1365///
1366/// \headerfile <x86intrin.h>
1367///
Logan Chien55afb0a2018-10-15 10:42:14 +08001368/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001369///
1370/// \param __a
1371/// A 128-bit vector of [4 x float].
1372/// \returns A 64-bit integer vector containing the converted values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001373static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08001374_mm_cvt_ps2pi(__m128 __a)
1375{
1376 return _mm_cvtps_pi32(__a);
1377}
1378
Logan Chien55afb0a2018-10-15 10:42:14 +08001379/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien2833ffb2018-10-09 10:03:24 +08001380/// [4 x float] into a 32-bit integer, truncating the result when it is
1381/// inexact.
1382///
1383/// \headerfile <x86intrin.h>
1384///
Logan Chien55afb0a2018-10-15 10:42:14 +08001385/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1386/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001387///
1388/// \param __a
1389/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1390/// used in the conversion.
1391/// \returns A 32-bit integer containing the converted value.
1392static __inline__ int __DEFAULT_FN_ATTRS
1393_mm_cvttss_si32(__m128 __a)
1394{
Logan Chien55afb0a2018-10-15 10:42:14 +08001395 return __builtin_ia32_cvttss2si((__v4sf)__a);
Logan Chien2833ffb2018-10-09 10:03:24 +08001396}
1397
Logan Chien55afb0a2018-10-15 10:42:14 +08001398/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien2833ffb2018-10-09 10:03:24 +08001399/// [4 x float] into a 32-bit integer, truncating the result when it is
1400/// inexact.
1401///
1402/// \headerfile <x86intrin.h>
1403///
Logan Chien55afb0a2018-10-15 10:42:14 +08001404/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1405/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001406///
1407/// \param __a
1408/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1409/// used in the conversion.
1410/// \returns A 32-bit integer containing the converted value.
1411static __inline__ int __DEFAULT_FN_ATTRS
1412_mm_cvtt_ss2si(__m128 __a)
1413{
1414 return _mm_cvttss_si32(__a);
1415}
1416
Logan Chien55afb0a2018-10-15 10:42:14 +08001417#ifdef __x86_64__
1418/// Converts a float value contained in the lower 32 bits of a vector of
Logan Chien2833ffb2018-10-09 10:03:24 +08001419/// [4 x float] into a 64-bit integer, truncating the result when it is
1420/// inexact.
1421///
1422/// \headerfile <x86intrin.h>
1423///
Logan Chien55afb0a2018-10-15 10:42:14 +08001424/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1425/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001426///
1427/// \param __a
1428/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1429/// used in the conversion.
1430/// \returns A 64-bit integer containing the converted value.
1431static __inline__ long long __DEFAULT_FN_ATTRS
1432_mm_cvttss_si64(__m128 __a)
1433{
Logan Chien55afb0a2018-10-15 10:42:14 +08001434 return __builtin_ia32_cvttss2si64((__v4sf)__a);
Logan Chien2833ffb2018-10-09 10:03:24 +08001435}
Logan Chien55afb0a2018-10-15 10:42:14 +08001436#endif
Logan Chien2833ffb2018-10-09 10:03:24 +08001437
Logan Chien55afb0a2018-10-15 10:42:14 +08001438/// Converts two low-order float values in a 128-bit vector of
Logan Chien2833ffb2018-10-09 10:03:24 +08001439/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1440/// when it is inexact.
1441///
1442/// \headerfile <x86intrin.h>
1443///
Logan Chien55afb0a2018-10-15 10:42:14 +08001444/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1445/// instructions.
Logan Chien2833ffb2018-10-09 10:03:24 +08001446///
1447/// \param __a
1448/// A 128-bit vector of [4 x float].
1449/// \returns A 64-bit integer vector containing the converted values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001450static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08001451_mm_cvttps_pi32(__m128 __a)
1452{
1453 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1454}
1455
Logan Chien55afb0a2018-10-15 10:42:14 +08001456/// Converts two low-order float values in a 128-bit vector of [4 x
Logan Chien2833ffb2018-10-09 10:03:24 +08001457/// float] into a 64-bit vector of [2 x i32], truncating the result when it
1458/// is inexact.
1459///
1460/// \headerfile <x86intrin.h>
1461///
Logan Chien55afb0a2018-10-15 10:42:14 +08001462/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001463///
1464/// \param __a
1465/// A 128-bit vector of [4 x float].
1466/// \returns A 64-bit integer vector containing the converted values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001467static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08001468_mm_cvtt_ps2pi(__m128 __a)
1469{
1470 return _mm_cvttps_pi32(__a);
1471}
1472
Logan Chien55afb0a2018-10-15 10:42:14 +08001473/// Converts a 32-bit signed integer value into a floating point value
Logan Chien2833ffb2018-10-09 10:03:24 +08001474/// and writes it to the lower 32 bits of the destination. The remaining
1475/// higher order elements of the destination vector are copied from the
1476/// corresponding elements in the first operand.
1477///
1478/// \headerfile <x86intrin.h>
1479///
Logan Chien55afb0a2018-10-15 10:42:14 +08001480/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001481///
1482/// \param __a
1483/// A 128-bit vector of [4 x float].
1484/// \param __b
1485/// A 32-bit signed integer operand containing the value to be converted.
1486/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1487/// converted value of the second operand. The upper 96 bits are copied from
1488/// the upper 96 bits of the first operand.
1489static __inline__ __m128 __DEFAULT_FN_ATTRS
1490_mm_cvtsi32_ss(__m128 __a, int __b)
1491{
1492 __a[0] = __b;
1493 return __a;
1494}
1495
Logan Chien55afb0a2018-10-15 10:42:14 +08001496/// Converts a 32-bit signed integer value into a floating point value
Logan Chien2833ffb2018-10-09 10:03:24 +08001497/// and writes it to the lower 32 bits of the destination. The remaining
1498/// higher order elements of the destination are copied from the
1499/// corresponding elements in the first operand.
1500///
1501/// \headerfile <x86intrin.h>
1502///
Logan Chien55afb0a2018-10-15 10:42:14 +08001503/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001504///
1505/// \param __a
1506/// A 128-bit vector of [4 x float].
1507/// \param __b
1508/// A 32-bit signed integer operand containing the value to be converted.
1509/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1510/// converted value of the second operand. The upper 96 bits are copied from
1511/// the upper 96 bits of the first operand.
1512static __inline__ __m128 __DEFAULT_FN_ATTRS
1513_mm_cvt_si2ss(__m128 __a, int __b)
1514{
1515 return _mm_cvtsi32_ss(__a, __b);
1516}
1517
1518#ifdef __x86_64__
1519
Logan Chien55afb0a2018-10-15 10:42:14 +08001520/// Converts a 64-bit signed integer value into a floating point value
Logan Chien2833ffb2018-10-09 10:03:24 +08001521/// and writes it to the lower 32 bits of the destination. The remaining
1522/// higher order elements of the destination are copied from the
1523/// corresponding elements in the first operand.
1524///
1525/// \headerfile <x86intrin.h>
1526///
Logan Chien55afb0a2018-10-15 10:42:14 +08001527/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001528///
1529/// \param __a
1530/// A 128-bit vector of [4 x float].
1531/// \param __b
1532/// A 64-bit signed integer operand containing the value to be converted.
1533/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1534/// converted value of the second operand. The upper 96 bits are copied from
1535/// the upper 96 bits of the first operand.
1536static __inline__ __m128 __DEFAULT_FN_ATTRS
1537_mm_cvtsi64_ss(__m128 __a, long long __b)
1538{
1539 __a[0] = __b;
1540 return __a;
1541}
1542
1543#endif
1544
Logan Chien55afb0a2018-10-15 10:42:14 +08001545/// Converts two elements of a 64-bit vector of [2 x i32] into two
Logan Chien2833ffb2018-10-09 10:03:24 +08001546/// floating point values and writes them to the lower 64-bits of the
1547/// destination. The remaining higher order elements of the destination are
1548/// copied from the corresponding elements in the first operand.
1549///
1550/// \headerfile <x86intrin.h>
1551///
Logan Chien55afb0a2018-10-15 10:42:14 +08001552/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001553///
1554/// \param __a
1555/// A 128-bit vector of [4 x float].
1556/// \param __b
1557/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1558/// and written to the corresponding low-order elements in the destination.
1559/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1560/// converted value of the second operand. The upper 64 bits are copied from
1561/// the upper 64 bits of the first operand.
Logan Chien55afb0a2018-10-15 10:42:14 +08001562static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08001563_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1564{
1565 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1566}
1567
Logan Chien55afb0a2018-10-15 10:42:14 +08001568/// Converts two elements of a 64-bit vector of [2 x i32] into two
Logan Chien2833ffb2018-10-09 10:03:24 +08001569/// floating point values and writes them to the lower 64-bits of the
1570/// destination. The remaining higher order elements of the destination are
1571/// copied from the corresponding elements in the first operand.
1572///
1573/// \headerfile <x86intrin.h>
1574///
Logan Chien55afb0a2018-10-15 10:42:14 +08001575/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001576///
1577/// \param __a
1578/// A 128-bit vector of [4 x float].
1579/// \param __b
1580/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1581/// and written to the corresponding low-order elements in the destination.
1582/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1583/// converted value from the second operand. The upper 64 bits are copied
1584/// from the upper 64 bits of the first operand.
Logan Chien55afb0a2018-10-15 10:42:14 +08001585static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08001586_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1587{
1588 return _mm_cvtpi32_ps(__a, __b);
1589}
1590
Logan Chien55afb0a2018-10-15 10:42:14 +08001591/// Extracts a float value contained in the lower 32 bits of a vector of
Logan Chien2833ffb2018-10-09 10:03:24 +08001592/// [4 x float].
1593///
1594/// \headerfile <x86intrin.h>
1595///
Logan Chien55afb0a2018-10-15 10:42:14 +08001596/// This intrinsic has no corresponding instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001597///
1598/// \param __a
1599/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1600/// used in the extraction.
1601/// \returns A 32-bit float containing the extracted value.
1602static __inline__ float __DEFAULT_FN_ATTRS
1603_mm_cvtss_f32(__m128 __a)
1604{
1605 return __a[0];
1606}
1607
Logan Chien55afb0a2018-10-15 10:42:14 +08001608/// Loads two packed float values from the address \a __p into the
Logan Chien2833ffb2018-10-09 10:03:24 +08001609/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1610/// are copied from the low-order bits of the first operand.
1611///
1612/// \headerfile <x86intrin.h>
1613///
Logan Chien55afb0a2018-10-15 10:42:14 +08001614/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001615///
1616/// \param __a
1617/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1618/// of the destination.
1619/// \param __p
1620/// A pointer to two packed float values. Bits [63:0] are written to bits
1621/// [127:64] of the destination.
1622/// \returns A 128-bit vector of [4 x float] containing the moved values.
1623static __inline__ __m128 __DEFAULT_FN_ATTRS
1624_mm_loadh_pi(__m128 __a, const __m64 *__p)
1625{
1626 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1627 struct __mm_loadh_pi_struct {
1628 __mm_loadh_pi_v2f32 __u;
1629 } __attribute__((__packed__, __may_alias__));
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07001630 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
Logan Chien2833ffb2018-10-09 10:03:24 +08001631 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1632 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1633}
1634
Logan Chien55afb0a2018-10-15 10:42:14 +08001635/// Loads two packed float values from the address \a __p into the
1636/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1637/// are copied from the high-order bits of the first operand.
Logan Chien2833ffb2018-10-09 10:03:24 +08001638///
1639/// \headerfile <x86intrin.h>
1640///
Logan Chien55afb0a2018-10-15 10:42:14 +08001641/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001642///
1643/// \param __a
1644/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1645/// [127:64] of the destination.
1646/// \param __p
1647/// A pointer to two packed float values. Bits [63:0] are written to bits
1648/// [63:0] of the destination.
1649/// \returns A 128-bit vector of [4 x float] containing the moved values.
1650static __inline__ __m128 __DEFAULT_FN_ATTRS
1651_mm_loadl_pi(__m128 __a, const __m64 *__p)
1652{
1653 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1654 struct __mm_loadl_pi_struct {
1655 __mm_loadl_pi_v2f32 __u;
1656 } __attribute__((__packed__, __may_alias__));
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07001657 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
Logan Chien2833ffb2018-10-09 10:03:24 +08001658 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1659 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1660}
1661
Logan Chien55afb0a2018-10-15 10:42:14 +08001662/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien2833ffb2018-10-09 10:03:24 +08001663/// 32 bits of the vector are initialized with the single-precision
1664/// floating-point value loaded from a specified memory location. The upper
1665/// 96 bits are set to zero.
1666///
1667/// \headerfile <x86intrin.h>
1668///
Logan Chien55afb0a2018-10-15 10:42:14 +08001669/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001670///
1671/// \param __p
1672/// A pointer to a 32-bit memory location containing a single-precision
1673/// floating-point value.
1674/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1675/// lower 32 bits contain the value loaded from the memory location. The
1676/// upper 96 bits are set to zero.
1677static __inline__ __m128 __DEFAULT_FN_ATTRS
1678_mm_load_ss(const float *__p)
1679{
1680 struct __mm_load_ss_struct {
1681 float __u;
1682 } __attribute__((__packed__, __may_alias__));
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07001683 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
Logan Chien55afb0a2018-10-15 10:42:14 +08001684 return __extension__ (__m128){ __u, 0, 0, 0 };
Logan Chien2833ffb2018-10-09 10:03:24 +08001685}
1686
Logan Chien55afb0a2018-10-15 10:42:14 +08001687/// Loads a 32-bit float value and duplicates it to all four vector
Logan Chien2833ffb2018-10-09 10:03:24 +08001688/// elements of a 128-bit vector of [4 x float].
1689///
1690/// \headerfile <x86intrin.h>
1691///
Logan Chien55afb0a2018-10-15 10:42:14 +08001692/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
Logan Chien2833ffb2018-10-09 10:03:24 +08001693/// instruction.
1694///
1695/// \param __p
1696/// A pointer to a float value to be loaded and duplicated.
Logan Chien55afb0a2018-10-15 10:42:14 +08001697/// \returns A 128-bit vector of [4 x float] containing the loaded and
1698/// duplicated values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001699static __inline__ __m128 __DEFAULT_FN_ATTRS
1700_mm_load1_ps(const float *__p)
1701{
1702 struct __mm_load1_ps_struct {
1703 float __u;
1704 } __attribute__((__packed__, __may_alias__));
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07001705 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
Logan Chien55afb0a2018-10-15 10:42:14 +08001706 return __extension__ (__m128){ __u, __u, __u, __u };
Logan Chien2833ffb2018-10-09 10:03:24 +08001707}
1708
1709#define _mm_load_ps1(p) _mm_load1_ps(p)
1710
Logan Chien55afb0a2018-10-15 10:42:14 +08001711/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
Logan Chien2833ffb2018-10-09 10:03:24 +08001712/// memory location.
1713///
1714/// \headerfile <x86intrin.h>
1715///
Logan Chien55afb0a2018-10-15 10:42:14 +08001716/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001717///
1718/// \param __p
1719/// A pointer to a 128-bit memory location. The address of the memory
1720/// location has to be 128-bit aligned.
Logan Chien55afb0a2018-10-15 10:42:14 +08001721/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001722static __inline__ __m128 __DEFAULT_FN_ATTRS
1723_mm_load_ps(const float *__p)
1724{
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07001725 return *(const __m128*)__p;
Logan Chien2833ffb2018-10-09 10:03:24 +08001726}
1727
Logan Chien55afb0a2018-10-15 10:42:14 +08001728/// Loads a 128-bit floating-point vector of [4 x float] from an
Logan Chien2833ffb2018-10-09 10:03:24 +08001729/// unaligned memory location.
1730///
1731/// \headerfile <x86intrin.h>
1732///
Logan Chien55afb0a2018-10-15 10:42:14 +08001733/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001734///
1735/// \param __p
1736/// A pointer to a 128-bit memory location. The address of the memory
1737/// location does not have to be aligned.
1738/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1739static __inline__ __m128 __DEFAULT_FN_ATTRS
1740_mm_loadu_ps(const float *__p)
1741{
1742 struct __loadu_ps {
Logan Chiendbcf4122019-03-21 10:50:25 +08001743 __m128_u __v;
Logan Chien2833ffb2018-10-09 10:03:24 +08001744 } __attribute__((__packed__, __may_alias__));
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07001745 return ((const struct __loadu_ps*)__p)->__v;
Logan Chien2833ffb2018-10-09 10:03:24 +08001746}
1747
Logan Chien55afb0a2018-10-15 10:42:14 +08001748/// Loads four packed float values, in reverse order, from an aligned
Logan Chien2833ffb2018-10-09 10:03:24 +08001749/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1750///
1751/// \headerfile <x86intrin.h>
1752///
Logan Chien55afb0a2018-10-15 10:42:14 +08001753/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien2833ffb2018-10-09 10:03:24 +08001754/// instruction.
1755///
1756/// \param __p
1757/// A pointer to a 128-bit memory location. The address of the memory
1758/// location has to be 128-bit aligned.
1759/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1760/// in reverse order.
1761static __inline__ __m128 __DEFAULT_FN_ATTRS
1762_mm_loadr_ps(const float *__p)
1763{
1764 __m128 __a = _mm_load_ps(__p);
1765 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1766}
1767
Logan Chien55afb0a2018-10-15 10:42:14 +08001768/// Create a 128-bit vector of [4 x float] with undefined values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001769///
1770/// \headerfile <x86intrin.h>
1771///
1772/// This intrinsic has no corresponding instruction.
1773///
1774/// \returns A 128-bit vector of [4 x float] containing undefined values.
Logan Chien2833ffb2018-10-09 10:03:24 +08001775static __inline__ __m128 __DEFAULT_FN_ATTRS
1776_mm_undefined_ps(void)
1777{
1778 return (__m128)__builtin_ia32_undef128();
1779}
1780
Logan Chien55afb0a2018-10-15 10:42:14 +08001781/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien2833ffb2018-10-09 10:03:24 +08001782/// 32 bits of the vector are initialized with the specified single-precision
1783/// floating-point value. The upper 96 bits are set to zero.
1784///
1785/// \headerfile <x86intrin.h>
1786///
Logan Chien55afb0a2018-10-15 10:42:14 +08001787/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001788///
1789/// \param __w
1790/// A single-precision floating-point value used to initialize the lower 32
1791/// bits of the result.
1792/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1793/// lower 32 bits contain the value provided in the source operand. The
1794/// upper 96 bits are set to zero.
1795static __inline__ __m128 __DEFAULT_FN_ATTRS
1796_mm_set_ss(float __w)
1797{
Logan Chien55afb0a2018-10-15 10:42:14 +08001798 return __extension__ (__m128){ __w, 0, 0, 0 };
Logan Chien2833ffb2018-10-09 10:03:24 +08001799}
1800
Logan Chien55afb0a2018-10-15 10:42:14 +08001801/// Constructs a 128-bit floating-point vector of [4 x float], with each
Logan Chien2833ffb2018-10-09 10:03:24 +08001802/// of the four single-precision floating-point vector elements set to the
1803/// specified single-precision floating-point value.
1804///
1805/// \headerfile <x86intrin.h>
1806///
Logan Chien55afb0a2018-10-15 10:42:14 +08001807/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001808///
1809/// \param __w
1810/// A single-precision floating-point value used to initialize each vector
1811/// element of the result.
1812/// \returns An initialized 128-bit floating-point vector of [4 x float].
1813static __inline__ __m128 __DEFAULT_FN_ATTRS
1814_mm_set1_ps(float __w)
1815{
Logan Chien55afb0a2018-10-15 10:42:14 +08001816 return __extension__ (__m128){ __w, __w, __w, __w };
Logan Chien2833ffb2018-10-09 10:03:24 +08001817}
1818
1819/* Microsoft specific. */
Logan Chien55afb0a2018-10-15 10:42:14 +08001820/// Constructs a 128-bit floating-point vector of [4 x float], with each
Logan Chien2833ffb2018-10-09 10:03:24 +08001821/// of the four single-precision floating-point vector elements set to the
1822/// specified single-precision floating-point value.
1823///
1824/// \headerfile <x86intrin.h>
1825///
Logan Chien55afb0a2018-10-15 10:42:14 +08001826/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001827///
1828/// \param __w
1829/// A single-precision floating-point value used to initialize each vector
1830/// element of the result.
1831/// \returns An initialized 128-bit floating-point vector of [4 x float].
1832static __inline__ __m128 __DEFAULT_FN_ATTRS
1833_mm_set_ps1(float __w)
1834{
1835 return _mm_set1_ps(__w);
1836}
1837
Logan Chien55afb0a2018-10-15 10:42:14 +08001838/// Constructs a 128-bit floating-point vector of [4 x float]
Logan Chien2833ffb2018-10-09 10:03:24 +08001839/// initialized with the specified single-precision floating-point values.
1840///
1841/// \headerfile <x86intrin.h>
1842///
1843/// This intrinsic is a utility function and does not correspond to a specific
1844/// instruction.
1845///
1846/// \param __z
1847/// A single-precision floating-point value used to initialize bits [127:96]
1848/// of the result.
1849/// \param __y
1850/// A single-precision floating-point value used to initialize bits [95:64]
1851/// of the result.
1852/// \param __x
1853/// A single-precision floating-point value used to initialize bits [63:32]
1854/// of the result.
1855/// \param __w
1856/// A single-precision floating-point value used to initialize bits [31:0]
1857/// of the result.
1858/// \returns An initialized 128-bit floating-point vector of [4 x float].
1859static __inline__ __m128 __DEFAULT_FN_ATTRS
1860_mm_set_ps(float __z, float __y, float __x, float __w)
1861{
Logan Chien55afb0a2018-10-15 10:42:14 +08001862 return __extension__ (__m128){ __w, __x, __y, __z };
Logan Chien2833ffb2018-10-09 10:03:24 +08001863}
1864
Logan Chien55afb0a2018-10-15 10:42:14 +08001865/// Constructs a 128-bit floating-point vector of [4 x float],
Logan Chien2833ffb2018-10-09 10:03:24 +08001866/// initialized in reverse order with the specified 32-bit single-precision
1867/// float-point values.
1868///
1869/// \headerfile <x86intrin.h>
1870///
1871/// This intrinsic is a utility function and does not correspond to a specific
1872/// instruction.
1873///
1874/// \param __z
1875/// A single-precision floating-point value used to initialize bits [31:0]
1876/// of the result.
1877/// \param __y
1878/// A single-precision floating-point value used to initialize bits [63:32]
1879/// of the result.
1880/// \param __x
1881/// A single-precision floating-point value used to initialize bits [95:64]
1882/// of the result.
1883/// \param __w
1884/// A single-precision floating-point value used to initialize bits [127:96]
1885/// of the result.
1886/// \returns An initialized 128-bit floating-point vector of [4 x float].
1887static __inline__ __m128 __DEFAULT_FN_ATTRS
1888_mm_setr_ps(float __z, float __y, float __x, float __w)
1889{
Logan Chien55afb0a2018-10-15 10:42:14 +08001890 return __extension__ (__m128){ __z, __y, __x, __w };
Logan Chien2833ffb2018-10-09 10:03:24 +08001891}
1892
Logan Chien55afb0a2018-10-15 10:42:14 +08001893/// Constructs a 128-bit floating-point vector of [4 x float] initialized
Logan Chien2833ffb2018-10-09 10:03:24 +08001894/// to zero.
1895///
1896/// \headerfile <x86intrin.h>
1897///
Logan Chien55afb0a2018-10-15 10:42:14 +08001898/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001899///
1900/// \returns An initialized 128-bit floating-point vector of [4 x float] with
1901/// all elements set to zero.
1902static __inline__ __m128 __DEFAULT_FN_ATTRS
1903_mm_setzero_ps(void)
1904{
Logan Chien55afb0a2018-10-15 10:42:14 +08001905 return __extension__ (__m128){ 0, 0, 0, 0 };
Logan Chien2833ffb2018-10-09 10:03:24 +08001906}
1907
Logan Chien55afb0a2018-10-15 10:42:14 +08001908/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
Logan Chien2833ffb2018-10-09 10:03:24 +08001909/// memory location.
1910///
1911/// \headerfile <x86intrin.h>
1912///
Logan Chien55afb0a2018-10-15 10:42:14 +08001913/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001914///
1915/// \param __p
1916/// A pointer to a 64-bit memory location.
1917/// \param __a
1918/// A 128-bit vector of [4 x float] containing the values to be stored.
1919static __inline__ void __DEFAULT_FN_ATTRS
1920_mm_storeh_pi(__m64 *__p, __m128 __a)
1921{
Logan Chienbedbf4f2020-01-06 19:35:19 -08001922 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1923 struct __mm_storeh_pi_struct {
1924 __mm_storeh_pi_v2f32 __u;
1925 } __attribute__((__packed__, __may_alias__));
1926 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
Logan Chien2833ffb2018-10-09 10:03:24 +08001927}
1928
Logan Chien55afb0a2018-10-15 10:42:14 +08001929/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
Logan Chien2833ffb2018-10-09 10:03:24 +08001930/// memory location.
1931///
1932/// \headerfile <x86intrin.h>
1933///
Logan Chien55afb0a2018-10-15 10:42:14 +08001934/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001935///
1936/// \param __p
1937/// A pointer to a memory location that will receive the float values.
1938/// \param __a
1939/// A 128-bit vector of [4 x float] containing the values to be stored.
1940static __inline__ void __DEFAULT_FN_ATTRS
1941_mm_storel_pi(__m64 *__p, __m128 __a)
1942{
Logan Chienbedbf4f2020-01-06 19:35:19 -08001943 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1944 struct __mm_storeh_pi_struct {
1945 __mm_storeh_pi_v2f32 __u;
1946 } __attribute__((__packed__, __may_alias__));
1947 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
Logan Chien2833ffb2018-10-09 10:03:24 +08001948}
1949
Logan Chien55afb0a2018-10-15 10:42:14 +08001950/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
Logan Chien2833ffb2018-10-09 10:03:24 +08001951/// memory location.
1952///
1953/// \headerfile <x86intrin.h>
1954///
Logan Chien55afb0a2018-10-15 10:42:14 +08001955/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001956///
1957/// \param __p
1958/// A pointer to a 32-bit memory location.
1959/// \param __a
1960/// A 128-bit vector of [4 x float] containing the value to be stored.
1961static __inline__ void __DEFAULT_FN_ATTRS
1962_mm_store_ss(float *__p, __m128 __a)
1963{
1964 struct __mm_store_ss_struct {
1965 float __u;
1966 } __attribute__((__packed__, __may_alias__));
1967 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1968}
1969
Logan Chien55afb0a2018-10-15 10:42:14 +08001970/// Stores a 128-bit vector of [4 x float] to an unaligned memory
1971/// location.
Logan Chien2833ffb2018-10-09 10:03:24 +08001972///
1973/// \headerfile <x86intrin.h>
1974///
Logan Chien55afb0a2018-10-15 10:42:14 +08001975/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001976///
1977/// \param __p
1978/// A pointer to a 128-bit memory location. The address of the memory
1979/// location does not have to be aligned.
1980/// \param __a
1981/// A 128-bit vector of [4 x float] containing the values to be stored.
1982static __inline__ void __DEFAULT_FN_ATTRS
1983_mm_storeu_ps(float *__p, __m128 __a)
1984{
1985 struct __storeu_ps {
Logan Chiendbcf4122019-03-21 10:50:25 +08001986 __m128_u __v;
Logan Chien2833ffb2018-10-09 10:03:24 +08001987 } __attribute__((__packed__, __may_alias__));
1988 ((struct __storeu_ps*)__p)->__v = __a;
1989}
1990
Logan Chien55afb0a2018-10-15 10:42:14 +08001991/// Stores a 128-bit vector of [4 x float] into an aligned memory
1992/// location.
Logan Chien2833ffb2018-10-09 10:03:24 +08001993///
1994/// \headerfile <x86intrin.h>
1995///
Logan Chien55afb0a2018-10-15 10:42:14 +08001996/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001997///
1998/// \param __p
Logan Chien55afb0a2018-10-15 10:42:14 +08001999/// A pointer to a 128-bit memory location. The address of the memory
2000/// location has to be 16-byte aligned.
Logan Chien2833ffb2018-10-09 10:03:24 +08002001/// \param __a
Logan Chien55afb0a2018-10-15 10:42:14 +08002002/// A 128-bit vector of [4 x float] containing the values to be stored.
Logan Chien2833ffb2018-10-09 10:03:24 +08002003static __inline__ void __DEFAULT_FN_ATTRS
2004_mm_store_ps(float *__p, __m128 __a)
2005{
2006 *(__m128*)__p = __a;
2007}
2008
Logan Chien55afb0a2018-10-15 10:42:14 +08002009/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
Logan Chien2833ffb2018-10-09 10:03:24 +08002010/// four contiguous elements in an aligned memory location.
2011///
2012/// \headerfile <x86intrin.h>
2013///
Logan Chien55afb0a2018-10-15 10:42:14 +08002014/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien2833ffb2018-10-09 10:03:24 +08002015/// instruction.
2016///
2017/// \param __p
2018/// A pointer to a 128-bit memory location.
2019/// \param __a
2020/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
Logan Chien55afb0a2018-10-15 10:42:14 +08002021/// of the four contiguous elements pointed by \a __p.
Logan Chien2833ffb2018-10-09 10:03:24 +08002022static __inline__ void __DEFAULT_FN_ATTRS
2023_mm_store1_ps(float *__p, __m128 __a)
2024{
2025 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2026 _mm_store_ps(__p, __a);
2027}
2028
Logan Chien55afb0a2018-10-15 10:42:14 +08002029/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2030/// four contiguous elements in an aligned memory location.
Logan Chien2833ffb2018-10-09 10:03:24 +08002031///
2032/// \headerfile <x86intrin.h>
2033///
Logan Chien55afb0a2018-10-15 10:42:14 +08002034/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2035/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002036///
2037/// \param __p
Logan Chien55afb0a2018-10-15 10:42:14 +08002038/// A pointer to a 128-bit memory location.
Logan Chien2833ffb2018-10-09 10:03:24 +08002039/// \param __a
Logan Chien55afb0a2018-10-15 10:42:14 +08002040/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2041/// of the four contiguous elements pointed by \a __p.
Logan Chien2833ffb2018-10-09 10:03:24 +08002042static __inline__ void __DEFAULT_FN_ATTRS
2043_mm_store_ps1(float *__p, __m128 __a)
2044{
Logan Chien55afb0a2018-10-15 10:42:14 +08002045 _mm_store1_ps(__p, __a);
Logan Chien2833ffb2018-10-09 10:03:24 +08002046}
2047
Logan Chien55afb0a2018-10-15 10:42:14 +08002048/// Stores float values from a 128-bit vector of [4 x float] to an
Logan Chien2833ffb2018-10-09 10:03:24 +08002049/// aligned memory location in reverse order.
2050///
2051/// \headerfile <x86intrin.h>
2052///
Logan Chien55afb0a2018-10-15 10:42:14 +08002053/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
Logan Chien2833ffb2018-10-09 10:03:24 +08002054/// instruction.
2055///
2056/// \param __p
2057/// A pointer to a 128-bit memory location. The address of the memory
2058/// location has to be 128-bit aligned.
2059/// \param __a
2060/// A 128-bit vector of [4 x float] containing the values to be stored.
2061static __inline__ void __DEFAULT_FN_ATTRS
2062_mm_storer_ps(float *__p, __m128 __a)
2063{
2064 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2065 _mm_store_ps(__p, __a);
2066}
2067
Logan Chien55afb0a2018-10-15 10:42:14 +08002068#define _MM_HINT_ET0 7
2069#define _MM_HINT_ET1 6
2070#define _MM_HINT_T0 3
2071#define _MM_HINT_T1 2
2072#define _MM_HINT_T2 1
Logan Chien2833ffb2018-10-09 10:03:24 +08002073#define _MM_HINT_NTA 0
2074
2075#ifndef _MSC_VER
2076/* FIXME: We have to #define this because "sel" must be a constant integer, and
2077 Sema doesn't do any form of constant propagation yet. */
2078
Logan Chien55afb0a2018-10-15 10:42:14 +08002079/// Loads one cache line of data from the specified address to a location
Logan Chien2833ffb2018-10-09 10:03:24 +08002080/// closer to the processor.
2081///
2082/// \headerfile <x86intrin.h>
2083///
2084/// \code
2085/// void _mm_prefetch(const void * a, const int sel);
2086/// \endcode
2087///
Logan Chien55afb0a2018-10-15 10:42:14 +08002088/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002089///
2090/// \param a
2091/// A pointer to a memory location containing a cache line of data.
2092/// \param sel
Logan Chien55afb0a2018-10-15 10:42:14 +08002093/// A predefined integer constant specifying the type of prefetch
2094/// operation: \n
2095/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2096/// PREFETCHNTA instruction will be generated. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08002097/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
Logan Chien55afb0a2018-10-15 10:42:14 +08002098/// be generated. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08002099/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
Logan Chien55afb0a2018-10-15 10:42:14 +08002100/// be generated. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08002101/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2102/// be generated.
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07002103#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
Logan Chien55afb0a2018-10-15 10:42:14 +08002104 ((sel) >> 2) & 1, (sel) & 0x3))
Logan Chien2833ffb2018-10-09 10:03:24 +08002105#endif
2106
Logan Chien55afb0a2018-10-15 10:42:14 +08002107/// Stores a 64-bit integer in the specified aligned memory location. To
Logan Chien2833ffb2018-10-09 10:03:24 +08002108/// minimize caching, the data is flagged as non-temporal (unlikely to be
2109/// used again soon).
2110///
2111/// \headerfile <x86intrin.h>
2112///
Logan Chien55afb0a2018-10-15 10:42:14 +08002113/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002114///
2115/// \param __p
2116/// A pointer to an aligned memory location used to store the register value.
2117/// \param __a
2118/// A 64-bit integer containing the value to be stored.
Logan Chien55afb0a2018-10-15 10:42:14 +08002119static __inline__ void __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002120_mm_stream_pi(__m64 *__p, __m64 __a)
2121{
2122 __builtin_ia32_movntq(__p, __a);
2123}
2124
Logan Chien55afb0a2018-10-15 10:42:14 +08002125/// Moves packed float values from a 128-bit vector of [4 x float] to a
Logan Chien2833ffb2018-10-09 10:03:24 +08002126/// 128-bit aligned memory location. To minimize caching, the data is flagged
2127/// as non-temporal (unlikely to be used again soon).
2128///
2129/// \headerfile <x86intrin.h>
2130///
Logan Chien55afb0a2018-10-15 10:42:14 +08002131/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002132///
2133/// \param __p
2134/// A pointer to a 128-bit aligned memory location that will receive the
Logan Chien55afb0a2018-10-15 10:42:14 +08002135/// single-precision floating-point values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002136/// \param __a
2137/// A 128-bit vector of [4 x float] containing the values to be moved.
2138static __inline__ void __DEFAULT_FN_ATTRS
2139_mm_stream_ps(float *__p, __m128 __a)
2140{
2141 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2142}
2143
Logan Chien55afb0a2018-10-15 10:42:14 +08002144#if defined(__cplusplus)
2145extern "C" {
2146#endif
2147
2148/// Forces strong memory ordering (serialization) between store
Logan Chien2833ffb2018-10-09 10:03:24 +08002149/// instructions preceding this instruction and store instructions following
2150/// this instruction, ensuring the system completes all previous stores
2151/// before executing subsequent stores.
2152///
2153/// \headerfile <x86intrin.h>
2154///
Logan Chien55afb0a2018-10-15 10:42:14 +08002155/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002156///
Logan Chien55afb0a2018-10-15 10:42:14 +08002157void _mm_sfence(void);
Logan Chien2833ffb2018-10-09 10:03:24 +08002158
Logan Chien55afb0a2018-10-15 10:42:14 +08002159#if defined(__cplusplus)
2160} // extern "C"
2161#endif
2162
2163/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
Logan Chien2833ffb2018-10-09 10:03:24 +08002164/// returns it, as specified by the immediate integer operand.
2165///
2166/// \headerfile <x86intrin.h>
2167///
Logan Chien55afb0a2018-10-15 10:42:14 +08002168/// \code
2169/// int _mm_extract_pi16(__m64 a, int n);
2170/// \endcode
Logan Chien2833ffb2018-10-09 10:03:24 +08002171///
Logan Chien55afb0a2018-10-15 10:42:14 +08002172/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2173///
2174/// \param a
Logan Chien2833ffb2018-10-09 10:03:24 +08002175/// A 64-bit vector of [4 x i16].
Logan Chien55afb0a2018-10-15 10:42:14 +08002176/// \param n
2177/// An immediate integer operand that determines which bits are extracted: \n
2178/// 0: Bits [15:0] are copied to the destination. \n
2179/// 1: Bits [31:16] are copied to the destination. \n
2180/// 2: Bits [47:32] are copied to the destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08002181/// 3: Bits [63:48] are copied to the destination.
2182/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
Logan Chien55afb0a2018-10-15 10:42:14 +08002183#define _mm_extract_pi16(a, n) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002184 ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
Logan Chien2833ffb2018-10-09 10:03:24 +08002185
Logan Chien55afb0a2018-10-15 10:42:14 +08002186/// Copies data from the 64-bit vector of [4 x i16] to the destination,
Logan Chien2833ffb2018-10-09 10:03:24 +08002187/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
Logan Chien55afb0a2018-10-15 10:42:14 +08002188/// specified by the immediate operand \a n.
Logan Chien2833ffb2018-10-09 10:03:24 +08002189///
2190/// \headerfile <x86intrin.h>
2191///
Logan Chien55afb0a2018-10-15 10:42:14 +08002192/// \code
2193/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2194/// \endcode
Logan Chien2833ffb2018-10-09 10:03:24 +08002195///
Logan Chien55afb0a2018-10-15 10:42:14 +08002196/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2197///
2198/// \param a
Logan Chien2833ffb2018-10-09 10:03:24 +08002199/// A 64-bit vector of [4 x i16].
Logan Chien55afb0a2018-10-15 10:42:14 +08002200/// \param d
Logan Chien2833ffb2018-10-09 10:03:24 +08002201/// An integer. The lower 16-bit value from this operand is written to the
Logan Chien55afb0a2018-10-15 10:42:14 +08002202/// destination at the offset specified by operand \a n.
2203/// \param n
Logan Chien2833ffb2018-10-09 10:03:24 +08002204/// An immediate integer operant that determines which the bits to be used
Logan Chien55afb0a2018-10-15 10:42:14 +08002205/// in the destination. \n
2206/// 0: Bits [15:0] are copied to the destination. \n
2207/// 1: Bits [31:16] are copied to the destination. \n
2208/// 2: Bits [47:32] are copied to the destination. \n
2209/// 3: Bits [63:48] are copied to the destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08002210/// The remaining bits in the destination are copied from the corresponding
Logan Chien55afb0a2018-10-15 10:42:14 +08002211/// bits in operand \a a.
Logan Chien2833ffb2018-10-09 10:03:24 +08002212/// \returns A 64-bit integer vector containing the copied packed data from the
2213/// operands.
Logan Chien55afb0a2018-10-15 10:42:14 +08002214#define _mm_insert_pi16(a, d, n) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002215 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
Logan Chien2833ffb2018-10-09 10:03:24 +08002216
Logan Chien55afb0a2018-10-15 10:42:14 +08002217/// Compares each of the corresponding packed 16-bit integer values of
Logan Chien2833ffb2018-10-09 10:03:24 +08002218/// the 64-bit integer vectors, and writes the greater value to the
2219/// corresponding bits in the destination.
2220///
2221/// \headerfile <x86intrin.h>
2222///
Logan Chien55afb0a2018-10-15 10:42:14 +08002223/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002224///
2225/// \param __a
2226/// A 64-bit integer vector containing one of the source operands.
2227/// \param __b
2228/// A 64-bit integer vector containing one of the source operands.
2229/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien55afb0a2018-10-15 10:42:14 +08002230static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002231_mm_max_pi16(__m64 __a, __m64 __b)
2232{
2233 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2234}
2235
Logan Chien55afb0a2018-10-15 10:42:14 +08002236/// Compares each of the corresponding packed 8-bit unsigned integer
Logan Chien2833ffb2018-10-09 10:03:24 +08002237/// values of the 64-bit integer vectors, and writes the greater value to the
2238/// corresponding bits in the destination.
2239///
2240/// \headerfile <x86intrin.h>
2241///
Logan Chien55afb0a2018-10-15 10:42:14 +08002242/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002243///
2244/// \param __a
2245/// A 64-bit integer vector containing one of the source operands.
2246/// \param __b
2247/// A 64-bit integer vector containing one of the source operands.
2248/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien55afb0a2018-10-15 10:42:14 +08002249static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002250_mm_max_pu8(__m64 __a, __m64 __b)
2251{
2252 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2253}
2254
Logan Chien55afb0a2018-10-15 10:42:14 +08002255/// Compares each of the corresponding packed 16-bit integer values of
Logan Chien2833ffb2018-10-09 10:03:24 +08002256/// the 64-bit integer vectors, and writes the lesser value to the
2257/// corresponding bits in the destination.
2258///
2259/// \headerfile <x86intrin.h>
2260///
Logan Chien55afb0a2018-10-15 10:42:14 +08002261/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002262///
2263/// \param __a
2264/// A 64-bit integer vector containing one of the source operands.
2265/// \param __b
2266/// A 64-bit integer vector containing one of the source operands.
2267/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien55afb0a2018-10-15 10:42:14 +08002268static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002269_mm_min_pi16(__m64 __a, __m64 __b)
2270{
2271 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2272}
2273
Logan Chien55afb0a2018-10-15 10:42:14 +08002274/// Compares each of the corresponding packed 8-bit unsigned integer
Logan Chien2833ffb2018-10-09 10:03:24 +08002275/// values of the 64-bit integer vectors, and writes the lesser value to the
2276/// corresponding bits in the destination.
2277///
2278/// \headerfile <x86intrin.h>
2279///
Logan Chien55afb0a2018-10-15 10:42:14 +08002280/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002281///
2282/// \param __a
2283/// A 64-bit integer vector containing one of the source operands.
2284/// \param __b
2285/// A 64-bit integer vector containing one of the source operands.
2286/// \returns A 64-bit integer vector containing the comparison results.
Logan Chien55afb0a2018-10-15 10:42:14 +08002287static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002288_mm_min_pu8(__m64 __a, __m64 __b)
2289{
2290 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2291}
2292
Logan Chien55afb0a2018-10-15 10:42:14 +08002293/// Takes the most significant bit from each 8-bit element in a 64-bit
2294/// integer vector to create an 8-bit mask value. Zero-extends the value to
Logan Chien2833ffb2018-10-09 10:03:24 +08002295/// 32-bit integer and writes it to the destination.
2296///
2297/// \headerfile <x86intrin.h>
2298///
Logan Chien55afb0a2018-10-15 10:42:14 +08002299/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002300///
2301/// \param __a
2302/// A 64-bit integer vector containing the values with bits to be extracted.
Logan Chien55afb0a2018-10-15 10:42:14 +08002303/// \returns The most significant bit from each 8-bit element in \a __a,
2304/// written to bits [7:0].
2305static __inline__ int __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002306_mm_movemask_pi8(__m64 __a)
2307{
2308 return __builtin_ia32_pmovmskb((__v8qi)__a);
2309}
2310
Logan Chien55afb0a2018-10-15 10:42:14 +08002311/// Multiplies packed 16-bit unsigned integer values and writes the
Logan Chien2833ffb2018-10-09 10:03:24 +08002312/// high-order 16 bits of each 32-bit product to the corresponding bits in
2313/// the destination.
2314///
2315/// \headerfile <x86intrin.h>
2316///
Logan Chien55afb0a2018-10-15 10:42:14 +08002317/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002318///
2319/// \param __a
2320/// A 64-bit integer vector containing one of the source operands.
2321/// \param __b
2322/// A 64-bit integer vector containing one of the source operands.
2323/// \returns A 64-bit integer vector containing the products of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +08002324static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002325_mm_mulhi_pu16(__m64 __a, __m64 __b)
2326{
2327 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2328}
2329
Logan Chien55afb0a2018-10-15 10:42:14 +08002330/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
Logan Chien2833ffb2018-10-09 10:03:24 +08002331/// destination, as specified by the immediate value operand.
2332///
2333/// \headerfile <x86intrin.h>
2334///
Logan Chien2833ffb2018-10-09 10:03:24 +08002335/// \code
2336/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2337/// \endcode
2338///
Logan Chien55afb0a2018-10-15 10:42:14 +08002339/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2340///
Logan Chien2833ffb2018-10-09 10:03:24 +08002341/// \param a
2342/// A 64-bit integer vector containing the values to be shuffled.
2343/// \param n
2344/// An immediate value containing an 8-bit value specifying which elements to
Logan Chien55afb0a2018-10-15 10:42:14 +08002345/// copy from \a a. The destinations within the 64-bit destination are
2346/// assigned values as follows: \n
2347/// Bits [1:0] are used to assign values to bits [15:0] in the
2348/// destination. \n
2349/// Bits [3:2] are used to assign values to bits [31:16] in the
2350/// destination. \n
2351/// Bits [5:4] are used to assign values to bits [47:32] in the
2352/// destination. \n
2353/// Bits [7:6] are used to assign values to bits [63:48] in the
2354/// destination. \n
2355/// Bit value assignments: \n
2356/// 00: assigned from bits [15:0] of \a a. \n
2357/// 01: assigned from bits [31:16] of \a a. \n
2358/// 10: assigned from bits [47:32] of \a a. \n
2359/// 11: assigned from bits [63:48] of \a a.
Logan Chien2833ffb2018-10-09 10:03:24 +08002360/// \returns A 64-bit integer vector containing the shuffled values.
Logan Chien55afb0a2018-10-15 10:42:14 +08002361#define _mm_shuffle_pi16(a, n) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002362 ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002363
Logan Chien55afb0a2018-10-15 10:42:14 +08002364/// Conditionally copies the values from each 8-bit element in the first
Logan Chien2833ffb2018-10-09 10:03:24 +08002365/// 64-bit integer vector operand to the specified memory location, as
2366/// specified by the most significant bit in the corresponding element in the
Logan Chien55afb0a2018-10-15 10:42:14 +08002367/// second 64-bit integer vector operand.
2368///
2369/// To minimize caching, the data is flagged as non-temporal
2370/// (unlikely to be used again soon).
Logan Chien2833ffb2018-10-09 10:03:24 +08002371///
2372/// \headerfile <x86intrin.h>
2373///
Logan Chien55afb0a2018-10-15 10:42:14 +08002374/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002375///
2376/// \param __d
2377/// A 64-bit integer vector containing the values with elements to be copied.
2378/// \param __n
2379/// A 64-bit integer vector operand. The most significant bit from each 8-bit
Logan Chien55afb0a2018-10-15 10:42:14 +08002380/// element determines whether the corresponding element in operand \a __d
2381/// is copied. If the most significant bit of a given element is 1, the
2382/// corresponding element in operand \a __d is copied.
Logan Chien2833ffb2018-10-09 10:03:24 +08002383/// \param __p
2384/// A pointer to a 64-bit memory location that will receive the conditionally
2385/// copied integer values. The address of the memory location does not have
2386/// to be aligned.
Logan Chien55afb0a2018-10-15 10:42:14 +08002387static __inline__ void __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002388_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2389{
2390 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2391}
2392
Logan Chien55afb0a2018-10-15 10:42:14 +08002393/// Computes the rounded averages of the packed unsigned 8-bit integer
Logan Chien2833ffb2018-10-09 10:03:24 +08002394/// values and writes the averages to the corresponding bits in the
2395/// destination.
2396///
2397/// \headerfile <x86intrin.h>
2398///
Logan Chien55afb0a2018-10-15 10:42:14 +08002399/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002400///
2401/// \param __a
2402/// A 64-bit integer vector containing one of the source operands.
2403/// \param __b
2404/// A 64-bit integer vector containing one of the source operands.
2405/// \returns A 64-bit integer vector containing the averages of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +08002406static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002407_mm_avg_pu8(__m64 __a, __m64 __b)
2408{
2409 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2410}
2411
Logan Chien55afb0a2018-10-15 10:42:14 +08002412/// Computes the rounded averages of the packed unsigned 16-bit integer
Logan Chien2833ffb2018-10-09 10:03:24 +08002413/// values and writes the averages to the corresponding bits in the
2414/// destination.
2415///
2416/// \headerfile <x86intrin.h>
2417///
Logan Chien55afb0a2018-10-15 10:42:14 +08002418/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002419///
2420/// \param __a
2421/// A 64-bit integer vector containing one of the source operands.
2422/// \param __b
2423/// A 64-bit integer vector containing one of the source operands.
2424/// \returns A 64-bit integer vector containing the averages of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +08002425static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002426_mm_avg_pu16(__m64 __a, __m64 __b)
2427{
2428 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2429}
2430
Logan Chien55afb0a2018-10-15 10:42:14 +08002431/// Subtracts the corresponding 8-bit unsigned integer values of the two
Logan Chien2833ffb2018-10-09 10:03:24 +08002432/// 64-bit vector operands and computes the absolute value for each of the
2433/// difference. Then sum of the 8 absolute differences is written to the
2434/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2435///
2436/// \headerfile <x86intrin.h>
2437///
Logan Chien55afb0a2018-10-15 10:42:14 +08002438/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002439///
2440/// \param __a
2441/// A 64-bit integer vector containing one of the source operands.
2442/// \param __b
2443/// A 64-bit integer vector containing one of the source operands.
2444/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2445/// sets of absolute differences between both operands. The upper bits are
2446/// cleared.
Logan Chien55afb0a2018-10-15 10:42:14 +08002447static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002448_mm_sad_pu8(__m64 __a, __m64 __b)
2449{
2450 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2451}
2452
Logan Chien55afb0a2018-10-15 10:42:14 +08002453#if defined(__cplusplus)
2454extern "C" {
2455#endif
2456
2457/// Returns the contents of the MXCSR register as a 32-bit unsigned
2458/// integer value.
2459///
2460/// There are several groups of macros associated with this
Logan Chien2833ffb2018-10-09 10:03:24 +08002461/// intrinsic, including:
Logan Chien55afb0a2018-10-15 10:42:14 +08002462/// <ul>
2463/// <li>
2464/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
Logan Chien2833ffb2018-10-09 10:03:24 +08002465/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2466/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2467/// _MM_GET_EXCEPTION_STATE().
Logan Chien55afb0a2018-10-15 10:42:14 +08002468/// </li>
2469/// <li>
2470/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
Logan Chien2833ffb2018-10-09 10:03:24 +08002471/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2472/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
Logan Chien55afb0a2018-10-15 10:42:14 +08002473/// </li>
2474/// <li>
2475/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
Logan Chien2833ffb2018-10-09 10:03:24 +08002476/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
Logan Chien55afb0a2018-10-15 10:42:14 +08002477/// _MM_GET_ROUNDING_MODE().
2478/// </li>
2479/// <li>
2480/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
Logan Chien2833ffb2018-10-09 10:03:24 +08002481/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
Logan Chien55afb0a2018-10-15 10:42:14 +08002482/// </li>
2483/// <li>
2484/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
Logan Chien2833ffb2018-10-09 10:03:24 +08002485/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2486/// _MM_GET_DENORMALS_ZERO_MODE().
Logan Chien55afb0a2018-10-15 10:42:14 +08002487/// </li>
2488/// </ul>
Logan Chien2833ffb2018-10-09 10:03:24 +08002489///
Logan Chien55afb0a2018-10-15 10:42:14 +08002490/// For example, the following expression checks if an overflow exception has
Logan Chien2833ffb2018-10-09 10:03:24 +08002491/// occurred:
Logan Chien55afb0a2018-10-15 10:42:14 +08002492/// \code
Logan Chien2833ffb2018-10-09 10:03:24 +08002493/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
Logan Chien55afb0a2018-10-15 10:42:14 +08002494/// \endcode
Logan Chien2833ffb2018-10-09 10:03:24 +08002495///
Logan Chien55afb0a2018-10-15 10:42:14 +08002496/// The following expression gets the current rounding mode:
2497/// \code
Logan Chien2833ffb2018-10-09 10:03:24 +08002498/// _MM_GET_ROUNDING_MODE()
Logan Chien55afb0a2018-10-15 10:42:14 +08002499/// \endcode
Logan Chien2833ffb2018-10-09 10:03:24 +08002500///
2501/// \headerfile <x86intrin.h>
2502///
Logan Chien55afb0a2018-10-15 10:42:14 +08002503/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002504///
2505/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2506/// register.
Logan Chien55afb0a2018-10-15 10:42:14 +08002507unsigned int _mm_getcsr(void);
Logan Chien2833ffb2018-10-09 10:03:24 +08002508
Logan Chien55afb0a2018-10-15 10:42:14 +08002509/// Sets the MXCSR register with the 32-bit unsigned integer value.
2510///
2511/// There are several groups of macros associated with this intrinsic,
2512/// including:
2513/// <ul>
2514/// <li>
2515/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
Logan Chien2833ffb2018-10-09 10:03:24 +08002516/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2517/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2518/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
Logan Chien55afb0a2018-10-15 10:42:14 +08002519/// </li>
2520/// <li>
2521/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
Logan Chien2833ffb2018-10-09 10:03:24 +08002522/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2523/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2524/// of these macros.
Logan Chien55afb0a2018-10-15 10:42:14 +08002525/// </li>
2526/// <li>
2527/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
Logan Chien2833ffb2018-10-09 10:03:24 +08002528/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2529/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
Logan Chien55afb0a2018-10-15 10:42:14 +08002530/// </li>
2531/// <li>
2532/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
Logan Chien2833ffb2018-10-09 10:03:24 +08002533/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2534/// one of these macros.
Logan Chien55afb0a2018-10-15 10:42:14 +08002535/// </li>
2536/// <li>
2537/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
Logan Chien2833ffb2018-10-09 10:03:24 +08002538/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2539/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
Logan Chien55afb0a2018-10-15 10:42:14 +08002540/// </li>
2541/// </ul>
Logan Chien2833ffb2018-10-09 10:03:24 +08002542///
2543/// For example, the following expression causes subsequent floating-point
2544/// operations to round up:
2545/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2546///
2547/// The following example sets the DAZ and FTZ flags:
Logan Chien55afb0a2018-10-15 10:42:14 +08002548/// \code
2549/// void setFlags() {
2550/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2551/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2552/// }
2553/// \endcode
Logan Chien2833ffb2018-10-09 10:03:24 +08002554///
2555/// \headerfile <x86intrin.h>
2556///
Logan Chien55afb0a2018-10-15 10:42:14 +08002557/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002558///
2559/// \param __i
2560/// A 32-bit unsigned integer value to be written to the MXCSR register.
Logan Chien55afb0a2018-10-15 10:42:14 +08002561void _mm_setcsr(unsigned int __i);
Logan Chien2833ffb2018-10-09 10:03:24 +08002562
Logan Chien55afb0a2018-10-15 10:42:14 +08002563#if defined(__cplusplus)
2564} // extern "C"
2565#endif
2566
2567/// Selects 4 float values from the 128-bit operands of [4 x float], as
Logan Chien2833ffb2018-10-09 10:03:24 +08002568/// specified by the immediate value operand.
2569///
2570/// \headerfile <x86intrin.h>
2571///
2572/// \code
2573/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2574/// \endcode
2575///
Logan Chien55afb0a2018-10-15 10:42:14 +08002576/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002577///
2578/// \param a
2579/// A 128-bit vector of [4 x float].
2580/// \param b
2581/// A 128-bit vector of [4 x float].
2582/// \param mask
2583/// An immediate value containing an 8-bit value specifying which elements to
Logan Chien55afb0a2018-10-15 10:42:14 +08002584/// copy from \a a and \a b. \n
2585/// Bits [3:0] specify the values copied from operand \a a. \n
2586/// Bits [7:4] specify the values copied from operand \a b. \n
2587/// The destinations within the 128-bit destination are assigned values as
2588/// follows: \n
2589/// Bits [1:0] are used to assign values to bits [31:0] in the
2590/// destination. \n
2591/// Bits [3:2] are used to assign values to bits [63:32] in the
2592/// destination. \n
2593/// Bits [5:4] are used to assign values to bits [95:64] in the
2594/// destination. \n
2595/// Bits [7:6] are used to assign values to bits [127:96] in the
2596/// destination. \n
2597/// Bit value assignments: \n
2598/// 00: Bits [31:0] copied from the specified operand. \n
2599/// 01: Bits [63:32] copied from the specified operand. \n
2600/// 10: Bits [95:64] copied from the specified operand. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08002601/// 11: Bits [127:96] copied from the specified operand.
2602/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
Logan Chien55afb0a2018-10-15 10:42:14 +08002603#define _mm_shuffle_ps(a, b, mask) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002604 ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2605 (int)(mask)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002606
Logan Chien55afb0a2018-10-15 10:42:14 +08002607/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2608/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +08002609///
2610/// \headerfile <x86intrin.h>
2611///
Logan Chien55afb0a2018-10-15 10:42:14 +08002612/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002613///
2614/// \param __a
Logan Chien55afb0a2018-10-15 10:42:14 +08002615/// A 128-bit vector of [4 x float]. \n
2616/// Bits [95:64] are written to bits [31:0] of the destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08002617/// Bits [127:96] are written to bits [95:64] of the destination.
2618/// \param __b
2619/// A 128-bit vector of [4 x float].
Logan Chien55afb0a2018-10-15 10:42:14 +08002620/// Bits [95:64] are written to bits [63:32] of the destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08002621/// Bits [127:96] are written to bits [127:96] of the destination.
2622/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2623static __inline__ __m128 __DEFAULT_FN_ATTRS
2624_mm_unpackhi_ps(__m128 __a, __m128 __b)
2625{
2626 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2627}
2628
Logan Chien55afb0a2018-10-15 10:42:14 +08002629/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2630/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +08002631///
2632/// \headerfile <x86intrin.h>
2633///
Logan Chien55afb0a2018-10-15 10:42:14 +08002634/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002635///
2636/// \param __a
Logan Chien55afb0a2018-10-15 10:42:14 +08002637/// A 128-bit vector of [4 x float]. \n
2638/// Bits [31:0] are written to bits [31:0] of the destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08002639/// Bits [63:32] are written to bits [95:64] of the destination.
2640/// \param __b
Logan Chien55afb0a2018-10-15 10:42:14 +08002641/// A 128-bit vector of [4 x float]. \n
2642/// Bits [31:0] are written to bits [63:32] of the destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08002643/// Bits [63:32] are written to bits [127:96] of the destination.
2644/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2645static __inline__ __m128 __DEFAULT_FN_ATTRS
2646_mm_unpacklo_ps(__m128 __a, __m128 __b)
2647{
2648 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2649}
2650
Logan Chien55afb0a2018-10-15 10:42:14 +08002651/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien2833ffb2018-10-09 10:03:24 +08002652/// 32 bits are set to the lower 32 bits of the second parameter. The upper
2653/// 96 bits are set to the upper 96 bits of the first parameter.
2654///
2655/// \headerfile <x86intrin.h>
2656///
Logan Chien55afb0a2018-10-15 10:42:14 +08002657/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2658/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002659///
2660/// \param __a
2661/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2662/// written to the upper 96 bits of the result.
2663/// \param __b
2664/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2665/// written to the lower 32 bits of the result.
2666/// \returns A 128-bit floating-point vector of [4 x float].
2667static __inline__ __m128 __DEFAULT_FN_ATTRS
2668_mm_move_ss(__m128 __a, __m128 __b)
2669{
Logan Chien55afb0a2018-10-15 10:42:14 +08002670 __a[0] = __b[0];
2671 return __a;
Logan Chien2833ffb2018-10-09 10:03:24 +08002672}
2673
Logan Chien55afb0a2018-10-15 10:42:14 +08002674/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien2833ffb2018-10-09 10:03:24 +08002675/// 64 bits are set to the upper 64 bits of the second parameter. The upper
2676/// 64 bits are set to the upper 64 bits of the first parameter.
2677///
2678/// \headerfile <x86intrin.h>
2679///
Logan Chien55afb0a2018-10-15 10:42:14 +08002680/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002681///
2682/// \param __a
2683/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2684/// written to the upper 64 bits of the result.
2685/// \param __b
2686/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2687/// written to the lower 64 bits of the result.
2688/// \returns A 128-bit floating-point vector of [4 x float].
2689static __inline__ __m128 __DEFAULT_FN_ATTRS
2690_mm_movehl_ps(__m128 __a, __m128 __b)
2691{
2692 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2693}
2694
Logan Chien55afb0a2018-10-15 10:42:14 +08002695/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
Logan Chien2833ffb2018-10-09 10:03:24 +08002696/// 64 bits are set to the lower 64 bits of the first parameter. The upper
2697/// 64 bits are set to the lower 64 bits of the second parameter.
2698///
2699/// \headerfile <x86intrin.h>
2700///
Logan Chien55afb0a2018-10-15 10:42:14 +08002701/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002702///
2703/// \param __a
2704/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2705/// written to the lower 64 bits of the result.
2706/// \param __b
2707/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2708/// written to the upper 64 bits of the result.
2709/// \returns A 128-bit floating-point vector of [4 x float].
2710static __inline__ __m128 __DEFAULT_FN_ATTRS
2711_mm_movelh_ps(__m128 __a, __m128 __b)
2712{
2713 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2714}
2715
Logan Chien55afb0a2018-10-15 10:42:14 +08002716/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
Logan Chien2833ffb2018-10-09 10:03:24 +08002717/// float].
2718///
2719/// \headerfile <x86intrin.h>
2720///
Logan Chien55afb0a2018-10-15 10:42:14 +08002721/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002722///
2723/// \param __a
2724/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2725/// from the corresponding elements in this operand.
2726/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2727/// values from the operand.
Logan Chien55afb0a2018-10-15 10:42:14 +08002728static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002729_mm_cvtpi16_ps(__m64 __a)
2730{
2731 __m64 __b, __c;
2732 __m128 __r;
2733
2734 __b = _mm_setzero_si64();
2735 __b = _mm_cmpgt_pi16(__b, __a);
2736 __c = _mm_unpackhi_pi16(__a, __b);
2737 __r = _mm_setzero_ps();
2738 __r = _mm_cvtpi32_ps(__r, __c);
2739 __r = _mm_movelh_ps(__r, __r);
2740 __c = _mm_unpacklo_pi16(__a, __b);
2741 __r = _mm_cvtpi32_ps(__r, __c);
2742
2743 return __r;
2744}
2745
Logan Chien55afb0a2018-10-15 10:42:14 +08002746/// Converts a 64-bit vector of 16-bit unsigned integer values into a
Logan Chien2833ffb2018-10-09 10:03:24 +08002747/// 128-bit vector of [4 x float].
2748///
2749/// \headerfile <x86intrin.h>
2750///
Logan Chien55afb0a2018-10-15 10:42:14 +08002751/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002752///
2753/// \param __a
2754/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2755/// destination are copied from the corresponding elements in this operand.
2756/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2757/// values from the operand.
Logan Chien55afb0a2018-10-15 10:42:14 +08002758static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002759_mm_cvtpu16_ps(__m64 __a)
2760{
2761 __m64 __b, __c;
2762 __m128 __r;
2763
2764 __b = _mm_setzero_si64();
2765 __c = _mm_unpackhi_pi16(__a, __b);
2766 __r = _mm_setzero_ps();
2767 __r = _mm_cvtpi32_ps(__r, __c);
2768 __r = _mm_movelh_ps(__r, __r);
2769 __c = _mm_unpacklo_pi16(__a, __b);
2770 __r = _mm_cvtpi32_ps(__r, __c);
2771
2772 return __r;
2773}
2774
Logan Chien55afb0a2018-10-15 10:42:14 +08002775/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
Logan Chien2833ffb2018-10-09 10:03:24 +08002776/// into a 128-bit vector of [4 x float].
2777///
2778/// \headerfile <x86intrin.h>
2779///
Logan Chien55afb0a2018-10-15 10:42:14 +08002780/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002781///
2782/// \param __a
2783/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2784/// from the corresponding lower 4 elements in this operand.
2785/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2786/// values from the operand.
Logan Chien55afb0a2018-10-15 10:42:14 +08002787static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002788_mm_cvtpi8_ps(__m64 __a)
2789{
2790 __m64 __b;
2791
2792 __b = _mm_setzero_si64();
2793 __b = _mm_cmpgt_pi8(__b, __a);
2794 __b = _mm_unpacklo_pi8(__a, __b);
2795
2796 return _mm_cvtpi16_ps(__b);
2797}
2798
Logan Chien55afb0a2018-10-15 10:42:14 +08002799/// Converts the lower four unsigned 8-bit integer values from a 64-bit
Logan Chien2833ffb2018-10-09 10:03:24 +08002800/// vector of [8 x u8] into a 128-bit vector of [4 x float].
2801///
2802/// \headerfile <x86intrin.h>
2803///
Logan Chien55afb0a2018-10-15 10:42:14 +08002804/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002805///
2806/// \param __a
2807/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2808/// destination are copied from the corresponding lower 4 elements in this
2809/// operand.
2810/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2811/// values from the source operand.
Logan Chien55afb0a2018-10-15 10:42:14 +08002812static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002813_mm_cvtpu8_ps(__m64 __a)
2814{
2815 __m64 __b;
2816
2817 __b = _mm_setzero_si64();
2818 __b = _mm_unpacklo_pi8(__a, __b);
2819
2820 return _mm_cvtpi16_ps(__b);
2821}
2822
Logan Chien55afb0a2018-10-15 10:42:14 +08002823/// Converts the two 32-bit signed integer values from each 64-bit vector
Logan Chien2833ffb2018-10-09 10:03:24 +08002824/// operand of [2 x i32] into a 128-bit vector of [4 x float].
2825///
2826/// \headerfile <x86intrin.h>
2827///
Logan Chien55afb0a2018-10-15 10:42:14 +08002828/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002829///
2830/// \param __a
2831/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2832/// copied from the elements in this operand.
2833/// \param __b
2834/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2835/// copied from the elements in this operand.
2836/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2837/// copied and converted values from the first operand. The upper 64 bits
2838/// contain the copied and converted values from the second operand.
Logan Chien55afb0a2018-10-15 10:42:14 +08002839static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002840_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2841{
2842 __m128 __c;
2843
2844 __c = _mm_setzero_ps();
2845 __c = _mm_cvtpi32_ps(__c, __b);
2846 __c = _mm_movelh_ps(__c, __c);
2847
2848 return _mm_cvtpi32_ps(__c, __a);
2849}
2850
Logan Chien55afb0a2018-10-15 10:42:14 +08002851/// Converts each single-precision floating-point element of a 128-bit
Logan Chien2833ffb2018-10-09 10:03:24 +08002852/// floating-point vector of [4 x float] into a 16-bit signed integer, and
Logan Chien55afb0a2018-10-15 10:42:14 +08002853/// packs the results into a 64-bit integer vector of [4 x i16].
2854///
2855/// If the floating-point element is NaN or infinity, or if the
2856/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2857/// it is converted to 0x8000. Otherwise if the floating-point element is
2858/// greater than 0x7FFF, it is converted to 0x7FFF.
Logan Chien2833ffb2018-10-09 10:03:24 +08002859///
2860/// \headerfile <x86intrin.h>
2861///
Logan Chien55afb0a2018-10-15 10:42:14 +08002862/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002863///
2864/// \param __a
2865/// A 128-bit floating-point vector of [4 x float].
2866/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2867/// values.
Logan Chien55afb0a2018-10-15 10:42:14 +08002868static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002869_mm_cvtps_pi16(__m128 __a)
2870{
2871 __m64 __b, __c;
2872
2873 __b = _mm_cvtps_pi32(__a);
2874 __a = _mm_movehl_ps(__a, __a);
2875 __c = _mm_cvtps_pi32(__a);
2876
2877 return _mm_packs_pi32(__b, __c);
2878}
2879
Logan Chien55afb0a2018-10-15 10:42:14 +08002880/// Converts each single-precision floating-point element of a 128-bit
Logan Chien2833ffb2018-10-09 10:03:24 +08002881/// floating-point vector of [4 x float] into an 8-bit signed integer, and
2882/// packs the results into the lower 32 bits of a 64-bit integer vector of
Logan Chien55afb0a2018-10-15 10:42:14 +08002883/// [8 x i8]. The upper 32 bits of the vector are set to 0.
2884///
2885/// If the floating-point element is NaN or infinity, or if the
2886/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2887/// is converted to 0x80. Otherwise if the floating-point element is greater
Logan Chien2833ffb2018-10-09 10:03:24 +08002888/// than 0x7F, it is converted to 0x7F.
2889///
2890/// \headerfile <x86intrin.h>
2891///
Logan Chien55afb0a2018-10-15 10:42:14 +08002892/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002893///
2894/// \param __a
2895/// 128-bit floating-point vector of [4 x float].
2896/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2897/// converted values and the uppper 32 bits are set to zero.
Logan Chien55afb0a2018-10-15 10:42:14 +08002898static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08002899_mm_cvtps_pi8(__m128 __a)
2900{
2901 __m64 __b, __c;
2902
2903 __b = _mm_cvtps_pi16(__a);
2904 __c = _mm_setzero_si64();
2905
2906 return _mm_packs_pi16(__b, __c);
2907}
2908
Logan Chien55afb0a2018-10-15 10:42:14 +08002909/// Extracts the sign bits from each single-precision floating-point
Logan Chien2833ffb2018-10-09 10:03:24 +08002910/// element of a 128-bit floating-point vector of [4 x float] and returns the
2911/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2912/// to zero.
2913///
2914/// \headerfile <x86intrin.h>
2915///
Logan Chien55afb0a2018-10-15 10:42:14 +08002916/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002917///
2918/// \param __a
2919/// A 128-bit floating-point vector of [4 x float].
2920/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2921/// single-precision floating-point element of the parameter. Bits [31:4] are
2922/// set to zero.
2923static __inline__ int __DEFAULT_FN_ATTRS
2924_mm_movemask_ps(__m128 __a)
2925{
2926 return __builtin_ia32_movmskps((__v4sf)__a);
2927}
2928
2929
2930#define _MM_ALIGN16 __attribute__((aligned(16)))
2931
2932#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2933
Sasha Smundak0fc590b2020-10-07 08:11:59 -07002934#define _MM_EXCEPT_INVALID (0x0001U)
2935#define _MM_EXCEPT_DENORM (0x0002U)
2936#define _MM_EXCEPT_DIV_ZERO (0x0004U)
2937#define _MM_EXCEPT_OVERFLOW (0x0008U)
2938#define _MM_EXCEPT_UNDERFLOW (0x0010U)
2939#define _MM_EXCEPT_INEXACT (0x0020U)
2940#define _MM_EXCEPT_MASK (0x003fU)
Logan Chien2833ffb2018-10-09 10:03:24 +08002941
Sasha Smundak0fc590b2020-10-07 08:11:59 -07002942#define _MM_MASK_INVALID (0x0080U)
2943#define _MM_MASK_DENORM (0x0100U)
2944#define _MM_MASK_DIV_ZERO (0x0200U)
2945#define _MM_MASK_OVERFLOW (0x0400U)
2946#define _MM_MASK_UNDERFLOW (0x0800U)
2947#define _MM_MASK_INEXACT (0x1000U)
2948#define _MM_MASK_MASK (0x1f80U)
Logan Chien2833ffb2018-10-09 10:03:24 +08002949
Sasha Smundak0fc590b2020-10-07 08:11:59 -07002950#define _MM_ROUND_NEAREST (0x0000U)
2951#define _MM_ROUND_DOWN (0x2000U)
2952#define _MM_ROUND_UP (0x4000U)
2953#define _MM_ROUND_TOWARD_ZERO (0x6000U)
2954#define _MM_ROUND_MASK (0x6000U)
Logan Chien2833ffb2018-10-09 10:03:24 +08002955
Sasha Smundak0fc590b2020-10-07 08:11:59 -07002956#define _MM_FLUSH_ZERO_MASK (0x8000U)
2957#define _MM_FLUSH_ZERO_ON (0x8000U)
2958#define _MM_FLUSH_ZERO_OFF (0x0000U)
Logan Chien2833ffb2018-10-09 10:03:24 +08002959
2960#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2961#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2962#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2963#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2964
2965#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2966#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2967#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2968#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2969
2970#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2971do { \
2972 __m128 tmp3, tmp2, tmp1, tmp0; \
2973 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2974 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2975 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2976 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2977 (row0) = _mm_movelh_ps(tmp0, tmp2); \
2978 (row1) = _mm_movehl_ps(tmp2, tmp0); \
2979 (row2) = _mm_movelh_ps(tmp1, tmp3); \
2980 (row3) = _mm_movehl_ps(tmp3, tmp1); \
2981} while (0)
2982
2983/* Aliases for compatibility. */
2984#define _m_pextrw _mm_extract_pi16
2985#define _m_pinsrw _mm_insert_pi16
2986#define _m_pmaxsw _mm_max_pi16
2987#define _m_pmaxub _mm_max_pu8
2988#define _m_pminsw _mm_min_pi16
2989#define _m_pminub _mm_min_pu8
2990#define _m_pmovmskb _mm_movemask_pi8
2991#define _m_pmulhuw _mm_mulhi_pu16
2992#define _m_pshufw _mm_shuffle_pi16
2993#define _m_maskmovq _mm_maskmove_si64
2994#define _m_pavgb _mm_avg_pu8
2995#define _m_pavgw _mm_avg_pu16
2996#define _m_psadbw _mm_sad_pu8
2997#define _m_ _mm_
2998#define _m_ _mm_
2999
3000#undef __DEFAULT_FN_ATTRS
Logan Chien55afb0a2018-10-15 10:42:14 +08003001#undef __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +08003002
3003/* Ugly hack for backwards-compatibility (compatible with gcc) */
3004#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3005#include <emmintrin.h>
3006#endif
3007
3008#endif /* __XMMINTRIN_H */