blob: 3110e8babf9463af40870cf8286efa18a350e314 [file] [log] [blame]
Ying Wanga6720142011-12-20 14:43:20 -08001/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -070023
Ying Wanga6720142011-12-20 14:43:20 -080024#ifndef __XMMINTRIN_H
25#define __XMMINTRIN_H
Ying Wanga6720142011-12-20 14:43:20 -080026
27#include <mmintrin.h>
28
29typedef int __v4si __attribute__((__vector_size__(16)));
30typedef float __v4sf __attribute__((__vector_size__(16)));
31typedef float __m128 __attribute__((__vector_size__(16)));
32
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -070033/* Unsigned types */
34typedef unsigned int __v4su __attribute__((__vector_size__(16)));
35
Stephen Hines990d2fc2014-07-23 10:40:48 -070036/* This header should only be included in a hosted environment as it depends on
37 * a standard library to provide allocation routines. */
Ying Wanga6720142011-12-20 14:43:20 -080038#if __STDC_HOSTED__
39#include <mm_malloc.h>
40#endif
41
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -070042/* Define the default attributes for the functions in this file. */
43#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
44
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -070045/// \brief Adds the 32-bit float values in the low-order bits of the operands.
46///
47/// \headerfile <x86intrin.h>
48///
49/// This intrinsic corresponds to the \c VADDSS / ADDSS instructions.
50///
51/// \param __a
52/// A 128-bit vector of [4 x float] containing one of the source operands.
53/// The lower 32 bits of this operand are used in the calculation.
54/// \param __b
55/// A 128-bit vector of [4 x float] containing one of the source operands.
56/// The lower 32 bits of this operand are used in the calculation.
57/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
58/// of the lower 32 bits of both operands. The upper 96 bits are copied from
59/// the upper 96 bits of the first source operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -070060static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -070061_mm_add_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -080062{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -070063 __a[0] += __b[0];
64 return __a;
Ying Wanga6720142011-12-20 14:43:20 -080065}
66
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -070067/// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
68/// the addition.
69///
70/// \headerfile <x86intrin.h>
71///
72/// This intrinsic corresponds to the \c VADDPS / ADDPS instructions.
73///
74/// \param __a
75/// A 128-bit vector of [4 x float] containing one of the source operands.
76/// \param __b
77/// A 128-bit vector of [4 x float] containing one of the source operands.
78/// \returns A 128-bit vector of [4 x float] containing the sums of both
79/// operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -070080static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -070081_mm_add_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -080082{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -070083 return (__m128)((__v4sf)__a + (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -080084}
85
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -070086/// \brief Subtracts the 32-bit float value in the low-order bits of the second
87/// operand from the corresponding value in the first operand.
88///
89/// \headerfile <x86intrin.h>
90///
91/// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions.
92///
93/// \param __a
94/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
95/// of this operand are used in the calculation.
96/// \param __b
97/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
98/// bits of this operand are used in the calculation.
99/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
100/// difference of the lower 32 bits of both operands. The upper 96 bits are
101/// copied from the upper 96 bits of the first source operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700102static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700103_mm_sub_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800104{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700105 __a[0] -= __b[0];
106 return __a;
Ying Wanga6720142011-12-20 14:43:20 -0800107}
108
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700109/// \brief Subtracts each of the values of the second operand from the first
110/// operand, both of which are 128-bit vectors of [4 x float] and returns
111/// the results of the subtraction.
112///
113/// \headerfile <x86intrin.h>
114///
115/// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions.
116///
117/// \param __a
118/// A 128-bit vector of [4 x float] containing the minuend.
119/// \param __b
120/// A 128-bit vector of [4 x float] containing the subtrahend.
121/// \returns A 128-bit vector of [4 x float] containing the differences between
122/// both operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700123static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700124_mm_sub_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800125{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700126 return (__m128)((__v4sf)__a - (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800127}
128
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700129/// \brief Multiplies two 32-bit float values in the low-order bits of the
130/// operands.
131///
132/// \headerfile <x86intrin.h>
133///
134/// This intrinsic corresponds to the \c VMULSS / MULSS instructions.
135///
136/// \param __a
137/// A 128-bit vector of [4 x float] containing one of the source operands.
138/// The lower 32 bits of this operand are used in the calculation.
139/// \param __b
140/// A 128-bit vector of [4 x float] containing one of the source operands.
141/// The lower 32 bits of this operand are used in the calculation.
142/// \returns A 128-bit vector of [4 x float] containing the product of the lower
143/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
144/// bits of the first source operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700145static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700146_mm_mul_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800147{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700148 __a[0] *= __b[0];
149 return __a;
Ying Wanga6720142011-12-20 14:43:20 -0800150}
151
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700152/// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
153/// results of the multiplication.
154///
155/// \headerfile <x86intrin.h>
156///
157/// This intrinsic corresponds to the \c VMULPS / MULPS instructions.
158///
159/// \param __a
160/// A 128-bit vector of [4 x float] containing one of the source operands.
161/// \param __b
162/// A 128-bit vector of [4 x float] containing one of the source operands.
163/// \returns A 128-bit vector of [4 x float] containing the products of both
164/// operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700165static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700166_mm_mul_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800167{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700168 return (__m128)((__v4sf)__a * (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800169}
170
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700171/// \brief Divides the value in the low-order 32 bits of the first operand by
172/// the corresponding value in the second operand.
173///
174/// \headerfile <x86intrin.h>
175///
176/// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions.
177///
178/// \param __a
179/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
180/// bits of this operand are used in the calculation.
181/// \param __b
182/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
183/// of this operand are used in the calculation.
184/// \returns A 128-bit vector of [4 x float] containing the quotients of the
185/// lower 32 bits of both operands. The upper 96 bits are copied from the
186/// upper 96 bits of the first source operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700187static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700188_mm_div_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800189{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700190 __a[0] /= __b[0];
191 return __a;
Ying Wanga6720142011-12-20 14:43:20 -0800192}
193
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700194/// \brief Divides two 128-bit vectors of [4 x float].
195///
196/// \headerfile <x86intrin.h>
197///
198/// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions.
199///
200/// \param __a
201/// A 128-bit vector of [4 x float] containing the dividend.
202/// \param __b
203/// A 128-bit vector of [4 x float] containing the divisor.
204/// \returns A 128-bit vector of [4 x float] containing the quotients of both
205/// operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700206static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700207_mm_div_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800208{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700209 return (__m128)((__v4sf)__a / (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800210}
211
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700212/// \brief Calculates the square root of the value stored in the low-order bits
213/// of a 128-bit vector of [4 x float].
214///
215/// \headerfile <x86intrin.h>
216///
217/// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions.
218///
219/// \param __a
220/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
221/// used in the calculation.
222/// \returns A 128-bit vector of [4 x float] containing the square root of the
223/// value in the low-order bits of the operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700224static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700225_mm_sqrt_ss(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -0800226{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700227 __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700228 return (__m128) { __c[0], __a[1], __a[2], __a[3] };
Ying Wanga6720142011-12-20 14:43:20 -0800229}
230
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700231/// \brief Calculates the square roots of the values stored in a 128-bit vector
232/// of [4 x float].
233///
234/// \headerfile <x86intrin.h>
235///
236/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions.
237///
238/// \param __a
239/// A 128-bit vector of [4 x float].
240/// \returns A 128-bit vector of [4 x float] containing the square roots of the
241/// values in the operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700242static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700243_mm_sqrt_ps(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -0800244{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700245 return __builtin_ia32_sqrtps((__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -0800246}
247
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700248/// \brief Calculates the approximate reciprocal of the value stored in the
249/// low-order bits of a 128-bit vector of [4 x float].
250///
251/// \headerfile <x86intrin.h>
252///
253/// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions.
254///
255/// \param __a
256/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
257/// used in the calculation.
258/// \returns A 128-bit vector of [4 x float] containing the approximate
259/// reciprocal of the value in the low-order bits of the operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700260static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700261_mm_rcp_ss(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -0800262{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700263 __m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700264 return (__m128) { __c[0], __a[1], __a[2], __a[3] };
Ying Wanga6720142011-12-20 14:43:20 -0800265}
266
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700267/// \brief Calculates the approximate reciprocals of the values stored in a
268/// 128-bit vector of [4 x float].
269///
270/// \headerfile <x86intrin.h>
271///
272/// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions.
273///
274/// \param __a
275/// A 128-bit vector of [4 x float].
276/// \returns A 128-bit vector of [4 x float] containing the approximate
277/// reciprocals of the values in the operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700278static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700279_mm_rcp_ps(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -0800280{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700281 return __builtin_ia32_rcpps((__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -0800282}
283
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700284/// \brief Calculates the approximate reciprocal of the square root of the value
285/// stored in the low-order bits of a 128-bit vector of [4 x float].
286///
287/// \headerfile <x86intrin.h>
288///
289/// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions.
290///
291/// \param __a
292/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
293/// used in the calculation.
294/// \returns A 128-bit vector of [4 x float] containing the approximate
295/// reciprocal of the square root of the value in the low-order bits of the
296/// operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700297static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700298_mm_rsqrt_ss(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -0800299{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700300 __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700301 return (__m128) { __c[0], __a[1], __a[2], __a[3] };
Ying Wanga6720142011-12-20 14:43:20 -0800302}
303
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700304/// \brief Calculates the approximate reciprocals of the square roots of the
305/// values stored in a 128-bit vector of [4 x float].
306///
307/// \headerfile <x86intrin.h>
308///
309/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions.
310///
311/// \param __a
312/// A 128-bit vector of [4 x float].
313/// \returns A 128-bit vector of [4 x float] containing the approximate
314/// reciprocals of the square roots of the values in the operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700315static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700316_mm_rsqrt_ps(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -0800317{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700318 return __builtin_ia32_rsqrtps((__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -0800319}
320
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700321/// \brief Compares two 32-bit float values in the low-order bits of both
322/// operands and returns the lesser value in the low-order bits of the
323/// vector of [4 x float].
324///
325/// \headerfile <x86intrin.h>
326///
327/// This intrinsic corresponds to the \c VMINSS / MINSS instructions.
328///
329/// \param __a
330/// A 128-bit vector of [4 x float] containing one of the operands. The lower
331/// 32 bits of this operand are used in the comparison.
332/// \param __b
333/// A 128-bit vector of [4 x float] containing one of the operands. The lower
334/// 32 bits of this operand are used in the comparison.
335/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
336/// minimum value between both operands. The upper 96 bits are copied from
337/// the upper 96 bits of the first source operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700338static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700339_mm_min_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800340{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700341 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800342}
343
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700344/// \brief Compares two 128-bit vectors of [4 x float] and returns the
345/// lesser of each pair of values.
346///
347/// \headerfile <x86intrin.h>
348///
349/// This intrinsic corresponds to the \c VMINPS / MINPS instructions.
350///
351/// \param __a
352/// A 128-bit vector of [4 x float] containing one of the operands.
353/// \param __b
354/// A 128-bit vector of [4 x float] containing one of the operands.
355/// \returns A 128-bit vector of [4 x float] containing the minimum values
356/// between both operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700357static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700358_mm_min_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800359{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700360 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800361}
362
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700363/// \brief Compares two 32-bit float values in the low-order bits of both
364/// operands and returns the greater value in the low-order bits of
365/// a vector [4 x float].
366///
367/// \headerfile <x86intrin.h>
368///
369/// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions.
370///
371/// \param __a
372/// A 128-bit vector of [4 x float] containing one of the operands. The lower
373/// 32 bits of this operand are used in the comparison.
374/// \param __b
375/// A 128-bit vector of [4 x float] containing one of the operands. The lower
376/// 32 bits of this operand are used in the comparison.
377/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
378/// maximum value between both operands. The upper 96 bits are copied from
379/// the upper 96 bits of the first source operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700380static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700381_mm_max_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800382{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700383 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800384}
385
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700386/// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
387/// of each pair of values.
388///
389/// \headerfile <x86intrin.h>
390///
391/// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions.
392///
393/// \param __a
394/// A 128-bit vector of [4 x float] containing one of the operands.
395/// \param __b
396/// A 128-bit vector of [4 x float] containing one of the operands.
397/// \returns A 128-bit vector of [4 x float] containing the maximum values
398/// between both operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700399static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700400_mm_max_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800401{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700402 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800403}
404
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700405/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
406///
407/// \headerfile <x86intrin.h>
408///
409/// This intrinsic corresponds to the \c VANDPS / ANDPS instructions.
410///
411/// \param __a
412/// A 128-bit vector containing one of the source operands.
413/// \param __b
414/// A 128-bit vector containing one of the source operands.
415/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
416/// values between both operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700417static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700418_mm_and_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800419{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700420 return (__m128)((__v4su)__a & (__v4su)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800421}
422
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700423/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
424/// the one's complement of the values contained in the first source
425/// operand.
426///
427/// \headerfile <x86intrin.h>
428///
429/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions.
430///
431/// \param __a
432/// A 128-bit vector of [4 x float] containing the first source operand. The
433/// one's complement of this value is used in the bitwise AND.
434/// \param __b
435/// A 128-bit vector of [4 x float] containing the second source operand.
436/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
437/// one's complement of the first operand and the values in the second
438/// operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700439static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700440_mm_andnot_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800441{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700442 return (__m128)(~(__v4su)__a & (__v4su)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800443}
444
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700445/// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
446///
447/// \headerfile <x86intrin.h>
448///
449/// This intrinsic corresponds to the \c VORPS / ORPS instructions.
450///
451/// \param __a
452/// A 128-bit vector of [4 x float] containing one of the source operands.
453/// \param __b
454/// A 128-bit vector of [4 x float] containing one of the source operands.
455/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
456/// values between both operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700457static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700458_mm_or_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800459{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700460 return (__m128)((__v4su)__a | (__v4su)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800461}
462
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700463/// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
464/// [4 x float].
465///
466/// \headerfile <x86intrin.h>
467///
468/// This intrinsic corresponds to the \c VXORPS / XORPS instructions.
469///
470/// \param __a
471/// A 128-bit vector of [4 x float] containing one of the source operands.
472/// \param __b
473/// A 128-bit vector of [4 x float] containing one of the source operands.
474/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
475/// of the values between both operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700476static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700477_mm_xor_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800478{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700479 return (__m128)((__v4su)__a ^ (__v4su)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800480}
481
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700482/// \brief Compares two 32-bit float values in the low-order bits of both
483/// operands for equality and returns the result of the comparison in the
484/// low-order bits of a vector [4 x float].
485///
486/// \headerfile <x86intrin.h>
487///
488/// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions.
489///
490/// \param __a
491/// A 128-bit vector of [4 x float] containing one of the operands. The lower
492/// 32 bits of this operand are used in the comparison.
493/// \param __b
494/// A 128-bit vector of [4 x float] containing one of the operands. The lower
495/// 32 bits of this operand are used in the comparison.
496/// \returns A 128-bit vector of [4 x float] containing the comparison results
497/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700498static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700499_mm_cmpeq_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800500{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700501 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800502}
503
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700504/// \brief Compares each of the corresponding 32-bit float values of the
505/// 128-bit vectors of [4 x float] for equality.
506///
507/// \headerfile <x86intrin.h>
508///
509/// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions.
510///
511/// \param __a
512/// A 128-bit vector of [4 x float].
513/// \param __b
514/// A 128-bit vector of [4 x float].
515/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700516static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700517_mm_cmpeq_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800518{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700519 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800520}
521
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700522/// \brief Compares two 32-bit float values in the low-order bits of both
523/// operands to determine if the value in the first operand is less than the
524/// corresponding value in the second operand and returns the result of the
525/// comparison in the low-order bits of a vector of [4 x float].
526///
527/// \headerfile <x86intrin.h>
528///
529/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
530///
531/// \param __a
532/// A 128-bit vector of [4 x float] containing one of the operands. The lower
533/// 32 bits of this operand are used in the comparison.
534/// \param __b
535/// A 128-bit vector of [4 x float] containing one of the operands. The lower
536/// 32 bits of this operand are used in the comparison.
537/// \returns A 128-bit vector of [4 x float] containing the comparison results
538/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700539static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700540_mm_cmplt_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800541{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700542 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800543}
544
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700545/// \brief Compares each of the corresponding 32-bit float values of the
546/// 128-bit vectors of [4 x float] to determine if the values in the first
547/// operand are less than those in the second operand.
548///
549/// \headerfile <x86intrin.h>
550///
551/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
552///
553/// \param __a
554/// A 128-bit vector of [4 x float].
555/// \param __b
556/// A 128-bit vector of [4 x float].
557/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700558static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700559_mm_cmplt_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800560{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700561 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800562}
563
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700564/// \brief Compares two 32-bit float values in the low-order bits of both
565/// operands to determine if the value in the first operand is less than or
566/// equal to the corresponding value in the second operand and returns the
567/// result of the comparison in the low-order bits of a vector of
568/// [4 x float].
569///
570/// \headerfile <x86intrin.h>
571///
572/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
573///
574/// \param __a
575/// A 128-bit vector of [4 x float] containing one of the operands. The lower
576/// 32 bits of this operand are used in the comparison.
577/// \param __b
578/// A 128-bit vector of [4 x float] containing one of the operands. The lower
579/// 32 bits of this operand are used in the comparison.
580/// \returns A 128-bit vector of [4 x float] containing the comparison results
581/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700582static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700583_mm_cmple_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800584{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700585 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800586}
587
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700588/// \brief Compares each of the corresponding 32-bit float values of the
589/// 128-bit vectors of [4 x float] to determine if the values in the first
590/// operand are less than or equal to those in the second operand.
591///
592/// \headerfile <x86intrin.h>
593///
594/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
595///
596/// \param __a
597/// A 128-bit vector of [4 x float].
598/// \param __b
599/// A 128-bit vector of [4 x float].
600/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700601static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700602_mm_cmple_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800603{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700604 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800605}
606
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700607/// \brief Compares two 32-bit float values in the low-order bits of both
608/// operands to determine if the value in the first operand is greater than
609/// the corresponding value in the second operand and returns the result of
610/// the comparison in the low-order bits of a vector of [4 x float].
611///
612/// \headerfile <x86intrin.h>
613///
614/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
615///
616/// \param __a
617/// A 128-bit vector of [4 x float] containing one of the operands. The lower
618/// 32 bits of this operand are used in the comparison.
619/// \param __b
620/// A 128-bit vector of [4 x float] containing one of the operands. The lower
621/// 32 bits of this operand are used in the comparison.
622/// \returns A 128-bit vector of [4 x float] containing the comparison results
623/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700624static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700625_mm_cmpgt_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800626{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700627 return (__m128)__builtin_shufflevector((__v4sf)__a,
628 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
Stephen Hines996e4dc2013-08-13 01:04:14 -0700629 4, 1, 2, 3);
Ying Wanga6720142011-12-20 14:43:20 -0800630}
631
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700632/// \brief Compares each of the corresponding 32-bit float values of the
633/// 128-bit vectors of [4 x float] to determine if the values in the first
634/// operand are greater than those in the second operand.
635///
636/// \headerfile <x86intrin.h>
637///
638/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
639///
640/// \param __a
641/// A 128-bit vector of [4 x float].
642/// \param __b
643/// A 128-bit vector of [4 x float].
644/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700645static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700646_mm_cmpgt_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800647{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700648 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -0800649}
650
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700651/// \brief Compares two 32-bit float values in the low-order bits of both
652/// operands to determine if the value in the first operand is greater than
653/// or equal to the corresponding value in the second operand and returns
654/// the result of the comparison in the low-order bits of a vector of
655/// [4 x float].
656///
657/// \headerfile <x86intrin.h>
658///
659/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
660///
661/// \param __a
662/// A 128-bit vector of [4 x float] containing one of the operands. The lower
663/// 32 bits of this operand are used in the comparison.
664/// \param __b
665/// A 128-bit vector of [4 x float] containing one of the operands. The lower
666/// 32 bits of this operand are used in the comparison.
667/// \returns A 128-bit vector of [4 x float] containing the comparison results
668/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700669static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700670_mm_cmpge_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800671{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700672 return (__m128)__builtin_shufflevector((__v4sf)__a,
673 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
Stephen Hines996e4dc2013-08-13 01:04:14 -0700674 4, 1, 2, 3);
Ying Wanga6720142011-12-20 14:43:20 -0800675}
676
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700677/// \brief Compares each of the corresponding 32-bit float values of the
678/// 128-bit vectors of [4 x float] to determine if the values in the first
679/// operand are greater than or equal to those in the second operand.
680///
681/// \headerfile <x86intrin.h>
682///
683/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
684///
685/// \param __a
686/// A 128-bit vector of [4 x float].
687/// \param __b
688/// A 128-bit vector of [4 x float].
689/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700690static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700691_mm_cmpge_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800692{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700693 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -0800694}
695
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700696/// \brief Compares two 32-bit float values in the low-order bits of both
697/// operands for inequality and returns the result of the comparison in the
698/// low-order bits of a vector of [4 x float].
699///
700/// \headerfile <x86intrin.h>
701///
702/// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions.
703///
704/// \param __a
705/// A 128-bit vector of [4 x float] containing one of the operands. The lower
706/// 32 bits of this operand are used in the comparison.
707/// \param __b
708/// A 128-bit vector of [4 x float] containing one of the operands. The lower
709/// 32 bits of this operand are used in the comparison.
710/// \returns A 128-bit vector of [4 x float] containing the comparison results
711/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700712static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700713_mm_cmpneq_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800714{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700715 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800716}
717
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700718/// \brief Compares each of the corresponding 32-bit float values of the
719/// 128-bit vectors of [4 x float] for inequality.
720///
721/// \headerfile <x86intrin.h>
722///
723/// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions.
724///
725/// \param __a
726/// A 128-bit vector of [4 x float].
727/// \param __b
728/// A 128-bit vector of [4 x float].
729/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700730static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700731_mm_cmpneq_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800732{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700733 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800734}
735
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700736/// \brief Compares two 32-bit float values in the low-order bits of both
737/// operands to determine if the value in the first operand is not less than
738/// the corresponding value in the second operand and returns the result of
739/// the comparison in the low-order bits of a vector of [4 x float].
740///
741/// \headerfile <x86intrin.h>
742///
743/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
744///
745/// \param __a
746/// A 128-bit vector of [4 x float] containing one of the operands. The lower
747/// 32 bits of this operand are used in the comparison.
748/// \param __b
749/// A 128-bit vector of [4 x float] containing one of the operands. The lower
750/// 32 bits of this operand are used in the comparison.
751/// \returns A 128-bit vector of [4 x float] containing the comparison results
752/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700753static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700754_mm_cmpnlt_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800755{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700756 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800757}
758
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700759/// \brief Compares each of the corresponding 32-bit float values of the
760/// 128-bit vectors of [4 x float] to determine if the values in the first
761/// operand are not less than those in the second operand.
762///
763/// \headerfile <x86intrin.h>
764///
765/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
766///
767/// \param __a
768/// A 128-bit vector of [4 x float].
769/// \param __b
770/// A 128-bit vector of [4 x float].
771/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700772static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700773_mm_cmpnlt_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800774{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700775 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800776}
777
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700778/// \brief Compares two 32-bit float values in the low-order bits of both
779/// operands to determine if the value in the first operand is not less than
780/// or equal to the corresponding value in the second operand and returns
781/// the result of the comparison in the low-order bits of a vector of
782/// [4 x float].
783///
784/// \headerfile <x86intrin.h>
785///
786/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
787///
788/// \param __a
789/// A 128-bit vector of [4 x float] containing one of the operands. The lower
790/// 32 bits of this operand are used in the comparison.
791/// \param __b
792/// A 128-bit vector of [4 x float] containing one of the operands. The lower
793/// 32 bits of this operand are used in the comparison.
794/// \returns A 128-bit vector of [4 x float] containing the comparison results
795/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700796static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700797_mm_cmpnle_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800798{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700799 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800800}
801
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700802/// \brief Compares each of the corresponding 32-bit float values of the
803/// 128-bit vectors of [4 x float] to determine if the values in the first
804/// operand are not less than or equal to those in the second operand.
805///
806/// \headerfile <x86intrin.h>
807///
808/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
809///
810/// \param __a
811/// A 128-bit vector of [4 x float].
812/// \param __b
813/// A 128-bit vector of [4 x float].
814/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700815static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700816_mm_cmpnle_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800817{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700818 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800819}
820
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700821/// \brief Compares two 32-bit float values in the low-order bits of both
822/// operands to determine if the value in the first operand is not greater
823/// than the corresponding value in the second operand and returns the
824/// result of the comparison in the low-order bits of a vector of
825/// [4 x float].
826///
827/// \headerfile <x86intrin.h>
828///
829/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
830///
831/// \param __a
832/// A 128-bit vector of [4 x float] containing one of the operands. The lower
833/// 32 bits of this operand are used in the comparison.
834/// \param __b
835/// A 128-bit vector of [4 x float] containing one of the operands. The lower
836/// 32 bits of this operand are used in the comparison.
837/// \returns A 128-bit vector of [4 x float] containing the comparison results
838/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700839static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700840_mm_cmpngt_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800841{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700842 return (__m128)__builtin_shufflevector((__v4sf)__a,
843 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
Stephen Hines996e4dc2013-08-13 01:04:14 -0700844 4, 1, 2, 3);
Ying Wanga6720142011-12-20 14:43:20 -0800845}
846
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700847/// \brief Compares each of the corresponding 32-bit float values of the
848/// 128-bit vectors of [4 x float] to determine if the values in the first
849/// operand are not greater than those in the second operand.
850///
851/// \headerfile <x86intrin.h>
852///
853/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
854///
855/// \param __a
856/// A 128-bit vector of [4 x float].
857/// \param __b
858/// A 128-bit vector of [4 x float].
859/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700860static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700861_mm_cmpngt_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800862{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700863 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -0800864}
865
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700866/// \brief Compares two 32-bit float values in the low-order bits of both
867/// operands to determine if the value in the first operand is not greater
868/// than or equal to the corresponding value in the second operand and
869/// returns the result of the comparison in the low-order bits of a vector
870/// of [4 x float].
871///
872/// \headerfile <x86intrin.h>
873///
874/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
875///
876/// \param __a
877/// A 128-bit vector of [4 x float] containing one of the operands. The lower
878/// 32 bits of this operand are used in the comparison.
879/// \param __b
880/// A 128-bit vector of [4 x float] containing one of the operands. The lower
881/// 32 bits of this operand are used in the comparison.
882/// \returns A 128-bit vector of [4 x float] containing the comparison results
883/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700884static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700885_mm_cmpnge_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800886{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700887 return (__m128)__builtin_shufflevector((__v4sf)__a,
888 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
Stephen Hines996e4dc2013-08-13 01:04:14 -0700889 4, 1, 2, 3);
Ying Wanga6720142011-12-20 14:43:20 -0800890}
891
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700892/// \brief Compares each of the corresponding 32-bit float values of the
893/// 128-bit vectors of [4 x float] to determine if the values in the first
894/// operand are not greater than or equal to those in the second operand.
895///
896/// \headerfile <x86intrin.h>
897///
898/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
899///
900/// \param __a
901/// A 128-bit vector of [4 x float].
902/// \param __b
903/// A 128-bit vector of [4 x float].
904/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700905static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700906_mm_cmpnge_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800907{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700908 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -0800909}
910
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700911/// \brief Compares two 32-bit float values in the low-order bits of both
912/// operands to determine if the value in the first operand is ordered with
913/// respect to the corresponding value in the second operand and returns the
914/// result of the comparison in the low-order bits of a vector of
915/// [4 x float].
916///
917/// \headerfile <x86intrin.h>
918///
919/// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions.
920///
921/// \param __a
922/// A 128-bit vector of [4 x float] containing one of the operands. The lower
923/// 32 bits of this operand are used in the comparison.
924/// \param __b
925/// A 128-bit vector of [4 x float] containing one of the operands. The lower
926/// 32 bits of this operand are used in the comparison.
927/// \returns A 128-bit vector of [4 x float] containing the comparison results
928/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700929static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700930_mm_cmpord_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800931{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700932 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800933}
934
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700935/// \brief Compares each of the corresponding 32-bit float values of the
936/// 128-bit vectors of [4 x float] to determine if the values in the first
937/// operand are ordered with respect to those in the second operand.
938///
939/// \headerfile <x86intrin.h>
940///
941/// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions.
942///
943/// \param __a
944/// A 128-bit vector of [4 x float].
945/// \param __b
946/// A 128-bit vector of [4 x float].
947/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700948static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700949_mm_cmpord_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800950{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700951 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800952}
953
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700954/// \brief Compares two 32-bit float values in the low-order bits of both
955/// operands to determine if the value in the first operand is unordered
956/// with respect to the corresponding value in the second operand and
957/// returns the result of the comparison in the low-order bits of a vector
958/// of [4 x float].
959///
960/// \headerfile <x86intrin.h>
961///
962/// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions.
963///
964/// \param __a
965/// A 128-bit vector of [4 x float] containing one of the operands. The lower
966/// 32 bits of this operand are used in the comparison.
967/// \param __b
968/// A 128-bit vector of [4 x float] containing one of the operands. The lower
969/// 32 bits of this operand are used in the comparison.
970/// \returns A 128-bit vector of [4 x float] containing the comparison results
971/// in the low-order bits.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700972static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700973_mm_cmpunord_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800974{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700975 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800976}
977
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700978/// \brief Compares each of the corresponding 32-bit float values of the
979/// 128-bit vectors of [4 x float] to determine if the values in the first
980/// operand are unordered with respect to those in the second operand.
981///
982/// \headerfile <x86intrin.h>
983///
984/// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions.
985///
986/// \param __a
987/// A 128-bit vector of [4 x float].
988/// \param __b
989/// A 128-bit vector of [4 x float].
990/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -0700991static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -0700992_mm_cmpunord_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -0800993{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700994 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -0800995}
996
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -0700997/// \brief Compares two 32-bit float values in the low-order bits of both
998/// operands for equality and returns the result of the comparison.
999///
1000/// \headerfile <x86intrin.h>
1001///
1002/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1003///
1004/// \param __a
1005/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1006/// used in the comparison.
1007/// \param __b
1008/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1009/// used in the comparison.
1010/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001011static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001012_mm_comieq_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001013{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001014 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001015}
1016
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001017/// \brief Compares two 32-bit float values in the low-order bits of both
1018/// operands to determine if the first operand is less than the second
1019/// operand and returns the result of the comparison.
1020///
1021/// \headerfile <x86intrin.h>
1022///
1023/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1024///
1025/// \param __a
1026/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1027/// used in the comparison.
1028/// \param __b
1029/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1030/// used in the comparison.
1031/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001032static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001033_mm_comilt_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001034{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001035 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001036}
1037
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001038/// \brief Compares two 32-bit float values in the low-order bits of both
1039/// operands to determine if the first operand is less than or equal to the
1040/// second operand and returns the result of the comparison.
1041///
1042/// \headerfile <x86intrin.h>
1043///
1044/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1045///
1046/// \param __a
1047/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1048/// used in the comparison.
1049/// \param __b
1050/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1051/// used in the comparison.
1052/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001053static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001054_mm_comile_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001055{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001056 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001057}
1058
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001059/// \brief Compares two 32-bit float values in the low-order bits of both
1060/// operands to determine if the first operand is greater than the second
1061/// operand and returns the result of the comparison.
1062///
1063/// \headerfile <x86intrin.h>
1064///
1065/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1066///
1067/// \param __a
1068/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1069/// used in the comparison.
1070/// \param __b
1071/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1072/// used in the comparison.
1073/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001074static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001075_mm_comigt_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001076{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001077 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001078}
1079
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001080/// \brief Compares two 32-bit float values in the low-order bits of both
1081/// operands to determine if the first operand is greater than or equal to
1082/// the second operand and returns the result of the comparison.
1083///
1084/// \headerfile <x86intrin.h>
1085///
1086/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1087///
1088/// \param __a
1089/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1090/// used in the comparison.
1091/// \param __b
1092/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1093/// used in the comparison.
1094/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001095static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001096_mm_comige_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001097{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001098 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001099}
1100
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001101/// \brief Compares two 32-bit float values in the low-order bits of both
1102/// operands to determine if the first operand is not equal to the second
1103/// operand and returns the result of the comparison.
1104///
1105/// \headerfile <x86intrin.h>
1106///
1107/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1108///
1109/// \param __a
1110/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1111/// used in the comparison.
1112/// \param __b
1113/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1114/// used in the comparison.
1115/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001116static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001117_mm_comineq_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001118{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001119 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001120}
1121
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001122/// \brief Performs an unordered comparison of two 32-bit float values using
1123/// the low-order bits of both operands to determine equality and returns
1124/// the result of the comparison.
1125///
1126/// \headerfile <x86intrin.h>
1127///
1128/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1129///
1130/// \param __a
1131/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1132/// used in the comparison.
1133/// \param __b
1134/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1135/// used in the comparison.
1136/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001137static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001138_mm_ucomieq_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001139{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001140 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001141}
1142
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001143/// \brief Performs an unordered comparison of two 32-bit float values using
1144/// the low-order bits of both operands to determine if the first operand is
1145/// less than the second operand and returns the result of the comparison.
1146///
1147/// \headerfile <x86intrin.h>
1148///
1149/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1150///
1151/// \param __a
1152/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1153/// used in the comparison.
1154/// \param __b
1155/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1156/// used in the comparison.
1157/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001158static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001159_mm_ucomilt_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001160{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001161 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001162}
1163
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001164/// \brief Performs an unordered comparison of two 32-bit float values using
1165/// the low-order bits of both operands to determine if the first operand
1166/// is less than or equal to the second operand and returns the result of
1167/// the comparison.
1168///
1169/// \headerfile <x86intrin.h>
1170///
1171/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1172///
1173/// \param __a
1174/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1175/// used in the comparison.
1176/// \param __b
1177/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178/// used in the comparison.
1179/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001180static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001181_mm_ucomile_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001182{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001183 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001184}
1185
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001186/// \brief Performs an unordered comparison of two 32-bit float values using
1187/// the low-order bits of both operands to determine if the first operand
1188/// is greater than the second operand and returns the result of the
1189/// comparison.
1190///
1191/// \headerfile <x86intrin.h>
1192///
1193/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1194///
1195/// \param __a
1196/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1197/// used in the comparison.
1198/// \param __b
1199/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1200/// used in the comparison.
1201/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001202static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001203_mm_ucomigt_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001204{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001205 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001206}
1207
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001208/// \brief Performs an unordered comparison of two 32-bit float values using
1209/// the low-order bits of both operands to determine if the first operand is
1210/// greater than or equal to the second operand and returns the result of
1211/// the comparison.
1212///
1213/// \headerfile <x86intrin.h>
1214///
1215/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1216///
1217/// \param __a
1218/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1219/// used in the comparison.
1220/// \param __b
1221/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1222/// used in the comparison.
1223/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001224static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001225_mm_ucomige_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001226{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001227 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001228}
1229
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001230/// \brief Performs an unordered comparison of two 32-bit float values using
1231/// the low-order bits of both operands to determine inequality and returns
1232/// the result of the comparison.
1233///
1234/// \headerfile <x86intrin.h>
1235///
1236/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1237///
1238/// \param __a
1239/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1240/// used in the comparison.
1241/// \param __b
1242/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1243/// used in the comparison.
1244/// \returns An integer containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001245static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001246_mm_ucomineq_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001247{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001248 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001249}
1250
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001251/// \brief Converts a float value contained in the lower 32 bits of a vector of
1252/// [4 x float] into a 32-bit integer.
1253///
1254/// \headerfile <x86intrin.h>
1255///
1256/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1257///
1258/// \param __a
1259/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1260/// used in the conversion.
1261/// \returns A 32-bit integer containing the converted value.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001262static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001263_mm_cvtss_si32(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001264{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001265 return __builtin_ia32_cvtss2si((__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -08001266}
1267
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001268/// \brief Converts a float value contained in the lower 32 bits of a vector of
1269/// [4 x float] into a 32-bit integer.
1270///
1271/// \headerfile <x86intrin.h>
1272///
1273/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1274///
1275/// \param __a
1276/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277/// used in the conversion.
1278/// \returns A 32-bit integer containing the converted value.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001279static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001280_mm_cvt_ss2si(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001281{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001282 return _mm_cvtss_si32(__a);
Ying Wanga6720142011-12-20 14:43:20 -08001283}
1284
1285#ifdef __x86_64__
1286
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001287/// \brief Converts a float value contained in the lower 32 bits of a vector of
1288/// [4 x float] into a 64-bit integer.
1289///
1290/// \headerfile <x86intrin.h>
1291///
1292/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1293///
1294/// \param __a
1295/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1296/// used in the conversion.
1297/// \returns A 64-bit integer containing the converted value.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001298static __inline__ long long __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001299_mm_cvtss_si64(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001300{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001301 return __builtin_ia32_cvtss2si64((__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -08001302}
1303
1304#endif
1305
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001306/// \brief Converts two low-order float values in a 128-bit vector of
1307/// [4 x float] into a 64-bit vector of [2 x i32].
1308///
1309/// \headerfile <x86intrin.h>
1310///
1311/// This intrinsic corresponds to the \c CVTPS2PI instruction.
1312///
1313/// \param __a
1314/// A 128-bit vector of [4 x float].
1315/// \returns A 64-bit integer vector containing the converted values.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001316static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001317_mm_cvtps_pi32(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001318{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001319 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -08001320}
1321
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001322/// \brief Converts two low-order float values in a 128-bit vector of
1323/// [4 x float] into a 64-bit vector of [2 x i32].
1324///
1325/// \headerfile <x86intrin.h>
1326///
1327/// This intrinsic corresponds to the \c CVTPS2PI instruction.
1328///
1329/// \param __a
1330/// A 128-bit vector of [4 x float].
1331/// \returns A 64-bit integer vector containing the converted values.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001332static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001333_mm_cvt_ps2pi(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001334{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001335 return _mm_cvtps_pi32(__a);
Ying Wanga6720142011-12-20 14:43:20 -08001336}
1337
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001338/// \brief Converts a float value contained in the lower 32 bits of a vector of
1339/// [4 x float] into a 32-bit integer, truncating the result when it is
1340/// inexact.
1341///
1342/// \headerfile <x86intrin.h>
1343///
1344/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1345///
1346/// \param __a
1347/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1348/// used in the conversion.
1349/// \returns A 32-bit integer containing the converted value.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001350static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001351_mm_cvttss_si32(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001352{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001353 return __a[0];
Ying Wanga6720142011-12-20 14:43:20 -08001354}
1355
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001356/// \brief Converts a float value contained in the lower 32 bits of a vector of
1357/// [4 x float] into a 32-bit integer, truncating the result when it is
1358/// inexact.
1359///
1360/// \headerfile <x86intrin.h>
1361///
1362/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1363///
1364/// \param __a
1365/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1366/// used in the conversion.
1367/// \returns A 32-bit integer containing the converted value.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001368static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001369_mm_cvtt_ss2si(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001370{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001371 return _mm_cvttss_si32(__a);
Ying Wanga6720142011-12-20 14:43:20 -08001372}
1373
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001374/// \brief Converts a float value contained in the lower 32 bits of a vector of
1375/// [4 x float] into a 64-bit integer, truncating the result when it is
1376/// inexact.
1377///
1378/// \headerfile <x86intrin.h>
1379///
1380/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1381///
1382/// \param __a
1383/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1384/// used in the conversion.
1385/// \returns A 64-bit integer containing the converted value.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001386static __inline__ long long __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001387_mm_cvttss_si64(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001388{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001389 return __a[0];
Ying Wanga6720142011-12-20 14:43:20 -08001390}
1391
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001392/// \brief Converts two low-order float values in a 128-bit vector of
1393/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1394/// when it is inexact.
1395///
1396/// \headerfile <x86intrin.h>
1397///
1398/// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions.
1399///
1400/// \param __a
1401/// A 128-bit vector of [4 x float].
1402/// \returns A 64-bit integer vector containing the converted values.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001403static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001404_mm_cvttps_pi32(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001405{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001406 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -08001407}
1408
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001409/// \brief Converts two low-order float values in a 128-bit vector of [4 x
1410/// float] into a 64-bit vector of [2 x i32], truncating the result when it
1411/// is inexact.
1412///
1413/// \headerfile <x86intrin.h>
1414///
1415/// This intrinsic corresponds to the \c CVTTPS2PI instruction.
1416///
1417/// \param __a
1418/// A 128-bit vector of [4 x float].
1419/// \returns A 64-bit integer vector containing the converted values.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001420static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001421_mm_cvtt_ps2pi(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001422{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001423 return _mm_cvttps_pi32(__a);
Ying Wanga6720142011-12-20 14:43:20 -08001424}
1425
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001426/// \brief Converts a 32-bit signed integer value into a floating point value
1427/// and writes it to the lower 32 bits of the destination. The remaining
1428/// higher order elements of the destination vector are copied from the
1429/// corresponding elements in the first operand.
1430///
1431/// \headerfile <x86intrin.h>
1432///
1433/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1434///
1435/// \param __a
1436/// A 128-bit vector of [4 x float].
1437/// \param __b
1438/// A 32-bit signed integer operand containing the value to be converted.
1439/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1440/// converted value of the second operand. The upper 96 bits are copied from
1441/// the upper 96 bits of the first operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001442static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001443_mm_cvtsi32_ss(__m128 __a, int __b)
Ying Wanga6720142011-12-20 14:43:20 -08001444{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001445 __a[0] = __b;
1446 return __a;
Ying Wanga6720142011-12-20 14:43:20 -08001447}
1448
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001449/// \brief Converts a 32-bit signed integer value into a floating point value
1450/// and writes it to the lower 32 bits of the destination. The remaining
1451/// higher order elements of the destination are copied from the
1452/// corresponding elements in the first operand.
1453///
1454/// \headerfile <x86intrin.h>
1455///
1456/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1457///
1458/// \param __a
1459/// A 128-bit vector of [4 x float].
1460/// \param __b
1461/// A 32-bit signed integer operand containing the value to be converted.
1462/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1463/// converted value of the second operand. The upper 96 bits are copied from
1464/// the upper 96 bits of the first operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001465static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001466_mm_cvt_si2ss(__m128 __a, int __b)
Ying Wanga6720142011-12-20 14:43:20 -08001467{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001468 return _mm_cvtsi32_ss(__a, __b);
Ying Wanga6720142011-12-20 14:43:20 -08001469}
1470
1471#ifdef __x86_64__
1472
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001473/// \brief Converts a 64-bit signed integer value into a floating point value
1474/// and writes it to the lower 32 bits of the destination. The remaining
1475/// higher order elements of the destination are copied from the
1476/// corresponding elements in the first operand.
1477///
1478/// \headerfile <x86intrin.h>
1479///
1480/// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1481///
1482/// \param __a
1483/// A 128-bit vector of [4 x float].
1484/// \param __b
1485/// A 64-bit signed integer operand containing the value to be converted.
1486/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1487/// converted value of the second operand. The upper 96 bits are copied from
1488/// the upper 96 bits of the first operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001489static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001490_mm_cvtsi64_ss(__m128 __a, long long __b)
Ying Wanga6720142011-12-20 14:43:20 -08001491{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001492 __a[0] = __b;
1493 return __a;
Ying Wanga6720142011-12-20 14:43:20 -08001494}
1495
1496#endif
1497
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001498/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1499/// floating point values and writes them to the lower 64-bits of the
1500/// destination. The remaining higher order elements of the destination are
1501/// copied from the corresponding elements in the first operand.
1502///
1503/// \headerfile <x86intrin.h>
1504///
1505/// This intrinsic corresponds to the \c CVTPI2PS instruction.
1506///
1507/// \param __a
1508/// A 128-bit vector of [4 x float].
1509/// \param __b
1510/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1511/// and written to the corresponding low-order elements in the destination.
1512/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1513/// converted value of the second operand. The upper 64 bits are copied from
1514/// the upper 64 bits of the first operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001515static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001516_mm_cvtpi32_ps(__m128 __a, __m64 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001517{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001518 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
Ying Wanga6720142011-12-20 14:43:20 -08001519}
1520
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001521/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1522/// floating point values and writes them to the lower 64-bits of the
1523/// destination. The remaining higher order elements of the destination are
1524/// copied from the corresponding elements in the first operand.
1525///
1526/// \headerfile <x86intrin.h>
1527///
1528/// This intrinsic corresponds to the \c CVTPI2PS instruction.
1529///
1530/// \param __a
1531/// A 128-bit vector of [4 x float].
1532/// \param __b
1533/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1534/// and written to the corresponding low-order elements in the destination.
1535/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1536/// converted value from the second operand. The upper 64 bits are copied
1537/// from the upper 64 bits of the first operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001538static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001539_mm_cvt_pi2ps(__m128 __a, __m64 __b)
Ying Wanga6720142011-12-20 14:43:20 -08001540{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001541 return _mm_cvtpi32_ps(__a, __b);
Ying Wanga6720142011-12-20 14:43:20 -08001542}
1543
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001544/// \brief Extracts a float value contained in the lower 32 bits of a vector of
1545/// [4 x float].
1546///
1547/// \headerfile <x86intrin.h>
1548///
1549/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1550///
1551/// \param __a
1552/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1553/// used in the extraction.
1554/// \returns A 32-bit float containing the extracted value.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001555static __inline__ float __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001556_mm_cvtss_f32(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001557{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001558 return __a[0];
Ying Wanga6720142011-12-20 14:43:20 -08001559}
1560
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001561/// \brief Loads two packed float values from the address __p into the
1562/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1563/// are copied from the low-order bits of the first operand.
1564///
1565/// \headerfile <x86intrin.h>
1566///
1567/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
1568///
1569/// \param __a
1570/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1571/// of the destination.
1572/// \param __p
1573/// A pointer to two packed float values. Bits [63:0] are written to bits
1574/// [127:64] of the destination.
1575/// \returns A 128-bit vector of [4 x float] containing the moved values.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001576static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001577_mm_loadh_pi(__m128 __a, const __m64 *__p)
Ying Wanga6720142011-12-20 14:43:20 -08001578{
1579 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1580 struct __mm_loadh_pi_struct {
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001581 __mm_loadh_pi_v2f32 __u;
Ying Wanga6720142011-12-20 14:43:20 -08001582 } __attribute__((__packed__, __may_alias__));
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001583 __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
1584 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1585 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
Ying Wanga6720142011-12-20 14:43:20 -08001586}
1587
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001588/// \brief Loads two packed float values from the address __p into the low-order
1589/// bits of a 128-bit vector of [4 x float]. The high-order bits are copied
1590/// from the high-order bits of the first operand.
1591///
1592/// \headerfile <x86intrin.h>
1593///
1594/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
1595///
1596/// \param __a
1597/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1598/// [127:64] of the destination.
1599/// \param __p
1600/// A pointer to two packed float values. Bits [63:0] are written to bits
1601/// [63:0] of the destination.
1602/// \returns A 128-bit vector of [4 x float] containing the moved values.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001603static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001604_mm_loadl_pi(__m128 __a, const __m64 *__p)
Ying Wanga6720142011-12-20 14:43:20 -08001605{
1606 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1607 struct __mm_loadl_pi_struct {
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001608 __mm_loadl_pi_v2f32 __u;
Ying Wanga6720142011-12-20 14:43:20 -08001609 } __attribute__((__packed__, __may_alias__));
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001610 __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
1611 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1612 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
Ying Wanga6720142011-12-20 14:43:20 -08001613}
1614
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001615/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1616/// 32 bits of the vector are initialized with the single-precision
1617/// floating-point value loaded from a specified memory location. The upper
1618/// 96 bits are set to zero.
1619///
1620/// \headerfile <x86intrin.h>
1621///
1622/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1623///
1624/// \param __p
1625/// A pointer to a 32-bit memory location containing a single-precision
1626/// floating-point value.
1627/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1628/// lower 32 bits contain the value loaded from the memory location. The
1629/// upper 96 bits are set to zero.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001630static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001631_mm_load_ss(const float *__p)
Ying Wanga6720142011-12-20 14:43:20 -08001632{
1633 struct __mm_load_ss_struct {
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001634 float __u;
Ying Wanga6720142011-12-20 14:43:20 -08001635 } __attribute__((__packed__, __may_alias__));
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001636 float __u = ((struct __mm_load_ss_struct*)__p)->__u;
1637 return (__m128){ __u, 0, 0, 0 };
Ying Wanga6720142011-12-20 14:43:20 -08001638}
1639
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001640/// \brief Loads a 32-bit float value and duplicates it to all four vector
1641/// elements of a 128-bit vector of [4 x float].
1642///
1643/// \headerfile <x86intrin.h>
1644///
1645/// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling
1646/// instruction.
1647///
1648/// \param __p
1649/// A pointer to a float value to be loaded and duplicated.
1650/// \returns A 128-bit vector of [4 x float] containing the loaded
1651/// and duplicated values.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001652static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001653_mm_load1_ps(const float *__p)
Ying Wanga6720142011-12-20 14:43:20 -08001654{
1655 struct __mm_load1_ps_struct {
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001656 float __u;
Ying Wanga6720142011-12-20 14:43:20 -08001657 } __attribute__((__packed__, __may_alias__));
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001658 float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
1659 return (__m128){ __u, __u, __u, __u };
Ying Wanga6720142011-12-20 14:43:20 -08001660}
1661
1662#define _mm_load_ps1(p) _mm_load1_ps(p)
1663
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001664/// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
1665/// memory location.
1666///
1667/// \headerfile <x86intrin.h>
1668///
1669/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
1670///
1671/// \param __p
1672/// A pointer to a 128-bit memory location. The address of the memory
1673/// location has to be 128-bit aligned.
1674/// \returns A 128-bit vector of [4 x float] containing the loaded valus.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001675static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001676_mm_load_ps(const float *__p)
Ying Wanga6720142011-12-20 14:43:20 -08001677{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001678 return *(__m128*)__p;
Ying Wanga6720142011-12-20 14:43:20 -08001679}
1680
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001681/// \brief Loads a 128-bit floating-point vector of [4 x float] from an
1682/// unaligned memory location.
1683///
1684/// \headerfile <x86intrin.h>
1685///
1686/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
1687///
1688/// \param __p
1689/// A pointer to a 128-bit memory location. The address of the memory
1690/// location does not have to be aligned.
1691/// \returns A 128-bit vector of [4 x float] containing the loaded values.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001692static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001693_mm_loadu_ps(const float *__p)
Ying Wanga6720142011-12-20 14:43:20 -08001694{
1695 struct __loadu_ps {
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001696 __m128 __v;
Ying Wanga6720142011-12-20 14:43:20 -08001697 } __attribute__((__packed__, __may_alias__));
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001698 return ((struct __loadu_ps*)__p)->__v;
Ying Wanga6720142011-12-20 14:43:20 -08001699}
1700
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001701/// \brief Loads four packed float values, in reverse order, from an aligned
1702/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1703///
1704/// \headerfile <x86intrin.h>
1705///
1706/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
1707/// instruction.
1708///
1709/// \param __p
1710/// A pointer to a 128-bit memory location. The address of the memory
1711/// location has to be 128-bit aligned.
1712/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1713/// in reverse order.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001714static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001715_mm_loadr_ps(const float *__p)
Ying Wanga6720142011-12-20 14:43:20 -08001716{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001717 __m128 __a = _mm_load_ps(__p);
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001718 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
Ying Wanga6720142011-12-20 14:43:20 -08001719}
1720
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001721/// \brief Create a 128-bit vector of [4 x float] with undefined values.
1722///
1723/// \headerfile <x86intrin.h>
1724///
1725/// This intrinsic has no corresponding instruction.
1726///
1727/// \returns A 128-bit vector of [4 x float] containing undefined values.
1728
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001729static __inline__ __m128 __DEFAULT_FN_ATTRS
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001730_mm_undefined_ps(void)
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001731{
1732 return (__m128)__builtin_ia32_undef128();
1733}
1734
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001735/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1736/// 32 bits of the vector are initialized with the specified single-precision
1737/// floating-point value. The upper 96 bits are set to zero.
1738///
1739/// \headerfile <x86intrin.h>
1740///
1741/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1742///
1743/// \param __w
1744/// A single-precision floating-point value used to initialize the lower 32
1745/// bits of the result.
1746/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1747/// lower 32 bits contain the value provided in the source operand. The
1748/// upper 96 bits are set to zero.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001749static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001750_mm_set_ss(float __w)
Ying Wanga6720142011-12-20 14:43:20 -08001751{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001752 return (__m128){ __w, 0, 0, 0 };
Ying Wanga6720142011-12-20 14:43:20 -08001753}
1754
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001755/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1756/// of the four single-precision floating-point vector elements set to the
1757/// specified single-precision floating-point value.
1758///
1759/// \headerfile <x86intrin.h>
1760///
1761/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1762///
1763/// \param __w
1764/// A single-precision floating-point value used to initialize each vector
1765/// element of the result.
1766/// \returns An initialized 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001767static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001768_mm_set1_ps(float __w)
Ying Wanga6720142011-12-20 14:43:20 -08001769{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001770 return (__m128){ __w, __w, __w, __w };
Ying Wanga6720142011-12-20 14:43:20 -08001771}
1772
Stephen Hines990d2fc2014-07-23 10:40:48 -07001773/* Microsoft specific. */
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001774/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1775/// of the four single-precision floating-point vector elements set to the
1776/// specified single-precision floating-point value.
1777///
1778/// \headerfile <x86intrin.h>
1779///
1780/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1781///
1782/// \param __w
1783/// A single-precision floating-point value used to initialize each vector
1784/// element of the result.
1785/// \returns An initialized 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001786static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001787_mm_set_ps1(float __w)
Ying Wanga6720142011-12-20 14:43:20 -08001788{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001789 return _mm_set1_ps(__w);
Ying Wanga6720142011-12-20 14:43:20 -08001790}
1791
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001792/// \brief Constructs a 128-bit floating-point vector of [4 x float]
1793/// initialized with the specified single-precision floating-point values.
1794///
1795/// \headerfile <x86intrin.h>
1796///
1797/// This intrinsic is a utility function and does not correspond to a specific
1798/// instruction.
1799///
1800/// \param __z
1801/// A single-precision floating-point value used to initialize bits [127:96]
1802/// of the result.
1803/// \param __y
1804/// A single-precision floating-point value used to initialize bits [95:64]
1805/// of the result.
1806/// \param __x
1807/// A single-precision floating-point value used to initialize bits [63:32]
1808/// of the result.
1809/// \param __w
1810/// A single-precision floating-point value used to initialize bits [31:0]
1811/// of the result.
1812/// \returns An initialized 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001813static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001814_mm_set_ps(float __z, float __y, float __x, float __w)
Ying Wanga6720142011-12-20 14:43:20 -08001815{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001816 return (__m128){ __w, __x, __y, __z };
Ying Wanga6720142011-12-20 14:43:20 -08001817}
1818
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001819/// \brief Constructs a 128-bit floating-point vector of [4 x float],
1820/// initialized in reverse order with the specified 32-bit single-precision
1821/// float-point values.
1822///
1823/// \headerfile <x86intrin.h>
1824///
1825/// This intrinsic is a utility function and does not correspond to a specific
1826/// instruction.
1827///
1828/// \param __z
1829/// A single-precision floating-point value used to initialize bits [31:0]
1830/// of the result.
1831/// \param __y
1832/// A single-precision floating-point value used to initialize bits [63:32]
1833/// of the result.
1834/// \param __x
1835/// A single-precision floating-point value used to initialize bits [95:64]
1836/// of the result.
1837/// \param __w
1838/// A single-precision floating-point value used to initialize bits [127:96]
1839/// of the result.
1840/// \returns An initialized 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001841static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001842_mm_setr_ps(float __z, float __y, float __x, float __w)
Ying Wanga6720142011-12-20 14:43:20 -08001843{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001844 return (__m128){ __z, __y, __x, __w };
Ying Wanga6720142011-12-20 14:43:20 -08001845}
1846
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001847/// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
1848/// to zero.
1849///
1850/// \headerfile <x86intrin.h>
1851///
1852/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
1853///
1854/// \returns An initialized 128-bit floating-point vector of [4 x float] with
1855/// all elements set to zero.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001856static __inline__ __m128 __DEFAULT_FN_ATTRS
Ying Wanga6720142011-12-20 14:43:20 -08001857_mm_setzero_ps(void)
1858{
1859 return (__m128){ 0, 0, 0, 0 };
1860}
1861
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001862/// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1863/// memory location.
1864///
1865/// \headerfile <x86intrin.h>
1866///
1867/// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction.
1868///
1869/// \param __p
1870/// A pointer to a 64-bit memory location.
1871/// \param __a
1872/// A 128-bit vector of [4 x float] containing the values to be stored.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001873static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001874_mm_storeh_pi(__m64 *__p, __m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001875{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001876 __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -08001877}
1878
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001879/// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1880/// memory location.
1881///
1882/// \headerfile <x86intrin.h>
1883///
1884/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
1885///
1886/// \param __p
1887/// A pointer to a memory location that will receive the float values.
1888/// \param __a
1889/// A 128-bit vector of [4 x float] containing the values to be stored.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001890static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001891_mm_storel_pi(__m64 *__p, __m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001892{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001893 __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -08001894}
1895
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001896/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1897/// memory location.
1898///
1899/// \headerfile <x86intrin.h>
1900///
1901/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1902///
1903/// \param __p
1904/// A pointer to a 32-bit memory location.
1905/// \param __a
1906/// A 128-bit vector of [4 x float] containing the value to be stored.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001907static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001908_mm_store_ss(float *__p, __m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001909{
1910 struct __mm_store_ss_struct {
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001911 float __u;
Ying Wanga6720142011-12-20 14:43:20 -08001912 } __attribute__((__packed__, __may_alias__));
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001913 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
Ying Wanga6720142011-12-20 14:43:20 -08001914}
1915
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001916/// \brief Stores float values from a 128-bit vector of [4 x float] to an
1917/// unaligned memory location.
1918///
1919/// \headerfile <x86intrin.h>
1920///
1921/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
1922///
1923/// \param __p
1924/// A pointer to a 128-bit memory location. The address of the memory
1925/// location does not have to be aligned.
1926/// \param __a
1927/// A 128-bit vector of [4 x float] containing the values to be stored.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001928static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001929_mm_storeu_ps(float *__p, __m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001930{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001931 struct __storeu_ps {
1932 __m128 __v;
1933 } __attribute__((__packed__, __may_alias__));
1934 ((struct __storeu_ps*)__p)->__v = __a;
Ying Wanga6720142011-12-20 14:43:20 -08001935}
1936
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001937/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
1938/// four contiguous elements in an aligned memory location.
1939///
1940/// \headerfile <x86intrin.h>
1941///
1942/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
1943/// instruction.
1944///
1945/// \param __p
1946/// A pointer to a 128-bit memory location.
1947/// \param __a
1948/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
1949/// of the four contiguous elements pointed by __p.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07001950static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07001951_mm_store_ps(float *__p, __m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08001952{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001953 *(__m128*)__p = __a;
Ying Wanga6720142011-12-20 14:43:20 -08001954}
1955
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07001956/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
1957/// four contiguous elements in an aligned memory location.
1958///
1959/// \headerfile <x86intrin.h>
1960///
1961/// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
1962/// instruction.
1963///
1964/// \param __p
1965/// A pointer to a 128-bit memory location.
1966/// \param __a
1967/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
1968/// of the four contiguous elements pointed by __p.
1969static __inline__ void __DEFAULT_FN_ATTRS
1970_mm_store1_ps(float *__p, __m128 __a)
1971{
1972 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
1973 _mm_store_ps(__p, __a);
1974}
1975
1976/// \brief Stores float values from a 128-bit vector of [4 x float] to an
1977/// aligned memory location.
1978///
1979/// \headerfile <x86intrin.h>
1980///
1981/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
1982///
1983/// \param __p
1984/// A pointer to a 128-bit memory location. The address of the memory
1985/// location has to be 128-bit aligned.
1986/// \param __a
1987/// A 128-bit vector of [4 x float] containing the values to be stored.
1988static __inline__ void __DEFAULT_FN_ATTRS
1989_mm_store_ps1(float *__p, __m128 __a)
1990{
1991 return _mm_store1_ps(__p, __a);
1992}
1993
1994/// \brief Stores float values from a 128-bit vector of [4 x float] to an
1995/// aligned memory location in reverse order.
1996///
1997/// \headerfile <x86intrin.h>
1998///
1999/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
2000/// instruction.
2001///
2002/// \param __p
2003/// A pointer to a 128-bit memory location. The address of the memory
2004/// location has to be 128-bit aligned.
2005/// \param __a
2006/// A 128-bit vector of [4 x float] containing the values to be stored.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002007static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002008_mm_storer_ps(float *__p, __m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08002009{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002010 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002011 _mm_store_ps(__p, __a);
Ying Wanga6720142011-12-20 14:43:20 -08002012}
2013
2014#define _MM_HINT_T0 3
2015#define _MM_HINT_T1 2
2016#define _MM_HINT_T2 1
2017#define _MM_HINT_NTA 0
2018
Stephen Hines30047ab2014-04-24 10:38:22 -07002019#ifndef _MSC_VER
Ying Wanga6720142011-12-20 14:43:20 -08002020/* FIXME: We have to #define this because "sel" must be a constant integer, and
2021 Sema doesn't do any form of constant propagation yet. */
2022
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002023/// \brief Loads one cache line of data from the specified address to a location
2024/// closer to the processor.
2025///
2026/// \headerfile <x86intrin.h>
2027///
2028/// \code
2029/// void _mm_prefetch(const void * a, const int sel);
2030/// \endcode
2031///
2032/// This intrinsic corresponds to the \c PREFETCHNTA instruction.
2033///
2034/// \param a
2035/// A pointer to a memory location containing a cache line of data.
2036/// \param sel
2037/// A predefined integer constant specifying the type of prefetch operation:
2038/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint.
2039/// The PREFETCHNTA instruction will be generated.
2040/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2041/// be generated.
2042/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2043/// be generated.
2044/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2045/// be generated.
Ying Wang60999142013-01-07 13:59:36 -08002046#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
Stephen Hines30047ab2014-04-24 10:38:22 -07002047#endif
Ying Wanga6720142011-12-20 14:43:20 -08002048
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002049/// \brief Stores a 64-bit integer in the specified aligned memory location. To
2050/// minimize caching, the data is flagged as non-temporal (unlikely to be
2051/// used again soon).
2052///
2053/// \headerfile <x86intrin.h>
2054///
2055/// This intrinsic corresponds to the \c MOVNTQ instruction.
2056///
2057/// \param __p
2058/// A pointer to an aligned memory location used to store the register value.
2059/// \param __a
2060/// A 64-bit integer containing the value to be stored.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002061static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002062_mm_stream_pi(__m64 *__p, __m64 __a)
Ying Wanga6720142011-12-20 14:43:20 -08002063{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002064 __builtin_ia32_movntq(__p, __a);
Ying Wanga6720142011-12-20 14:43:20 -08002065}
2066
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002067/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
2068/// 128-bit aligned memory location. To minimize caching, the data is flagged
2069/// as non-temporal (unlikely to be used again soon).
2070///
2071/// \headerfile <x86intrin.h>
2072///
2073/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
2074///
2075/// \param __p
2076/// A pointer to a 128-bit aligned memory location that will receive the
2077/// integer values.
2078/// \param __a
2079/// A 128-bit vector of [4 x float] containing the values to be moved.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002080static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002081_mm_stream_ps(float *__p, __m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08002082{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002083 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
Ying Wanga6720142011-12-20 14:43:20 -08002084}
2085
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002086/// \brief Forces strong memory ordering (serialization) between store
2087/// instructions preceding this instruction and store instructions following
2088/// this instruction, ensuring the system completes all previous stores
2089/// before executing subsequent stores.
2090///
2091/// \headerfile <x86intrin.h>
2092///
2093/// This intrinsic corresponds to the \c SFENCE instruction.
2094///
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002095static __inline__ void __DEFAULT_FN_ATTRS
Ying Wanga6720142011-12-20 14:43:20 -08002096_mm_sfence(void)
2097{
2098 __builtin_ia32_sfence();
2099}
2100
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002101/// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2102/// returns it, as specified by the immediate integer operand.
2103///
2104/// \headerfile <x86intrin.h>
2105///
2106/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
2107///
2108/// \param __a
2109/// A 64-bit vector of [4 x i16].
2110/// \param __n
2111/// An immediate integer operand that determines which bits are extracted:
2112/// 0: Bits [15:0] are copied to the destination.
2113/// 1: Bits [31:16] are copied to the destination.
2114/// 2: Bits [47:32] are copied to the destination.
2115/// 3: Bits [63:48] are copied to the destination.
2116/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2117#define _mm_extract_pi16(a, n) __extension__ ({ \
2118 (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
Ying Wanga6720142011-12-20 14:43:20 -08002119
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002120/// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
2121/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2122/// specified by the immediate operand __n.
2123///
2124/// \headerfile <x86intrin.h>
2125///
2126/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
2127///
2128/// \param __a
2129/// A 64-bit vector of [4 x i16].
2130/// \param __d
2131/// An integer. The lower 16-bit value from this operand is written to the
2132/// destination at the offset specified by operand __n.
2133/// \param __n
2134/// An immediate integer operant that determines which the bits to be used
2135/// in the destination.
2136/// 0: Bits [15:0] are copied to the destination.
2137/// 1: Bits [31:16] are copied to the destination.
2138/// 2: Bits [47:32] are copied to the destination.
2139/// 3: Bits [63:48] are copied to the destination.
2140/// The remaining bits in the destination are copied from the corresponding
2141/// bits in operand __a.
2142/// \returns A 64-bit integer vector containing the copied packed data from the
2143/// operands.
2144#define _mm_insert_pi16(a, d, n) __extension__ ({ \
2145 (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
Ying Wanga6720142011-12-20 14:43:20 -08002146
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002147/// \brief Compares each of the corresponding packed 16-bit integer values of
2148/// the 64-bit integer vectors, and writes the greater value to the
2149/// corresponding bits in the destination.
2150///
2151/// \headerfile <x86intrin.h>
2152///
2153/// This intrinsic corresponds to the \c PMAXSW instruction.
2154///
2155/// \param __a
2156/// A 64-bit integer vector containing one of the source operands.
2157/// \param __b
2158/// A 64-bit integer vector containing one of the source operands.
2159/// \returns A 64-bit integer vector containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002160static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002161_mm_max_pi16(__m64 __a, __m64 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002162{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002163 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
Ying Wanga6720142011-12-20 14:43:20 -08002164}
2165
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002166/// \brief Compares each of the corresponding packed 8-bit unsigned integer
2167/// values of the 64-bit integer vectors, and writes the greater value to the
2168/// corresponding bits in the destination.
2169///
2170/// \headerfile <x86intrin.h>
2171///
2172/// This intrinsic corresponds to the \c PMAXUB instruction.
2173///
2174/// \param __a
2175/// A 64-bit integer vector containing one of the source operands.
2176/// \param __b
2177/// A 64-bit integer vector containing one of the source operands.
2178/// \returns A 64-bit integer vector containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002179static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002180_mm_max_pu8(__m64 __a, __m64 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002181{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002182 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
Ying Wanga6720142011-12-20 14:43:20 -08002183}
2184
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002185/// \brief Compares each of the corresponding packed 16-bit integer values of
2186/// the 64-bit integer vectors, and writes the lesser value to the
2187/// corresponding bits in the destination.
2188///
2189/// \headerfile <x86intrin.h>
2190///
2191/// This intrinsic corresponds to the \c PMINSW instruction.
2192///
2193/// \param __a
2194/// A 64-bit integer vector containing one of the source operands.
2195/// \param __b
2196/// A 64-bit integer vector containing one of the source operands.
2197/// \returns A 64-bit integer vector containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002198static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002199_mm_min_pi16(__m64 __a, __m64 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002200{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002201 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
Ying Wanga6720142011-12-20 14:43:20 -08002202}
2203
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002204/// \brief Compares each of the corresponding packed 8-bit unsigned integer
2205/// values of the 64-bit integer vectors, and writes the lesser value to the
2206/// corresponding bits in the destination.
2207///
2208/// \headerfile <x86intrin.h>
2209///
2210/// This intrinsic corresponds to the \c PMINUB instruction.
2211///
2212/// \param __a
2213/// A 64-bit integer vector containing one of the source operands.
2214/// \param __b
2215/// A 64-bit integer vector containing one of the source operands.
2216/// \returns A 64-bit integer vector containing the comparison results.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002217static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002218_mm_min_pu8(__m64 __a, __m64 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002219{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002220 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
Ying Wanga6720142011-12-20 14:43:20 -08002221}
2222
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002223/// \brief Takes the most significant bit from each 8-bit element in a 64-bit
2224/// integer vector to create a 16-bit mask value. Zero-extends the value to
2225/// 32-bit integer and writes it to the destination.
2226///
2227/// \headerfile <x86intrin.h>
2228///
2229/// This intrinsic corresponds to the \c PMOVMSKB instruction.
2230///
2231/// \param __a
2232/// A 64-bit integer vector containing the values with bits to be extracted.
2233/// \returns The most significant bit from each 8-bit element in the operand,
2234/// written to bits [15:0].
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002235static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002236_mm_movemask_pi8(__m64 __a)
Ying Wanga6720142011-12-20 14:43:20 -08002237{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002238 return __builtin_ia32_pmovmskb((__v8qi)__a);
Ying Wanga6720142011-12-20 14:43:20 -08002239}
2240
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002241/// \brief Multiplies packed 16-bit unsigned integer values and writes the
2242/// high-order 16 bits of each 32-bit product to the corresponding bits in
2243/// the destination.
2244///
2245/// \headerfile <x86intrin.h>
2246///
2247/// This intrinsic corresponds to the \c PMULHUW instruction.
2248///
2249/// \param __a
2250/// A 64-bit integer vector containing one of the source operands.
2251/// \param __b
2252/// A 64-bit integer vector containing one of the source operands.
2253/// \returns A 64-bit integer vector containing the products of both operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002254static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002255_mm_mulhi_pu16(__m64 __a, __m64 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002256{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002257 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
Ying Wanga6720142011-12-20 14:43:20 -08002258}
2259
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002260/// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2261/// destination, as specified by the immediate value operand.
2262///
2263/// \headerfile <x86intrin.h>
2264///
2265/// This intrinsic corresponds to the \c PSHUFW instruction.
2266///
2267/// \code
2268/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2269/// \endcode
2270///
2271/// \param a
2272/// A 64-bit integer vector containing the values to be shuffled.
2273/// \param n
2274/// An immediate value containing an 8-bit value specifying which elements to
2275/// copy from a. The destinations within the 64-bit destination are assigned
2276/// values as follows:
2277/// Bits [1:0] are used to assign values to bits [15:0] in the destination.
2278/// Bits [3:2] are used to assign values to bits [31:16] in the destination.
2279/// Bits [5:4] are used to assign values to bits [47:32] in the destination.
2280/// Bits [7:6] are used to assign values to bits [63:48] in the destination.
2281/// Bit value assignments:
2282/// 00: assigned from bits [15:0] of a.
2283/// 01: assigned from bits [31:16] of a.
2284/// 10: assigned from bits [47:32] of a.
2285/// 11: assigned from bits [63:48] of a.
2286/// \returns A 64-bit integer vector containing the shuffled values.
Ying Wang60999142013-01-07 13:59:36 -08002287#define _mm_shuffle_pi16(a, n) __extension__ ({ \
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002288 (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
Ying Wanga6720142011-12-20 14:43:20 -08002289
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002290/// \brief Conditionally copies the values from each 8-bit element in the first
2291/// 64-bit integer vector operand to the specified memory location, as
2292/// specified by the most significant bit in the corresponding element in the
2293/// second 64-bit integer vector operand. To minimize caching, the data is
2294/// flagged as non-temporal (unlikely to be used again soon).
2295///
2296/// \headerfile <x86intrin.h>
2297///
2298/// This intrinsic corresponds to the \c MASKMOVQ instruction.
2299///
2300/// \param __d
2301/// A 64-bit integer vector containing the values with elements to be copied.
2302/// \param __n
2303/// A 64-bit integer vector operand. The most significant bit from each 8-bit
2304/// element determines whether the corresponding element in operand __d is
2305/// copied. If the most significant bit of a given element is 1, the
2306/// corresponding element in operand __d is copied.
2307/// \param __p
2308/// A pointer to a 64-bit memory location that will receive the conditionally
2309/// copied integer values. The address of the memory location does not have
2310/// to be aligned.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002311static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002312_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Ying Wanga6720142011-12-20 14:43:20 -08002313{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002314 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
Ying Wanga6720142011-12-20 14:43:20 -08002315}
2316
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002317/// \brief Computes the rounded averages of the packed unsigned 8-bit integer
2318/// values and writes the averages to the corresponding bits in the
2319/// destination.
2320///
2321/// \headerfile <x86intrin.h>
2322///
2323/// This intrinsic corresponds to the \c PAVGB instruction.
2324///
2325/// \param __a
2326/// A 64-bit integer vector containing one of the source operands.
2327/// \param __b
2328/// A 64-bit integer vector containing one of the source operands.
2329/// \returns A 64-bit integer vector containing the averages of both operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002330static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002331_mm_avg_pu8(__m64 __a, __m64 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002332{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002333 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
Ying Wanga6720142011-12-20 14:43:20 -08002334}
2335
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002336/// \brief Computes the rounded averages of the packed unsigned 16-bit integer
2337/// values and writes the averages to the corresponding bits in the
2338/// destination.
2339///
2340/// \headerfile <x86intrin.h>
2341///
2342/// This intrinsic corresponds to the \c PAVGW instruction.
2343///
2344/// \param __a
2345/// A 64-bit integer vector containing one of the source operands.
2346/// \param __b
2347/// A 64-bit integer vector containing one of the source operands.
2348/// \returns A 64-bit integer vector containing the averages of both operands.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002349static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002350_mm_avg_pu16(__m64 __a, __m64 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002351{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002352 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
Ying Wanga6720142011-12-20 14:43:20 -08002353}
2354
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002355/// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
2356/// 64-bit vector operands and computes the absolute value for each of the
2357/// difference. Then sum of the 8 absolute differences is written to the
2358/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2359///
2360/// \headerfile <x86intrin.h>
2361///
2362/// This intrinsic corresponds to the \c PSADBW instruction.
2363///
2364/// \param __a
2365/// A 64-bit integer vector containing one of the source operands.
2366/// \param __b
2367/// A 64-bit integer vector containing one of the source operands.
2368/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2369/// sets of absolute differences between both operands. The upper bits are
2370/// cleared.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002371static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002372_mm_sad_pu8(__m64 __a, __m64 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002373{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002374 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
Ying Wanga6720142011-12-20 14:43:20 -08002375}
2376
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002377/// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
2378/// integer value. There are several groups of macros associated with this
2379/// intrinsic, including:
2380/// * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2381/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2382/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2383/// _MM_GET_EXCEPTION_STATE().
2384/// * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2385/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2386/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2387/// * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2388/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2389/// _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
2390/// * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2391/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2392/// * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2393/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2394/// _MM_GET_DENORMALS_ZERO_MODE().
2395///
2396/// For example, the expression below checks if an overflow exception has
2397/// occurred:
2398/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2399///
2400/// The following example gets the current rounding mode:
2401/// _MM_GET_ROUNDING_MODE()
2402///
2403/// \headerfile <x86intrin.h>
2404///
2405/// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction.
2406///
2407/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2408/// register.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002409static __inline__ unsigned int __DEFAULT_FN_ATTRS
Ying Wanga6720142011-12-20 14:43:20 -08002410_mm_getcsr(void)
2411{
2412 return __builtin_ia32_stmxcsr();
2413}
2414
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002415/// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There
2416/// are several groups of macros associated with this intrinsic, including:
2417/// * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2418/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2419/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2420/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2421/// * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2422/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2423/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2424/// of these macros.
2425/// * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2426/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2427/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2428/// * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2429/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2430/// one of these macros.
2431/// * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2432/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2433/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2434///
2435/// For example, the following expression causes subsequent floating-point
2436/// operations to round up:
2437/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2438///
2439/// The following example sets the DAZ and FTZ flags:
2440/// void setFlags() {
2441/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
2442/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
2443/// }
2444///
2445/// \headerfile <x86intrin.h>
2446///
2447/// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction.
2448///
2449/// \param __i
2450/// A 32-bit unsigned integer value to be written to the MXCSR register.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002451static __inline__ void __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002452_mm_setcsr(unsigned int __i)
Ying Wanga6720142011-12-20 14:43:20 -08002453{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002454 __builtin_ia32_ldmxcsr(__i);
Ying Wanga6720142011-12-20 14:43:20 -08002455}
2456
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002457/// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
2458/// specified by the immediate value operand.
2459///
2460/// \headerfile <x86intrin.h>
2461///
2462/// \code
2463/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2464/// \endcode
2465///
2466/// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
2467///
2468/// \param a
2469/// A 128-bit vector of [4 x float].
2470/// \param b
2471/// A 128-bit vector of [4 x float].
2472/// \param mask
2473/// An immediate value containing an 8-bit value specifying which elements to
2474/// copy from a and b.
2475/// Bits [3:0] specify the values copied from operand a.
2476/// Bits [7:4] specify the values copied from operand b. The destinations
2477/// within the 128-bit destination are assigned values as follows:
2478/// Bits [1:0] are used to assign values to bits [31:0] in the destination.
2479/// Bits [3:2] are used to assign values to bits [63:32] in the destination.
2480/// Bits [5:4] are used to assign values to bits [95:64] in the destination.
2481/// Bits [7:6] are used to assign values to bits [127:96] in the destination.
2482/// Bit value assignments:
2483/// 00: Bits [31:0] copied from the specified operand.
2484/// 01: Bits [63:32] copied from the specified operand.
2485/// 10: Bits [95:64] copied from the specified operand.
2486/// 11: Bits [127:96] copied from the specified operand.
2487/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
Ying Wang60999142013-01-07 13:59:36 -08002488#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002489 (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002490 0 + (((mask) >> 0) & 0x3), \
2491 0 + (((mask) >> 2) & 0x3), \
2492 4 + (((mask) >> 4) & 0x3), \
2493 4 + (((mask) >> 6) & 0x3)); })
Ying Wanga6720142011-12-20 14:43:20 -08002494
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002495/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2496/// [4 x float] and interleaves them into a 128-bit vector of [4 x
2497/// float].
2498///
2499/// \headerfile <x86intrin.h>
2500///
2501/// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction.
2502///
2503/// \param __a
2504/// A 128-bit vector of [4 x float].
2505/// Bits [95:64] are written to bits [31:0] of the destination.
2506/// Bits [127:96] are written to bits [95:64] of the destination.
2507/// \param __b
2508/// A 128-bit vector of [4 x float].
2509/// Bits [95:64] are written to bits [63:32] of the destination.
2510/// Bits [127:96] are written to bits [127:96] of the destination.
2511/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002512static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002513_mm_unpackhi_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002514{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002515 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
Ying Wanga6720142011-12-20 14:43:20 -08002516}
2517
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002518/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2519/// [4 x float] and interleaves them into a 128-bit vector of [4 x
2520/// float].
2521///
2522/// \headerfile <x86intrin.h>
2523///
2524/// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction.
2525///
2526/// \param __a
2527/// A 128-bit vector of [4 x float].
2528/// Bits [31:0] are written to bits [31:0] of the destination.
2529/// Bits [63:32] are written to bits [95:64] of the destination.
2530/// \param __b
2531/// A 128-bit vector of [4 x float].
2532/// Bits [31:0] are written to bits [63:32] of the destination.
2533/// Bits [63:32] are written to bits [127:96] of the destination.
2534/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002535static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002536_mm_unpacklo_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002537{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002538 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
Ying Wanga6720142011-12-20 14:43:20 -08002539}
2540
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002541/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2542/// 32 bits are set to the lower 32 bits of the second parameter. The upper
2543/// 96 bits are set to the upper 96 bits of the first parameter.
2544///
2545/// \headerfile <x86intrin.h>
2546///
2547/// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
2548///
2549/// \param __a
2550/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2551/// written to the upper 96 bits of the result.
2552/// \param __b
2553/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2554/// written to the lower 32 bits of the result.
2555/// \returns A 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002556static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002557_mm_move_ss(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002558{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002559 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
Ying Wanga6720142011-12-20 14:43:20 -08002560}
2561
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002562/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2563/// 64 bits are set to the upper 64 bits of the second parameter. The upper
2564/// 64 bits are set to the upper 64 bits of the first parameter.
2565///
2566/// \headerfile <x86intrin.h>
2567///
2568/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
2569///
2570/// \param __a
2571/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2572/// written to the upper 64 bits of the result.
2573/// \param __b
2574/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2575/// written to the lower 64 bits of the result.
2576/// \returns A 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002577static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002578_mm_movehl_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002579{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002580 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
Ying Wanga6720142011-12-20 14:43:20 -08002581}
2582
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002583/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2584/// 64 bits are set to the lower 64 bits of the first parameter. The upper
2585/// 64 bits are set to the lower 64 bits of the second parameter.
2586///
2587/// \headerfile <x86intrin.h>
2588///
2589/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
2590///
2591/// \param __a
2592/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2593/// written to the lower 64 bits of the result.
2594/// \param __b
2595/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2596/// written to the upper 64 bits of the result.
2597/// \returns A 128-bit floating-point vector of [4 x float].
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002598static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002599_mm_movelh_ps(__m128 __a, __m128 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002600{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002601 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
Ying Wanga6720142011-12-20 14:43:20 -08002602}
2603
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002604/// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2605/// float].
2606///
2607/// \headerfile <x86intrin.h>
2608///
2609/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2610///
2611/// \param __a
2612/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2613/// from the corresponding elements in this operand.
2614/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2615/// values from the operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002616static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002617_mm_cvtpi16_ps(__m64 __a)
Ying Wanga6720142011-12-20 14:43:20 -08002618{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002619 __m64 __b, __c;
2620 __m128 __r;
Ying Wanga6720142011-12-20 14:43:20 -08002621
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002622 __b = _mm_setzero_si64();
2623 __b = _mm_cmpgt_pi16(__b, __a);
2624 __c = _mm_unpackhi_pi16(__a, __b);
2625 __r = _mm_setzero_ps();
2626 __r = _mm_cvtpi32_ps(__r, __c);
2627 __r = _mm_movelh_ps(__r, __r);
2628 __c = _mm_unpacklo_pi16(__a, __b);
2629 __r = _mm_cvtpi32_ps(__r, __c);
Ying Wanga6720142011-12-20 14:43:20 -08002630
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002631 return __r;
Ying Wanga6720142011-12-20 14:43:20 -08002632}
2633
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002634/// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
2635/// 128-bit vector of [4 x float].
2636///
2637/// \headerfile <x86intrin.h>
2638///
2639/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2640///
2641/// \param __a
2642/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2643/// destination are copied from the corresponding elements in this operand.
2644/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2645/// values from the operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002646static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002647_mm_cvtpu16_ps(__m64 __a)
Ying Wanga6720142011-12-20 14:43:20 -08002648{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002649 __m64 __b, __c;
2650 __m128 __r;
Ying Wanga6720142011-12-20 14:43:20 -08002651
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002652 __b = _mm_setzero_si64();
2653 __c = _mm_unpackhi_pi16(__a, __b);
2654 __r = _mm_setzero_ps();
2655 __r = _mm_cvtpi32_ps(__r, __c);
2656 __r = _mm_movelh_ps(__r, __r);
2657 __c = _mm_unpacklo_pi16(__a, __b);
2658 __r = _mm_cvtpi32_ps(__r, __c);
Ying Wanga6720142011-12-20 14:43:20 -08002659
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002660 return __r;
Ying Wanga6720142011-12-20 14:43:20 -08002661}
2662
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002663/// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2664/// into a 128-bit vector of [4 x float].
2665///
2666/// \headerfile <x86intrin.h>
2667///
2668/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2669///
2670/// \param __a
2671/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2672/// from the corresponding lower 4 elements in this operand.
2673/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2674/// values from the operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002675static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002676_mm_cvtpi8_ps(__m64 __a)
Ying Wanga6720142011-12-20 14:43:20 -08002677{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002678 __m64 __b;
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002679
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002680 __b = _mm_setzero_si64();
2681 __b = _mm_cmpgt_pi8(__b, __a);
2682 __b = _mm_unpacklo_pi8(__a, __b);
Ying Wanga6720142011-12-20 14:43:20 -08002683
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002684 return _mm_cvtpi16_ps(__b);
Ying Wanga6720142011-12-20 14:43:20 -08002685}
2686
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002687/// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
2688/// vector of [8 x u8] into a 128-bit vector of [4 x float].
2689///
2690/// \headerfile <x86intrin.h>
2691///
2692/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2693///
2694/// \param __a
2695/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2696/// destination are copied from the corresponding lower 4 elements in this
2697/// operand.
2698/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2699/// values from the source operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002700static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002701_mm_cvtpu8_ps(__m64 __a)
Ying Wanga6720142011-12-20 14:43:20 -08002702{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002703 __m64 __b;
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002704
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002705 __b = _mm_setzero_si64();
2706 __b = _mm_unpacklo_pi8(__a, __b);
Ying Wanga6720142011-12-20 14:43:20 -08002707
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002708 return _mm_cvtpi16_ps(__b);
Ying Wanga6720142011-12-20 14:43:20 -08002709}
2710
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002711/// \brief Converts the two 32-bit signed integer values from each 64-bit vector
2712/// operand of [2 x i32] into a 128-bit vector of [4 x float].
2713///
2714/// \headerfile <x86intrin.h>
2715///
2716/// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2717///
2718/// \param __a
2719/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2720/// copied from the elements in this operand.
2721/// \param __b
2722/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2723/// copied from the elements in this operand.
2724/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2725/// copied and converted values from the first operand. The upper 64 bits
2726/// contain the copied and converted values from the second operand.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002727static __inline__ __m128 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002728_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Ying Wanga6720142011-12-20 14:43:20 -08002729{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002730 __m128 __c;
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002731
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002732 __c = _mm_setzero_ps();
2733 __c = _mm_cvtpi32_ps(__c, __b);
2734 __c = _mm_movelh_ps(__c, __c);
Ying Wanga6720142011-12-20 14:43:20 -08002735
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002736 return _mm_cvtpi32_ps(__c, __a);
Ying Wanga6720142011-12-20 14:43:20 -08002737}
2738
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002739/// \brief Converts each single-precision floating-point element of a 128-bit
2740/// floating-point vector of [4 x float] into a 16-bit signed integer, and
2741/// packs the results into a 64-bit integer vector of [4 x i16]. If the
2742/// floating-point element is NaN or infinity, or if the floating-point
2743/// element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
2744/// to 0x8000. Otherwise if the floating-point element is greater
2745/// than 0x7FFF, it is converted to 0x7FFF.
2746///
2747/// \headerfile <x86intrin.h>
2748///
2749/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
2750///
2751/// \param __a
2752/// A 128-bit floating-point vector of [4 x float].
2753/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2754/// values.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002755static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002756_mm_cvtps_pi16(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08002757{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002758 __m64 __b, __c;
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002759
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002760 __b = _mm_cvtps_pi32(__a);
2761 __a = _mm_movehl_ps(__a, __a);
2762 __c = _mm_cvtps_pi32(__a);
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002763
Stephen Hinese65db132014-05-30 13:26:31 -07002764 return _mm_packs_pi32(__b, __c);
Ying Wanga6720142011-12-20 14:43:20 -08002765}
2766
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002767/// \brief Converts each single-precision floating-point element of a 128-bit
2768/// floating-point vector of [4 x float] into an 8-bit signed integer, and
2769/// packs the results into the lower 32 bits of a 64-bit integer vector of
2770/// [8 x i8]. The upper 32 bits of the vector are set to 0. If the
2771/// floating-point element is NaN or infinity, or if the floating-point
2772/// element is greater than 0x7FFFFFFF or less than -0x80, it is converted
2773/// to 0x80. Otherwise if the floating-point element is greater
2774/// than 0x7F, it is converted to 0x7F.
2775///
2776/// \headerfile <x86intrin.h>
2777///
2778/// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
2779///
2780/// \param __a
2781/// 128-bit floating-point vector of [4 x float].
2782/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2783/// converted values and the uppper 32 bits are set to zero.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002784static __inline__ __m64 __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002785_mm_cvtps_pi8(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08002786{
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002787 __m64 __b, __c;
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002788
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002789 __b = _mm_cvtps_pi16(__a);
2790 __c = _mm_setzero_si64();
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002791
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002792 return _mm_packs_pi16(__b, __c);
Ying Wanga6720142011-12-20 14:43:20 -08002793}
2794
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002795/// \brief Extracts the sign bits from each single-precision floating-point
2796/// element of a 128-bit floating-point vector of [4 x float] and returns the
2797/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2798/// to zero.
2799///
2800/// \headerfile <x86intrin.h>
2801///
2802/// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction.
2803///
2804/// \param __a
2805/// A 128-bit floating-point vector of [4 x float].
2806/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2807/// single-precision floating-point element of the parameter. Bits [31:4] are
2808/// set to zero.
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002809static __inline__ int __DEFAULT_FN_ATTRS
Stephen Hinesc6ee7df2013-04-02 18:41:57 -07002810_mm_movemask_ps(__m128 __a)
Ying Wanga6720142011-12-20 14:43:20 -08002811{
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002812 return __builtin_ia32_movmskps((__v4sf)__a);
Ying Wanga6720142011-12-20 14:43:20 -08002813}
2814
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002815
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002816#define _MM_ALIGN16 __attribute__((aligned(16)))
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002817
Ying Wanga6720142011-12-20 14:43:20 -08002818#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2819
2820#define _MM_EXCEPT_INVALID (0x0001)
2821#define _MM_EXCEPT_DENORM (0x0002)
2822#define _MM_EXCEPT_DIV_ZERO (0x0004)
2823#define _MM_EXCEPT_OVERFLOW (0x0008)
2824#define _MM_EXCEPT_UNDERFLOW (0x0010)
2825#define _MM_EXCEPT_INEXACT (0x0020)
2826#define _MM_EXCEPT_MASK (0x003f)
2827
2828#define _MM_MASK_INVALID (0x0080)
2829#define _MM_MASK_DENORM (0x0100)
2830#define _MM_MASK_DIV_ZERO (0x0200)
2831#define _MM_MASK_OVERFLOW (0x0400)
2832#define _MM_MASK_UNDERFLOW (0x0800)
2833#define _MM_MASK_INEXACT (0x1000)
2834#define _MM_MASK_MASK (0x1f80)
2835
2836#define _MM_ROUND_NEAREST (0x0000)
2837#define _MM_ROUND_DOWN (0x2000)
2838#define _MM_ROUND_UP (0x4000)
2839#define _MM_ROUND_TOWARD_ZERO (0x6000)
2840#define _MM_ROUND_MASK (0x6000)
2841
2842#define _MM_FLUSH_ZERO_MASK (0x8000)
2843#define _MM_FLUSH_ZERO_ON (0x8000)
Ying Wang60999142013-01-07 13:59:36 -08002844#define _MM_FLUSH_ZERO_OFF (0x0000)
Ying Wanga6720142011-12-20 14:43:20 -08002845
2846#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2847#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2848#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2849#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2850
2851#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2852#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2853#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2854#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2855
2856#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2857do { \
2858 __m128 tmp3, tmp2, tmp1, tmp0; \
2859 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2860 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2861 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2862 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2863 (row0) = _mm_movelh_ps(tmp0, tmp2); \
2864 (row1) = _mm_movehl_ps(tmp2, tmp0); \
2865 (row2) = _mm_movelh_ps(tmp1, tmp3); \
2866 (row3) = _mm_movehl_ps(tmp3, tmp1); \
2867} while (0)
2868
2869/* Aliases for compatibility. */
2870#define _m_pextrw _mm_extract_pi16
2871#define _m_pinsrw _mm_insert_pi16
2872#define _m_pmaxsw _mm_max_pi16
2873#define _m_pmaxub _mm_max_pu8
2874#define _m_pminsw _mm_min_pi16
2875#define _m_pminub _mm_min_pu8
2876#define _m_pmovmskb _mm_movemask_pi8
2877#define _m_pmulhuw _mm_mulhi_pu16
2878#define _m_pshufw _mm_shuffle_pi16
2879#define _m_maskmovq _mm_maskmove_si64
2880#define _m_pavgb _mm_avg_pu8
2881#define _m_pavgw _mm_avg_pu16
2882#define _m_psadbw _mm_sad_pu8
2883#define _m_ _mm_
2884#define _m_ _mm_
2885
Pirama Arumuga Nainar4e74a022016-03-17 18:03:02 -07002886#undef __DEFAULT_FN_ATTRS
2887
Ying Wanga6720142011-12-20 14:43:20 -08002888/* Ugly hack for backwards-compatibility (compatible with gcc) */
Pirama Arumuga Nainarbb4374f2016-10-20 16:43:03 -07002889#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
Ying Wanga6720142011-12-20 14:43:20 -08002890#include <emmintrin.h>
2891#endif
2892
Ying Wanga6720142011-12-20 14:43:20 -08002893#endif /* __XMMINTRIN_H */