blob: 7f4e9761f1e2cd9c1d5149173777d45083296335 [file] [log] [blame]
Logan Chien2833ffb2018-10-09 10:03:24 +08001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
Logan Chiendf4f7662019-09-04 16:45:23 -07003 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien2833ffb2018-10-09 10:03:24 +08006 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
Logan Chiendbcf4122019-03-21 10:50:25 +080034typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
Logan Chien2833ffb2018-10-09 10:03:24 +080041
42/* Define the default attributes for the functions in this file. */
Logan Chien55afb0a2018-10-15 10:42:14 +080043#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
44#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
Logan Chien2833ffb2018-10-09 10:03:24 +080045
46/* Arithmetic */
Logan Chien55afb0a2018-10-15 10:42:14 +080047/// Adds two 256-bit vectors of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +080048///
49/// \headerfile <x86intrin.h>
50///
Logan Chien55afb0a2018-10-15 10:42:14 +080051/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +080052///
53/// \param __a
54/// A 256-bit vector of [4 x double] containing one of the source operands.
55/// \param __b
56/// A 256-bit vector of [4 x double] containing one of the source operands.
57/// \returns A 256-bit vector of [4 x double] containing the sums of both
58/// operands.
59static __inline __m256d __DEFAULT_FN_ATTRS
60_mm256_add_pd(__m256d __a, __m256d __b)
61{
62 return (__m256d)((__v4df)__a+(__v4df)__b);
63}
64
Logan Chien55afb0a2018-10-15 10:42:14 +080065/// Adds two 256-bit vectors of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +080066///
67/// \headerfile <x86intrin.h>
68///
Logan Chien55afb0a2018-10-15 10:42:14 +080069/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +080070///
71/// \param __a
72/// A 256-bit vector of [8 x float] containing one of the source operands.
73/// \param __b
74/// A 256-bit vector of [8 x float] containing one of the source operands.
75/// \returns A 256-bit vector of [8 x float] containing the sums of both
76/// operands.
77static __inline __m256 __DEFAULT_FN_ATTRS
78_mm256_add_ps(__m256 __a, __m256 __b)
79{
80 return (__m256)((__v8sf)__a+(__v8sf)__b);
81}
82
Logan Chien55afb0a2018-10-15 10:42:14 +080083/// Subtracts two 256-bit vectors of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +080084///
85/// \headerfile <x86intrin.h>
86///
Logan Chien55afb0a2018-10-15 10:42:14 +080087/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +080088///
89/// \param __a
90/// A 256-bit vector of [4 x double] containing the minuend.
91/// \param __b
92/// A 256-bit vector of [4 x double] containing the subtrahend.
93/// \returns A 256-bit vector of [4 x double] containing the differences between
94/// both operands.
95static __inline __m256d __DEFAULT_FN_ATTRS
96_mm256_sub_pd(__m256d __a, __m256d __b)
97{
98 return (__m256d)((__v4df)__a-(__v4df)__b);
99}
100
Logan Chien55afb0a2018-10-15 10:42:14 +0800101/// Subtracts two 256-bit vectors of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +0800102///
103/// \headerfile <x86intrin.h>
104///
Logan Chien55afb0a2018-10-15 10:42:14 +0800105/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800106///
107/// \param __a
108/// A 256-bit vector of [8 x float] containing the minuend.
109/// \param __b
110/// A 256-bit vector of [8 x float] containing the subtrahend.
111/// \returns A 256-bit vector of [8 x float] containing the differences between
112/// both operands.
113static __inline __m256 __DEFAULT_FN_ATTRS
114_mm256_sub_ps(__m256 __a, __m256 __b)
115{
116 return (__m256)((__v8sf)__a-(__v8sf)__b);
117}
118
Logan Chien55afb0a2018-10-15 10:42:14 +0800119/// Adds the even-indexed values and subtracts the odd-indexed values of
Logan Chien2833ffb2018-10-09 10:03:24 +0800120/// two 256-bit vectors of [4 x double].
121///
122/// \headerfile <x86intrin.h>
123///
Logan Chien55afb0a2018-10-15 10:42:14 +0800124/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800125///
126/// \param __a
127/// A 256-bit vector of [4 x double] containing the left source operand.
128/// \param __b
129/// A 256-bit vector of [4 x double] containing the right source operand.
130/// \returns A 256-bit vector of [4 x double] containing the alternating sums
131/// and differences between both operands.
132static __inline __m256d __DEFAULT_FN_ATTRS
133_mm256_addsub_pd(__m256d __a, __m256d __b)
134{
135 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
136}
137
Logan Chien55afb0a2018-10-15 10:42:14 +0800138/// Adds the even-indexed values and subtracts the odd-indexed values of
Logan Chien2833ffb2018-10-09 10:03:24 +0800139/// two 256-bit vectors of [8 x float].
140///
141/// \headerfile <x86intrin.h>
142///
Logan Chien55afb0a2018-10-15 10:42:14 +0800143/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800144///
145/// \param __a
146/// A 256-bit vector of [8 x float] containing the left source operand.
147/// \param __b
148/// A 256-bit vector of [8 x float] containing the right source operand.
149/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
150/// differences between both operands.
151static __inline __m256 __DEFAULT_FN_ATTRS
152_mm256_addsub_ps(__m256 __a, __m256 __b)
153{
154 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
155}
156
Logan Chien55afb0a2018-10-15 10:42:14 +0800157/// Divides two 256-bit vectors of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +0800158///
159/// \headerfile <x86intrin.h>
160///
Logan Chien55afb0a2018-10-15 10:42:14 +0800161/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800162///
163/// \param __a
164/// A 256-bit vector of [4 x double] containing the dividend.
165/// \param __b
166/// A 256-bit vector of [4 x double] containing the divisor.
167/// \returns A 256-bit vector of [4 x double] containing the quotients of both
168/// operands.
169static __inline __m256d __DEFAULT_FN_ATTRS
170_mm256_div_pd(__m256d __a, __m256d __b)
171{
172 return (__m256d)((__v4df)__a/(__v4df)__b);
173}
174
Logan Chien55afb0a2018-10-15 10:42:14 +0800175/// Divides two 256-bit vectors of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +0800176///
177/// \headerfile <x86intrin.h>
178///
Logan Chien55afb0a2018-10-15 10:42:14 +0800179/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800180///
181/// \param __a
182/// A 256-bit vector of [8 x float] containing the dividend.
183/// \param __b
184/// A 256-bit vector of [8 x float] containing the divisor.
185/// \returns A 256-bit vector of [8 x float] containing the quotients of both
186/// operands.
187static __inline __m256 __DEFAULT_FN_ATTRS
188_mm256_div_ps(__m256 __a, __m256 __b)
189{
190 return (__m256)((__v8sf)__a/(__v8sf)__b);
191}
192
Logan Chien55afb0a2018-10-15 10:42:14 +0800193/// Compares two 256-bit vectors of [4 x double] and returns the greater
Logan Chien2833ffb2018-10-09 10:03:24 +0800194/// of each pair of values.
195///
196/// \headerfile <x86intrin.h>
197///
Logan Chien55afb0a2018-10-15 10:42:14 +0800198/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800199///
200/// \param __a
201/// A 256-bit vector of [4 x double] containing one of the operands.
202/// \param __b
203/// A 256-bit vector of [4 x double] containing one of the operands.
204/// \returns A 256-bit vector of [4 x double] containing the maximum values
205/// between both operands.
206static __inline __m256d __DEFAULT_FN_ATTRS
207_mm256_max_pd(__m256d __a, __m256d __b)
208{
209 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
210}
211
Logan Chien55afb0a2018-10-15 10:42:14 +0800212/// Compares two 256-bit vectors of [8 x float] and returns the greater
Logan Chien2833ffb2018-10-09 10:03:24 +0800213/// of each pair of values.
214///
215/// \headerfile <x86intrin.h>
216///
Logan Chien55afb0a2018-10-15 10:42:14 +0800217/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800218///
219/// \param __a
220/// A 256-bit vector of [8 x float] containing one of the operands.
221/// \param __b
222/// A 256-bit vector of [8 x float] containing one of the operands.
223/// \returns A 256-bit vector of [8 x float] containing the maximum values
224/// between both operands.
225static __inline __m256 __DEFAULT_FN_ATTRS
226_mm256_max_ps(__m256 __a, __m256 __b)
227{
228 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
229}
230
Logan Chien55afb0a2018-10-15 10:42:14 +0800231/// Compares two 256-bit vectors of [4 x double] and returns the lesser
Logan Chien2833ffb2018-10-09 10:03:24 +0800232/// of each pair of values.
233///
234/// \headerfile <x86intrin.h>
235///
Logan Chien55afb0a2018-10-15 10:42:14 +0800236/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800237///
238/// \param __a
239/// A 256-bit vector of [4 x double] containing one of the operands.
240/// \param __b
241/// A 256-bit vector of [4 x double] containing one of the operands.
242/// \returns A 256-bit vector of [4 x double] containing the minimum values
243/// between both operands.
244static __inline __m256d __DEFAULT_FN_ATTRS
245_mm256_min_pd(__m256d __a, __m256d __b)
246{
247 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
248}
249
Logan Chien55afb0a2018-10-15 10:42:14 +0800250/// Compares two 256-bit vectors of [8 x float] and returns the lesser
Logan Chien2833ffb2018-10-09 10:03:24 +0800251/// of each pair of values.
252///
253/// \headerfile <x86intrin.h>
254///
Logan Chien55afb0a2018-10-15 10:42:14 +0800255/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800256///
257/// \param __a
258/// A 256-bit vector of [8 x float] containing one of the operands.
259/// \param __b
260/// A 256-bit vector of [8 x float] containing one of the operands.
261/// \returns A 256-bit vector of [8 x float] containing the minimum values
262/// between both operands.
263static __inline __m256 __DEFAULT_FN_ATTRS
264_mm256_min_ps(__m256 __a, __m256 __b)
265{
266 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
267}
268
Logan Chien55afb0a2018-10-15 10:42:14 +0800269/// Multiplies two 256-bit vectors of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +0800270///
271/// \headerfile <x86intrin.h>
272///
Logan Chien55afb0a2018-10-15 10:42:14 +0800273/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800274///
275/// \param __a
276/// A 256-bit vector of [4 x double] containing one of the operands.
277/// \param __b
278/// A 256-bit vector of [4 x double] containing one of the operands.
279/// \returns A 256-bit vector of [4 x double] containing the products of both
280/// operands.
281static __inline __m256d __DEFAULT_FN_ATTRS
282_mm256_mul_pd(__m256d __a, __m256d __b)
283{
284 return (__m256d)((__v4df)__a * (__v4df)__b);
285}
286
Logan Chien55afb0a2018-10-15 10:42:14 +0800287/// Multiplies two 256-bit vectors of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +0800288///
289/// \headerfile <x86intrin.h>
290///
Logan Chien55afb0a2018-10-15 10:42:14 +0800291/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800292///
293/// \param __a
294/// A 256-bit vector of [8 x float] containing one of the operands.
295/// \param __b
296/// A 256-bit vector of [8 x float] containing one of the operands.
297/// \returns A 256-bit vector of [8 x float] containing the products of both
298/// operands.
299static __inline __m256 __DEFAULT_FN_ATTRS
300_mm256_mul_ps(__m256 __a, __m256 __b)
301{
302 return (__m256)((__v8sf)__a * (__v8sf)__b);
303}
304
Logan Chien55afb0a2018-10-15 10:42:14 +0800305/// Calculates the square roots of the values in a 256-bit vector of
Logan Chien2833ffb2018-10-09 10:03:24 +0800306/// [4 x double].
307///
308/// \headerfile <x86intrin.h>
309///
Logan Chien55afb0a2018-10-15 10:42:14 +0800310/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800311///
312/// \param __a
313/// A 256-bit vector of [4 x double].
314/// \returns A 256-bit vector of [4 x double] containing the square roots of the
315/// values in the operand.
316static __inline __m256d __DEFAULT_FN_ATTRS
317_mm256_sqrt_pd(__m256d __a)
318{
319 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
320}
321
Logan Chien55afb0a2018-10-15 10:42:14 +0800322/// Calculates the square roots of the values in a 256-bit vector of
Logan Chien2833ffb2018-10-09 10:03:24 +0800323/// [8 x float].
324///
325/// \headerfile <x86intrin.h>
326///
Logan Chien55afb0a2018-10-15 10:42:14 +0800327/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800328///
329/// \param __a
330/// A 256-bit vector of [8 x float].
331/// \returns A 256-bit vector of [8 x float] containing the square roots of the
332/// values in the operand.
333static __inline __m256 __DEFAULT_FN_ATTRS
334_mm256_sqrt_ps(__m256 __a)
335{
336 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
337}
338
Logan Chien55afb0a2018-10-15 10:42:14 +0800339/// Calculates the reciprocal square roots of the values in a 256-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800340/// vector of [8 x float].
341///
342/// \headerfile <x86intrin.h>
343///
Logan Chien55afb0a2018-10-15 10:42:14 +0800344/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800345///
346/// \param __a
347/// A 256-bit vector of [8 x float].
348/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
349/// roots of the values in the operand.
350static __inline __m256 __DEFAULT_FN_ATTRS
351_mm256_rsqrt_ps(__m256 __a)
352{
353 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
354}
355
Logan Chien55afb0a2018-10-15 10:42:14 +0800356/// Calculates the reciprocals of the values in a 256-bit vector of
Logan Chien2833ffb2018-10-09 10:03:24 +0800357/// [8 x float].
358///
359/// \headerfile <x86intrin.h>
360///
Logan Chien55afb0a2018-10-15 10:42:14 +0800361/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800362///
363/// \param __a
364/// A 256-bit vector of [8 x float].
365/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
366/// values in the operand.
367static __inline __m256 __DEFAULT_FN_ATTRS
368_mm256_rcp_ps(__m256 __a)
369{
370 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
371}
372
Logan Chien55afb0a2018-10-15 10:42:14 +0800373/// Rounds the values in a 256-bit vector of [4 x double] as specified
Logan Chien2833ffb2018-10-09 10:03:24 +0800374/// by the byte operand. The source values are rounded to integer values and
375/// returned as 64-bit double-precision floating-point values.
376///
377/// \headerfile <x86intrin.h>
378///
379/// \code
380/// __m256d _mm256_round_pd(__m256d V, const int M);
381/// \endcode
382///
Logan Chien55afb0a2018-10-15 10:42:14 +0800383/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800384///
385/// \param V
386/// A 256-bit vector of [4 x double].
387/// \param M
Logan Chien55afb0a2018-10-15 10:42:14 +0800388/// An integer value that specifies the rounding operation. \n
389/// Bits [7:4] are reserved. \n
390/// Bit [3] is a precision exception value: \n
391/// 0: A normal PE exception is used. \n
392/// 1: The PE field is not updated. \n
393/// Bit [2] is the rounding control source: \n
394/// 0: Use bits [1:0] of \a M. \n
395/// 1: Use the current MXCSR setting. \n
396/// Bits [1:0] contain the rounding control definition: \n
397/// 00: Nearest. \n
398/// 01: Downward (toward negative infinity). \n
399/// 10: Upward (toward positive infinity). \n
400/// 11: Truncated.
Logan Chien2833ffb2018-10-09 10:03:24 +0800401/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800402#define _mm256_round_pd(V, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800403 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800404
Logan Chien55afb0a2018-10-15 10:42:14 +0800405/// Rounds the values stored in a 256-bit vector of [8 x float] as
Logan Chien2833ffb2018-10-09 10:03:24 +0800406/// specified by the byte operand. The source values are rounded to integer
407/// values and returned as floating-point values.
408///
409/// \headerfile <x86intrin.h>
410///
411/// \code
412/// __m256 _mm256_round_ps(__m256 V, const int M);
413/// \endcode
414///
Logan Chien55afb0a2018-10-15 10:42:14 +0800415/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800416///
417/// \param V
418/// A 256-bit vector of [8 x float].
419/// \param M
Logan Chien55afb0a2018-10-15 10:42:14 +0800420/// An integer value that specifies the rounding operation. \n
421/// Bits [7:4] are reserved. \n
422/// Bit [3] is a precision exception value: \n
423/// 0: A normal PE exception is used. \n
424/// 1: The PE field is not updated. \n
425/// Bit [2] is the rounding control source: \n
426/// 0: Use bits [1:0] of \a M. \n
427/// 1: Use the current MXCSR setting. \n
428/// Bits [1:0] contain the rounding control definition: \n
429/// 00: Nearest. \n
430/// 01: Downward (toward negative infinity). \n
431/// 10: Upward (toward positive infinity). \n
432/// 11: Truncated.
Logan Chien2833ffb2018-10-09 10:03:24 +0800433/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800434#define _mm256_round_ps(V, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800435 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800436
Logan Chien55afb0a2018-10-15 10:42:14 +0800437/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
Logan Chien2833ffb2018-10-09 10:03:24 +0800438/// source values are rounded up to integer values and returned as 64-bit
439/// double-precision floating-point values.
440///
441/// \headerfile <x86intrin.h>
442///
443/// \code
444/// __m256d _mm256_ceil_pd(__m256d V);
445/// \endcode
446///
Logan Chien55afb0a2018-10-15 10:42:14 +0800447/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800448///
449/// \param V
450/// A 256-bit vector of [4 x double].
451/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
452#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
453
Logan Chien55afb0a2018-10-15 10:42:14 +0800454/// Rounds down the values stored in a 256-bit vector of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +0800455/// The source values are rounded down to integer values and returned as
456/// 64-bit double-precision floating-point values.
457///
458/// \headerfile <x86intrin.h>
459///
460/// \code
461/// __m256d _mm256_floor_pd(__m256d V);
462/// \endcode
463///
Logan Chien55afb0a2018-10-15 10:42:14 +0800464/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800465///
466/// \param V
467/// A 256-bit vector of [4 x double].
468/// \returns A 256-bit vector of [4 x double] containing the rounded down
469/// values.
470#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
471
Logan Chien55afb0a2018-10-15 10:42:14 +0800472/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
Logan Chien2833ffb2018-10-09 10:03:24 +0800473/// source values are rounded up to integer values and returned as
474/// floating-point values.
475///
476/// \headerfile <x86intrin.h>
477///
478/// \code
479/// __m256 _mm256_ceil_ps(__m256 V);
480/// \endcode
481///
Logan Chien55afb0a2018-10-15 10:42:14 +0800482/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800483///
484/// \param V
485/// A 256-bit vector of [8 x float].
486/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
487#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
488
Logan Chien55afb0a2018-10-15 10:42:14 +0800489/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
Logan Chien2833ffb2018-10-09 10:03:24 +0800490/// source values are rounded down to integer values and returned as
491/// floating-point values.
492///
493/// \headerfile <x86intrin.h>
494///
495/// \code
496/// __m256 _mm256_floor_ps(__m256 V);
497/// \endcode
498///
Logan Chien55afb0a2018-10-15 10:42:14 +0800499/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800500///
501/// \param V
502/// A 256-bit vector of [8 x float].
503/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
504#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
505
506/* Logical */
Logan Chien55afb0a2018-10-15 10:42:14 +0800507/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +0800508///
509/// \headerfile <x86intrin.h>
510///
Logan Chien55afb0a2018-10-15 10:42:14 +0800511/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800512///
513/// \param __a
514/// A 256-bit vector of [4 x double] containing one of the source operands.
515/// \param __b
516/// A 256-bit vector of [4 x double] containing one of the source operands.
517/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
518/// values between both operands.
519static __inline __m256d __DEFAULT_FN_ATTRS
520_mm256_and_pd(__m256d __a, __m256d __b)
521{
522 return (__m256d)((__v4du)__a & (__v4du)__b);
523}
524
Logan Chien55afb0a2018-10-15 10:42:14 +0800525/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +0800526///
527/// \headerfile <x86intrin.h>
528///
Logan Chien55afb0a2018-10-15 10:42:14 +0800529/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800530///
531/// \param __a
532/// A 256-bit vector of [8 x float] containing one of the source operands.
533/// \param __b
534/// A 256-bit vector of [8 x float] containing one of the source operands.
535/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
536/// values between both operands.
537static __inline __m256 __DEFAULT_FN_ATTRS
538_mm256_and_ps(__m256 __a, __m256 __b)
539{
540 return (__m256)((__v8su)__a & (__v8su)__b);
541}
542
Logan Chien55afb0a2018-10-15 10:42:14 +0800543/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
Logan Chien2833ffb2018-10-09 10:03:24 +0800544/// the one's complement of the values contained in the first source operand.
545///
546/// \headerfile <x86intrin.h>
547///
Logan Chien55afb0a2018-10-15 10:42:14 +0800548/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800549///
550/// \param __a
551/// A 256-bit vector of [4 x double] containing the left source operand. The
552/// one's complement of this value is used in the bitwise AND.
553/// \param __b
554/// A 256-bit vector of [4 x double] containing the right source operand.
555/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
556/// values of the second operand and the one's complement of the first
557/// operand.
558static __inline __m256d __DEFAULT_FN_ATTRS
559_mm256_andnot_pd(__m256d __a, __m256d __b)
560{
561 return (__m256d)(~(__v4du)__a & (__v4du)__b);
562}
563
Logan Chien55afb0a2018-10-15 10:42:14 +0800564/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
Logan Chien2833ffb2018-10-09 10:03:24 +0800565/// the one's complement of the values contained in the first source operand.
566///
567/// \headerfile <x86intrin.h>
568///
Logan Chien55afb0a2018-10-15 10:42:14 +0800569/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800570///
571/// \param __a
572/// A 256-bit vector of [8 x float] containing the left source operand. The
573/// one's complement of this value is used in the bitwise AND.
574/// \param __b
575/// A 256-bit vector of [8 x float] containing the right source operand.
576/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
577/// values of the second operand and the one's complement of the first
578/// operand.
579static __inline __m256 __DEFAULT_FN_ATTRS
580_mm256_andnot_ps(__m256 __a, __m256 __b)
581{
582 return (__m256)(~(__v8su)__a & (__v8su)__b);
583}
584
Logan Chien55afb0a2018-10-15 10:42:14 +0800585/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +0800586///
587/// \headerfile <x86intrin.h>
588///
Logan Chien55afb0a2018-10-15 10:42:14 +0800589/// This intrinsic corresponds to the <c> VORPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800590///
591/// \param __a
592/// A 256-bit vector of [4 x double] containing one of the source operands.
593/// \param __b
594/// A 256-bit vector of [4 x double] containing one of the source operands.
595/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
596/// values between both operands.
597static __inline __m256d __DEFAULT_FN_ATTRS
598_mm256_or_pd(__m256d __a, __m256d __b)
599{
600 return (__m256d)((__v4du)__a | (__v4du)__b);
601}
602
Logan Chien55afb0a2018-10-15 10:42:14 +0800603/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +0800604///
605/// \headerfile <x86intrin.h>
606///
Logan Chien55afb0a2018-10-15 10:42:14 +0800607/// This intrinsic corresponds to the <c> VORPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800608///
609/// \param __a
610/// A 256-bit vector of [8 x float] containing one of the source operands.
611/// \param __b
612/// A 256-bit vector of [8 x float] containing one of the source operands.
613/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
614/// values between both operands.
615static __inline __m256 __DEFAULT_FN_ATTRS
616_mm256_or_ps(__m256 __a, __m256 __b)
617{
618 return (__m256)((__v8su)__a | (__v8su)__b);
619}
620
Logan Chien55afb0a2018-10-15 10:42:14 +0800621/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +0800622///
623/// \headerfile <x86intrin.h>
624///
Logan Chien55afb0a2018-10-15 10:42:14 +0800625/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800626///
627/// \param __a
628/// A 256-bit vector of [4 x double] containing one of the source operands.
629/// \param __b
630/// A 256-bit vector of [4 x double] containing one of the source operands.
631/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
632/// values between both operands.
633static __inline __m256d __DEFAULT_FN_ATTRS
634_mm256_xor_pd(__m256d __a, __m256d __b)
635{
636 return (__m256d)((__v4du)__a ^ (__v4du)__b);
637}
638
Logan Chien55afb0a2018-10-15 10:42:14 +0800639/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +0800640///
641/// \headerfile <x86intrin.h>
642///
Logan Chien55afb0a2018-10-15 10:42:14 +0800643/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800644///
645/// \param __a
646/// A 256-bit vector of [8 x float] containing one of the source operands.
647/// \param __b
648/// A 256-bit vector of [8 x float] containing one of the source operands.
649/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
650/// values between both operands.
651static __inline __m256 __DEFAULT_FN_ATTRS
652_mm256_xor_ps(__m256 __a, __m256 __b)
653{
654 return (__m256)((__v8su)__a ^ (__v8su)__b);
655}
656
657/* Horizontal arithmetic */
Logan Chien55afb0a2018-10-15 10:42:14 +0800658/// Horizontally adds the adjacent pairs of values contained in two
Logan Chien2833ffb2018-10-09 10:03:24 +0800659/// 256-bit vectors of [4 x double].
660///
661/// \headerfile <x86intrin.h>
662///
Logan Chien55afb0a2018-10-15 10:42:14 +0800663/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800664///
665/// \param __a
666/// A 256-bit vector of [4 x double] containing one of the source operands.
667/// The horizontal sums of the values are returned in the even-indexed
668/// elements of a vector of [4 x double].
669/// \param __b
670/// A 256-bit vector of [4 x double] containing one of the source operands.
671/// The horizontal sums of the values are returned in the odd-indexed
672/// elements of a vector of [4 x double].
673/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
674/// both operands.
675static __inline __m256d __DEFAULT_FN_ATTRS
676_mm256_hadd_pd(__m256d __a, __m256d __b)
677{
678 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
679}
680
Logan Chien55afb0a2018-10-15 10:42:14 +0800681/// Horizontally adds the adjacent pairs of values contained in two
Logan Chien2833ffb2018-10-09 10:03:24 +0800682/// 256-bit vectors of [8 x float].
683///
684/// \headerfile <x86intrin.h>
685///
Logan Chien55afb0a2018-10-15 10:42:14 +0800686/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800687///
688/// \param __a
689/// A 256-bit vector of [8 x float] containing one of the source operands.
690/// The horizontal sums of the values are returned in the elements with
691/// index 0, 1, 4, 5 of a vector of [8 x float].
692/// \param __b
693/// A 256-bit vector of [8 x float] containing one of the source operands.
694/// The horizontal sums of the values are returned in the elements with
695/// index 2, 3, 6, 7 of a vector of [8 x float].
696/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
697/// both operands.
698static __inline __m256 __DEFAULT_FN_ATTRS
699_mm256_hadd_ps(__m256 __a, __m256 __b)
700{
701 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
702}
703
Logan Chien55afb0a2018-10-15 10:42:14 +0800704/// Horizontally subtracts the adjacent pairs of values contained in two
Logan Chien2833ffb2018-10-09 10:03:24 +0800705/// 256-bit vectors of [4 x double].
706///
707/// \headerfile <x86intrin.h>
708///
Logan Chien55afb0a2018-10-15 10:42:14 +0800709/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800710///
711/// \param __a
712/// A 256-bit vector of [4 x double] containing one of the source operands.
713/// The horizontal differences between the values are returned in the
714/// even-indexed elements of a vector of [4 x double].
715/// \param __b
716/// A 256-bit vector of [4 x double] containing one of the source operands.
717/// The horizontal differences between the values are returned in the
718/// odd-indexed elements of a vector of [4 x double].
719/// \returns A 256-bit vector of [4 x double] containing the horizontal
720/// differences of both operands.
721static __inline __m256d __DEFAULT_FN_ATTRS
722_mm256_hsub_pd(__m256d __a, __m256d __b)
723{
724 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
725}
726
Logan Chien55afb0a2018-10-15 10:42:14 +0800727/// Horizontally subtracts the adjacent pairs of values contained in two
Logan Chien2833ffb2018-10-09 10:03:24 +0800728/// 256-bit vectors of [8 x float].
729///
730/// \headerfile <x86intrin.h>
731///
Logan Chien55afb0a2018-10-15 10:42:14 +0800732/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800733///
734/// \param __a
735/// A 256-bit vector of [8 x float] containing one of the source operands.
736/// The horizontal differences between the values are returned in the
737/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
738/// \param __b
739/// A 256-bit vector of [8 x float] containing one of the source operands.
740/// The horizontal differences between the values are returned in the
741/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
742/// \returns A 256-bit vector of [8 x float] containing the horizontal
743/// differences of both operands.
744static __inline __m256 __DEFAULT_FN_ATTRS
745_mm256_hsub_ps(__m256 __a, __m256 __b)
746{
747 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
748}
749
750/* Vector permutations */
Logan Chien55afb0a2018-10-15 10:42:14 +0800751/// Copies the values in a 128-bit vector of [2 x double] as specified
Logan Chien2833ffb2018-10-09 10:03:24 +0800752/// by the 128-bit integer vector operand.
753///
754/// \headerfile <x86intrin.h>
755///
Logan Chien55afb0a2018-10-15 10:42:14 +0800756/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800757///
758/// \param __a
759/// A 128-bit vector of [2 x double].
760/// \param __c
761/// A 128-bit integer vector operand specifying how the values are to be
Logan Chien55afb0a2018-10-15 10:42:14 +0800762/// copied. \n
763/// Bit [1]: \n
764/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
765/// vector. \n
766/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
767/// returned vector. \n
768/// Bit [65]: \n
769/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
770/// returned vector. \n
771/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
772/// returned vector.
Logan Chien2833ffb2018-10-09 10:03:24 +0800773/// \returns A 128-bit vector of [2 x double] containing the copied values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800774static __inline __m128d __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +0800775_mm_permutevar_pd(__m128d __a, __m128i __c)
776{
777 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
778}
779
Logan Chien55afb0a2018-10-15 10:42:14 +0800780/// Copies the values in a 256-bit vector of [4 x double] as specified
781/// by the 256-bit integer vector operand.
Logan Chien2833ffb2018-10-09 10:03:24 +0800782///
783/// \headerfile <x86intrin.h>
784///
Logan Chien55afb0a2018-10-15 10:42:14 +0800785/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800786///
787/// \param __a
788/// A 256-bit vector of [4 x double].
789/// \param __c
790/// A 256-bit integer vector operand specifying how the values are to be
Logan Chien55afb0a2018-10-15 10:42:14 +0800791/// copied. \n
792/// Bit [1]: \n
793/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
794/// vector. \n
795/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
796/// returned vector. \n
797/// Bit [65]: \n
798/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
799/// returned vector. \n
800/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
801/// returned vector. \n
802/// Bit [129]: \n
803/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
804/// returned vector. \n
805/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
806/// returned vector. \n
807/// Bit [193]: \n
808/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
809/// returned vector. \n
810/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Logan Chien2833ffb2018-10-09 10:03:24 +0800811/// returned vector.
812/// \returns A 256-bit vector of [4 x double] containing the copied values.
813static __inline __m256d __DEFAULT_FN_ATTRS
814_mm256_permutevar_pd(__m256d __a, __m256i __c)
815{
816 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
817}
818
Logan Chien55afb0a2018-10-15 10:42:14 +0800819/// Copies the values stored in a 128-bit vector of [4 x float] as
Logan Chien2833ffb2018-10-09 10:03:24 +0800820/// specified by the 128-bit integer vector operand.
Logan Chien2833ffb2018-10-09 10:03:24 +0800821/// \headerfile <x86intrin.h>
822///
Logan Chien55afb0a2018-10-15 10:42:14 +0800823/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800824///
825/// \param __a
826/// A 128-bit vector of [4 x float].
827/// \param __c
828/// A 128-bit integer vector operand specifying how the values are to be
Logan Chien55afb0a2018-10-15 10:42:14 +0800829/// copied. \n
830/// Bits [1:0]: \n
831/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
832/// returned vector. \n
833/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
834/// returned vector. \n
835/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
836/// returned vector. \n
837/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
838/// returned vector. \n
839/// Bits [33:32]: \n
840/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
841/// returned vector. \n
842/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
843/// returned vector. \n
844/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
845/// returned vector. \n
846/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
847/// returned vector. \n
848/// Bits [65:64]: \n
849/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
850/// returned vector. \n
851/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
852/// returned vector. \n
853/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
854/// returned vector. \n
855/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
856/// returned vector. \n
857/// Bits [97:96]: \n
858/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
859/// returned vector. \n
860/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
861/// returned vector. \n
862/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
863/// returned vector. \n
864/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
865/// returned vector.
Logan Chien2833ffb2018-10-09 10:03:24 +0800866/// \returns A 128-bit vector of [4 x float] containing the copied values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800867static __inline __m128 __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +0800868_mm_permutevar_ps(__m128 __a, __m128i __c)
869{
870 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
871}
872
Logan Chien55afb0a2018-10-15 10:42:14 +0800873/// Copies the values stored in a 256-bit vector of [8 x float] as
Logan Chien2833ffb2018-10-09 10:03:24 +0800874/// specified by the 256-bit integer vector operand.
875///
876/// \headerfile <x86intrin.h>
877///
Logan Chien55afb0a2018-10-15 10:42:14 +0800878/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800879///
880/// \param __a
881/// A 256-bit vector of [8 x float].
882/// \param __c
883/// A 256-bit integer vector operand specifying how the values are to be
Logan Chien55afb0a2018-10-15 10:42:14 +0800884/// copied. \n
885/// Bits [1:0]: \n
886/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
887/// returned vector. \n
888/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
889/// returned vector. \n
890/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
891/// returned vector. \n
892/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
893/// returned vector. \n
894/// Bits [33:32]: \n
895/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
896/// returned vector. \n
897/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
898/// returned vector. \n
899/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
900/// returned vector. \n
901/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
902/// returned vector. \n
903/// Bits [65:64]: \n
904/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
905/// returned vector. \n
906/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
907/// returned vector. \n
908/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
909/// returned vector. \n
910/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
911/// returned vector. \n
912/// Bits [97:96]: \n
913/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
914/// returned vector. \n
915/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
916/// returned vector. \n
917/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
918/// returned vector. \n
919/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
920/// returned vector. \n
921/// Bits [129:128]: \n
922/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
923/// returned vector. \n
924/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
925/// returned vector. \n
926/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
927/// returned vector. \n
928/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
929/// returned vector. \n
930/// Bits [161:160]: \n
931/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
932/// returned vector. \n
933/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
934/// returned vector. \n
935/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
936/// returned vector. \n
937/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
938/// returned vector. \n
939/// Bits [193:192]: \n
940/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
941/// returned vector. \n
942/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
943/// returned vector. \n
944/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
945/// returned vector. \n
946/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
947/// returned vector. \n
948/// Bits [225:224]: \n
949/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
950/// returned vector. \n
951/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
952/// returned vector. \n
953/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
954/// returned vector. \n
955/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
956/// returned vector.
Logan Chien2833ffb2018-10-09 10:03:24 +0800957/// \returns A 256-bit vector of [8 x float] containing the copied values.
958static __inline __m256 __DEFAULT_FN_ATTRS
959_mm256_permutevar_ps(__m256 __a, __m256i __c)
960{
961 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
962}
963
Logan Chien55afb0a2018-10-15 10:42:14 +0800964/// Copies the values in a 128-bit vector of [2 x double] as specified
965/// by the immediate integer operand.
Logan Chien2833ffb2018-10-09 10:03:24 +0800966///
967/// \headerfile <x86intrin.h>
968///
969/// \code
970/// __m128d _mm_permute_pd(__m128d A, const int C);
971/// \endcode
972///
Logan Chien55afb0a2018-10-15 10:42:14 +0800973/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800974///
975/// \param A
976/// A 128-bit vector of [2 x double].
977/// \param C
Logan Chien55afb0a2018-10-15 10:42:14 +0800978/// An immediate integer operand specifying how the values are to be
979/// copied. \n
980/// Bit [0]: \n
981/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
982/// vector. \n
983/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
984/// returned vector. \n
985/// Bit [1]: \n
986/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
987/// returned vector. \n
988/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
989/// returned vector.
Logan Chien2833ffb2018-10-09 10:03:24 +0800990/// \returns A 128-bit vector of [2 x double] containing the copied values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800991#define _mm_permute_pd(A, C) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800992 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800993
Logan Chien55afb0a2018-10-15 10:42:14 +0800994/// Copies the values in a 256-bit vector of [4 x double] as specified by
995/// the immediate integer operand.
Logan Chien2833ffb2018-10-09 10:03:24 +0800996///
997/// \headerfile <x86intrin.h>
998///
999/// \code
1000/// __m256d _mm256_permute_pd(__m256d A, const int C);
1001/// \endcode
1002///
Logan Chien55afb0a2018-10-15 10:42:14 +08001003/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001004///
1005/// \param A
1006/// A 256-bit vector of [4 x double].
1007/// \param C
Logan Chien55afb0a2018-10-15 10:42:14 +08001008/// An immediate integer operand specifying how the values are to be
1009/// copied. \n
1010/// Bit [0]: \n
1011/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1012/// vector. \n
1013/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1014/// returned vector. \n
1015/// Bit [1]: \n
1016/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1017/// returned vector. \n
1018/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1019/// returned vector. \n
1020/// Bit [2]: \n
1021/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1022/// returned vector. \n
1023/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1024/// returned vector. \n
1025/// Bit [3]: \n
1026/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1027/// returned vector. \n
1028/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1029/// returned vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08001030/// \returns A 256-bit vector of [4 x double] containing the copied values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001031#define _mm256_permute_pd(A, C) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001032 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001033
Logan Chien55afb0a2018-10-15 10:42:14 +08001034/// Copies the values in a 128-bit vector of [4 x float] as specified by
1035/// the immediate integer operand.
Logan Chien2833ffb2018-10-09 10:03:24 +08001036///
1037/// \headerfile <x86intrin.h>
1038///
1039/// \code
1040/// __m128 _mm_permute_ps(__m128 A, const int C);
1041/// \endcode
1042///
Logan Chien55afb0a2018-10-15 10:42:14 +08001043/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001044///
1045/// \param A
1046/// A 128-bit vector of [4 x float].
1047/// \param C
Logan Chien55afb0a2018-10-15 10:42:14 +08001048/// An immediate integer operand specifying how the values are to be
1049/// copied. \n
1050/// Bits [1:0]: \n
1051/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1052/// returned vector. \n
1053/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1054/// returned vector. \n
1055/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1056/// returned vector. \n
1057/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1058/// returned vector. \n
1059/// Bits [3:2]: \n
1060/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1061/// returned vector. \n
1062/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1063/// returned vector. \n
1064/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1065/// returned vector. \n
1066/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1067/// returned vector. \n
1068/// Bits [5:4]: \n
1069/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1070/// returned vector. \n
1071/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1072/// returned vector. \n
1073/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1074/// returned vector. \n
1075/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1076/// returned vector. \n
1077/// Bits [7:6]: \n
1078/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1079/// returned vector. \n
1080/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1081/// returned vector. \n
1082/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1083/// returned vector. \n
1084/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1085/// returned vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08001086/// \returns A 128-bit vector of [4 x float] containing the copied values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001087#define _mm_permute_ps(A, C) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001088 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001089
Logan Chien55afb0a2018-10-15 10:42:14 +08001090/// Copies the values in a 256-bit vector of [8 x float] as specified by
1091/// the immediate integer operand.
Logan Chien2833ffb2018-10-09 10:03:24 +08001092///
1093/// \headerfile <x86intrin.h>
1094///
1095/// \code
1096/// __m256 _mm256_permute_ps(__m256 A, const int C);
1097/// \endcode
1098///
Logan Chien55afb0a2018-10-15 10:42:14 +08001099/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001100///
1101/// \param A
1102/// A 256-bit vector of [8 x float].
1103/// \param C
Logan Chien55afb0a2018-10-15 10:42:14 +08001104/// An immediate integer operand specifying how the values are to be
1105/// copied. \n
1106/// Bits [1:0]: \n
1107/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1108/// returned vector. \n
1109/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1110/// returned vector. \n
1111/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1112/// returned vector. \n
1113/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1114/// returned vector. \n
1115/// Bits [3:2]: \n
1116/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1117/// returned vector. \n
1118/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1119/// returned vector. \n
1120/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1121/// returned vector. \n
1122/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1123/// returned vector. \n
1124/// Bits [5:4]: \n
1125/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1126/// returned vector. \n
1127/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1128/// returned vector. \n
1129/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1130/// returned vector. \n
1131/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1132/// returned vector. \n
1133/// Bits [7:6]: \n
1134/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1135/// returned vector. \n
1136/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1137/// returned vector. \n
1138/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1139/// returned vector. \n
1140/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1141/// returned vector. \n
1142/// Bits [1:0]: \n
1143/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1144/// returned vector. \n
1145/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1146/// returned vector. \n
1147/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1148/// returned vector. \n
1149/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1150/// returned vector. \n
1151/// Bits [3:2]: \n
1152/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1153/// returned vector. \n
1154/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1155/// returned vector. \n
1156/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1157/// returned vector. \n
1158/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1159/// returned vector. \n
1160/// Bits [5:4]: \n
1161/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1162/// returned vector. \n
1163/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1164/// returned vector. \n
1165/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1166/// returned vector. \n
1167/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1168/// returned vector. \n
1169/// Bits [7:6]: \n
1170/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1171/// returned vector. \n
1172/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1173/// returned vector. \n
1174/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1175/// returned vector. \n
1176/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1177/// returned vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08001178/// \returns A 256-bit vector of [8 x float] containing the copied values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001179#define _mm256_permute_ps(A, C) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001180 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001181
Logan Chien55afb0a2018-10-15 10:42:14 +08001182/// Permutes 128-bit data values stored in two 256-bit vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +08001183/// [4 x double], as specified by the immediate integer operand.
1184///
1185/// \headerfile <x86intrin.h>
1186///
1187/// \code
1188/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1189/// \endcode
1190///
Logan Chien55afb0a2018-10-15 10:42:14 +08001191/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001192///
1193/// \param V1
1194/// A 256-bit vector of [4 x double].
1195/// \param V2
1196/// A 256-bit vector of [4 x double.
1197/// \param M
1198/// An immediate integer operand specifying how the values are to be
Logan Chien55afb0a2018-10-15 10:42:14 +08001199/// permuted. \n
1200/// Bits [1:0]: \n
1201/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1202/// destination. \n
1203/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1204/// destination. \n
1205/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1206/// destination. \n
1207/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1208/// destination. \n
1209/// Bits [5:4]: \n
1210/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1211/// destination. \n
1212/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1213/// destination. \n
1214/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1215/// destination. \n
1216/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1217/// destination.
Logan Chien2833ffb2018-10-09 10:03:24 +08001218/// \returns A 256-bit vector of [4 x double] containing the copied values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001219#define _mm256_permute2f128_pd(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001220 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1221 (__v4df)(__m256d)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001222
Logan Chien55afb0a2018-10-15 10:42:14 +08001223/// Permutes 128-bit data values stored in two 256-bit vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +08001224/// [8 x float], as specified by the immediate integer operand.
1225///
1226/// \headerfile <x86intrin.h>
1227///
1228/// \code
1229/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1230/// \endcode
1231///
Logan Chien55afb0a2018-10-15 10:42:14 +08001232/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001233///
1234/// \param V1
1235/// A 256-bit vector of [8 x float].
1236/// \param V2
1237/// A 256-bit vector of [8 x float].
1238/// \param M
1239/// An immediate integer operand specifying how the values are to be
Logan Chien55afb0a2018-10-15 10:42:14 +08001240/// permuted. \n
1241/// Bits [1:0]: \n
1242/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1243/// destination. \n
1244/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1245/// destination. \n
1246/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1247/// destination. \n
1248/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1249/// destination. \n
1250/// Bits [5:4]: \n
1251/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1252/// destination. \n
1253/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1254/// destination. \n
1255/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1256/// destination. \n
1257/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Logan Chien2833ffb2018-10-09 10:03:24 +08001258/// destination.
1259/// \returns A 256-bit vector of [8 x float] containing the copied values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001260#define _mm256_permute2f128_ps(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001261 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1262 (__v8sf)(__m256)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001263
Logan Chien55afb0a2018-10-15 10:42:14 +08001264/// Permutes 128-bit data values stored in two 256-bit integer vectors,
Logan Chien2833ffb2018-10-09 10:03:24 +08001265/// as specified by the immediate integer operand.
1266///
1267/// \headerfile <x86intrin.h>
1268///
1269/// \code
1270/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1271/// \endcode
1272///
Logan Chien55afb0a2018-10-15 10:42:14 +08001273/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001274///
1275/// \param V1
1276/// A 256-bit integer vector.
1277/// \param V2
1278/// A 256-bit integer vector.
1279/// \param M
1280/// An immediate integer operand specifying how the values are to be copied.
Logan Chien55afb0a2018-10-15 10:42:14 +08001281/// Bits [1:0]: \n
1282/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1283/// destination. \n
1284/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1285/// destination. \n
1286/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1287/// destination. \n
1288/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1289/// destination. \n
1290/// Bits [5:4]: \n
1291/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1292/// destination. \n
1293/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1294/// destination. \n
1295/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1296/// destination. \n
1297/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Logan Chien2833ffb2018-10-09 10:03:24 +08001298/// destination.
1299/// \returns A 256-bit integer vector containing the copied values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001300#define _mm256_permute2f128_si256(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001301 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1302 (__v8si)(__m256i)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001303
1304/* Vector Blend */
Logan Chien55afb0a2018-10-15 10:42:14 +08001305/// Merges 64-bit double-precision data values stored in either of the
Logan Chien2833ffb2018-10-09 10:03:24 +08001306/// two 256-bit vectors of [4 x double], as specified by the immediate
1307/// integer operand.
1308///
1309/// \headerfile <x86intrin.h>
1310///
1311/// \code
1312/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1313/// \endcode
1314///
Logan Chien55afb0a2018-10-15 10:42:14 +08001315/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001316///
1317/// \param V1
1318/// A 256-bit vector of [4 x double].
1319/// \param V2
1320/// A 256-bit vector of [4 x double].
1321/// \param M
1322/// An immediate integer operand, with mask bits [3:0] specifying how the
1323/// values are to be copied. The position of the mask bit corresponds to the
1324/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
Logan Chien55afb0a2018-10-15 10:42:14 +08001325/// element in operand \a V1 is copied to the same position in the
1326/// destination. When a mask bit is 1, the corresponding 64-bit element in
1327/// operand \a V2 is copied to the same position in the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +08001328/// \returns A 256-bit vector of [4 x double] containing the copied values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001329#define _mm256_blend_pd(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001330 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1331 (__v4df)(__m256d)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001332
Logan Chien55afb0a2018-10-15 10:42:14 +08001333/// Merges 32-bit single-precision data values stored in either of the
Logan Chien2833ffb2018-10-09 10:03:24 +08001334/// two 256-bit vectors of [8 x float], as specified by the immediate
1335/// integer operand.
1336///
1337/// \headerfile <x86intrin.h>
1338///
1339/// \code
1340/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1341/// \endcode
1342///
Logan Chien55afb0a2018-10-15 10:42:14 +08001343/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001344///
1345/// \param V1
1346/// A 256-bit vector of [8 x float].
1347/// \param V2
1348/// A 256-bit vector of [8 x float].
1349/// \param M
1350/// An immediate integer operand, with mask bits [7:0] specifying how the
1351/// values are to be copied. The position of the mask bit corresponds to the
1352/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
Logan Chien55afb0a2018-10-15 10:42:14 +08001353/// element in operand \a V1 is copied to the same position in the
1354/// destination. When a mask bit is 1, the corresponding 32-bit element in
1355/// operand \a V2 is copied to the same position in the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +08001356/// \returns A 256-bit vector of [8 x float] containing the copied values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001357#define _mm256_blend_ps(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001358 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1359 (__v8sf)(__m256)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001360
Logan Chien55afb0a2018-10-15 10:42:14 +08001361/// Merges 64-bit double-precision data values stored in either of the
Logan Chien2833ffb2018-10-09 10:03:24 +08001362/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1363/// operand.
1364///
1365/// \headerfile <x86intrin.h>
1366///
Logan Chien55afb0a2018-10-15 10:42:14 +08001367/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001368///
1369/// \param __a
1370/// A 256-bit vector of [4 x double].
1371/// \param __b
1372/// A 256-bit vector of [4 x double].
1373/// \param __c
1374/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1375/// how the values are to be copied. The position of the mask bit corresponds
1376/// to the most significant bit of a copied value. When a mask bit is 0, the
Logan Chien55afb0a2018-10-15 10:42:14 +08001377/// corresponding 64-bit element in operand \a __a is copied to the same
Logan Chien2833ffb2018-10-09 10:03:24 +08001378/// position in the destination. When a mask bit is 1, the corresponding
Logan Chien55afb0a2018-10-15 10:42:14 +08001379/// 64-bit element in operand \a __b is copied to the same position in the
Logan Chien2833ffb2018-10-09 10:03:24 +08001380/// destination.
1381/// \returns A 256-bit vector of [4 x double] containing the copied values.
1382static __inline __m256d __DEFAULT_FN_ATTRS
1383_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1384{
1385 return (__m256d)__builtin_ia32_blendvpd256(
1386 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1387}
1388
Logan Chien55afb0a2018-10-15 10:42:14 +08001389/// Merges 32-bit single-precision data values stored in either of the
Logan Chien2833ffb2018-10-09 10:03:24 +08001390/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1391/// operand.
1392///
1393/// \headerfile <x86intrin.h>
1394///
Logan Chien55afb0a2018-10-15 10:42:14 +08001395/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001396///
1397/// \param __a
1398/// A 256-bit vector of [8 x float].
1399/// \param __b
1400/// A 256-bit vector of [8 x float].
1401/// \param __c
1402/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1403/// and 31 specifying how the values are to be copied. The position of the
1404/// mask bit corresponds to the most significant bit of a copied value. When
Logan Chien55afb0a2018-10-15 10:42:14 +08001405/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
Logan Chien2833ffb2018-10-09 10:03:24 +08001406/// copied to the same position in the destination. When a mask bit is 1, the
Logan Chien55afb0a2018-10-15 10:42:14 +08001407/// corresponding 32-bit element in operand \a __b is copied to the same
Logan Chien2833ffb2018-10-09 10:03:24 +08001408/// position in the destination.
1409/// \returns A 256-bit vector of [8 x float] containing the copied values.
1410static __inline __m256 __DEFAULT_FN_ATTRS
1411_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1412{
1413 return (__m256)__builtin_ia32_blendvps256(
1414 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1415}
1416
1417/* Vector Dot Product */
Logan Chien55afb0a2018-10-15 10:42:14 +08001418/// Computes two dot products in parallel, using the lower and upper
Logan Chien2833ffb2018-10-09 10:03:24 +08001419/// halves of two [8 x float] vectors as input to the two computations, and
1420/// returning the two dot products in the lower and upper halves of the
Logan Chien55afb0a2018-10-15 10:42:14 +08001421/// [8 x float] result.
1422///
1423/// The immediate integer operand controls which input elements will
1424/// contribute to the dot product, and where the final results are returned.
1425/// In general, for each dot product, the four corresponding elements of the
1426/// input vectors are multiplied; the first two and second two products are
1427/// summed, then the two sums are added to form the final result.
Logan Chien2833ffb2018-10-09 10:03:24 +08001428///
1429/// \headerfile <x86intrin.h>
1430///
1431/// \code
1432/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1433/// \endcode
1434///
Logan Chien55afb0a2018-10-15 10:42:14 +08001435/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001436///
1437/// \param V1
1438/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1439/// \param V2
1440/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1441/// \param M
1442/// An immediate integer argument. Bits [7:4] determine which elements of
1443/// the input vectors are used, with bit [4] corresponding to the lowest
1444/// element and bit [7] corresponding to the highest element of each [4 x
1445/// float] subvector. If a bit is set, the corresponding elements from the
1446/// two input vectors are used as an input for dot product; otherwise that
1447/// input is treated as zero. Bits [3:0] determine which elements of the
1448/// result will receive a copy of the final dot product, with bit [0]
1449/// corresponding to the lowest element and bit [3] corresponding to the
1450/// highest element of each [4 x float] subvector. If a bit is set, the dot
1451/// product is returned in the corresponding element; otherwise that element
1452/// is set to zero. The bitmask is applied in the same way to each of the
1453/// two parallel dot product computations.
1454/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Logan Chien55afb0a2018-10-15 10:42:14 +08001455#define _mm256_dp_ps(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001456 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1457 (__v8sf)(__m256)(V2), (M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001458
1459/* Vector shuffle */
Logan Chien55afb0a2018-10-15 10:42:14 +08001460/// Selects 8 float values from the 256-bit operands of [8 x float], as
1461/// specified by the immediate value operand.
1462///
1463/// The four selected elements in each operand are copied to the destination
1464/// according to the bits specified in the immediate operand. The selected
1465/// elements from the first 256-bit operand are copied to bits [63:0] and
1466/// bits [191:128] of the destination, and the selected elements from the
1467/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1468/// the destination. For example, if bits [7:0] of the immediate operand
1469/// contain a value of 0xFF, the 256-bit destination vector would contain the
1470/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
Logan Chien2833ffb2018-10-09 10:03:24 +08001471///
1472/// \headerfile <x86intrin.h>
1473///
1474/// \code
1475/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1476/// \endcode
1477///
Logan Chien55afb0a2018-10-15 10:42:14 +08001478/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001479///
1480/// \param a
1481/// A 256-bit vector of [8 x float]. The four selected elements in this
1482/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1483/// according to the bits specified in the immediate operand.
1484/// \param b
1485/// A 256-bit vector of [8 x float]. The four selected elements in this
1486/// operand are copied to bits [127:64] and bits [255:192] in the
1487/// destination, according to the bits specified in the immediate operand.
1488/// \param mask
1489/// An immediate value containing an 8-bit value specifying which elements to
Logan Chien55afb0a2018-10-15 10:42:14 +08001490/// copy from \a a and \a b \n.
1491/// Bits [3:0] specify the values copied from operand \a a. \n
1492/// Bits [7:4] specify the values copied from operand \a b. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08001493/// The destinations within the 256-bit destination are assigned values as
Logan Chien55afb0a2018-10-15 10:42:14 +08001494/// follows, according to the bit value assignments described below: \n
Logan Chien2833ffb2018-10-09 10:03:24 +08001495/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
Logan Chien55afb0a2018-10-15 10:42:14 +08001496/// destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08001497/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
Logan Chien55afb0a2018-10-15 10:42:14 +08001498/// destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08001499/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
Logan Chien55afb0a2018-10-15 10:42:14 +08001500/// destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08001501/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
Logan Chien55afb0a2018-10-15 10:42:14 +08001502/// the destination. \n
1503/// Bit value assignments: \n
1504/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1505/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1506/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
Logan Chien2833ffb2018-10-09 10:03:24 +08001507/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
1508/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001509#define _mm256_shuffle_ps(a, b, mask) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001510 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1511 (__v8sf)(__m256)(b), (int)(mask)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001512
Logan Chien55afb0a2018-10-15 10:42:14 +08001513/// Selects four double-precision values from the 256-bit operands of
1514/// [4 x double], as specified by the immediate value operand.
1515///
1516/// The selected elements from the first 256-bit operand are copied to bits
1517/// [63:0] and bits [191:128] in the destination, and the selected elements
1518/// from the second 256-bit operand are copied to bits [127:64] and bits
1519/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1520/// operand contain a value of 0xF, the 256-bit destination vector would
1521/// contain the following values: b[3], a[3], b[1], a[1].
Logan Chien2833ffb2018-10-09 10:03:24 +08001522///
1523/// \headerfile <x86intrin.h>
1524///
1525/// \code
1526/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1527/// \endcode
1528///
Logan Chien55afb0a2018-10-15 10:42:14 +08001529/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001530///
1531/// \param a
1532/// A 256-bit vector of [4 x double].
1533/// \param b
1534/// A 256-bit vector of [4 x double].
1535/// \param mask
1536/// An immediate value containing 8-bit values specifying which elements to
Logan Chien55afb0a2018-10-15 10:42:14 +08001537/// copy from \a a and \a b: \n
1538/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1539/// destination. \n
1540/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1541/// destination. \n
1542/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1543/// destination. \n
1544/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1545/// destination. \n
1546/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1547/// destination. \n
1548/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1549/// destination. \n
1550/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1551/// destination. \n
1552/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
Logan Chien2833ffb2018-10-09 10:03:24 +08001553/// destination.
1554/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Logan Chien55afb0a2018-10-15 10:42:14 +08001555#define _mm256_shuffle_pd(a, b, mask) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001556 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1557 (__v4df)(__m256d)(b), (int)(mask)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001558
1559/* Compare */
1560#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
1561#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
1562#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
1563#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
1564#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
1565#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
1566#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
Logan Chien55afb0a2018-10-15 10:42:14 +08001567#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
Logan Chien2833ffb2018-10-09 10:03:24 +08001568#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
Logan Chien55afb0a2018-10-15 10:42:14 +08001569#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
Logan Chien2833ffb2018-10-09 10:03:24 +08001570#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1571#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1572#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1573#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1574#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1575#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1576#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1577#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1578#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1579#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1580#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1581#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
Logan Chien55afb0a2018-10-15 10:42:14 +08001582#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
Logan Chien2833ffb2018-10-09 10:03:24 +08001583#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1584#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
Logan Chien55afb0a2018-10-15 10:42:14 +08001585#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
Logan Chien2833ffb2018-10-09 10:03:24 +08001586#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1587#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1588#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1589#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1590#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1591#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1592
Logan Chien55afb0a2018-10-15 10:42:14 +08001593/// Compares each of the corresponding double-precision values of two
Logan Chien2833ffb2018-10-09 10:03:24 +08001594/// 128-bit vectors of [2 x double], using the operation specified by the
Logan Chien55afb0a2018-10-15 10:42:14 +08001595/// immediate integer operand.
1596///
1597/// Returns a [2 x double] vector consisting of two doubles corresponding to
1598/// the two comparison results: zero if the comparison is false, and all 1's
1599/// if the comparison is true.
Logan Chien2833ffb2018-10-09 10:03:24 +08001600///
1601/// \headerfile <x86intrin.h>
1602///
1603/// \code
1604/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1605/// \endcode
1606///
Logan Chien55afb0a2018-10-15 10:42:14 +08001607/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001608///
1609/// \param a
1610/// A 128-bit vector of [2 x double].
1611/// \param b
1612/// A 128-bit vector of [2 x double].
1613/// \param c
1614/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien55afb0a2018-10-15 10:42:14 +08001615/// operation to use: \n
1616/// 0x00: Equal (ordered, non-signaling) \n
1617/// 0x01: Less-than (ordered, signaling) \n
1618/// 0x02: Less-than-or-equal (ordered, signaling) \n
1619/// 0x03: Unordered (non-signaling) \n
1620/// 0x04: Not-equal (unordered, non-signaling) \n
1621/// 0x05: Not-less-than (unordered, signaling) \n
1622/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1623/// 0x07: Ordered (non-signaling) \n
1624/// 0x08: Equal (unordered, non-signaling) \n
1625/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1626/// 0x0A: Not-greater-than (unordered, signaling) \n
1627/// 0x0B: False (ordered, non-signaling) \n
1628/// 0x0C: Not-equal (ordered, non-signaling) \n
1629/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1630/// 0x0E: Greater-than (ordered, signaling) \n
1631/// 0x0F: True (unordered, non-signaling) \n
1632/// 0x10: Equal (ordered, signaling) \n
1633/// 0x11: Less-than (ordered, non-signaling) \n
1634/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1635/// 0x13: Unordered (signaling) \n
1636/// 0x14: Not-equal (unordered, signaling) \n
1637/// 0x15: Not-less-than (unordered, non-signaling) \n
1638/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1639/// 0x17: Ordered (signaling) \n
1640/// 0x18: Equal (unordered, signaling) \n
1641/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1642/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1643/// 0x1B: False (ordered, signaling) \n
1644/// 0x1C: Not-equal (ordered, signaling) \n
1645/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1646/// 0x1E: Greater-than (ordered, non-signaling) \n
1647/// 0x1F: True (unordered, signaling)
Logan Chien2833ffb2018-10-09 10:03:24 +08001648/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Logan Chien55afb0a2018-10-15 10:42:14 +08001649#define _mm_cmp_pd(a, b, c) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001650 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1651 (__v2df)(__m128d)(b), (c)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001652
Logan Chien55afb0a2018-10-15 10:42:14 +08001653/// Compares each of the corresponding values of two 128-bit vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +08001654/// [4 x float], using the operation specified by the immediate integer
Logan Chien55afb0a2018-10-15 10:42:14 +08001655/// operand.
1656///
1657/// Returns a [4 x float] vector consisting of four floats corresponding to
1658/// the four comparison results: zero if the comparison is false, and all 1's
1659/// if the comparison is true.
Logan Chien2833ffb2018-10-09 10:03:24 +08001660///
1661/// \headerfile <x86intrin.h>
1662///
1663/// \code
1664/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1665/// \endcode
1666///
Logan Chien55afb0a2018-10-15 10:42:14 +08001667/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001668///
1669/// \param a
1670/// A 128-bit vector of [4 x float].
1671/// \param b
1672/// A 128-bit vector of [4 x float].
1673/// \param c
1674/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien55afb0a2018-10-15 10:42:14 +08001675/// operation to use: \n
1676/// 0x00: Equal (ordered, non-signaling) \n
1677/// 0x01: Less-than (ordered, signaling) \n
1678/// 0x02: Less-than-or-equal (ordered, signaling) \n
1679/// 0x03: Unordered (non-signaling) \n
1680/// 0x04: Not-equal (unordered, non-signaling) \n
1681/// 0x05: Not-less-than (unordered, signaling) \n
1682/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1683/// 0x07: Ordered (non-signaling) \n
1684/// 0x08: Equal (unordered, non-signaling) \n
1685/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1686/// 0x0A: Not-greater-than (unordered, signaling) \n
1687/// 0x0B: False (ordered, non-signaling) \n
1688/// 0x0C: Not-equal (ordered, non-signaling) \n
1689/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1690/// 0x0E: Greater-than (ordered, signaling) \n
1691/// 0x0F: True (unordered, non-signaling) \n
1692/// 0x10: Equal (ordered, signaling) \n
1693/// 0x11: Less-than (ordered, non-signaling) \n
1694/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1695/// 0x13: Unordered (signaling) \n
1696/// 0x14: Not-equal (unordered, signaling) \n
1697/// 0x15: Not-less-than (unordered, non-signaling) \n
1698/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1699/// 0x17: Ordered (signaling) \n
1700/// 0x18: Equal (unordered, signaling) \n
1701/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1702/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1703/// 0x1B: False (ordered, signaling) \n
1704/// 0x1C: Not-equal (ordered, signaling) \n
1705/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1706/// 0x1E: Greater-than (ordered, non-signaling) \n
1707/// 0x1F: True (unordered, signaling)
Logan Chien2833ffb2018-10-09 10:03:24 +08001708/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Logan Chien55afb0a2018-10-15 10:42:14 +08001709#define _mm_cmp_ps(a, b, c) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001710 ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1711 (__v4sf)(__m128)(b), (c)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001712
Logan Chien55afb0a2018-10-15 10:42:14 +08001713/// Compares each of the corresponding double-precision values of two
Logan Chien2833ffb2018-10-09 10:03:24 +08001714/// 256-bit vectors of [4 x double], using the operation specified by the
Logan Chien55afb0a2018-10-15 10:42:14 +08001715/// immediate integer operand.
1716///
1717/// Returns a [4 x double] vector consisting of four doubles corresponding to
1718/// the four comparison results: zero if the comparison is false, and all 1's
1719/// if the comparison is true.
Logan Chien2833ffb2018-10-09 10:03:24 +08001720///
1721/// \headerfile <x86intrin.h>
1722///
1723/// \code
1724/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1725/// \endcode
1726///
Logan Chien55afb0a2018-10-15 10:42:14 +08001727/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001728///
1729/// \param a
1730/// A 256-bit vector of [4 x double].
1731/// \param b
1732/// A 256-bit vector of [4 x double].
1733/// \param c
1734/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien55afb0a2018-10-15 10:42:14 +08001735/// operation to use: \n
1736/// 0x00: Equal (ordered, non-signaling) \n
1737/// 0x01: Less-than (ordered, signaling) \n
1738/// 0x02: Less-than-or-equal (ordered, signaling) \n
1739/// 0x03: Unordered (non-signaling) \n
1740/// 0x04: Not-equal (unordered, non-signaling) \n
1741/// 0x05: Not-less-than (unordered, signaling) \n
1742/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1743/// 0x07: Ordered (non-signaling) \n
1744/// 0x08: Equal (unordered, non-signaling) \n
1745/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1746/// 0x0A: Not-greater-than (unordered, signaling) \n
1747/// 0x0B: False (ordered, non-signaling) \n
1748/// 0x0C: Not-equal (ordered, non-signaling) \n
1749/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1750/// 0x0E: Greater-than (ordered, signaling) \n
1751/// 0x0F: True (unordered, non-signaling) \n
1752/// 0x10: Equal (ordered, signaling) \n
1753/// 0x11: Less-than (ordered, non-signaling) \n
1754/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1755/// 0x13: Unordered (signaling) \n
1756/// 0x14: Not-equal (unordered, signaling) \n
1757/// 0x15: Not-less-than (unordered, non-signaling) \n
1758/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1759/// 0x17: Ordered (signaling) \n
1760/// 0x18: Equal (unordered, signaling) \n
1761/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1762/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1763/// 0x1B: False (ordered, signaling) \n
1764/// 0x1C: Not-equal (ordered, signaling) \n
1765/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1766/// 0x1E: Greater-than (ordered, non-signaling) \n
1767/// 0x1F: True (unordered, signaling)
Logan Chien2833ffb2018-10-09 10:03:24 +08001768/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Logan Chien55afb0a2018-10-15 10:42:14 +08001769#define _mm256_cmp_pd(a, b, c) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001770 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1771 (__v4df)(__m256d)(b), (c)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001772
Logan Chien55afb0a2018-10-15 10:42:14 +08001773/// Compares each of the corresponding values of two 256-bit vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +08001774/// [8 x float], using the operation specified by the immediate integer
Logan Chien55afb0a2018-10-15 10:42:14 +08001775/// operand.
1776///
1777/// Returns a [8 x float] vector consisting of eight floats corresponding to
1778/// the eight comparison results: zero if the comparison is false, and all
1779/// 1's if the comparison is true.
Logan Chien2833ffb2018-10-09 10:03:24 +08001780///
1781/// \headerfile <x86intrin.h>
1782///
1783/// \code
1784/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1785/// \endcode
1786///
Logan Chien55afb0a2018-10-15 10:42:14 +08001787/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001788///
1789/// \param a
1790/// A 256-bit vector of [8 x float].
1791/// \param b
1792/// A 256-bit vector of [8 x float].
1793/// \param c
1794/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien55afb0a2018-10-15 10:42:14 +08001795/// operation to use: \n
1796/// 0x00: Equal (ordered, non-signaling) \n
1797/// 0x01: Less-than (ordered, signaling) \n
1798/// 0x02: Less-than-or-equal (ordered, signaling) \n
1799/// 0x03: Unordered (non-signaling) \n
1800/// 0x04: Not-equal (unordered, non-signaling) \n
1801/// 0x05: Not-less-than (unordered, signaling) \n
1802/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1803/// 0x07: Ordered (non-signaling) \n
1804/// 0x08: Equal (unordered, non-signaling) \n
1805/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1806/// 0x0A: Not-greater-than (unordered, signaling) \n
1807/// 0x0B: False (ordered, non-signaling) \n
1808/// 0x0C: Not-equal (ordered, non-signaling) \n
1809/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1810/// 0x0E: Greater-than (ordered, signaling) \n
1811/// 0x0F: True (unordered, non-signaling) \n
1812/// 0x10: Equal (ordered, signaling) \n
1813/// 0x11: Less-than (ordered, non-signaling) \n
1814/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1815/// 0x13: Unordered (signaling) \n
1816/// 0x14: Not-equal (unordered, signaling) \n
1817/// 0x15: Not-less-than (unordered, non-signaling) \n
1818/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1819/// 0x17: Ordered (signaling) \n
1820/// 0x18: Equal (unordered, signaling) \n
1821/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1822/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1823/// 0x1B: False (ordered, signaling) \n
1824/// 0x1C: Not-equal (ordered, signaling) \n
1825/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1826/// 0x1E: Greater-than (ordered, non-signaling) \n
1827/// 0x1F: True (unordered, signaling)
Logan Chien2833ffb2018-10-09 10:03:24 +08001828/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Logan Chien55afb0a2018-10-15 10:42:14 +08001829#define _mm256_cmp_ps(a, b, c) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001830 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1831 (__v8sf)(__m256)(b), (c)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001832
Logan Chien55afb0a2018-10-15 10:42:14 +08001833/// Compares each of the corresponding scalar double-precision values of
Logan Chien2833ffb2018-10-09 10:03:24 +08001834/// two 128-bit vectors of [2 x double], using the operation specified by the
Logan Chien55afb0a2018-10-15 10:42:14 +08001835/// immediate integer operand.
1836///
1837/// If the result is true, all 64 bits of the destination vector are set;
1838/// otherwise they are cleared.
Logan Chien2833ffb2018-10-09 10:03:24 +08001839///
1840/// \headerfile <x86intrin.h>
1841///
1842/// \code
1843/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1844/// \endcode
1845///
Logan Chien55afb0a2018-10-15 10:42:14 +08001846/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001847///
1848/// \param a
1849/// A 128-bit vector of [2 x double].
1850/// \param b
1851/// A 128-bit vector of [2 x double].
1852/// \param c
1853/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien55afb0a2018-10-15 10:42:14 +08001854/// operation to use: \n
1855/// 0x00: Equal (ordered, non-signaling) \n
1856/// 0x01: Less-than (ordered, signaling) \n
1857/// 0x02: Less-than-or-equal (ordered, signaling) \n
1858/// 0x03: Unordered (non-signaling) \n
1859/// 0x04: Not-equal (unordered, non-signaling) \n
1860/// 0x05: Not-less-than (unordered, signaling) \n
1861/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1862/// 0x07: Ordered (non-signaling) \n
1863/// 0x08: Equal (unordered, non-signaling) \n
1864/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1865/// 0x0A: Not-greater-than (unordered, signaling) \n
1866/// 0x0B: False (ordered, non-signaling) \n
1867/// 0x0C: Not-equal (ordered, non-signaling) \n
1868/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1869/// 0x0E: Greater-than (ordered, signaling) \n
1870/// 0x0F: True (unordered, non-signaling) \n
1871/// 0x10: Equal (ordered, signaling) \n
1872/// 0x11: Less-than (ordered, non-signaling) \n
1873/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1874/// 0x13: Unordered (signaling) \n
1875/// 0x14: Not-equal (unordered, signaling) \n
1876/// 0x15: Not-less-than (unordered, non-signaling) \n
1877/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1878/// 0x17: Ordered (signaling) \n
1879/// 0x18: Equal (unordered, signaling) \n
1880/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1881/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1882/// 0x1B: False (ordered, signaling) \n
1883/// 0x1C: Not-equal (ordered, signaling) \n
1884/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1885/// 0x1E: Greater-than (ordered, non-signaling) \n
1886/// 0x1F: True (unordered, signaling)
Logan Chien2833ffb2018-10-09 10:03:24 +08001887/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Logan Chien55afb0a2018-10-15 10:42:14 +08001888#define _mm_cmp_sd(a, b, c) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001889 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1890 (__v2df)(__m128d)(b), (c)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001891
Logan Chien55afb0a2018-10-15 10:42:14 +08001892/// Compares each of the corresponding scalar values of two 128-bit
Logan Chien2833ffb2018-10-09 10:03:24 +08001893/// vectors of [4 x float], using the operation specified by the immediate
Logan Chien55afb0a2018-10-15 10:42:14 +08001894/// integer operand.
1895///
1896/// If the result is true, all 32 bits of the destination vector are set;
1897/// otherwise they are cleared.
Logan Chien2833ffb2018-10-09 10:03:24 +08001898///
1899/// \headerfile <x86intrin.h>
1900///
1901/// \code
1902/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1903/// \endcode
1904///
Logan Chien55afb0a2018-10-15 10:42:14 +08001905/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001906///
1907/// \param a
1908/// A 128-bit vector of [4 x float].
1909/// \param b
1910/// A 128-bit vector of [4 x float].
1911/// \param c
1912/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien55afb0a2018-10-15 10:42:14 +08001913/// operation to use: \n
1914/// 0x00: Equal (ordered, non-signaling) \n
1915/// 0x01: Less-than (ordered, signaling) \n
1916/// 0x02: Less-than-or-equal (ordered, signaling) \n
1917/// 0x03: Unordered (non-signaling) \n
1918/// 0x04: Not-equal (unordered, non-signaling) \n
1919/// 0x05: Not-less-than (unordered, signaling) \n
1920/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1921/// 0x07: Ordered (non-signaling) \n
1922/// 0x08: Equal (unordered, non-signaling) \n
1923/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1924/// 0x0A: Not-greater-than (unordered, signaling) \n
1925/// 0x0B: False (ordered, non-signaling) \n
1926/// 0x0C: Not-equal (ordered, non-signaling) \n
1927/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1928/// 0x0E: Greater-than (ordered, signaling) \n
1929/// 0x0F: True (unordered, non-signaling) \n
1930/// 0x10: Equal (ordered, signaling) \n
1931/// 0x11: Less-than (ordered, non-signaling) \n
1932/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1933/// 0x13: Unordered (signaling) \n
1934/// 0x14: Not-equal (unordered, signaling) \n
1935/// 0x15: Not-less-than (unordered, non-signaling) \n
1936/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1937/// 0x17: Ordered (signaling) \n
1938/// 0x18: Equal (unordered, signaling) \n
1939/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1940/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1941/// 0x1B: False (ordered, signaling) \n
1942/// 0x1C: Not-equal (ordered, signaling) \n
1943/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1944/// 0x1E: Greater-than (ordered, non-signaling) \n
1945/// 0x1F: True (unordered, signaling)
Logan Chien2833ffb2018-10-09 10:03:24 +08001946/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Logan Chien55afb0a2018-10-15 10:42:14 +08001947#define _mm_cmp_ss(a, b, c) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001948 ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
1949 (__v4sf)(__m128)(b), (c)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001950
Logan Chien55afb0a2018-10-15 10:42:14 +08001951/// Takes a [8 x i32] vector and returns the vector element value
Logan Chien2833ffb2018-10-09 10:03:24 +08001952/// indexed by the immediate constant operand.
1953///
1954/// \headerfile <x86intrin.h>
1955///
Logan Chien55afb0a2018-10-15 10:42:14 +08001956/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1957/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001958///
1959/// \param __a
1960/// A 256-bit vector of [8 x i32].
1961/// \param __imm
1962/// An immediate integer operand with bits [2:0] determining which vector
1963/// element is extracted and returned.
1964/// \returns A 32-bit integer containing the extracted 32 bits of extended
1965/// packed data.
Logan Chien55afb0a2018-10-15 10:42:14 +08001966#define _mm256_extract_epi32(X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001967 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001968
Logan Chien55afb0a2018-10-15 10:42:14 +08001969/// Takes a [16 x i16] vector and returns the vector element value
Logan Chien2833ffb2018-10-09 10:03:24 +08001970/// indexed by the immediate constant operand.
1971///
1972/// \headerfile <x86intrin.h>
1973///
Logan Chien55afb0a2018-10-15 10:42:14 +08001974/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1975/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001976///
1977/// \param __a
1978/// A 256-bit integer vector of [16 x i16].
1979/// \param __imm
1980/// An immediate integer operand with bits [3:0] determining which vector
1981/// element is extracted and returned.
1982/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
1983/// packed data.
Logan Chien55afb0a2018-10-15 10:42:14 +08001984#define _mm256_extract_epi16(X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001985 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
1986 (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +08001987
Logan Chien55afb0a2018-10-15 10:42:14 +08001988/// Takes a [32 x i8] vector and returns the vector element value
Logan Chien2833ffb2018-10-09 10:03:24 +08001989/// indexed by the immediate constant operand.
1990///
1991/// \headerfile <x86intrin.h>
1992///
Logan Chien55afb0a2018-10-15 10:42:14 +08001993/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1994/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001995///
1996/// \param __a
1997/// A 256-bit integer vector of [32 x i8].
1998/// \param __imm
1999/// An immediate integer operand with bits [4:0] determining which vector
2000/// element is extracted and returned.
2001/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2002/// packed data.
Logan Chien55afb0a2018-10-15 10:42:14 +08002003#define _mm256_extract_epi8(X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002004 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2005 (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002006
2007#ifdef __x86_64__
Logan Chien55afb0a2018-10-15 10:42:14 +08002008/// Takes a [4 x i64] vector and returns the vector element value
Logan Chien2833ffb2018-10-09 10:03:24 +08002009/// indexed by the immediate constant operand.
2010///
2011/// \headerfile <x86intrin.h>
2012///
Logan Chien55afb0a2018-10-15 10:42:14 +08002013/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2014/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002015///
2016/// \param __a
2017/// A 256-bit integer vector of [4 x i64].
2018/// \param __imm
2019/// An immediate integer operand with bits [1:0] determining which vector
2020/// element is extracted and returned.
2021/// \returns A 64-bit integer containing the extracted 64 bits of extended
2022/// packed data.
Logan Chien55afb0a2018-10-15 10:42:14 +08002023#define _mm256_extract_epi64(X, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002024 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002025#endif
2026
Logan Chien55afb0a2018-10-15 10:42:14 +08002027/// Takes a [8 x i32] vector and replaces the vector element value
Logan Chien2833ffb2018-10-09 10:03:24 +08002028/// indexed by the immediate constant operand by a new value. Returns the
2029/// modified vector.
2030///
2031/// \headerfile <x86intrin.h>
2032///
Logan Chien55afb0a2018-10-15 10:42:14 +08002033/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2034/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002035///
2036/// \param __a
2037/// A vector of [8 x i32] to be used by the insert operation.
2038/// \param __b
2039/// An integer value. The replacement value for the insert operation.
2040/// \param __imm
2041/// An immediate integer specifying the index of the vector element to be
2042/// replaced.
Logan Chien55afb0a2018-10-15 10:42:14 +08002043/// \returns A copy of vector \a __a, after replacing its element indexed by
2044/// \a __imm with \a __b.
2045#define _mm256_insert_epi32(X, I, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002046 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2047 (int)(I), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002048
2049
Logan Chien55afb0a2018-10-15 10:42:14 +08002050/// Takes a [16 x i16] vector and replaces the vector element value
Logan Chien2833ffb2018-10-09 10:03:24 +08002051/// indexed by the immediate constant operand with a new value. Returns the
2052/// modified vector.
2053///
2054/// \headerfile <x86intrin.h>
2055///
Logan Chien55afb0a2018-10-15 10:42:14 +08002056/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2057/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002058///
2059/// \param __a
2060/// A vector of [16 x i16] to be used by the insert operation.
2061/// \param __b
2062/// An i16 integer value. The replacement value for the insert operation.
2063/// \param __imm
2064/// An immediate integer specifying the index of the vector element to be
2065/// replaced.
Logan Chien55afb0a2018-10-15 10:42:14 +08002066/// \returns A copy of vector \a __a, after replacing its element indexed by
2067/// \a __imm with \a __b.
2068#define _mm256_insert_epi16(X, I, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002069 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2070 (int)(I), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002071
Logan Chien55afb0a2018-10-15 10:42:14 +08002072/// Takes a [32 x i8] vector and replaces the vector element value
Logan Chien2833ffb2018-10-09 10:03:24 +08002073/// indexed by the immediate constant operand with a new value. Returns the
2074/// modified vector.
2075///
2076/// \headerfile <x86intrin.h>
2077///
Logan Chien55afb0a2018-10-15 10:42:14 +08002078/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2079/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002080///
2081/// \param __a
2082/// A vector of [32 x i8] to be used by the insert operation.
2083/// \param __b
2084/// An i8 integer value. The replacement value for the insert operation.
2085/// \param __imm
2086/// An immediate integer specifying the index of the vector element to be
2087/// replaced.
Logan Chien55afb0a2018-10-15 10:42:14 +08002088/// \returns A copy of vector \a __a, after replacing its element indexed by
2089/// \a __imm with \a __b.
2090#define _mm256_insert_epi8(X, I, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002091 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2092 (int)(I), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002093
2094#ifdef __x86_64__
Logan Chien55afb0a2018-10-15 10:42:14 +08002095/// Takes a [4 x i64] vector and replaces the vector element value
Logan Chien2833ffb2018-10-09 10:03:24 +08002096/// indexed by the immediate constant operand with a new value. Returns the
2097/// modified vector.
2098///
2099/// \headerfile <x86intrin.h>
2100///
Logan Chien55afb0a2018-10-15 10:42:14 +08002101/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2102/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002103///
2104/// \param __a
2105/// A vector of [4 x i64] to be used by the insert operation.
2106/// \param __b
2107/// A 64-bit integer value. The replacement value for the insert operation.
2108/// \param __imm
2109/// An immediate integer specifying the index of the vector element to be
2110/// replaced.
Logan Chien55afb0a2018-10-15 10:42:14 +08002111/// \returns A copy of vector \a __a, after replacing its element indexed by
2112/// \a __imm with \a __b.
2113#define _mm256_insert_epi64(X, I, N) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002114 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2115 (long long)(I), (int)(N)))
Logan Chien2833ffb2018-10-09 10:03:24 +08002116#endif
2117
2118/* Conversion */
Logan Chien55afb0a2018-10-15 10:42:14 +08002119/// Converts a vector of [4 x i32] into a vector of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +08002120///
2121/// \headerfile <x86intrin.h>
2122///
Logan Chien55afb0a2018-10-15 10:42:14 +08002123/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002124///
2125/// \param __a
2126/// A 128-bit integer vector of [4 x i32].
2127/// \returns A 256-bit vector of [4 x double] containing the converted values.
2128static __inline __m256d __DEFAULT_FN_ATTRS
2129_mm256_cvtepi32_pd(__m128i __a)
2130{
2131 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2132}
2133
Logan Chien55afb0a2018-10-15 10:42:14 +08002134/// Converts a vector of [8 x i32] into a vector of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +08002135///
2136/// \headerfile <x86intrin.h>
2137///
Logan Chien55afb0a2018-10-15 10:42:14 +08002138/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002139///
2140/// \param __a
2141/// A 256-bit integer vector.
2142/// \returns A 256-bit vector of [8 x float] containing the converted values.
2143static __inline __m256 __DEFAULT_FN_ATTRS
2144_mm256_cvtepi32_ps(__m256i __a)
2145{
Logan Chien55afb0a2018-10-15 10:42:14 +08002146 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
Logan Chien2833ffb2018-10-09 10:03:24 +08002147}
2148
Logan Chien55afb0a2018-10-15 10:42:14 +08002149/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
Logan Chien2833ffb2018-10-09 10:03:24 +08002150/// [4 x float].
2151///
2152/// \headerfile <x86intrin.h>
2153///
Logan Chien55afb0a2018-10-15 10:42:14 +08002154/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002155///
2156/// \param __a
2157/// A 256-bit vector of [4 x double].
2158/// \returns A 128-bit vector of [4 x float] containing the converted values.
2159static __inline __m128 __DEFAULT_FN_ATTRS
2160_mm256_cvtpd_ps(__m256d __a)
2161{
2162 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2163}
2164
Logan Chien55afb0a2018-10-15 10:42:14 +08002165/// Converts a vector of [8 x float] into a vector of [8 x i32].
Logan Chien2833ffb2018-10-09 10:03:24 +08002166///
2167/// \headerfile <x86intrin.h>
2168///
Logan Chien55afb0a2018-10-15 10:42:14 +08002169/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08002170///
2171/// \param __a
2172/// A 256-bit vector of [8 x float].
2173/// \returns A 256-bit integer vector containing the converted values.
2174static __inline __m256i __DEFAULT_FN_ATTRS
2175_mm256_cvtps_epi32(__m256 __a)
2176{
2177 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2178}
2179
Logan Chien55afb0a2018-10-15 10:42:14 +08002180/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2181/// x double].
2182///
2183/// \headerfile <x86intrin.h>
2184///
2185/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2186///
2187/// \param __a
2188/// A 128-bit vector of [4 x float].
2189/// \returns A 256-bit vector of [4 x double] containing the converted values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002190static __inline __m256d __DEFAULT_FN_ATTRS
2191_mm256_cvtps_pd(__m128 __a)
2192{
2193 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2194}
2195
Logan Chien55afb0a2018-10-15 10:42:14 +08002196/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2197/// x i32], truncating the result by rounding towards zero when it is
2198/// inexact.
2199///
2200/// \headerfile <x86intrin.h>
2201///
2202/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2203///
2204/// \param __a
2205/// A 256-bit vector of [4 x double].
2206/// \returns A 128-bit integer vector containing the converted values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002207static __inline __m128i __DEFAULT_FN_ATTRS
2208_mm256_cvttpd_epi32(__m256d __a)
2209{
Logan Chien55afb0a2018-10-15 10:42:14 +08002210 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Logan Chien2833ffb2018-10-09 10:03:24 +08002211}
2212
Logan Chien55afb0a2018-10-15 10:42:14 +08002213/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2214/// x i32]. When a conversion is inexact, the value returned is rounded
2215/// according to the rounding control bits in the MXCSR register.
2216///
2217/// \headerfile <x86intrin.h>
2218///
2219/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2220///
2221/// \param __a
2222/// A 256-bit vector of [4 x double].
2223/// \returns A 128-bit integer vector containing the converted values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002224static __inline __m128i __DEFAULT_FN_ATTRS
2225_mm256_cvtpd_epi32(__m256d __a)
2226{
2227 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2228}
2229
Logan Chien55afb0a2018-10-15 10:42:14 +08002230/// Converts a vector of [8 x float] into a vector of [8 x i32],
2231/// truncating the result by rounding towards zero when it is inexact.
2232///
2233/// \headerfile <x86intrin.h>
2234///
2235/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2236///
2237/// \param __a
2238/// A 256-bit vector of [8 x float].
2239/// \returns A 256-bit integer vector containing the converted values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002240static __inline __m256i __DEFAULT_FN_ATTRS
2241_mm256_cvttps_epi32(__m256 __a)
2242{
Logan Chien55afb0a2018-10-15 10:42:14 +08002243 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Logan Chien2833ffb2018-10-09 10:03:24 +08002244}
2245
Logan Chien55afb0a2018-10-15 10:42:14 +08002246/// Returns the first element of the input vector of [4 x double].
2247///
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -07002248/// \headerfile <x86intrin.h>
Logan Chien55afb0a2018-10-15 10:42:14 +08002249///
2250/// This intrinsic is a utility function and does not correspond to a specific
2251/// instruction.
2252///
2253/// \param __a
2254/// A 256-bit vector of [4 x double].
2255/// \returns A 64 bit double containing the first element of the input vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08002256static __inline double __DEFAULT_FN_ATTRS
2257_mm256_cvtsd_f64(__m256d __a)
2258{
2259 return __a[0];
2260}
2261
Logan Chien55afb0a2018-10-15 10:42:14 +08002262/// Returns the first element of the input vector of [8 x i32].
2263///
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -07002264/// \headerfile <x86intrin.h>
Logan Chien55afb0a2018-10-15 10:42:14 +08002265///
2266/// This intrinsic is a utility function and does not correspond to a specific
2267/// instruction.
2268///
2269/// \param __a
2270/// A 256-bit vector of [8 x i32].
2271/// \returns A 32 bit integer containing the first element of the input vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08002272static __inline int __DEFAULT_FN_ATTRS
2273_mm256_cvtsi256_si32(__m256i __a)
2274{
2275 __v8si __b = (__v8si)__a;
2276 return __b[0];
2277}
2278
Logan Chien55afb0a2018-10-15 10:42:14 +08002279/// Returns the first element of the input vector of [8 x float].
2280///
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -07002281/// \headerfile <x86intrin.h>
Logan Chien55afb0a2018-10-15 10:42:14 +08002282///
2283/// This intrinsic is a utility function and does not correspond to a specific
2284/// instruction.
2285///
2286/// \param __a
2287/// A 256-bit vector of [8 x float].
2288/// \returns A 32 bit float containing the first element of the input vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08002289static __inline float __DEFAULT_FN_ATTRS
2290_mm256_cvtss_f32(__m256 __a)
2291{
2292 return __a[0];
2293}
2294
2295/* Vector replicate */
Logan Chien55afb0a2018-10-15 10:42:14 +08002296/// Moves and duplicates odd-indexed values from a 256-bit vector of
2297/// [8 x float] to float values in a 256-bit vector of [8 x float].
2298///
2299/// \headerfile <x86intrin.h>
2300///
2301/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2302///
2303/// \param __a
2304/// A 256-bit vector of [8 x float]. \n
2305/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2306/// the return value. \n
2307/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2308/// the return value. \n
2309/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2310/// return value. \n
2311/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2312/// return value.
2313/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2314/// values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002315static __inline __m256 __DEFAULT_FN_ATTRS
2316_mm256_movehdup_ps(__m256 __a)
2317{
2318 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2319}
2320
Logan Chien55afb0a2018-10-15 10:42:14 +08002321/// Moves and duplicates even-indexed values from a 256-bit vector of
2322/// [8 x float] to float values in a 256-bit vector of [8 x float].
2323///
2324/// \headerfile <x86intrin.h>
2325///
2326/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2327///
2328/// \param __a
2329/// A 256-bit vector of [8 x float]. \n
2330/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2331/// the return value. \n
2332/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2333/// the return value. \n
2334/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2335/// return value. \n
2336/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2337/// return value.
2338/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2339/// values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002340static __inline __m256 __DEFAULT_FN_ATTRS
2341_mm256_moveldup_ps(__m256 __a)
2342{
2343 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2344}
2345
Logan Chien55afb0a2018-10-15 10:42:14 +08002346/// Moves and duplicates double-precision floating point values from a
2347/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2348/// vector of [4 x double].
2349///
2350/// \headerfile <x86intrin.h>
2351///
2352/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2353///
2354/// \param __a
2355/// A 256-bit vector of [4 x double]. \n
2356/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2357/// return value. \n
2358/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2359/// the return value.
2360/// \returns A 256-bit vector of [4 x double] containing the moved and
2361/// duplicated values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002362static __inline __m256d __DEFAULT_FN_ATTRS
2363_mm256_movedup_pd(__m256d __a)
2364{
2365 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2366}
2367
2368/* Unpack and Interleave */
Logan Chien55afb0a2018-10-15 10:42:14 +08002369/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2370/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2371///
2372/// \headerfile <x86intrin.h>
2373///
2374/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2375///
2376/// \param __a
2377/// A 256-bit floating-point vector of [4 x double]. \n
2378/// Bits [127:64] are written to bits [63:0] of the return value. \n
2379/// Bits [255:192] are written to bits [191:128] of the return value. \n
2380/// \param __b
2381/// A 256-bit floating-point vector of [4 x double]. \n
2382/// Bits [127:64] are written to bits [127:64] of the return value. \n
2383/// Bits [255:192] are written to bits [255:192] of the return value. \n
2384/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002385static __inline __m256d __DEFAULT_FN_ATTRS
2386_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2387{
2388 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2389}
2390
Logan Chien55afb0a2018-10-15 10:42:14 +08002391/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2392/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2393///
2394/// \headerfile <x86intrin.h>
2395///
2396/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2397///
2398/// \param __a
2399/// A 256-bit floating-point vector of [4 x double]. \n
2400/// Bits [63:0] are written to bits [63:0] of the return value. \n
2401/// Bits [191:128] are written to bits [191:128] of the return value.
2402/// \param __b
2403/// A 256-bit floating-point vector of [4 x double]. \n
2404/// Bits [63:0] are written to bits [127:64] of the return value. \n
2405/// Bits [191:128] are written to bits [255:192] of the return value. \n
2406/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002407static __inline __m256d __DEFAULT_FN_ATTRS
2408_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2409{
2410 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2411}
2412
Logan Chien55afb0a2018-10-15 10:42:14 +08002413/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2414/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2415/// vector of [8 x float].
2416///
2417/// \headerfile <x86intrin.h>
2418///
2419/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2420///
2421/// \param __a
2422/// A 256-bit vector of [8 x float]. \n
2423/// Bits [95:64] are written to bits [31:0] of the return value. \n
2424/// Bits [127:96] are written to bits [95:64] of the return value. \n
2425/// Bits [223:192] are written to bits [159:128] of the return value. \n
2426/// Bits [255:224] are written to bits [223:192] of the return value.
2427/// \param __b
2428/// A 256-bit vector of [8 x float]. \n
2429/// Bits [95:64] are written to bits [63:32] of the return value. \n
2430/// Bits [127:96] are written to bits [127:96] of the return value. \n
2431/// Bits [223:192] are written to bits [191:160] of the return value. \n
2432/// Bits [255:224] are written to bits [255:224] of the return value.
2433/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002434static __inline __m256 __DEFAULT_FN_ATTRS
2435_mm256_unpackhi_ps(__m256 __a, __m256 __b)
2436{
2437 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2438}
2439
Logan Chien55afb0a2018-10-15 10:42:14 +08002440/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2441/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2442/// vector of [8 x float].
2443///
2444/// \headerfile <x86intrin.h>
2445///
2446/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2447///
2448/// \param __a
2449/// A 256-bit vector of [8 x float]. \n
2450/// Bits [31:0] are written to bits [31:0] of the return value. \n
2451/// Bits [63:32] are written to bits [95:64] of the return value. \n
2452/// Bits [159:128] are written to bits [159:128] of the return value. \n
2453/// Bits [191:160] are written to bits [223:192] of the return value.
2454/// \param __b
2455/// A 256-bit vector of [8 x float]. \n
2456/// Bits [31:0] are written to bits [63:32] of the return value. \n
2457/// Bits [63:32] are written to bits [127:96] of the return value. \n
2458/// Bits [159:128] are written to bits [191:160] of the return value. \n
2459/// Bits [191:160] are written to bits [255:224] of the return value.
2460/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Logan Chien2833ffb2018-10-09 10:03:24 +08002461static __inline __m256 __DEFAULT_FN_ATTRS
2462_mm256_unpacklo_ps(__m256 __a, __m256 __b)
2463{
2464 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2465}
2466
2467/* Bit Test */
Logan Chien55afb0a2018-10-15 10:42:14 +08002468/// Given two 128-bit floating-point vectors of [2 x double], perform an
2469/// element-by-element comparison of the double-precision element in the
2470/// first source vector and the corresponding element in the second source
2471/// vector.
2472///
2473/// The EFLAGS register is updated as follows: \n
2474/// If there is at least one pair of double-precision elements where the
2475/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2476/// ZF flag is set to 1. \n
2477/// If there is at least one pair of double-precision elements where the
2478/// sign-bit of the first element is 0 and the sign-bit of the second element
2479/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2480/// This intrinsic returns the value of the ZF flag.
2481///
2482/// \headerfile <x86intrin.h>
2483///
2484/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2485///
2486/// \param __a
2487/// A 128-bit vector of [2 x double].
2488/// \param __b
2489/// A 128-bit vector of [2 x double].
2490/// \returns the ZF flag in the EFLAGS register.
2491static __inline int __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08002492_mm_testz_pd(__m128d __a, __m128d __b)
2493{
2494 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2495}
2496
Logan Chien55afb0a2018-10-15 10:42:14 +08002497/// Given two 128-bit floating-point vectors of [2 x double], perform an
2498/// element-by-element comparison of the double-precision element in the
2499/// first source vector and the corresponding element in the second source
2500/// vector.
2501///
2502/// The EFLAGS register is updated as follows: \n
2503/// If there is at least one pair of double-precision elements where the
2504/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2505/// ZF flag is set to 1. \n
2506/// If there is at least one pair of double-precision elements where the
2507/// sign-bit of the first element is 0 and the sign-bit of the second element
2508/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2509/// This intrinsic returns the value of the CF flag.
2510///
2511/// \headerfile <x86intrin.h>
2512///
2513/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2514///
2515/// \param __a
2516/// A 128-bit vector of [2 x double].
2517/// \param __b
2518/// A 128-bit vector of [2 x double].
2519/// \returns the CF flag in the EFLAGS register.
2520static __inline int __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08002521_mm_testc_pd(__m128d __a, __m128d __b)
2522{
2523 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2524}
2525
Logan Chien55afb0a2018-10-15 10:42:14 +08002526/// Given two 128-bit floating-point vectors of [2 x double], perform an
2527/// element-by-element comparison of the double-precision element in the
2528/// first source vector and the corresponding element in the second source
2529/// vector.
2530///
2531/// The EFLAGS register is updated as follows: \n
2532/// If there is at least one pair of double-precision elements where the
2533/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2534/// ZF flag is set to 1. \n
2535/// If there is at least one pair of double-precision elements where the
2536/// sign-bit of the first element is 0 and the sign-bit of the second element
2537/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2538/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2539/// otherwise it returns 0.
2540///
2541/// \headerfile <x86intrin.h>
2542///
2543/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2544///
2545/// \param __a
2546/// A 128-bit vector of [2 x double].
2547/// \param __b
2548/// A 128-bit vector of [2 x double].
2549/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2550static __inline int __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08002551_mm_testnzc_pd(__m128d __a, __m128d __b)
2552{
2553 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2554}
2555
Logan Chien55afb0a2018-10-15 10:42:14 +08002556/// Given two 128-bit floating-point vectors of [4 x float], perform an
2557/// element-by-element comparison of the single-precision element in the
2558/// first source vector and the corresponding element in the second source
2559/// vector.
2560///
2561/// The EFLAGS register is updated as follows: \n
2562/// If there is at least one pair of single-precision elements where the
2563/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2564/// ZF flag is set to 1. \n
2565/// If there is at least one pair of single-precision elements where the
2566/// sign-bit of the first element is 0 and the sign-bit of the second element
2567/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2568/// This intrinsic returns the value of the ZF flag.
2569///
2570/// \headerfile <x86intrin.h>
2571///
2572/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2573///
2574/// \param __a
2575/// A 128-bit vector of [4 x float].
2576/// \param __b
2577/// A 128-bit vector of [4 x float].
2578/// \returns the ZF flag.
2579static __inline int __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08002580_mm_testz_ps(__m128 __a, __m128 __b)
2581{
2582 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2583}
2584
Logan Chien55afb0a2018-10-15 10:42:14 +08002585/// Given two 128-bit floating-point vectors of [4 x float], perform an
2586/// element-by-element comparison of the single-precision element in the
2587/// first source vector and the corresponding element in the second source
2588/// vector.
2589///
2590/// The EFLAGS register is updated as follows: \n
2591/// If there is at least one pair of single-precision elements where the
2592/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2593/// ZF flag is set to 1. \n
2594/// If there is at least one pair of single-precision elements where the
2595/// sign-bit of the first element is 0 and the sign-bit of the second element
2596/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2597/// This intrinsic returns the value of the CF flag.
2598///
2599/// \headerfile <x86intrin.h>
2600///
2601/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2602///
2603/// \param __a
2604/// A 128-bit vector of [4 x float].
2605/// \param __b
2606/// A 128-bit vector of [4 x float].
2607/// \returns the CF flag.
2608static __inline int __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08002609_mm_testc_ps(__m128 __a, __m128 __b)
2610{
2611 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2612}
2613
Logan Chien55afb0a2018-10-15 10:42:14 +08002614/// Given two 128-bit floating-point vectors of [4 x float], perform an
2615/// element-by-element comparison of the single-precision element in the
2616/// first source vector and the corresponding element in the second source
2617/// vector.
2618///
2619/// The EFLAGS register is updated as follows: \n
2620/// If there is at least one pair of single-precision elements where the
2621/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2622/// ZF flag is set to 1. \n
2623/// If there is at least one pair of single-precision elements where the
2624/// sign-bit of the first element is 0 and the sign-bit of the second element
2625/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2626/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2627/// otherwise it returns 0.
2628///
2629/// \headerfile <x86intrin.h>
2630///
2631/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2632///
2633/// \param __a
2634/// A 128-bit vector of [4 x float].
2635/// \param __b
2636/// A 128-bit vector of [4 x float].
2637/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2638static __inline int __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08002639_mm_testnzc_ps(__m128 __a, __m128 __b)
2640{
2641 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2642}
2643
Logan Chien55afb0a2018-10-15 10:42:14 +08002644/// Given two 256-bit floating-point vectors of [4 x double], perform an
2645/// element-by-element comparison of the double-precision elements in the
2646/// first source vector and the corresponding elements in the second source
2647/// vector.
2648///
2649/// The EFLAGS register is updated as follows: \n
2650/// If there is at least one pair of double-precision elements where the
2651/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2652/// ZF flag is set to 1. \n
2653/// If there is at least one pair of double-precision elements where the
2654/// sign-bit of the first element is 0 and the sign-bit of the second element
2655/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2656/// This intrinsic returns the value of the ZF flag.
2657///
2658/// \headerfile <x86intrin.h>
2659///
2660/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2661///
2662/// \param __a
2663/// A 256-bit vector of [4 x double].
2664/// \param __b
2665/// A 256-bit vector of [4 x double].
2666/// \returns the ZF flag.
Logan Chien2833ffb2018-10-09 10:03:24 +08002667static __inline int __DEFAULT_FN_ATTRS
2668_mm256_testz_pd(__m256d __a, __m256d __b)
2669{
2670 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2671}
2672
Logan Chien55afb0a2018-10-15 10:42:14 +08002673/// Given two 256-bit floating-point vectors of [4 x double], perform an
2674/// element-by-element comparison of the double-precision elements in the
2675/// first source vector and the corresponding elements in the second source
2676/// vector.
2677///
2678/// The EFLAGS register is updated as follows: \n
2679/// If there is at least one pair of double-precision elements where the
2680/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2681/// ZF flag is set to 1. \n
2682/// If there is at least one pair of double-precision elements where the
2683/// sign-bit of the first element is 0 and the sign-bit of the second element
2684/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2685/// This intrinsic returns the value of the CF flag.
2686///
2687/// \headerfile <x86intrin.h>
2688///
2689/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2690///
2691/// \param __a
2692/// A 256-bit vector of [4 x double].
2693/// \param __b
2694/// A 256-bit vector of [4 x double].
2695/// \returns the CF flag.
Logan Chien2833ffb2018-10-09 10:03:24 +08002696static __inline int __DEFAULT_FN_ATTRS
2697_mm256_testc_pd(__m256d __a, __m256d __b)
2698{
2699 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2700}
2701
Logan Chien55afb0a2018-10-15 10:42:14 +08002702/// Given two 256-bit floating-point vectors of [4 x double], perform an
2703/// element-by-element comparison of the double-precision elements in the
2704/// first source vector and the corresponding elements in the second source
2705/// vector.
2706///
2707/// The EFLAGS register is updated as follows: \n
2708/// If there is at least one pair of double-precision elements where the
2709/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2710/// ZF flag is set to 1. \n
2711/// If there is at least one pair of double-precision elements where the
2712/// sign-bit of the first element is 0 and the sign-bit of the second element
2713/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2714/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2715/// otherwise it returns 0.
2716///
2717/// \headerfile <x86intrin.h>
2718///
2719/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2720///
2721/// \param __a
2722/// A 256-bit vector of [4 x double].
2723/// \param __b
2724/// A 256-bit vector of [4 x double].
2725/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002726static __inline int __DEFAULT_FN_ATTRS
2727_mm256_testnzc_pd(__m256d __a, __m256d __b)
2728{
2729 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2730}
2731
Logan Chien55afb0a2018-10-15 10:42:14 +08002732/// Given two 256-bit floating-point vectors of [8 x float], perform an
2733/// element-by-element comparison of the single-precision element in the
2734/// first source vector and the corresponding element in the second source
2735/// vector.
2736///
2737/// The EFLAGS register is updated as follows: \n
2738/// If there is at least one pair of single-precision elements where the
2739/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2740/// ZF flag is set to 1. \n
2741/// If there is at least one pair of single-precision elements where the
2742/// sign-bit of the first element is 0 and the sign-bit of the second element
2743/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2744/// This intrinsic returns the value of the ZF flag.
2745///
2746/// \headerfile <x86intrin.h>
2747///
2748/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2749///
2750/// \param __a
2751/// A 256-bit vector of [8 x float].
2752/// \param __b
2753/// A 256-bit vector of [8 x float].
2754/// \returns the ZF flag.
Logan Chien2833ffb2018-10-09 10:03:24 +08002755static __inline int __DEFAULT_FN_ATTRS
2756_mm256_testz_ps(__m256 __a, __m256 __b)
2757{
2758 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2759}
2760
Logan Chien55afb0a2018-10-15 10:42:14 +08002761/// Given two 256-bit floating-point vectors of [8 x float], perform an
2762/// element-by-element comparison of the single-precision element in the
2763/// first source vector and the corresponding element in the second source
2764/// vector.
2765///
2766/// The EFLAGS register is updated as follows: \n
2767/// If there is at least one pair of single-precision elements where the
2768/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2769/// ZF flag is set to 1. \n
2770/// If there is at least one pair of single-precision elements where the
2771/// sign-bit of the first element is 0 and the sign-bit of the second element
2772/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2773/// This intrinsic returns the value of the CF flag.
2774///
2775/// \headerfile <x86intrin.h>
2776///
2777/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2778///
2779/// \param __a
2780/// A 256-bit vector of [8 x float].
2781/// \param __b
2782/// A 256-bit vector of [8 x float].
2783/// \returns the CF flag.
Logan Chien2833ffb2018-10-09 10:03:24 +08002784static __inline int __DEFAULT_FN_ATTRS
2785_mm256_testc_ps(__m256 __a, __m256 __b)
2786{
2787 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2788}
2789
Logan Chien55afb0a2018-10-15 10:42:14 +08002790/// Given two 256-bit floating-point vectors of [8 x float], perform an
2791/// element-by-element comparison of the single-precision elements in the
2792/// first source vector and the corresponding elements in the second source
2793/// vector.
2794///
2795/// The EFLAGS register is updated as follows: \n
2796/// If there is at least one pair of single-precision elements where the
2797/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2798/// ZF flag is set to 1. \n
2799/// If there is at least one pair of single-precision elements where the
2800/// sign-bit of the first element is 0 and the sign-bit of the second element
2801/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2802/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2803/// otherwise it returns 0.
2804///
2805/// \headerfile <x86intrin.h>
2806///
2807/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2808///
2809/// \param __a
2810/// A 256-bit vector of [8 x float].
2811/// \param __b
2812/// A 256-bit vector of [8 x float].
2813/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002814static __inline int __DEFAULT_FN_ATTRS
2815_mm256_testnzc_ps(__m256 __a, __m256 __b)
2816{
2817 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2818}
2819
Logan Chien55afb0a2018-10-15 10:42:14 +08002820/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2821/// of the two source vectors.
2822///
2823/// The EFLAGS register is updated as follows: \n
2824/// If there is at least one pair of bits where both bits are 1, the ZF flag
2825/// is set to 0. Otherwise the ZF flag is set to 1. \n
2826/// If there is at least one pair of bits where the bit from the first source
2827/// vector is 0 and the bit from the second source vector is 1, the CF flag
2828/// is set to 0. Otherwise the CF flag is set to 1. \n
2829/// This intrinsic returns the value of the ZF flag.
2830///
2831/// \headerfile <x86intrin.h>
2832///
2833/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2834///
2835/// \param __a
2836/// A 256-bit integer vector.
2837/// \param __b
2838/// A 256-bit integer vector.
2839/// \returns the ZF flag.
Logan Chien2833ffb2018-10-09 10:03:24 +08002840static __inline int __DEFAULT_FN_ATTRS
2841_mm256_testz_si256(__m256i __a, __m256i __b)
2842{
2843 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2844}
2845
Logan Chien55afb0a2018-10-15 10:42:14 +08002846/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2847/// of the two source vectors.
2848///
2849/// The EFLAGS register is updated as follows: \n
2850/// If there is at least one pair of bits where both bits are 1, the ZF flag
2851/// is set to 0. Otherwise the ZF flag is set to 1. \n
2852/// If there is at least one pair of bits where the bit from the first source
2853/// vector is 0 and the bit from the second source vector is 1, the CF flag
2854/// is set to 0. Otherwise the CF flag is set to 1. \n
2855/// This intrinsic returns the value of the CF flag.
2856///
2857/// \headerfile <x86intrin.h>
2858///
2859/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2860///
2861/// \param __a
2862/// A 256-bit integer vector.
2863/// \param __b
2864/// A 256-bit integer vector.
2865/// \returns the CF flag.
Logan Chien2833ffb2018-10-09 10:03:24 +08002866static __inline int __DEFAULT_FN_ATTRS
2867_mm256_testc_si256(__m256i __a, __m256i __b)
2868{
2869 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2870}
2871
Logan Chien55afb0a2018-10-15 10:42:14 +08002872/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2873/// of the two source vectors.
2874///
2875/// The EFLAGS register is updated as follows: \n
2876/// If there is at least one pair of bits where both bits are 1, the ZF flag
2877/// is set to 0. Otherwise the ZF flag is set to 1. \n
2878/// If there is at least one pair of bits where the bit from the first source
2879/// vector is 0 and the bit from the second source vector is 1, the CF flag
2880/// is set to 0. Otherwise the CF flag is set to 1. \n
2881/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2882/// otherwise it returns 0.
2883///
2884/// \headerfile <x86intrin.h>
2885///
2886/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2887///
2888/// \param __a
2889/// A 256-bit integer vector.
2890/// \param __b
2891/// A 256-bit integer vector.
2892/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Logan Chien2833ffb2018-10-09 10:03:24 +08002893static __inline int __DEFAULT_FN_ATTRS
2894_mm256_testnzc_si256(__m256i __a, __m256i __b)
2895{
2896 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2897}
2898
2899/* Vector extract sign mask */
Logan Chien55afb0a2018-10-15 10:42:14 +08002900/// Extracts the sign bits of double-precision floating point elements
2901/// in a 256-bit vector of [4 x double] and writes them to the lower order
2902/// bits of the return value.
2903///
2904/// \headerfile <x86intrin.h>
2905///
2906/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2907///
2908/// \param __a
2909/// A 256-bit vector of [4 x double] containing the double-precision
2910/// floating point values with sign bits to be extracted.
2911/// \returns The sign bits from the operand, written to bits [3:0].
Logan Chien2833ffb2018-10-09 10:03:24 +08002912static __inline int __DEFAULT_FN_ATTRS
2913_mm256_movemask_pd(__m256d __a)
2914{
2915 return __builtin_ia32_movmskpd256((__v4df)__a);
2916}
2917
Logan Chien55afb0a2018-10-15 10:42:14 +08002918/// Extracts the sign bits of single-precision floating point elements
2919/// in a 256-bit vector of [8 x float] and writes them to the lower order
2920/// bits of the return value.
2921///
2922/// \headerfile <x86intrin.h>
2923///
2924/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2925///
2926/// \param __a
2927/// A 256-bit vector of [8 x float] containing the single-precision floating
2928/// point values with sign bits to be extracted.
2929/// \returns The sign bits from the operand, written to bits [7:0].
Logan Chien2833ffb2018-10-09 10:03:24 +08002930static __inline int __DEFAULT_FN_ATTRS
2931_mm256_movemask_ps(__m256 __a)
2932{
2933 return __builtin_ia32_movmskps256((__v8sf)__a);
2934}
2935
2936/* Vector __zero */
Logan Chien55afb0a2018-10-15 10:42:14 +08002937/// Zeroes the contents of all XMM or YMM registers.
2938///
2939/// \headerfile <x86intrin.h>
2940///
2941/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2942static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Logan Chien2833ffb2018-10-09 10:03:24 +08002943_mm256_zeroall(void)
2944{
2945 __builtin_ia32_vzeroall();
2946}
2947
Logan Chien55afb0a2018-10-15 10:42:14 +08002948/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2949///
2950/// \headerfile <x86intrin.h>
2951///
2952/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
2953static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Logan Chien2833ffb2018-10-09 10:03:24 +08002954_mm256_zeroupper(void)
2955{
2956 __builtin_ia32_vzeroupper();
2957}
2958
2959/* Vector load with broadcast */
Logan Chien55afb0a2018-10-15 10:42:14 +08002960/// Loads a scalar single-precision floating point value from the
2961/// specified address pointed to by \a __a and broadcasts it to the elements
2962/// of a [4 x float] vector.
2963///
2964/// \headerfile <x86intrin.h>
2965///
2966/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2967///
2968/// \param __a
2969/// The single-precision floating point value to be broadcast.
2970/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2971/// equal to the broadcast value.
2972static __inline __m128 __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08002973_mm_broadcast_ss(float const *__a)
2974{
2975 float __f = *__a;
Logan Chien55afb0a2018-10-15 10:42:14 +08002976 return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
Logan Chien2833ffb2018-10-09 10:03:24 +08002977}
2978
Logan Chien55afb0a2018-10-15 10:42:14 +08002979/// Loads a scalar double-precision floating point value from the
2980/// specified address pointed to by \a __a and broadcasts it to the elements
2981/// of a [4 x double] vector.
2982///
2983/// \headerfile <x86intrin.h>
2984///
2985/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
2986///
2987/// \param __a
2988/// The double-precision floating point value to be broadcast.
2989/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
2990/// equal to the broadcast value.
Logan Chien2833ffb2018-10-09 10:03:24 +08002991static __inline __m256d __DEFAULT_FN_ATTRS
2992_mm256_broadcast_sd(double const *__a)
2993{
2994 double __d = *__a;
Logan Chien55afb0a2018-10-15 10:42:14 +08002995 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
Logan Chien2833ffb2018-10-09 10:03:24 +08002996}
2997
Logan Chien55afb0a2018-10-15 10:42:14 +08002998/// Loads a scalar single-precision floating point value from the
2999/// specified address pointed to by \a __a and broadcasts it to the elements
3000/// of a [8 x float] vector.
3001///
3002/// \headerfile <x86intrin.h>
3003///
3004/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3005///
3006/// \param __a
3007/// The single-precision floating point value to be broadcast.
3008/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3009/// equal to the broadcast value.
Logan Chien2833ffb2018-10-09 10:03:24 +08003010static __inline __m256 __DEFAULT_FN_ATTRS
3011_mm256_broadcast_ss(float const *__a)
3012{
3013 float __f = *__a;
Logan Chien55afb0a2018-10-15 10:42:14 +08003014 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Logan Chien2833ffb2018-10-09 10:03:24 +08003015}
3016
Logan Chien55afb0a2018-10-15 10:42:14 +08003017/// Loads the data from a 128-bit vector of [2 x double] from the
3018/// specified address pointed to by \a __a and broadcasts it to 128-bit
3019/// elements in a 256-bit vector of [4 x double].
3020///
3021/// \headerfile <x86intrin.h>
3022///
3023/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3024///
3025/// \param __a
3026/// The 128-bit vector of [2 x double] to be broadcast.
3027/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3028/// equal to the broadcast value.
Logan Chien2833ffb2018-10-09 10:03:24 +08003029static __inline __m256d __DEFAULT_FN_ATTRS
3030_mm256_broadcast_pd(__m128d const *__a)
3031{
Logan Chien55afb0a2018-10-15 10:42:14 +08003032 __m128d __b = _mm_loadu_pd((const double *)__a);
3033 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3034 0, 1, 0, 1);
Logan Chien2833ffb2018-10-09 10:03:24 +08003035}
3036
Logan Chien55afb0a2018-10-15 10:42:14 +08003037/// Loads the data from a 128-bit vector of [4 x float] from the
3038/// specified address pointed to by \a __a and broadcasts it to 128-bit
3039/// elements in a 256-bit vector of [8 x float].
3040///
3041/// \headerfile <x86intrin.h>
3042///
3043/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3044///
3045/// \param __a
3046/// The 128-bit vector of [4 x float] to be broadcast.
3047/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3048/// equal to the broadcast value.
Logan Chien2833ffb2018-10-09 10:03:24 +08003049static __inline __m256 __DEFAULT_FN_ATTRS
3050_mm256_broadcast_ps(__m128 const *__a)
3051{
Logan Chien55afb0a2018-10-15 10:42:14 +08003052 __m128 __b = _mm_loadu_ps((const float *)__a);
3053 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3054 0, 1, 2, 3, 0, 1, 2, 3);
Logan Chien2833ffb2018-10-09 10:03:24 +08003055}
3056
3057/* SIMD load ops */
Logan Chien55afb0a2018-10-15 10:42:14 +08003058/// Loads 4 double-precision floating point values from a 32-byte aligned
3059/// memory location pointed to by \a __p into a vector of [4 x double].
3060///
3061/// \headerfile <x86intrin.h>
3062///
3063/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3064///
3065/// \param __p
3066/// A 32-byte aligned pointer to a memory location containing
3067/// double-precision floating point values.
3068/// \returns A 256-bit vector of [4 x double] containing the moved values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003069static __inline __m256d __DEFAULT_FN_ATTRS
3070_mm256_load_pd(double const *__p)
3071{
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07003072 return *(const __m256d *)__p;
Logan Chien2833ffb2018-10-09 10:03:24 +08003073}
3074
Logan Chien55afb0a2018-10-15 10:42:14 +08003075/// Loads 8 single-precision floating point values from a 32-byte aligned
3076/// memory location pointed to by \a __p into a vector of [8 x float].
3077///
3078/// \headerfile <x86intrin.h>
3079///
3080/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3081///
3082/// \param __p
3083/// A 32-byte aligned pointer to a memory location containing float values.
3084/// \returns A 256-bit vector of [8 x float] containing the moved values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003085static __inline __m256 __DEFAULT_FN_ATTRS
3086_mm256_load_ps(float const *__p)
3087{
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07003088 return *(const __m256 *)__p;
Logan Chien2833ffb2018-10-09 10:03:24 +08003089}
3090
Logan Chien55afb0a2018-10-15 10:42:14 +08003091/// Loads 4 double-precision floating point values from an unaligned
3092/// memory location pointed to by \a __p into a vector of [4 x double].
3093///
3094/// \headerfile <x86intrin.h>
3095///
3096/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3097///
3098/// \param __p
3099/// A pointer to a memory location containing double-precision floating
3100/// point values.
3101/// \returns A 256-bit vector of [4 x double] containing the moved values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003102static __inline __m256d __DEFAULT_FN_ATTRS
3103_mm256_loadu_pd(double const *__p)
3104{
3105 struct __loadu_pd {
Logan Chiendbcf4122019-03-21 10:50:25 +08003106 __m256d_u __v;
Logan Chien2833ffb2018-10-09 10:03:24 +08003107 } __attribute__((__packed__, __may_alias__));
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07003108 return ((const struct __loadu_pd*)__p)->__v;
Logan Chien2833ffb2018-10-09 10:03:24 +08003109}
3110
Logan Chien55afb0a2018-10-15 10:42:14 +08003111/// Loads 8 single-precision floating point values from an unaligned
3112/// memory location pointed to by \a __p into a vector of [8 x float].
3113///
3114/// \headerfile <x86intrin.h>
3115///
3116/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3117///
3118/// \param __p
3119/// A pointer to a memory location containing single-precision floating
3120/// point values.
3121/// \returns A 256-bit vector of [8 x float] containing the moved values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003122static __inline __m256 __DEFAULT_FN_ATTRS
3123_mm256_loadu_ps(float const *__p)
3124{
3125 struct __loadu_ps {
Logan Chiendbcf4122019-03-21 10:50:25 +08003126 __m256_u __v;
Logan Chien2833ffb2018-10-09 10:03:24 +08003127 } __attribute__((__packed__, __may_alias__));
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07003128 return ((const struct __loadu_ps*)__p)->__v;
Logan Chien2833ffb2018-10-09 10:03:24 +08003129}
3130
Logan Chien55afb0a2018-10-15 10:42:14 +08003131/// Loads 256 bits of integer data from a 32-byte aligned memory
3132/// location pointed to by \a __p into elements of a 256-bit integer vector.
3133///
3134/// \headerfile <x86intrin.h>
3135///
3136/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3137///
3138/// \param __p
3139/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3140/// values.
3141/// \returns A 256-bit integer vector containing the moved values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003142static __inline __m256i __DEFAULT_FN_ATTRS
3143_mm256_load_si256(__m256i const *__p)
3144{
3145 return *__p;
3146}
3147
Logan Chien55afb0a2018-10-15 10:42:14 +08003148/// Loads 256 bits of integer data from an unaligned memory location
3149/// pointed to by \a __p into a 256-bit integer vector.
3150///
3151/// \headerfile <x86intrin.h>
3152///
3153/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3154///
3155/// \param __p
3156/// A pointer to a 256-bit integer vector containing integer values.
3157/// \returns A 256-bit integer vector containing the moved values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003158static __inline __m256i __DEFAULT_FN_ATTRS
Logan Chiendbcf4122019-03-21 10:50:25 +08003159_mm256_loadu_si256(__m256i_u const *__p)
Logan Chien2833ffb2018-10-09 10:03:24 +08003160{
3161 struct __loadu_si256 {
Logan Chiendbcf4122019-03-21 10:50:25 +08003162 __m256i_u __v;
Logan Chien2833ffb2018-10-09 10:03:24 +08003163 } __attribute__((__packed__, __may_alias__));
Sasha Smundak33d5ddd2020-05-04 13:37:26 -07003164 return ((const struct __loadu_si256*)__p)->__v;
Logan Chien2833ffb2018-10-09 10:03:24 +08003165}
3166
Logan Chien55afb0a2018-10-15 10:42:14 +08003167/// Loads 256 bits of integer data from an unaligned memory location
3168/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3169/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3170/// line boundary.
3171///
3172/// \headerfile <x86intrin.h>
3173///
3174/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3175///
3176/// \param __p
3177/// A pointer to a 256-bit integer vector containing integer values.
3178/// \returns A 256-bit integer vector containing the moved values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003179static __inline __m256i __DEFAULT_FN_ATTRS
3180_mm256_lddqu_si256(__m256i const *__p)
3181{
3182 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3183}
3184
3185/* SIMD store ops */
Logan Chien55afb0a2018-10-15 10:42:14 +08003186/// Stores double-precision floating point values from a 256-bit vector
3187/// of [4 x double] to a 32-byte aligned memory location pointed to by
3188/// \a __p.
3189///
3190/// \headerfile <x86intrin.h>
3191///
3192/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3193///
3194/// \param __p
3195/// A 32-byte aligned pointer to a memory location that will receive the
3196/// double-precision floaing point values.
3197/// \param __a
3198/// A 256-bit vector of [4 x double] containing the values to be moved.
Logan Chien2833ffb2018-10-09 10:03:24 +08003199static __inline void __DEFAULT_FN_ATTRS
3200_mm256_store_pd(double *__p, __m256d __a)
3201{
3202 *(__m256d *)__p = __a;
3203}
3204
Logan Chien55afb0a2018-10-15 10:42:14 +08003205/// Stores single-precision floating point values from a 256-bit vector
3206/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3207///
3208/// \headerfile <x86intrin.h>
3209///
3210/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3211///
3212/// \param __p
3213/// A 32-byte aligned pointer to a memory location that will receive the
3214/// float values.
3215/// \param __a
3216/// A 256-bit vector of [8 x float] containing the values to be moved.
Logan Chien2833ffb2018-10-09 10:03:24 +08003217static __inline void __DEFAULT_FN_ATTRS
3218_mm256_store_ps(float *__p, __m256 __a)
3219{
3220 *(__m256 *)__p = __a;
3221}
3222
Logan Chien55afb0a2018-10-15 10:42:14 +08003223/// Stores double-precision floating point values from a 256-bit vector
3224/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3225///
3226/// \headerfile <x86intrin.h>
3227///
3228/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3229///
3230/// \param __p
3231/// A pointer to a memory location that will receive the double-precision
3232/// floating point values.
3233/// \param __a
3234/// A 256-bit vector of [4 x double] containing the values to be moved.
Logan Chien2833ffb2018-10-09 10:03:24 +08003235static __inline void __DEFAULT_FN_ATTRS
3236_mm256_storeu_pd(double *__p, __m256d __a)
3237{
3238 struct __storeu_pd {
Logan Chiendbcf4122019-03-21 10:50:25 +08003239 __m256d_u __v;
Logan Chien2833ffb2018-10-09 10:03:24 +08003240 } __attribute__((__packed__, __may_alias__));
3241 ((struct __storeu_pd*)__p)->__v = __a;
3242}
3243
Logan Chien55afb0a2018-10-15 10:42:14 +08003244/// Stores single-precision floating point values from a 256-bit vector
3245/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3246///
3247/// \headerfile <x86intrin.h>
3248///
3249/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3250///
3251/// \param __p
3252/// A pointer to a memory location that will receive the float values.
3253/// \param __a
3254/// A 256-bit vector of [8 x float] containing the values to be moved.
Logan Chien2833ffb2018-10-09 10:03:24 +08003255static __inline void __DEFAULT_FN_ATTRS
3256_mm256_storeu_ps(float *__p, __m256 __a)
3257{
3258 struct __storeu_ps {
Logan Chiendbcf4122019-03-21 10:50:25 +08003259 __m256_u __v;
Logan Chien2833ffb2018-10-09 10:03:24 +08003260 } __attribute__((__packed__, __may_alias__));
3261 ((struct __storeu_ps*)__p)->__v = __a;
3262}
3263
Logan Chien55afb0a2018-10-15 10:42:14 +08003264/// Stores integer values from a 256-bit integer vector to a 32-byte
3265/// aligned memory location pointed to by \a __p.
3266///
3267/// \headerfile <x86intrin.h>
3268///
3269/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3270///
3271/// \param __p
3272/// A 32-byte aligned pointer to a memory location that will receive the
3273/// integer values.
3274/// \param __a
3275/// A 256-bit integer vector containing the values to be moved.
Logan Chien2833ffb2018-10-09 10:03:24 +08003276static __inline void __DEFAULT_FN_ATTRS
3277_mm256_store_si256(__m256i *__p, __m256i __a)
3278{
3279 *__p = __a;
3280}
3281
Logan Chien55afb0a2018-10-15 10:42:14 +08003282/// Stores integer values from a 256-bit integer vector to an unaligned
3283/// memory location pointed to by \a __p.
3284///
3285/// \headerfile <x86intrin.h>
3286///
3287/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3288///
3289/// \param __p
3290/// A pointer to a memory location that will receive the integer values.
3291/// \param __a
3292/// A 256-bit integer vector containing the values to be moved.
Logan Chien2833ffb2018-10-09 10:03:24 +08003293static __inline void __DEFAULT_FN_ATTRS
Logan Chiendbcf4122019-03-21 10:50:25 +08003294_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Logan Chien2833ffb2018-10-09 10:03:24 +08003295{
3296 struct __storeu_si256 {
Logan Chiendbcf4122019-03-21 10:50:25 +08003297 __m256i_u __v;
Logan Chien2833ffb2018-10-09 10:03:24 +08003298 } __attribute__((__packed__, __may_alias__));
3299 ((struct __storeu_si256*)__p)->__v = __a;
3300}
3301
3302/* Conditional load ops */
Logan Chien55afb0a2018-10-15 10:42:14 +08003303/// Conditionally loads double-precision floating point elements from a
3304/// memory location pointed to by \a __p into a 128-bit vector of
3305/// [2 x double], depending on the mask bits associated with each data
3306/// element.
3307///
3308/// \headerfile <x86intrin.h>
3309///
3310/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3311///
3312/// \param __p
3313/// A pointer to a memory location that contains the double-precision
3314/// floating point values.
3315/// \param __m
3316/// A 128-bit integer vector containing the mask. The most significant bit of
3317/// each data element represents the mask bits. If a mask bit is zero, the
3318/// corresponding value in the memory location is not loaded and the
3319/// corresponding field in the return value is set to zero.
3320/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3321static __inline __m128d __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08003322_mm_maskload_pd(double const *__p, __m128i __m)
3323{
3324 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3325}
3326
Logan Chien55afb0a2018-10-15 10:42:14 +08003327/// Conditionally loads double-precision floating point elements from a
3328/// memory location pointed to by \a __p into a 256-bit vector of
3329/// [4 x double], depending on the mask bits associated with each data
3330/// element.
3331///
3332/// \headerfile <x86intrin.h>
3333///
3334/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3335///
3336/// \param __p
3337/// A pointer to a memory location that contains the double-precision
3338/// floating point values.
3339/// \param __m
3340/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3341/// significant bit of each quadword element represents the mask bits. If a
3342/// mask bit is zero, the corresponding value in the memory location is not
3343/// loaded and the corresponding field in the return value is set to zero.
3344/// \returns A 256-bit vector of [4 x double] containing the loaded values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003345static __inline __m256d __DEFAULT_FN_ATTRS
3346_mm256_maskload_pd(double const *__p, __m256i __m)
3347{
3348 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3349 (__v4di)__m);
3350}
3351
Logan Chien55afb0a2018-10-15 10:42:14 +08003352/// Conditionally loads single-precision floating point elements from a
3353/// memory location pointed to by \a __p into a 128-bit vector of
3354/// [4 x float], depending on the mask bits associated with each data
3355/// element.
3356///
3357/// \headerfile <x86intrin.h>
3358///
3359/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3360///
3361/// \param __p
3362/// A pointer to a memory location that contains the single-precision
3363/// floating point values.
3364/// \param __m
3365/// A 128-bit integer vector containing the mask. The most significant bit of
3366/// each data element represents the mask bits. If a mask bit is zero, the
3367/// corresponding value in the memory location is not loaded and the
3368/// corresponding field in the return value is set to zero.
3369/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3370static __inline __m128 __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08003371_mm_maskload_ps(float const *__p, __m128i __m)
3372{
3373 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3374}
3375
Logan Chien55afb0a2018-10-15 10:42:14 +08003376/// Conditionally loads single-precision floating point elements from a
3377/// memory location pointed to by \a __p into a 256-bit vector of
3378/// [8 x float], depending on the mask bits associated with each data
3379/// element.
3380///
3381/// \headerfile <x86intrin.h>
3382///
3383/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3384///
3385/// \param __p
3386/// A pointer to a memory location that contains the single-precision
3387/// floating point values.
3388/// \param __m
3389/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3390/// significant bit of each dword element represents the mask bits. If a mask
3391/// bit is zero, the corresponding value in the memory location is not loaded
3392/// and the corresponding field in the return value is set to zero.
3393/// \returns A 256-bit vector of [8 x float] containing the loaded values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003394static __inline __m256 __DEFAULT_FN_ATTRS
3395_mm256_maskload_ps(float const *__p, __m256i __m)
3396{
3397 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3398}
3399
3400/* Conditional store ops */
Logan Chien55afb0a2018-10-15 10:42:14 +08003401/// Moves single-precision floating point values from a 256-bit vector
3402/// of [8 x float] to a memory location pointed to by \a __p, according to
3403/// the specified mask.
3404///
3405/// \headerfile <x86intrin.h>
3406///
3407/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3408///
3409/// \param __p
3410/// A pointer to a memory location that will receive the float values.
3411/// \param __m
3412/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3413/// significant bit of each dword element in the mask vector represents the
3414/// mask bits. If a mask bit is zero, the corresponding value from vector
3415/// \a __a is not stored and the corresponding field in the memory location
3416/// pointed to by \a __p is not changed.
3417/// \param __a
3418/// A 256-bit vector of [8 x float] containing the values to be stored.
Logan Chien2833ffb2018-10-09 10:03:24 +08003419static __inline void __DEFAULT_FN_ATTRS
3420_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3421{
3422 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3423}
3424
Logan Chien55afb0a2018-10-15 10:42:14 +08003425/// Moves double-precision values from a 128-bit vector of [2 x double]
3426/// to a memory location pointed to by \a __p, according to the specified
3427/// mask.
3428///
3429/// \headerfile <x86intrin.h>
3430///
3431/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3432///
3433/// \param __p
3434/// A pointer to a memory location that will receive the float values.
3435/// \param __m
3436/// A 128-bit integer vector containing the mask. The most significant bit of
3437/// each field in the mask vector represents the mask bits. If a mask bit is
3438/// zero, the corresponding value from vector \a __a is not stored and the
3439/// corresponding field in the memory location pointed to by \a __p is not
3440/// changed.
3441/// \param __a
3442/// A 128-bit vector of [2 x double] containing the values to be stored.
3443static __inline void __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08003444_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3445{
3446 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3447}
3448
Logan Chien55afb0a2018-10-15 10:42:14 +08003449/// Moves double-precision values from a 256-bit vector of [4 x double]
3450/// to a memory location pointed to by \a __p, according to the specified
3451/// mask.
3452///
3453/// \headerfile <x86intrin.h>
3454///
3455/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3456///
3457/// \param __p
3458/// A pointer to a memory location that will receive the float values.
3459/// \param __m
3460/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3461/// significant bit of each quadword element in the mask vector represents
3462/// the mask bits. If a mask bit is zero, the corresponding value from vector
3463/// __a is not stored and the corresponding field in the memory location
3464/// pointed to by \a __p is not changed.
3465/// \param __a
3466/// A 256-bit vector of [4 x double] containing the values to be stored.
Logan Chien2833ffb2018-10-09 10:03:24 +08003467static __inline void __DEFAULT_FN_ATTRS
3468_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3469{
3470 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3471}
3472
Logan Chien55afb0a2018-10-15 10:42:14 +08003473/// Moves single-precision floating point values from a 128-bit vector
3474/// of [4 x float] to a memory location pointed to by \a __p, according to
3475/// the specified mask.
3476///
3477/// \headerfile <x86intrin.h>
3478///
3479/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3480///
3481/// \param __p
3482/// A pointer to a memory location that will receive the float values.
3483/// \param __m
3484/// A 128-bit integer vector containing the mask. The most significant bit of
3485/// each field in the mask vector represents the mask bits. If a mask bit is
3486/// zero, the corresponding value from vector __a is not stored and the
3487/// corresponding field in the memory location pointed to by \a __p is not
3488/// changed.
3489/// \param __a
3490/// A 128-bit vector of [4 x float] containing the values to be stored.
3491static __inline void __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08003492_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3493{
3494 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3495}
3496
3497/* Cacheability support ops */
Logan Chien55afb0a2018-10-15 10:42:14 +08003498/// Moves integer data from a 256-bit integer vector to a 32-byte
3499/// aligned memory location. To minimize caching, the data is flagged as
3500/// non-temporal (unlikely to be used again soon).
3501///
3502/// \headerfile <x86intrin.h>
3503///
3504/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3505///
3506/// \param __a
3507/// A pointer to a 32-byte aligned memory location that will receive the
3508/// integer values.
3509/// \param __b
3510/// A 256-bit integer vector containing the values to be moved.
Logan Chien2833ffb2018-10-09 10:03:24 +08003511static __inline void __DEFAULT_FN_ATTRS
3512_mm256_stream_si256(__m256i *__a, __m256i __b)
3513{
Logan Chien55afb0a2018-10-15 10:42:14 +08003514 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3515 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
Logan Chien2833ffb2018-10-09 10:03:24 +08003516}
3517
Logan Chien55afb0a2018-10-15 10:42:14 +08003518/// Moves double-precision values from a 256-bit vector of [4 x double]
3519/// to a 32-byte aligned memory location. To minimize caching, the data is
3520/// flagged as non-temporal (unlikely to be used again soon).
3521///
3522/// \headerfile <x86intrin.h>
3523///
3524/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3525///
3526/// \param __a
3527/// A pointer to a 32-byte aligned memory location that will receive the
3528/// double-precision floating-point values.
3529/// \param __b
3530/// A 256-bit vector of [4 x double] containing the values to be moved.
Logan Chien2833ffb2018-10-09 10:03:24 +08003531static __inline void __DEFAULT_FN_ATTRS
3532_mm256_stream_pd(double *__a, __m256d __b)
3533{
Logan Chien55afb0a2018-10-15 10:42:14 +08003534 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3535 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
Logan Chien2833ffb2018-10-09 10:03:24 +08003536}
3537
Logan Chien55afb0a2018-10-15 10:42:14 +08003538/// Moves single-precision floating point values from a 256-bit vector
3539/// of [8 x float] to a 32-byte aligned memory location. To minimize
3540/// caching, the data is flagged as non-temporal (unlikely to be used again
3541/// soon).
3542///
3543/// \headerfile <x86intrin.h>
3544///
3545/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3546///
3547/// \param __p
3548/// A pointer to a 32-byte aligned memory location that will receive the
3549/// single-precision floating point values.
3550/// \param __a
3551/// A 256-bit vector of [8 x float] containing the values to be moved.
Logan Chien2833ffb2018-10-09 10:03:24 +08003552static __inline void __DEFAULT_FN_ATTRS
3553_mm256_stream_ps(float *__p, __m256 __a)
3554{
Logan Chien55afb0a2018-10-15 10:42:14 +08003555 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3556 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
Logan Chien2833ffb2018-10-09 10:03:24 +08003557}
3558
3559/* Create vectors */
Logan Chien55afb0a2018-10-15 10:42:14 +08003560/// Create a 256-bit vector of [4 x double] with undefined values.
3561///
3562/// \headerfile <x86intrin.h>
3563///
3564/// This intrinsic has no corresponding instruction.
3565///
3566/// \returns A 256-bit vector of [4 x double] containing undefined values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003567static __inline__ __m256d __DEFAULT_FN_ATTRS
3568_mm256_undefined_pd(void)
3569{
3570 return (__m256d)__builtin_ia32_undef256();
3571}
3572
Logan Chien55afb0a2018-10-15 10:42:14 +08003573/// Create a 256-bit vector of [8 x float] with undefined values.
3574///
3575/// \headerfile <x86intrin.h>
3576///
3577/// This intrinsic has no corresponding instruction.
3578///
3579/// \returns A 256-bit vector of [8 x float] containing undefined values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003580static __inline__ __m256 __DEFAULT_FN_ATTRS
3581_mm256_undefined_ps(void)
3582{
3583 return (__m256)__builtin_ia32_undef256();
3584}
3585
Logan Chien55afb0a2018-10-15 10:42:14 +08003586/// Create a 256-bit integer vector with undefined values.
3587///
3588/// \headerfile <x86intrin.h>
3589///
3590/// This intrinsic has no corresponding instruction.
3591///
3592/// \returns A 256-bit integer vector containing undefined values.
Logan Chien2833ffb2018-10-09 10:03:24 +08003593static __inline__ __m256i __DEFAULT_FN_ATTRS
3594_mm256_undefined_si256(void)
3595{
3596 return (__m256i)__builtin_ia32_undef256();
3597}
3598
Logan Chien55afb0a2018-10-15 10:42:14 +08003599/// Constructs a 256-bit floating-point vector of [4 x double]
3600/// initialized with the specified double-precision floating-point values.
3601///
3602/// \headerfile <x86intrin.h>
3603///
3604/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3605/// instruction.
3606///
3607/// \param __a
3608/// A double-precision floating-point value used to initialize bits [255:192]
3609/// of the result.
3610/// \param __b
3611/// A double-precision floating-point value used to initialize bits [191:128]
3612/// of the result.
3613/// \param __c
3614/// A double-precision floating-point value used to initialize bits [127:64]
3615/// of the result.
3616/// \param __d
3617/// A double-precision floating-point value used to initialize bits [63:0]
3618/// of the result.
3619/// \returns An initialized 256-bit floating-point vector of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +08003620static __inline __m256d __DEFAULT_FN_ATTRS
3621_mm256_set_pd(double __a, double __b, double __c, double __d)
3622{
Logan Chien55afb0a2018-10-15 10:42:14 +08003623 return __extension__ (__m256d){ __d, __c, __b, __a };
Logan Chien2833ffb2018-10-09 10:03:24 +08003624}
3625
Logan Chien55afb0a2018-10-15 10:42:14 +08003626/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3627/// with the specified single-precision floating-point values.
3628///
3629/// \headerfile <x86intrin.h>
3630///
3631/// This intrinsic is a utility function and does not correspond to a specific
3632/// instruction.
3633///
3634/// \param __a
3635/// A single-precision floating-point value used to initialize bits [255:224]
3636/// of the result.
3637/// \param __b
3638/// A single-precision floating-point value used to initialize bits [223:192]
3639/// of the result.
3640/// \param __c
3641/// A single-precision floating-point value used to initialize bits [191:160]
3642/// of the result.
3643/// \param __d
3644/// A single-precision floating-point value used to initialize bits [159:128]
3645/// of the result.
3646/// \param __e
3647/// A single-precision floating-point value used to initialize bits [127:96]
3648/// of the result.
3649/// \param __f
3650/// A single-precision floating-point value used to initialize bits [95:64]
3651/// of the result.
3652/// \param __g
3653/// A single-precision floating-point value used to initialize bits [63:32]
3654/// of the result.
3655/// \param __h
3656/// A single-precision floating-point value used to initialize bits [31:0]
3657/// of the result.
3658/// \returns An initialized 256-bit floating-point vector of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +08003659static __inline __m256 __DEFAULT_FN_ATTRS
3660_mm256_set_ps(float __a, float __b, float __c, float __d,
3661 float __e, float __f, float __g, float __h)
3662{
Logan Chien55afb0a2018-10-15 10:42:14 +08003663 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Logan Chien2833ffb2018-10-09 10:03:24 +08003664}
3665
Logan Chien55afb0a2018-10-15 10:42:14 +08003666/// Constructs a 256-bit integer vector initialized with the specified
3667/// 32-bit integral values.
3668///
3669/// \headerfile <x86intrin.h>
3670///
3671/// This intrinsic is a utility function and does not correspond to a specific
3672/// instruction.
3673///
3674/// \param __i0
3675/// A 32-bit integral value used to initialize bits [255:224] of the result.
3676/// \param __i1
3677/// A 32-bit integral value used to initialize bits [223:192] of the result.
3678/// \param __i2
3679/// A 32-bit integral value used to initialize bits [191:160] of the result.
3680/// \param __i3
3681/// A 32-bit integral value used to initialize bits [159:128] of the result.
3682/// \param __i4
3683/// A 32-bit integral value used to initialize bits [127:96] of the result.
3684/// \param __i5
3685/// A 32-bit integral value used to initialize bits [95:64] of the result.
3686/// \param __i6
3687/// A 32-bit integral value used to initialize bits [63:32] of the result.
3688/// \param __i7
3689/// A 32-bit integral value used to initialize bits [31:0] of the result.
3690/// \returns An initialized 256-bit integer vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08003691static __inline __m256i __DEFAULT_FN_ATTRS
3692_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3693 int __i4, int __i5, int __i6, int __i7)
3694{
Logan Chien55afb0a2018-10-15 10:42:14 +08003695 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Logan Chien2833ffb2018-10-09 10:03:24 +08003696}
3697
Logan Chien55afb0a2018-10-15 10:42:14 +08003698/// Constructs a 256-bit integer vector initialized with the specified
3699/// 16-bit integral values.
3700///
3701/// \headerfile <x86intrin.h>
3702///
3703/// This intrinsic is a utility function and does not correspond to a specific
3704/// instruction.
3705///
3706/// \param __w15
3707/// A 16-bit integral value used to initialize bits [255:240] of the result.
3708/// \param __w14
3709/// A 16-bit integral value used to initialize bits [239:224] of the result.
3710/// \param __w13
3711/// A 16-bit integral value used to initialize bits [223:208] of the result.
3712/// \param __w12
3713/// A 16-bit integral value used to initialize bits [207:192] of the result.
3714/// \param __w11
3715/// A 16-bit integral value used to initialize bits [191:176] of the result.
3716/// \param __w10
3717/// A 16-bit integral value used to initialize bits [175:160] of the result.
3718/// \param __w09
3719/// A 16-bit integral value used to initialize bits [159:144] of the result.
3720/// \param __w08
3721/// A 16-bit integral value used to initialize bits [143:128] of the result.
3722/// \param __w07
3723/// A 16-bit integral value used to initialize bits [127:112] of the result.
3724/// \param __w06
3725/// A 16-bit integral value used to initialize bits [111:96] of the result.
3726/// \param __w05
3727/// A 16-bit integral value used to initialize bits [95:80] of the result.
3728/// \param __w04
3729/// A 16-bit integral value used to initialize bits [79:64] of the result.
3730/// \param __w03
3731/// A 16-bit integral value used to initialize bits [63:48] of the result.
3732/// \param __w02
3733/// A 16-bit integral value used to initialize bits [47:32] of the result.
3734/// \param __w01
3735/// A 16-bit integral value used to initialize bits [31:16] of the result.
3736/// \param __w00
3737/// A 16-bit integral value used to initialize bits [15:0] of the result.
3738/// \returns An initialized 256-bit integer vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08003739static __inline __m256i __DEFAULT_FN_ATTRS
3740_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3741 short __w11, short __w10, short __w09, short __w08,
3742 short __w07, short __w06, short __w05, short __w04,
3743 short __w03, short __w02, short __w01, short __w00)
3744{
Logan Chien55afb0a2018-10-15 10:42:14 +08003745 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
Logan Chien2833ffb2018-10-09 10:03:24 +08003746 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3747}
3748
Logan Chien55afb0a2018-10-15 10:42:14 +08003749/// Constructs a 256-bit integer vector initialized with the specified
3750/// 8-bit integral values.
3751///
3752/// \headerfile <x86intrin.h>
3753///
3754/// This intrinsic is a utility function and does not correspond to a specific
3755/// instruction.
3756///
3757/// \param __b31
3758/// An 8-bit integral value used to initialize bits [255:248] of the result.
3759/// \param __b30
3760/// An 8-bit integral value used to initialize bits [247:240] of the result.
3761/// \param __b29
3762/// An 8-bit integral value used to initialize bits [239:232] of the result.
3763/// \param __b28
3764/// An 8-bit integral value used to initialize bits [231:224] of the result.
3765/// \param __b27
3766/// An 8-bit integral value used to initialize bits [223:216] of the result.
3767/// \param __b26
3768/// An 8-bit integral value used to initialize bits [215:208] of the result.
3769/// \param __b25
3770/// An 8-bit integral value used to initialize bits [207:200] of the result.
3771/// \param __b24
3772/// An 8-bit integral value used to initialize bits [199:192] of the result.
3773/// \param __b23
3774/// An 8-bit integral value used to initialize bits [191:184] of the result.
3775/// \param __b22
3776/// An 8-bit integral value used to initialize bits [183:176] of the result.
3777/// \param __b21
3778/// An 8-bit integral value used to initialize bits [175:168] of the result.
3779/// \param __b20
3780/// An 8-bit integral value used to initialize bits [167:160] of the result.
3781/// \param __b19
3782/// An 8-bit integral value used to initialize bits [159:152] of the result.
3783/// \param __b18
3784/// An 8-bit integral value used to initialize bits [151:144] of the result.
3785/// \param __b17
3786/// An 8-bit integral value used to initialize bits [143:136] of the result.
3787/// \param __b16
3788/// An 8-bit integral value used to initialize bits [135:128] of the result.
3789/// \param __b15
3790/// An 8-bit integral value used to initialize bits [127:120] of the result.
3791/// \param __b14
3792/// An 8-bit integral value used to initialize bits [119:112] of the result.
3793/// \param __b13
3794/// An 8-bit integral value used to initialize bits [111:104] of the result.
3795/// \param __b12
3796/// An 8-bit integral value used to initialize bits [103:96] of the result.
3797/// \param __b11
3798/// An 8-bit integral value used to initialize bits [95:88] of the result.
3799/// \param __b10
3800/// An 8-bit integral value used to initialize bits [87:80] of the result.
3801/// \param __b09
3802/// An 8-bit integral value used to initialize bits [79:72] of the result.
3803/// \param __b08
3804/// An 8-bit integral value used to initialize bits [71:64] of the result.
3805/// \param __b07
3806/// An 8-bit integral value used to initialize bits [63:56] of the result.
3807/// \param __b06
3808/// An 8-bit integral value used to initialize bits [55:48] of the result.
3809/// \param __b05
3810/// An 8-bit integral value used to initialize bits [47:40] of the result.
3811/// \param __b04
3812/// An 8-bit integral value used to initialize bits [39:32] of the result.
3813/// \param __b03
3814/// An 8-bit integral value used to initialize bits [31:24] of the result.
3815/// \param __b02
3816/// An 8-bit integral value used to initialize bits [23:16] of the result.
3817/// \param __b01
3818/// An 8-bit integral value used to initialize bits [15:8] of the result.
3819/// \param __b00
3820/// An 8-bit integral value used to initialize bits [7:0] of the result.
3821/// \returns An initialized 256-bit integer vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08003822static __inline __m256i __DEFAULT_FN_ATTRS
3823_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3824 char __b27, char __b26, char __b25, char __b24,
3825 char __b23, char __b22, char __b21, char __b20,
3826 char __b19, char __b18, char __b17, char __b16,
3827 char __b15, char __b14, char __b13, char __b12,
3828 char __b11, char __b10, char __b09, char __b08,
3829 char __b07, char __b06, char __b05, char __b04,
3830 char __b03, char __b02, char __b01, char __b00)
3831{
Logan Chien55afb0a2018-10-15 10:42:14 +08003832 return __extension__ (__m256i)(__v32qi){
Logan Chien2833ffb2018-10-09 10:03:24 +08003833 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3834 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3835 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3836 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3837 };
3838}
3839
Logan Chien55afb0a2018-10-15 10:42:14 +08003840/// Constructs a 256-bit integer vector initialized with the specified
3841/// 64-bit integral values.
3842///
3843/// \headerfile <x86intrin.h>
3844///
3845/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3846/// instruction.
3847///
3848/// \param __a
3849/// A 64-bit integral value used to initialize bits [255:192] of the result.
3850/// \param __b
3851/// A 64-bit integral value used to initialize bits [191:128] of the result.
3852/// \param __c
3853/// A 64-bit integral value used to initialize bits [127:64] of the result.
3854/// \param __d
3855/// A 64-bit integral value used to initialize bits [63:0] of the result.
3856/// \returns An initialized 256-bit integer vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08003857static __inline __m256i __DEFAULT_FN_ATTRS
3858_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3859{
Logan Chien55afb0a2018-10-15 10:42:14 +08003860 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
Logan Chien2833ffb2018-10-09 10:03:24 +08003861}
3862
3863/* Create vectors with elements in reverse order */
Logan Chien55afb0a2018-10-15 10:42:14 +08003864/// Constructs a 256-bit floating-point vector of [4 x double],
3865/// initialized in reverse order with the specified double-precision
3866/// floating-point values.
3867///
3868/// \headerfile <x86intrin.h>
3869///
3870/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3871/// instruction.
3872///
3873/// \param __a
3874/// A double-precision floating-point value used to initialize bits [63:0]
3875/// of the result.
3876/// \param __b
3877/// A double-precision floating-point value used to initialize bits [127:64]
3878/// of the result.
3879/// \param __c
3880/// A double-precision floating-point value used to initialize bits [191:128]
3881/// of the result.
3882/// \param __d
3883/// A double-precision floating-point value used to initialize bits [255:192]
3884/// of the result.
3885/// \returns An initialized 256-bit floating-point vector of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +08003886static __inline __m256d __DEFAULT_FN_ATTRS
3887_mm256_setr_pd(double __a, double __b, double __c, double __d)
3888{
Logan Chien55afb0a2018-10-15 10:42:14 +08003889 return _mm256_set_pd(__d, __c, __b, __a);
Logan Chien2833ffb2018-10-09 10:03:24 +08003890}
3891
Logan Chien55afb0a2018-10-15 10:42:14 +08003892/// Constructs a 256-bit floating-point vector of [8 x float],
3893/// initialized in reverse order with the specified single-precision
3894/// float-point values.
3895///
3896/// \headerfile <x86intrin.h>
3897///
3898/// This intrinsic is a utility function and does not correspond to a specific
3899/// instruction.
3900///
3901/// \param __a
3902/// A single-precision floating-point value used to initialize bits [31:0]
3903/// of the result.
3904/// \param __b
3905/// A single-precision floating-point value used to initialize bits [63:32]
3906/// of the result.
3907/// \param __c
3908/// A single-precision floating-point value used to initialize bits [95:64]
3909/// of the result.
3910/// \param __d
3911/// A single-precision floating-point value used to initialize bits [127:96]
3912/// of the result.
3913/// \param __e
3914/// A single-precision floating-point value used to initialize bits [159:128]
3915/// of the result.
3916/// \param __f
3917/// A single-precision floating-point value used to initialize bits [191:160]
3918/// of the result.
3919/// \param __g
3920/// A single-precision floating-point value used to initialize bits [223:192]
3921/// of the result.
3922/// \param __h
3923/// A single-precision floating-point value used to initialize bits [255:224]
3924/// of the result.
3925/// \returns An initialized 256-bit floating-point vector of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +08003926static __inline __m256 __DEFAULT_FN_ATTRS
3927_mm256_setr_ps(float __a, float __b, float __c, float __d,
3928 float __e, float __f, float __g, float __h)
3929{
Logan Chien55afb0a2018-10-15 10:42:14 +08003930 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
Logan Chien2833ffb2018-10-09 10:03:24 +08003931}
3932
Logan Chien55afb0a2018-10-15 10:42:14 +08003933/// Constructs a 256-bit integer vector, initialized in reverse order
3934/// with the specified 32-bit integral values.
3935///
3936/// \headerfile <x86intrin.h>
3937///
3938/// This intrinsic is a utility function and does not correspond to a specific
3939/// instruction.
3940///
3941/// \param __i0
3942/// A 32-bit integral value used to initialize bits [31:0] of the result.
3943/// \param __i1
3944/// A 32-bit integral value used to initialize bits [63:32] of the result.
3945/// \param __i2
3946/// A 32-bit integral value used to initialize bits [95:64] of the result.
3947/// \param __i3
3948/// A 32-bit integral value used to initialize bits [127:96] of the result.
3949/// \param __i4
3950/// A 32-bit integral value used to initialize bits [159:128] of the result.
3951/// \param __i5
3952/// A 32-bit integral value used to initialize bits [191:160] of the result.
3953/// \param __i6
3954/// A 32-bit integral value used to initialize bits [223:192] of the result.
3955/// \param __i7
3956/// A 32-bit integral value used to initialize bits [255:224] of the result.
3957/// \returns An initialized 256-bit integer vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08003958static __inline __m256i __DEFAULT_FN_ATTRS
3959_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
3960 int __i4, int __i5, int __i6, int __i7)
3961{
Logan Chien55afb0a2018-10-15 10:42:14 +08003962 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
Logan Chien2833ffb2018-10-09 10:03:24 +08003963}
3964
Logan Chien55afb0a2018-10-15 10:42:14 +08003965/// Constructs a 256-bit integer vector, initialized in reverse order
3966/// with the specified 16-bit integral values.
3967///
3968/// \headerfile <x86intrin.h>
3969///
3970/// This intrinsic is a utility function and does not correspond to a specific
3971/// instruction.
3972///
3973/// \param __w15
3974/// A 16-bit integral value used to initialize bits [15:0] of the result.
3975/// \param __w14
3976/// A 16-bit integral value used to initialize bits [31:16] of the result.
3977/// \param __w13
3978/// A 16-bit integral value used to initialize bits [47:32] of the result.
3979/// \param __w12
3980/// A 16-bit integral value used to initialize bits [63:48] of the result.
3981/// \param __w11
3982/// A 16-bit integral value used to initialize bits [79:64] of the result.
3983/// \param __w10
3984/// A 16-bit integral value used to initialize bits [95:80] of the result.
3985/// \param __w09
3986/// A 16-bit integral value used to initialize bits [111:96] of the result.
3987/// \param __w08
3988/// A 16-bit integral value used to initialize bits [127:112] of the result.
3989/// \param __w07
3990/// A 16-bit integral value used to initialize bits [143:128] of the result.
3991/// \param __w06
3992/// A 16-bit integral value used to initialize bits [159:144] of the result.
3993/// \param __w05
3994/// A 16-bit integral value used to initialize bits [175:160] of the result.
3995/// \param __w04
3996/// A 16-bit integral value used to initialize bits [191:176] of the result.
3997/// \param __w03
3998/// A 16-bit integral value used to initialize bits [207:192] of the result.
3999/// \param __w02
4000/// A 16-bit integral value used to initialize bits [223:208] of the result.
4001/// \param __w01
4002/// A 16-bit integral value used to initialize bits [239:224] of the result.
4003/// \param __w00
4004/// A 16-bit integral value used to initialize bits [255:240] of the result.
4005/// \returns An initialized 256-bit integer vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08004006static __inline __m256i __DEFAULT_FN_ATTRS
4007_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4008 short __w11, short __w10, short __w09, short __w08,
4009 short __w07, short __w06, short __w05, short __w04,
4010 short __w03, short __w02, short __w01, short __w00)
4011{
Logan Chien55afb0a2018-10-15 10:42:14 +08004012 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4013 __w04, __w05, __w06, __w07,
4014 __w08, __w09, __w10, __w11,
4015 __w12, __w13, __w14, __w15);
Logan Chien2833ffb2018-10-09 10:03:24 +08004016}
4017
Logan Chien55afb0a2018-10-15 10:42:14 +08004018/// Constructs a 256-bit integer vector, initialized in reverse order
4019/// with the specified 8-bit integral values.
4020///
4021/// \headerfile <x86intrin.h>
4022///
4023/// This intrinsic is a utility function and does not correspond to a specific
4024/// instruction.
4025///
4026/// \param __b31
4027/// An 8-bit integral value used to initialize bits [7:0] of the result.
4028/// \param __b30
4029/// An 8-bit integral value used to initialize bits [15:8] of the result.
4030/// \param __b29
4031/// An 8-bit integral value used to initialize bits [23:16] of the result.
4032/// \param __b28
4033/// An 8-bit integral value used to initialize bits [31:24] of the result.
4034/// \param __b27
4035/// An 8-bit integral value used to initialize bits [39:32] of the result.
4036/// \param __b26
4037/// An 8-bit integral value used to initialize bits [47:40] of the result.
4038/// \param __b25
4039/// An 8-bit integral value used to initialize bits [55:48] of the result.
4040/// \param __b24
4041/// An 8-bit integral value used to initialize bits [63:56] of the result.
4042/// \param __b23
4043/// An 8-bit integral value used to initialize bits [71:64] of the result.
4044/// \param __b22
4045/// An 8-bit integral value used to initialize bits [79:72] of the result.
4046/// \param __b21
4047/// An 8-bit integral value used to initialize bits [87:80] of the result.
4048/// \param __b20
4049/// An 8-bit integral value used to initialize bits [95:88] of the result.
4050/// \param __b19
4051/// An 8-bit integral value used to initialize bits [103:96] of the result.
4052/// \param __b18
4053/// An 8-bit integral value used to initialize bits [111:104] of the result.
4054/// \param __b17
4055/// An 8-bit integral value used to initialize bits [119:112] of the result.
4056/// \param __b16
4057/// An 8-bit integral value used to initialize bits [127:120] of the result.
4058/// \param __b15
4059/// An 8-bit integral value used to initialize bits [135:128] of the result.
4060/// \param __b14
4061/// An 8-bit integral value used to initialize bits [143:136] of the result.
4062/// \param __b13
4063/// An 8-bit integral value used to initialize bits [151:144] of the result.
4064/// \param __b12
4065/// An 8-bit integral value used to initialize bits [159:152] of the result.
4066/// \param __b11
4067/// An 8-bit integral value used to initialize bits [167:160] of the result.
4068/// \param __b10
4069/// An 8-bit integral value used to initialize bits [175:168] of the result.
4070/// \param __b09
4071/// An 8-bit integral value used to initialize bits [183:176] of the result.
4072/// \param __b08
4073/// An 8-bit integral value used to initialize bits [191:184] of the result.
4074/// \param __b07
4075/// An 8-bit integral value used to initialize bits [199:192] of the result.
4076/// \param __b06
4077/// An 8-bit integral value used to initialize bits [207:200] of the result.
4078/// \param __b05
4079/// An 8-bit integral value used to initialize bits [215:208] of the result.
4080/// \param __b04
4081/// An 8-bit integral value used to initialize bits [223:216] of the result.
4082/// \param __b03
4083/// An 8-bit integral value used to initialize bits [231:224] of the result.
4084/// \param __b02
4085/// An 8-bit integral value used to initialize bits [239:232] of the result.
4086/// \param __b01
4087/// An 8-bit integral value used to initialize bits [247:240] of the result.
4088/// \param __b00
4089/// An 8-bit integral value used to initialize bits [255:248] of the result.
4090/// \returns An initialized 256-bit integer vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08004091static __inline __m256i __DEFAULT_FN_ATTRS
4092_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4093 char __b27, char __b26, char __b25, char __b24,
4094 char __b23, char __b22, char __b21, char __b20,
4095 char __b19, char __b18, char __b17, char __b16,
4096 char __b15, char __b14, char __b13, char __b12,
4097 char __b11, char __b10, char __b09, char __b08,
4098 char __b07, char __b06, char __b05, char __b04,
4099 char __b03, char __b02, char __b01, char __b00)
4100{
Logan Chien55afb0a2018-10-15 10:42:14 +08004101 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4102 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4103 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4104 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
Logan Chien2833ffb2018-10-09 10:03:24 +08004105}
4106
Logan Chien55afb0a2018-10-15 10:42:14 +08004107/// Constructs a 256-bit integer vector, initialized in reverse order
4108/// with the specified 64-bit integral values.
4109///
4110/// \headerfile <x86intrin.h>
4111///
4112/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4113/// instruction.
4114///
4115/// \param __a
4116/// A 64-bit integral value used to initialize bits [63:0] of the result.
4117/// \param __b
4118/// A 64-bit integral value used to initialize bits [127:64] of the result.
4119/// \param __c
4120/// A 64-bit integral value used to initialize bits [191:128] of the result.
4121/// \param __d
4122/// A 64-bit integral value used to initialize bits [255:192] of the result.
4123/// \returns An initialized 256-bit integer vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08004124static __inline __m256i __DEFAULT_FN_ATTRS
4125_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4126{
Logan Chien55afb0a2018-10-15 10:42:14 +08004127 return _mm256_set_epi64x(__d, __c, __b, __a);
Logan Chien2833ffb2018-10-09 10:03:24 +08004128}
4129
4130/* Create vectors with repeated elements */
Logan Chien55afb0a2018-10-15 10:42:14 +08004131/// Constructs a 256-bit floating-point vector of [4 x double], with each
4132/// of the four double-precision floating-point vector elements set to the
4133/// specified double-precision floating-point value.
4134///
4135/// \headerfile <x86intrin.h>
4136///
4137/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4138///
4139/// \param __w
4140/// A double-precision floating-point value used to initialize each vector
4141/// element of the result.
4142/// \returns An initialized 256-bit floating-point vector of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +08004143static __inline __m256d __DEFAULT_FN_ATTRS
4144_mm256_set1_pd(double __w)
4145{
Logan Chien55afb0a2018-10-15 10:42:14 +08004146 return _mm256_set_pd(__w, __w, __w, __w);
Logan Chien2833ffb2018-10-09 10:03:24 +08004147}
4148
Logan Chien55afb0a2018-10-15 10:42:14 +08004149/// Constructs a 256-bit floating-point vector of [8 x float], with each
4150/// of the eight single-precision floating-point vector elements set to the
4151/// specified single-precision floating-point value.
4152///
4153/// \headerfile <x86intrin.h>
4154///
4155/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4156/// instruction.
4157///
4158/// \param __w
4159/// A single-precision floating-point value used to initialize each vector
4160/// element of the result.
4161/// \returns An initialized 256-bit floating-point vector of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +08004162static __inline __m256 __DEFAULT_FN_ATTRS
4163_mm256_set1_ps(float __w)
4164{
Logan Chien55afb0a2018-10-15 10:42:14 +08004165 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
Logan Chien2833ffb2018-10-09 10:03:24 +08004166}
4167
Logan Chien55afb0a2018-10-15 10:42:14 +08004168/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4169/// 32-bit integral vector elements set to the specified 32-bit integral
4170/// value.
4171///
4172/// \headerfile <x86intrin.h>
4173///
4174/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4175/// instruction.
4176///
4177/// \param __i
4178/// A 32-bit integral value used to initialize each vector element of the
4179/// result.
4180/// \returns An initialized 256-bit integer vector of [8 x i32].
Logan Chien2833ffb2018-10-09 10:03:24 +08004181static __inline __m256i __DEFAULT_FN_ATTRS
4182_mm256_set1_epi32(int __i)
4183{
Logan Chien55afb0a2018-10-15 10:42:14 +08004184 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
Logan Chien2833ffb2018-10-09 10:03:24 +08004185}
4186
Logan Chien55afb0a2018-10-15 10:42:14 +08004187/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4188/// 16-bit integral vector elements set to the specified 16-bit integral
4189/// value.
4190///
4191/// \headerfile <x86intrin.h>
4192///
4193/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4194///
4195/// \param __w
4196/// A 16-bit integral value used to initialize each vector element of the
4197/// result.
4198/// \returns An initialized 256-bit integer vector of [16 x i16].
Logan Chien2833ffb2018-10-09 10:03:24 +08004199static __inline __m256i __DEFAULT_FN_ATTRS
4200_mm256_set1_epi16(short __w)
4201{
Logan Chien55afb0a2018-10-15 10:42:14 +08004202 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4203 __w, __w, __w, __w, __w, __w, __w, __w);
Logan Chien2833ffb2018-10-09 10:03:24 +08004204}
4205
Logan Chien55afb0a2018-10-15 10:42:14 +08004206/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4207/// 8-bit integral vector elements set to the specified 8-bit integral value.
4208///
4209/// \headerfile <x86intrin.h>
4210///
4211/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4212///
4213/// \param __b
4214/// An 8-bit integral value used to initialize each vector element of the
4215/// result.
4216/// \returns An initialized 256-bit integer vector of [32 x i8].
Logan Chien2833ffb2018-10-09 10:03:24 +08004217static __inline __m256i __DEFAULT_FN_ATTRS
4218_mm256_set1_epi8(char __b)
4219{
Logan Chien55afb0a2018-10-15 10:42:14 +08004220 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4221 __b, __b, __b, __b, __b, __b, __b, __b,
4222 __b, __b, __b, __b, __b, __b, __b, __b,
4223 __b, __b, __b, __b, __b, __b, __b, __b);
Logan Chien2833ffb2018-10-09 10:03:24 +08004224}
4225
Logan Chien55afb0a2018-10-15 10:42:14 +08004226/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4227/// 64-bit integral vector elements set to the specified 64-bit integral
4228/// value.
4229///
4230/// \headerfile <x86intrin.h>
4231///
4232/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4233///
4234/// \param __q
4235/// A 64-bit integral value used to initialize each vector element of the
4236/// result.
4237/// \returns An initialized 256-bit integer vector of [4 x i64].
Logan Chien2833ffb2018-10-09 10:03:24 +08004238static __inline __m256i __DEFAULT_FN_ATTRS
4239_mm256_set1_epi64x(long long __q)
4240{
Logan Chien55afb0a2018-10-15 10:42:14 +08004241 return _mm256_set_epi64x(__q, __q, __q, __q);
Logan Chien2833ffb2018-10-09 10:03:24 +08004242}
4243
4244/* Create __zeroed vectors */
Logan Chien55afb0a2018-10-15 10:42:14 +08004245/// Constructs a 256-bit floating-point vector of [4 x double] with all
4246/// vector elements initialized to zero.
4247///
4248/// \headerfile <x86intrin.h>
4249///
4250/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4251///
4252/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
Logan Chien2833ffb2018-10-09 10:03:24 +08004253static __inline __m256d __DEFAULT_FN_ATTRS
4254_mm256_setzero_pd(void)
4255{
Logan Chien55afb0a2018-10-15 10:42:14 +08004256 return __extension__ (__m256d){ 0, 0, 0, 0 };
Logan Chien2833ffb2018-10-09 10:03:24 +08004257}
4258
Logan Chien55afb0a2018-10-15 10:42:14 +08004259/// Constructs a 256-bit floating-point vector of [8 x float] with all
4260/// vector elements initialized to zero.
4261///
4262/// \headerfile <x86intrin.h>
4263///
4264/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4265///
4266/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
Logan Chien2833ffb2018-10-09 10:03:24 +08004267static __inline __m256 __DEFAULT_FN_ATTRS
4268_mm256_setzero_ps(void)
4269{
Logan Chien55afb0a2018-10-15 10:42:14 +08004270 return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
Logan Chien2833ffb2018-10-09 10:03:24 +08004271}
4272
Logan Chien55afb0a2018-10-15 10:42:14 +08004273/// Constructs a 256-bit integer vector initialized to zero.
4274///
4275/// \headerfile <x86intrin.h>
4276///
4277/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4278///
4279/// \returns A 256-bit integer vector initialized to zero.
Logan Chien2833ffb2018-10-09 10:03:24 +08004280static __inline __m256i __DEFAULT_FN_ATTRS
4281_mm256_setzero_si256(void)
4282{
Logan Chien55afb0a2018-10-15 10:42:14 +08004283 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
Logan Chien2833ffb2018-10-09 10:03:24 +08004284}
4285
4286/* Cast between vector types */
Logan Chien55afb0a2018-10-15 10:42:14 +08004287/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4288/// floating-point vector of [8 x float].
4289///
4290/// \headerfile <x86intrin.h>
4291///
4292/// This intrinsic has no corresponding instruction.
4293///
4294/// \param __a
4295/// A 256-bit floating-point vector of [4 x double].
4296/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4297/// bitwise pattern as the parameter.
Logan Chien2833ffb2018-10-09 10:03:24 +08004298static __inline __m256 __DEFAULT_FN_ATTRS
4299_mm256_castpd_ps(__m256d __a)
4300{
4301 return (__m256)__a;
4302}
4303
Logan Chien55afb0a2018-10-15 10:42:14 +08004304/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4305/// integer vector.
4306///
4307/// \headerfile <x86intrin.h>
4308///
4309/// This intrinsic has no corresponding instruction.
4310///
4311/// \param __a
4312/// A 256-bit floating-point vector of [4 x double].
4313/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4314/// parameter.
Logan Chien2833ffb2018-10-09 10:03:24 +08004315static __inline __m256i __DEFAULT_FN_ATTRS
4316_mm256_castpd_si256(__m256d __a)
4317{
4318 return (__m256i)__a;
4319}
4320
Logan Chien55afb0a2018-10-15 10:42:14 +08004321/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4322/// floating-point vector of [4 x double].
4323///
4324/// \headerfile <x86intrin.h>
4325///
4326/// This intrinsic has no corresponding instruction.
4327///
4328/// \param __a
4329/// A 256-bit floating-point vector of [8 x float].
4330/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4331/// bitwise pattern as the parameter.
Logan Chien2833ffb2018-10-09 10:03:24 +08004332static __inline __m256d __DEFAULT_FN_ATTRS
4333_mm256_castps_pd(__m256 __a)
4334{
4335 return (__m256d)__a;
4336}
4337
Logan Chien55afb0a2018-10-15 10:42:14 +08004338/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4339/// integer vector.
4340///
4341/// \headerfile <x86intrin.h>
4342///
4343/// This intrinsic has no corresponding instruction.
4344///
4345/// \param __a
4346/// A 256-bit floating-point vector of [8 x float].
4347/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4348/// parameter.
Logan Chien2833ffb2018-10-09 10:03:24 +08004349static __inline __m256i __DEFAULT_FN_ATTRS
4350_mm256_castps_si256(__m256 __a)
4351{
4352 return (__m256i)__a;
4353}
4354
Logan Chien55afb0a2018-10-15 10:42:14 +08004355/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4356/// of [8 x float].
4357///
4358/// \headerfile <x86intrin.h>
4359///
4360/// This intrinsic has no corresponding instruction.
4361///
4362/// \param __a
4363/// A 256-bit integer vector.
4364/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4365/// bitwise pattern as the parameter.
Logan Chien2833ffb2018-10-09 10:03:24 +08004366static __inline __m256 __DEFAULT_FN_ATTRS
4367_mm256_castsi256_ps(__m256i __a)
4368{
4369 return (__m256)__a;
4370}
4371
Logan Chien55afb0a2018-10-15 10:42:14 +08004372/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4373/// of [4 x double].
4374///
4375/// \headerfile <x86intrin.h>
4376///
4377/// This intrinsic has no corresponding instruction.
4378///
4379/// \param __a
4380/// A 256-bit integer vector.
4381/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4382/// bitwise pattern as the parameter.
Logan Chien2833ffb2018-10-09 10:03:24 +08004383static __inline __m256d __DEFAULT_FN_ATTRS
4384_mm256_castsi256_pd(__m256i __a)
4385{
4386 return (__m256d)__a;
4387}
4388
Logan Chien55afb0a2018-10-15 10:42:14 +08004389/// Returns the lower 128 bits of a 256-bit floating-point vector of
4390/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4391///
4392/// \headerfile <x86intrin.h>
4393///
4394/// This intrinsic has no corresponding instruction.
4395///
4396/// \param __a
4397/// A 256-bit floating-point vector of [4 x double].
4398/// \returns A 128-bit floating-point vector of [2 x double] containing the
4399/// lower 128 bits of the parameter.
Logan Chien2833ffb2018-10-09 10:03:24 +08004400static __inline __m128d __DEFAULT_FN_ATTRS
4401_mm256_castpd256_pd128(__m256d __a)
4402{
4403 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4404}
4405
Logan Chien55afb0a2018-10-15 10:42:14 +08004406/// Returns the lower 128 bits of a 256-bit floating-point vector of
4407/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4408///
4409/// \headerfile <x86intrin.h>
4410///
4411/// This intrinsic has no corresponding instruction.
4412///
4413/// \param __a
4414/// A 256-bit floating-point vector of [8 x float].
4415/// \returns A 128-bit floating-point vector of [4 x float] containing the
4416/// lower 128 bits of the parameter.
Logan Chien2833ffb2018-10-09 10:03:24 +08004417static __inline __m128 __DEFAULT_FN_ATTRS
4418_mm256_castps256_ps128(__m256 __a)
4419{
4420 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4421}
4422
Logan Chien55afb0a2018-10-15 10:42:14 +08004423/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4424///
4425/// \headerfile <x86intrin.h>
4426///
4427/// This intrinsic has no corresponding instruction.
4428///
4429/// \param __a
4430/// A 256-bit integer vector.
4431/// \returns A 128-bit integer vector containing the lower 128 bits of the
4432/// parameter.
Logan Chien2833ffb2018-10-09 10:03:24 +08004433static __inline __m128i __DEFAULT_FN_ATTRS
4434_mm256_castsi256_si128(__m256i __a)
4435{
4436 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4437}
4438
Logan Chien55afb0a2018-10-15 10:42:14 +08004439/// Constructs a 256-bit floating-point vector of [4 x double] from a
4440/// 128-bit floating-point vector of [2 x double].
4441///
4442/// The lower 128 bits contain the value of the source vector. The contents
4443/// of the upper 128 bits are undefined.
4444///
4445/// \headerfile <x86intrin.h>
4446///
4447/// This intrinsic has no corresponding instruction.
4448///
4449/// \param __a
4450/// A 128-bit vector of [2 x double].
4451/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4452/// contain the value of the parameter. The contents of the upper 128 bits
4453/// are undefined.
Logan Chien2833ffb2018-10-09 10:03:24 +08004454static __inline __m256d __DEFAULT_FN_ATTRS
4455_mm256_castpd128_pd256(__m128d __a)
4456{
4457 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
4458}
4459
Logan Chien55afb0a2018-10-15 10:42:14 +08004460/// Constructs a 256-bit floating-point vector of [8 x float] from a
4461/// 128-bit floating-point vector of [4 x float].
4462///
4463/// The lower 128 bits contain the value of the source vector. The contents
4464/// of the upper 128 bits are undefined.
4465///
4466/// \headerfile <x86intrin.h>
4467///
4468/// This intrinsic has no corresponding instruction.
4469///
4470/// \param __a
4471/// A 128-bit vector of [4 x float].
4472/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4473/// contain the value of the parameter. The contents of the upper 128 bits
4474/// are undefined.
Logan Chien2833ffb2018-10-09 10:03:24 +08004475static __inline __m256 __DEFAULT_FN_ATTRS
4476_mm256_castps128_ps256(__m128 __a)
4477{
4478 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
4479}
4480
Logan Chien55afb0a2018-10-15 10:42:14 +08004481/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4482///
4483/// The lower 128 bits contain the value of the source vector. The contents
4484/// of the upper 128 bits are undefined.
4485///
4486/// \headerfile <x86intrin.h>
4487///
4488/// This intrinsic has no corresponding instruction.
4489///
4490/// \param __a
4491/// A 128-bit integer vector.
4492/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4493/// the parameter. The contents of the upper 128 bits are undefined.
Logan Chien2833ffb2018-10-09 10:03:24 +08004494static __inline __m256i __DEFAULT_FN_ATTRS
4495_mm256_castsi128_si256(__m128i __a)
4496{
4497 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
4498}
4499
Logan Chien55afb0a2018-10-15 10:42:14 +08004500/// Constructs a 256-bit floating-point vector of [4 x double] from a
4501/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4502/// contain the value of the source vector. The upper 128 bits are set
4503/// to zero.
4504///
4505/// \headerfile <x86intrin.h>
4506///
4507/// This intrinsic has no corresponding instruction.
4508///
4509/// \param __a
4510/// A 128-bit vector of [2 x double].
4511/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4512/// contain the value of the parameter. The upper 128 bits are set to zero.
4513static __inline __m256d __DEFAULT_FN_ATTRS
4514_mm256_zextpd128_pd256(__m128d __a)
4515{
4516 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4517}
4518
4519/// Constructs a 256-bit floating-point vector of [8 x float] from a
4520/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4521/// the value of the source vector. The upper 128 bits are set to zero.
4522///
4523/// \headerfile <x86intrin.h>
4524///
4525/// This intrinsic has no corresponding instruction.
4526///
4527/// \param __a
4528/// A 128-bit vector of [4 x float].
4529/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4530/// contain the value of the parameter. The upper 128 bits are set to zero.
4531static __inline __m256 __DEFAULT_FN_ATTRS
4532_mm256_zextps128_ps256(__m128 __a)
4533{
4534 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4535}
4536
4537/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4538/// The lower 128 bits contain the value of the source vector. The upper
4539/// 128 bits are set to zero.
4540///
4541/// \headerfile <x86intrin.h>
4542///
4543/// This intrinsic has no corresponding instruction.
4544///
4545/// \param __a
4546/// A 128-bit integer vector.
4547/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4548/// the parameter. The upper 128 bits are set to zero.
4549static __inline __m256i __DEFAULT_FN_ATTRS
4550_mm256_zextsi128_si256(__m128i __a)
4551{
4552 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4553}
4554
Logan Chien2833ffb2018-10-09 10:03:24 +08004555/*
4556 Vector insert.
4557 We use macros rather than inlines because we only want to accept
4558 invocations where the immediate M is a constant expression.
4559*/
Logan Chien55afb0a2018-10-15 10:42:14 +08004560/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4561/// a 256-bit vector of [8 x float] given in the first parameter, and then
4562/// replacing either the upper or the lower 128 bits with the contents of a
4563/// 128-bit vector of [4 x float] in the second parameter.
4564///
4565/// The immediate integer parameter determines between the upper or the lower
4566/// 128 bits.
4567///
4568/// \headerfile <x86intrin.h>
4569///
4570/// \code
4571/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4572/// \endcode
4573///
4574/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4575///
4576/// \param V1
4577/// A 256-bit vector of [8 x float]. This vector is copied to the result
4578/// first, and then either the upper or the lower 128 bits of the result will
4579/// be replaced by the contents of \a V2.
4580/// \param V2
4581/// A 128-bit vector of [4 x float]. The contents of this parameter are
4582/// written to either the upper or the lower 128 bits of the result depending
4583/// on the value of parameter \a M.
4584/// \param M
4585/// An immediate integer. The least significant bit determines how the values
4586/// from the two parameters are interleaved: \n
4587/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4588/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4589/// result. \n
4590/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4591/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4592/// result.
4593/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4594#define _mm256_insertf128_ps(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08004595 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4596 (__v4sf)(__m128)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08004597
Logan Chien55afb0a2018-10-15 10:42:14 +08004598/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4599/// a 256-bit vector of [4 x double] given in the first parameter, and then
4600/// replacing either the upper or the lower 128 bits with the contents of a
4601/// 128-bit vector of [2 x double] in the second parameter.
4602///
4603/// The immediate integer parameter determines between the upper or the lower
4604/// 128 bits.
4605///
4606/// \headerfile <x86intrin.h>
4607///
4608/// \code
4609/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4610/// \endcode
4611///
4612/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4613///
4614/// \param V1
4615/// A 256-bit vector of [4 x double]. This vector is copied to the result
4616/// first, and then either the upper or the lower 128 bits of the result will
4617/// be replaced by the contents of \a V2.
4618/// \param V2
4619/// A 128-bit vector of [2 x double]. The contents of this parameter are
4620/// written to either the upper or the lower 128 bits of the result depending
4621/// on the value of parameter \a M.
4622/// \param M
4623/// An immediate integer. The least significant bit determines how the values
4624/// from the two parameters are interleaved: \n
4625/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4626/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4627/// result. \n
4628/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4629/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4630/// result.
4631/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4632#define _mm256_insertf128_pd(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08004633 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4634 (__v2df)(__m128d)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08004635
Logan Chien55afb0a2018-10-15 10:42:14 +08004636/// Constructs a new 256-bit integer vector by first duplicating a
4637/// 256-bit integer vector given in the first parameter, and then replacing
4638/// either the upper or the lower 128 bits with the contents of a 128-bit
4639/// integer vector in the second parameter.
4640///
4641/// The immediate integer parameter determines between the upper or the lower
4642/// 128 bits.
4643///
4644/// \headerfile <x86intrin.h>
4645///
4646/// \code
4647/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4648/// \endcode
4649///
4650/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4651///
4652/// \param V1
4653/// A 256-bit integer vector. This vector is copied to the result first, and
4654/// then either the upper or the lower 128 bits of the result will be
4655/// replaced by the contents of \a V2.
4656/// \param V2
4657/// A 128-bit integer vector. The contents of this parameter are written to
4658/// either the upper or the lower 128 bits of the result depending on the
4659/// value of parameter \a M.
4660/// \param M
4661/// An immediate integer. The least significant bit determines how the values
4662/// from the two parameters are interleaved: \n
4663/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4664/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4665/// result. \n
4666/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4667/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4668/// result.
4669/// \returns A 256-bit integer vector containing the interleaved values.
4670#define _mm256_insertf128_si256(V1, V2, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08004671 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4672 (__v4si)(__m128i)(V2), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08004673
4674/*
4675 Vector extract.
4676 We use macros rather than inlines because we only want to accept
4677 invocations where the immediate M is a constant expression.
4678*/
Logan Chien55afb0a2018-10-15 10:42:14 +08004679/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4680/// of [8 x float], as determined by the immediate integer parameter, and
4681/// returns the extracted bits as a 128-bit vector of [4 x float].
4682///
4683/// \headerfile <x86intrin.h>
4684///
4685/// \code
4686/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4687/// \endcode
4688///
4689/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4690///
4691/// \param V
4692/// A 256-bit vector of [8 x float].
4693/// \param M
4694/// An immediate integer. The least significant bit determines which bits are
4695/// extracted from the first parameter: \n
4696/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4697/// result. \n
4698/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4699/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4700#define _mm256_extractf128_ps(V, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08004701 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08004702
Logan Chien55afb0a2018-10-15 10:42:14 +08004703/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4704/// of [4 x double], as determined by the immediate integer parameter, and
4705/// returns the extracted bits as a 128-bit vector of [2 x double].
4706///
4707/// \headerfile <x86intrin.h>
4708///
4709/// \code
4710/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4711/// \endcode
4712///
4713/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4714///
4715/// \param V
4716/// A 256-bit vector of [4 x double].
4717/// \param M
4718/// An immediate integer. The least significant bit determines which bits are
4719/// extracted from the first parameter: \n
4720/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4721/// result. \n
4722/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4723/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4724#define _mm256_extractf128_pd(V, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08004725 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08004726
Logan Chien55afb0a2018-10-15 10:42:14 +08004727/// Extracts either the upper or the lower 128 bits from a 256-bit
4728/// integer vector, as determined by the immediate integer parameter, and
4729/// returns the extracted bits as a 128-bit integer vector.
4730///
4731/// \headerfile <x86intrin.h>
4732///
4733/// \code
4734/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4735/// \endcode
4736///
4737/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4738///
4739/// \param V
4740/// A 256-bit integer vector.
4741/// \param M
4742/// An immediate integer. The least significant bit determines which bits are
4743/// extracted from the first parameter: \n
4744/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4745/// result. \n
4746/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4747/// \returns A 128-bit integer vector containing the extracted bits.
4748#define _mm256_extractf128_si256(V, M) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08004749 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
Logan Chien2833ffb2018-10-09 10:03:24 +08004750
4751/* SIMD load ops (unaligned) */
Logan Chien55afb0a2018-10-15 10:42:14 +08004752/// Loads two 128-bit floating-point vectors of [4 x float] from
4753/// unaligned memory locations and constructs a 256-bit floating-point vector
4754/// of [8 x float] by concatenating the two 128-bit vectors.
4755///
4756/// \headerfile <x86intrin.h>
4757///
4758/// This intrinsic corresponds to load instructions followed by the
4759/// <c> VINSERTF128 </c> instruction.
4760///
4761/// \param __addr_hi
4762/// A pointer to a 128-bit memory location containing 4 consecutive
4763/// single-precision floating-point values. These values are to be copied to
4764/// bits[255:128] of the result. The address of the memory location does not
4765/// have to be aligned.
4766/// \param __addr_lo
4767/// A pointer to a 128-bit memory location containing 4 consecutive
4768/// single-precision floating-point values. These values are to be copied to
4769/// bits[127:0] of the result. The address of the memory location does not
4770/// have to be aligned.
4771/// \returns A 256-bit floating-point vector of [8 x float] containing the
4772/// concatenated result.
Logan Chien2833ffb2018-10-09 10:03:24 +08004773static __inline __m256 __DEFAULT_FN_ATTRS
4774_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4775{
4776 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4777 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
4778}
4779
Logan Chien55afb0a2018-10-15 10:42:14 +08004780/// Loads two 128-bit floating-point vectors of [2 x double] from
4781/// unaligned memory locations and constructs a 256-bit floating-point vector
4782/// of [4 x double] by concatenating the two 128-bit vectors.
4783///
4784/// \headerfile <x86intrin.h>
4785///
4786/// This intrinsic corresponds to load instructions followed by the
4787/// <c> VINSERTF128 </c> instruction.
4788///
4789/// \param __addr_hi
4790/// A pointer to a 128-bit memory location containing two consecutive
4791/// double-precision floating-point values. These values are to be copied to
4792/// bits[255:128] of the result. The address of the memory location does not
4793/// have to be aligned.
4794/// \param __addr_lo
4795/// A pointer to a 128-bit memory location containing two consecutive
4796/// double-precision floating-point values. These values are to be copied to
4797/// bits[127:0] of the result. The address of the memory location does not
4798/// have to be aligned.
4799/// \returns A 256-bit floating-point vector of [4 x double] containing the
4800/// concatenated result.
Logan Chien2833ffb2018-10-09 10:03:24 +08004801static __inline __m256d __DEFAULT_FN_ATTRS
4802_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4803{
4804 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4805 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
4806}
4807
Logan Chien55afb0a2018-10-15 10:42:14 +08004808/// Loads two 128-bit integer vectors from unaligned memory locations and
4809/// constructs a 256-bit integer vector by concatenating the two 128-bit
4810/// vectors.
4811///
4812/// \headerfile <x86intrin.h>
4813///
4814/// This intrinsic corresponds to load instructions followed by the
4815/// <c> VINSERTF128 </c> instruction.
4816///
4817/// \param __addr_hi
4818/// A pointer to a 128-bit memory location containing a 128-bit integer
4819/// vector. This vector is to be copied to bits[255:128] of the result. The
4820/// address of the memory location does not have to be aligned.
4821/// \param __addr_lo
4822/// A pointer to a 128-bit memory location containing a 128-bit integer
4823/// vector. This vector is to be copied to bits[127:0] of the result. The
4824/// address of the memory location does not have to be aligned.
4825/// \returns A 256-bit integer vector containing the concatenated result.
Logan Chien2833ffb2018-10-09 10:03:24 +08004826static __inline __m256i __DEFAULT_FN_ATTRS
Logan Chiendbcf4122019-03-21 10:50:25 +08004827_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
Logan Chien2833ffb2018-10-09 10:03:24 +08004828{
4829 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4830 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
4831}
4832
4833/* SIMD store ops (unaligned) */
Logan Chien55afb0a2018-10-15 10:42:14 +08004834/// Stores the upper and lower 128 bits of a 256-bit floating-point
4835/// vector of [8 x float] into two different unaligned memory locations.
4836///
4837/// \headerfile <x86intrin.h>
4838///
4839/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4840/// store instructions.
4841///
4842/// \param __addr_hi
4843/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4844/// copied to this memory location. The address of this memory location does
4845/// not have to be aligned.
4846/// \param __addr_lo
4847/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4848/// copied to this memory location. The address of this memory location does
4849/// not have to be aligned.
4850/// \param __a
4851/// A 256-bit floating-point vector of [8 x float].
Logan Chien2833ffb2018-10-09 10:03:24 +08004852static __inline void __DEFAULT_FN_ATTRS
4853_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
4854{
4855 __m128 __v128;
4856
4857 __v128 = _mm256_castps256_ps128(__a);
4858 _mm_storeu_ps(__addr_lo, __v128);
4859 __v128 = _mm256_extractf128_ps(__a, 1);
4860 _mm_storeu_ps(__addr_hi, __v128);
4861}
4862
Logan Chien55afb0a2018-10-15 10:42:14 +08004863/// Stores the upper and lower 128 bits of a 256-bit floating-point
4864/// vector of [4 x double] into two different unaligned memory locations.
4865///
4866/// \headerfile <x86intrin.h>
4867///
4868/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4869/// store instructions.
4870///
4871/// \param __addr_hi
4872/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4873/// copied to this memory location. The address of this memory location does
4874/// not have to be aligned.
4875/// \param __addr_lo
4876/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4877/// copied to this memory location. The address of this memory location does
4878/// not have to be aligned.
4879/// \param __a
4880/// A 256-bit floating-point vector of [4 x double].
Logan Chien2833ffb2018-10-09 10:03:24 +08004881static __inline void __DEFAULT_FN_ATTRS
4882_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
4883{
4884 __m128d __v128;
4885
4886 __v128 = _mm256_castpd256_pd128(__a);
4887 _mm_storeu_pd(__addr_lo, __v128);
4888 __v128 = _mm256_extractf128_pd(__a, 1);
4889 _mm_storeu_pd(__addr_hi, __v128);
4890}
4891
Logan Chien55afb0a2018-10-15 10:42:14 +08004892/// Stores the upper and lower 128 bits of a 256-bit integer vector into
4893/// two different unaligned memory locations.
4894///
4895/// \headerfile <x86intrin.h>
4896///
4897/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4898/// store instructions.
4899///
4900/// \param __addr_hi
4901/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4902/// copied to this memory location. The address of this memory location does
4903/// not have to be aligned.
4904/// \param __addr_lo
4905/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4906/// copied to this memory location. The address of this memory location does
4907/// not have to be aligned.
4908/// \param __a
4909/// A 256-bit integer vector.
Logan Chien2833ffb2018-10-09 10:03:24 +08004910static __inline void __DEFAULT_FN_ATTRS
Logan Chiendbcf4122019-03-21 10:50:25 +08004911_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
Logan Chien2833ffb2018-10-09 10:03:24 +08004912{
4913 __m128i __v128;
4914
4915 __v128 = _mm256_castsi256_si128(__a);
4916 _mm_storeu_si128(__addr_lo, __v128);
4917 __v128 = _mm256_extractf128_si256(__a, 1);
4918 _mm_storeu_si128(__addr_hi, __v128);
4919}
4920
Logan Chien55afb0a2018-10-15 10:42:14 +08004921/// Constructs a 256-bit floating-point vector of [8 x float] by
4922/// concatenating two 128-bit floating-point vectors of [4 x float].
4923///
4924/// \headerfile <x86intrin.h>
4925///
4926/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4927///
4928/// \param __hi
4929/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4930/// 128 bits of the result.
4931/// \param __lo
4932/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4933/// 128 bits of the result.
4934/// \returns A 256-bit floating-point vector of [8 x float] containing the
4935/// concatenated result.
Logan Chien2833ffb2018-10-09 10:03:24 +08004936static __inline __m256 __DEFAULT_FN_ATTRS
Logan Chien55afb0a2018-10-15 10:42:14 +08004937_mm256_set_m128 (__m128 __hi, __m128 __lo)
4938{
Logan Chien2833ffb2018-10-09 10:03:24 +08004939 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4940}
4941
Logan Chien55afb0a2018-10-15 10:42:14 +08004942/// Constructs a 256-bit floating-point vector of [4 x double] by
4943/// concatenating two 128-bit floating-point vectors of [2 x double].
4944///
4945/// \headerfile <x86intrin.h>
4946///
4947/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4948///
4949/// \param __hi
4950/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4951/// 128 bits of the result.
4952/// \param __lo
4953/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4954/// 128 bits of the result.
4955/// \returns A 256-bit floating-point vector of [4 x double] containing the
4956/// concatenated result.
Logan Chien2833ffb2018-10-09 10:03:24 +08004957static __inline __m256d __DEFAULT_FN_ATTRS
Logan Chien55afb0a2018-10-15 10:42:14 +08004958_mm256_set_m128d (__m128d __hi, __m128d __lo)
4959{
4960 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
Logan Chien2833ffb2018-10-09 10:03:24 +08004961}
4962
Logan Chien55afb0a2018-10-15 10:42:14 +08004963/// Constructs a 256-bit integer vector by concatenating two 128-bit
4964/// integer vectors.
4965///
4966/// \headerfile <x86intrin.h>
4967///
4968/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4969///
4970/// \param __hi
4971/// A 128-bit integer vector to be copied to the upper 128 bits of the
4972/// result.
4973/// \param __lo
4974/// A 128-bit integer vector to be copied to the lower 128 bits of the
4975/// result.
4976/// \returns A 256-bit integer vector containing the concatenated result.
Logan Chien2833ffb2018-10-09 10:03:24 +08004977static __inline __m256i __DEFAULT_FN_ATTRS
Logan Chien55afb0a2018-10-15 10:42:14 +08004978_mm256_set_m128i (__m128i __hi, __m128i __lo)
4979{
4980 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
Logan Chien2833ffb2018-10-09 10:03:24 +08004981}
4982
Logan Chien55afb0a2018-10-15 10:42:14 +08004983/// Constructs a 256-bit floating-point vector of [8 x float] by
4984/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4985/// similar to _mm256_set_m128, but the order of the input parameters is
4986/// swapped.
4987///
4988/// \headerfile <x86intrin.h>
4989///
4990/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4991///
4992/// \param __lo
4993/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4994/// 128 bits of the result.
4995/// \param __hi
4996/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4997/// 128 bits of the result.
4998/// \returns A 256-bit floating-point vector of [8 x float] containing the
4999/// concatenated result.
Logan Chien2833ffb2018-10-09 10:03:24 +08005000static __inline __m256 __DEFAULT_FN_ATTRS
Logan Chien55afb0a2018-10-15 10:42:14 +08005001_mm256_setr_m128 (__m128 __lo, __m128 __hi)
5002{
Logan Chien2833ffb2018-10-09 10:03:24 +08005003 return _mm256_set_m128(__hi, __lo);
5004}
5005
Logan Chien55afb0a2018-10-15 10:42:14 +08005006/// Constructs a 256-bit floating-point vector of [4 x double] by
5007/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
5008/// similar to _mm256_set_m128d, but the order of the input parameters is
5009/// swapped.
5010///
5011/// \headerfile <x86intrin.h>
5012///
5013/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5014///
5015/// \param __lo
5016/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
5017/// 128 bits of the result.
5018/// \param __hi
5019/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
5020/// 128 bits of the result.
5021/// \returns A 256-bit floating-point vector of [4 x double] containing the
5022/// concatenated result.
Logan Chien2833ffb2018-10-09 10:03:24 +08005023static __inline __m256d __DEFAULT_FN_ATTRS
Logan Chien55afb0a2018-10-15 10:42:14 +08005024_mm256_setr_m128d (__m128d __lo, __m128d __hi)
5025{
5026 return (__m256d)_mm256_set_m128d(__hi, __lo);
Logan Chien2833ffb2018-10-09 10:03:24 +08005027}
5028
Logan Chien55afb0a2018-10-15 10:42:14 +08005029/// Constructs a 256-bit integer vector by concatenating two 128-bit
5030/// integer vectors. This is similar to _mm256_set_m128i, but the order of
5031/// the input parameters is swapped.
5032///
5033/// \headerfile <x86intrin.h>
5034///
5035/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5036///
5037/// \param __lo
5038/// A 128-bit integer vector to be copied to the lower 128 bits of the
5039/// result.
5040/// \param __hi
5041/// A 128-bit integer vector to be copied to the upper 128 bits of the
5042/// result.
5043/// \returns A 256-bit integer vector containing the concatenated result.
Logan Chien2833ffb2018-10-09 10:03:24 +08005044static __inline __m256i __DEFAULT_FN_ATTRS
Logan Chien55afb0a2018-10-15 10:42:14 +08005045_mm256_setr_m128i (__m128i __lo, __m128i __hi)
5046{
5047 return (__m256i)_mm256_set_m128i(__hi, __lo);
Logan Chien2833ffb2018-10-09 10:03:24 +08005048}
5049
5050#undef __DEFAULT_FN_ATTRS
Logan Chien55afb0a2018-10-15 10:42:14 +08005051#undef __DEFAULT_FN_ATTRS128
Logan Chien2833ffb2018-10-09 10:03:24 +08005052
5053#endif /* __AVXINTRIN_H */