blob: 2a733230c6019017a5f8ffd49dd86da3500f9213 [file] [log] [blame]
Ben Murdoch097c5b22016-05-18 11:27:45 +01001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
27
28#ifndef __AVXINTRIN_H
29#define __AVXINTRIN_H
30
31typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37
38/* We need an explicitly signed variant for char. Note that this shouldn't
39 * appear in the interface though. */
40typedef signed char __v32qs __attribute__((__vector_size__(32)));
41
42typedef float __m256 __attribute__ ((__vector_size__ (32)));
43typedef double __m256d __attribute__((__vector_size__(32)));
44typedef long long __m256i __attribute__((__vector_size__(32)));
45
46/* Define the default attributes for the functions in this file. */
47#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
48
49/* Arithmetic */
50/// \brief Adds two 256-bit vectors of [4 x double].
51///
52/// \headerfile <x86intrin.h>
53///
54/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction.
55///
56/// \param __a
57/// A 256-bit vector of [4 x double] containing one of the source operands.
58/// \param __b
59/// A 256-bit vector of [4 x double] containing one of the source operands.
60/// \returns A 256-bit vector of [4 x double] containing the sums of both
61/// operands.
62static __inline __m256d __DEFAULT_FN_ATTRS
63_mm256_add_pd(__m256d __a, __m256d __b)
64{
65 return __a+__b;
66}
67
68/// \brief Adds two 256-bit vectors of [8 x float].
69///
70/// \headerfile <x86intrin.h>
71///
72/// This intrinsic corresponds to the \c VADDPS / ADDPS instruction.
73///
74/// \param __a
75/// A 256-bit vector of [8 x float] containing one of the source operands.
76/// \param __b
77/// A 256-bit vector of [8 x float] containing one of the source operands.
78/// \returns A 256-bit vector of [8 x float] containing the sums of both
79/// operands.
80static __inline __m256 __DEFAULT_FN_ATTRS
81_mm256_add_ps(__m256 __a, __m256 __b)
82{
83 return __a+__b;
84}
85
86/// \brief Subtracts two 256-bit vectors of [4 x double].
87///
88/// \headerfile <x86intrin.h>
89///
90/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction.
91///
92/// \param __a
93/// A 256-bit vector of [4 x double] containing the minuend.
94/// \param __b
95/// A 256-bit vector of [4 x double] containing the subtrahend.
96/// \returns A 256-bit vector of [4 x double] containing the differences between
97/// both operands.
98static __inline __m256d __DEFAULT_FN_ATTRS
99_mm256_sub_pd(__m256d __a, __m256d __b)
100{
101 return __a-__b;
102}
103
104/// \brief Subtracts two 256-bit vectors of [8 x float].
105///
106/// \headerfile <x86intrin.h>
107///
108/// This intrinsic corresponds to the \c VSUBPS / SUBPS instruction.
109///
110/// \param __a
111/// A 256-bit vector of [8 x float] containing the minuend.
112/// \param __b
113/// A 256-bit vector of [8 x float] containing the subtrahend.
114/// \returns A 256-bit vector of [8 x float] containing the differences between
115/// both operands.
116static __inline __m256 __DEFAULT_FN_ATTRS
117_mm256_sub_ps(__m256 __a, __m256 __b)
118{
119 return __a-__b;
120}
121
122/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
123/// two 256-bit vectors of [4 x double].
124///
125/// \headerfile <x86intrin.h>
126///
127/// This intrinsic corresponds to the \c VADDSUBPD / ADDSUBPD instruction.
128///
129/// \param __a
130/// A 256-bit vector of [4 x double] containing the left source operand.
131/// \param __b
132/// A 256-bit vector of [4 x double] containing the right source operand.
133/// \returns A 256-bit vector of [4 x double] containing the alternating sums
134/// and differences between both operands.
135static __inline __m256d __DEFAULT_FN_ATTRS
136_mm256_addsub_pd(__m256d __a, __m256d __b)
137{
138 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
139}
140
141/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
142/// two 256-bit vectors of [8 x float].
143///
144/// \headerfile <x86intrin.h>
145///
146/// This intrinsic corresponds to the \c VADDSUBPS / ADDSUBPS instruction.
147///
148/// \param __a
149/// A 256-bit vector of [8 x float] containing the left source operand.
150/// \param __b
151/// A 256-bit vector of [8 x float] containing the right source operand.
152/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
153/// differences between both operands.
154static __inline __m256 __DEFAULT_FN_ATTRS
155_mm256_addsub_ps(__m256 __a, __m256 __b)
156{
157 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
158}
159
160/// \brief Divides two 256-bit vectors of [4 x double].
161///
162/// \headerfile <x86intrin.h>
163///
164/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction.
165///
166/// \param __a
167/// A 256-bit vector of [4 x double] containing the dividend.
168/// \param __b
169/// A 256-bit vector of [4 x double] containing the divisor.
170/// \returns A 256-bit vector of [4 x double] containing the quotients between
171/// both operands.
172static __inline __m256d __DEFAULT_FN_ATTRS
173_mm256_div_pd(__m256d __a, __m256d __b)
174{
175 return __a / __b;
176}
177
178/// \brief Divides two 256-bit vectors of [8 x float].
179///
180/// \headerfile <x86intrin.h>
181///
182/// This intrinsic corresponds to the \c VDIVPS / DIVPS instruction.
183///
184/// \param __a
185/// A 256-bit vector of [8 x float] containing the dividend.
186/// \param __b
187/// A 256-bit vector of [8 x float] containing the divisor.
188/// \returns A 256-bit vector of [8 x float] containing the quotients between
189/// both operands.
190static __inline __m256 __DEFAULT_FN_ATTRS
191_mm256_div_ps(__m256 __a, __m256 __b)
192{
193 return __a / __b;
194}
195
196/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
197/// of each pair of values.
198///
199/// \headerfile <x86intrin.h>
200///
201/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction.
202///
203/// \param __a
204/// A 256-bit vector of [4 x double] containing one of the operands.
205/// \param __b
206/// A 256-bit vector of [4 x double] containing one of the operands.
207/// \returns A 256-bit vector of [4 x double] containing the maximum values
208/// between both operands.
209static __inline __m256d __DEFAULT_FN_ATTRS
210_mm256_max_pd(__m256d __a, __m256d __b)
211{
212 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
213}
214
215/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
216/// of each pair of values.
217///
218/// \headerfile <x86intrin.h>
219///
220/// This intrinsic corresponds to the \c VMAXPS / MAXPS instruction.
221///
222/// \param __a
223/// A 256-bit vector of [8 x float] containing one of the operands.
224/// \param __b
225/// A 256-bit vector of [8 x float] containing one of the operands.
226/// \returns A 256-bit vector of [8 x float] containing the maximum values
227/// between both operands.
228static __inline __m256 __DEFAULT_FN_ATTRS
229_mm256_max_ps(__m256 __a, __m256 __b)
230{
231 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
232}
233
234/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
235/// of each pair of values.
236///
237/// \headerfile <x86intrin.h>
238///
239/// This intrinsic corresponds to the \c VMINPD / MINPD instruction.
240///
241/// \param __a
242/// A 256-bit vector of [4 x double] containing one of the operands.
243/// \param __b
244/// A 256-bit vector of [4 x double] containing one of the operands.
245/// \returns A 256-bit vector of [4 x double] containing the minimum values
246/// between both operands.
247static __inline __m256d __DEFAULT_FN_ATTRS
248_mm256_min_pd(__m256d __a, __m256d __b)
249{
250 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
251}
252
253/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
254/// of each pair of values.
255///
256/// \headerfile <x86intrin.h>
257///
258/// This intrinsic corresponds to the \c VMINPS / MINPS instruction.
259///
260/// \param __a
261/// A 256-bit vector of [8 x float] containing one of the operands.
262/// \param __b
263/// A 256-bit vector of [8 x float] containing one of the operands.
264/// \returns A 256-bit vector of [8 x float] containing the minimum values
265/// between both operands.
266static __inline __m256 __DEFAULT_FN_ATTRS
267_mm256_min_ps(__m256 __a, __m256 __b)
268{
269 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
270}
271
272/// \brief Multiplies two 256-bit vectors of [4 x double].
273///
274/// \headerfile <x86intrin.h>
275///
276/// This intrinsic corresponds to the \c VMULPD / MULPD instruction.
277///
278/// \param __a
279/// A 256-bit vector of [4 x double] containing one of the operands.
280/// \param __b
281/// A 256-bit vector of [4 x double] containing one of the operands.
282/// \returns A 256-bit vector of [4 x double] containing the products between
283/// both operands.
284static __inline __m256d __DEFAULT_FN_ATTRS
285_mm256_mul_pd(__m256d __a, __m256d __b)
286{
287 return __a * __b;
288}
289
290/// \brief Multiplies two 256-bit vectors of [8 x float].
291///
292/// \headerfile <x86intrin.h>
293///
294/// This intrinsic corresponds to the \c VMULPS / MULPS instruction.
295///
296/// \param __a
297/// A 256-bit vector of [8 x float] containing one of the operands.
298/// \param __b
299/// A 256-bit vector of [8 x float] containing one of the operands.
300/// \returns A 256-bit vector of [8 x float] containing the products between
301/// both operands.
302static __inline __m256 __DEFAULT_FN_ATTRS
303_mm256_mul_ps(__m256 __a, __m256 __b)
304{
305 return __a * __b;
306}
307
308/// \brief Calculates the square roots of the values stored in a 256-bit vector
309/// of [4 x double].
310///
311/// \headerfile <x86intrin.h>
312///
313/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction.
314///
315/// \param __a
316/// A 256-bit vector of [4 x double].
317/// \returns A 256-bit vector of [4 x double] containing the square roots of the
318/// values in the operand.
319static __inline __m256d __DEFAULT_FN_ATTRS
320_mm256_sqrt_pd(__m256d __a)
321{
322 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
323}
324
325/// \brief Calculates the square roots of the values stored in a 256-bit vector
326/// of [8 x float].
327///
328/// \headerfile <x86intrin.h>
329///
330/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instruction.
331///
332/// \param __a
333/// A 256-bit vector of [8 x float].
334/// \returns A 256-bit vector of [8 x float] containing the square roots of the
335/// values in the operand.
336static __inline __m256 __DEFAULT_FN_ATTRS
337_mm256_sqrt_ps(__m256 __a)
338{
339 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
340}
341
342/// \brief Calculates the reciprocal square roots of the values stored in a
343/// 256-bit vector of [8 x float].
344///
345/// \headerfile <x86intrin.h>
346///
347/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instruction.
348///
349/// \param __a
350/// A 256-bit vector of [8 x float].
351/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
352/// roots of the values in the operand.
353static __inline __m256 __DEFAULT_FN_ATTRS
354_mm256_rsqrt_ps(__m256 __a)
355{
356 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
357}
358
359/// \brief Calculates the reciprocals of the values stored in a 256-bit vector
360/// of [8 x float].
361///
362/// \headerfile <x86intrin.h>
363///
364/// This intrinsic corresponds to the \c VRCPPS / RCPPS instruction.
365///
366/// \param __a
367/// A 256-bit vector of [8 x float].
368/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
369/// values in the operand.
370static __inline __m256 __DEFAULT_FN_ATTRS
371_mm256_rcp_ps(__m256 __a)
372{
373 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
374}
375
376/// \brief Rounds the values stored in a 256-bit vector of [4 x double] as
377/// specified by the byte operand. The source values are rounded to integer
378/// values and returned as 64-bit double-precision floating-point values.
379///
380/// \headerfile <x86intrin.h>
381///
382/// \code
383/// __m256d _mm256_round_pd(__m256d V, const int M);
384/// \endcode
385///
386/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
387///
388/// \param V
389/// A 256-bit vector of [4 x double].
390/// \param M
391/// An integer value that specifies the rounding operation.
392/// Bits [7:4] are reserved.
393/// Bit [3] is a precision exception value:
394/// 0: A normal PE exception is used
395/// 1: The PE field is not updated
396/// Bit [2] is the rounding control source:
397/// 0: Use bits [1:0] of M
398/// 1: Use the current MXCSR setting
399/// Bits [1:0] contain the rounding control definition:
400/// 00: Nearest
401/// 01: Downward (toward negative infinity)
402/// 10: Upward (toward positive infinity)
403/// 11: Truncated
404/// \returns A 256-bit vector of [4 x double] containing the rounded values.
405#define _mm256_round_pd(V, M) __extension__ ({ \
406 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
407
408/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
409/// specified by the byte operand. The source values are rounded to integer
410/// values and returned as floating-point values.
411///
412/// \headerfile <x86intrin.h>
413///
414/// \code
415/// __m256 _mm256_round_ps(__m256 V, const int M);
416/// \endcode
417///
418/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
419///
420/// \param V
421/// A 256-bit vector of [8 x float].
422/// \param M
423/// An integer value that specifies the rounding operation.
424/// Bits [7:4] are reserved.
425/// Bit [3] is a precision exception value:
426/// 0: A normal PE exception is used
427/// 1: The PE field is not updated
428/// Bit [2] is the rounding control source:
429/// 0: Use bits [1:0] of M
430/// 1: Use the current MXCSR setting
431/// Bits [1:0] contain the rounding control definition:
432/// 00: Nearest
433/// 01: Downward (toward negative infinity)
434/// 10: Upward (toward positive infinity)
435/// 11: Truncated
436/// \returns A 256-bit vector of [8 x float] containing the rounded values.
437#define _mm256_round_ps(V, M) __extension__ ({ \
438 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
439
440/// \brief Round up the values stored in a 256-bit vector of [4 x double]. The
441/// source values are rounded up to integer values and returned as 64-bit
442/// double-precision floating-point values.
443///
444/// \headerfile <x86intrin.h>
445///
446/// \code
447/// __m256d _mm256_ceil_pd(__m256d V);
448/// \endcode
449///
450/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
451///
452/// \param V
453/// A 256-bit vector of [4 x double].
454/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
455#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
456
457/// \brief Round down the values stored in a 256-bit vector of [4 x double].
458/// The source values are rounded down to integer values and returned as
459/// 64-bit double-precision floating-point values.
460///
461/// \headerfile <x86intrin.h>
462///
463/// \code
464/// __m256d _mm256_floor_pd(__m256d V);
465/// \endcode
466///
467/// This intrinsic corresponds to the \c VROUNDPD / ROUNDPD instruction.
468///
469/// \param V
470/// A 256-bit vector of [4 x double].
471/// \returns A 256-bit vector of [4 x double] containing the rounded down
472/// values.
473#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
474
475/// \brief Round up the values stored in a 256-bit vector of [8 x float]. The
476/// source values are rounded up to integer values and returned as
477/// floating-point values.
478///
479/// \headerfile <x86intrin.h>
480///
481/// \code
482/// __m256 _mm256_ceil_ps(__m256 V);
483/// \endcode
484///
485/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
486///
487/// \param V
488/// A 256-bit vector of [8 x float].
489/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
490#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
491
492/// \brief Round down the values stored in a 256-bit vector of [8 x float]. The
493/// source values are rounded down to integer values and returned as
494/// floating-point values.
495///
496/// \headerfile <x86intrin.h>
497///
498/// \code
499/// __m256 _mm256_floor_ps(__m256 V);
500/// \endcode
501///
502/// This intrinsic corresponds to the \c VROUNDPS / ROUNDPS instruction.
503///
504/// \param V
505/// A 256-bit vector of [8 x float].
506/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
507#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
508
509/* Logical */
510/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
511///
512/// \headerfile <x86intrin.h>
513///
514/// This intrinsic corresponds to the \c VANDPD / ANDPD instruction.
515///
516/// \param __a
517/// A 256-bit vector of [4 x double] containing one of the source operands.
518/// \param __b
519/// A 256-bit vector of [4 x double] containing one of the source operands.
520/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
521/// values between both operands.
522static __inline __m256d __DEFAULT_FN_ATTRS
523_mm256_and_pd(__m256d __a, __m256d __b)
524{
525 return (__m256d)((__v4di)__a & (__v4di)__b);
526}
527
528/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
529///
530/// \headerfile <x86intrin.h>
531///
532/// This intrinsic corresponds to the \c VANDPS / ANDPS instruction.
533///
534/// \param __a
535/// A 256-bit vector of [8 x float] containing one of the source operands.
536/// \param __b
537/// A 256-bit vector of [8 x float] containing one of the source operands.
538/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
539/// values between both operands.
540static __inline __m256 __DEFAULT_FN_ATTRS
541_mm256_and_ps(__m256 __a, __m256 __b)
542{
543 return (__m256)((__v8si)__a & (__v8si)__b);
544}
545
546/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
547/// the one's complement of the values contained in the first source operand.
548///
549/// \headerfile <x86intrin.h>
550///
551/// This intrinsic corresponds to the \c VANDNPD / ANDNPD instruction.
552///
553/// \param __a
554/// A 256-bit vector of [4 x double] containing the left source operand. The
555/// one's complement of this value is used in the bitwise AND.
556/// \param __b
557/// A 256-bit vector of [4 x double] containing the right source operand.
558/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
559/// values of the second operand and the one's complement of the first
560/// operand.
561static __inline __m256d __DEFAULT_FN_ATTRS
562_mm256_andnot_pd(__m256d __a, __m256d __b)
563{
564 return (__m256d)(~(__v4di)__a & (__v4di)__b);
565}
566
567/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
568/// the one's complement of the values contained in the first source operand.
569///
570/// \headerfile <x86intrin.h>
571///
572/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instruction.
573///
574/// \param __a
575/// A 256-bit vector of [8 x float] containing the left source operand. The
576/// one's complement of this value is used in the bitwise AND.
577/// \param __b
578/// A 256-bit vector of [8 x float] containing the right source operand.
579/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
580/// values of the second operand and the one's complement of the first
581/// operand.
582static __inline __m256 __DEFAULT_FN_ATTRS
583_mm256_andnot_ps(__m256 __a, __m256 __b)
584{
585 return (__m256)(~(__v8si)__a & (__v8si)__b);
586}
587
588/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
589///
590/// \headerfile <x86intrin.h>
591///
592/// This intrinsic corresponds to the \c VORPD / ORPD instruction.
593///
594/// \param __a
595/// A 256-bit vector of [4 x double] containing one of the source operands.
596/// \param __b
597/// A 256-bit vector of [4 x double] containing one of the source operands.
598/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
599/// values between both operands.
600static __inline __m256d __DEFAULT_FN_ATTRS
601_mm256_or_pd(__m256d __a, __m256d __b)
602{
603 return (__m256d)((__v4di)__a | (__v4di)__b);
604}
605
606/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
607///
608/// \headerfile <x86intrin.h>
609///
610/// This intrinsic corresponds to the \c VORPS / ORPS instruction.
611///
612/// \param __a
613/// A 256-bit vector of [8 x float] containing one of the source operands.
614/// \param __b
615/// A 256-bit vector of [8 x float] containing one of the source operands.
616/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
617/// values between both operands.
618static __inline __m256 __DEFAULT_FN_ATTRS
619_mm256_or_ps(__m256 __a, __m256 __b)
620{
621 return (__m256)((__v8si)__a | (__v8si)__b);
622}
623
624/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
625///
626/// \headerfile <x86intrin.h>
627///
628/// This intrinsic corresponds to the \c VXORPD / XORPD instruction.
629///
630/// \param __a
631/// A 256-bit vector of [4 x double] containing one of the source operands.
632/// \param __b
633/// A 256-bit vector of [4 x double] containing one of the source operands.
634/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
635/// values between both operands.
636static __inline __m256d __DEFAULT_FN_ATTRS
637_mm256_xor_pd(__m256d __a, __m256d __b)
638{
639 return (__m256d)((__v4di)__a ^ (__v4di)__b);
640}
641
642/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
643///
644/// \headerfile <x86intrin.h>
645///
646/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
647///
648/// \param __a
649/// A 256-bit vector of [8 x float] containing one of the source operands.
650/// \param __b
651/// A 256-bit vector of [8 x float] containing one of the source operands.
652/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
653/// values between both operands.
654static __inline __m256 __DEFAULT_FN_ATTRS
655_mm256_xor_ps(__m256 __a, __m256 __b)
656{
657 return (__m256)((__v8si)__a ^ (__v8si)__b);
658}
659
660/* Horizontal arithmetic */
661/// \brief Horizontally adds the adjacent pairs of values contained in two
662/// 256-bit vectors of [4 x double].
663///
664/// \headerfile <x86intrin.h>
665///
666/// This intrinsic corresponds to the \c VHADDPD / HADDPD instruction.
667///
668/// \param __a
669/// A 256-bit vector of [4 x double] containing one of the source operands.
670/// The horizontal sums of the values are returned in the even-indexed
671/// elements of a vector of [4 x double].
672/// \param __b
673/// A 256-bit vector of [4 x double] containing one of the source operands.
674/// The horizontal sums of the values are returned in the odd-indexed
675/// elements of a vector of [4 x double].
676/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
677/// both operands.
678static __inline __m256d __DEFAULT_FN_ATTRS
679_mm256_hadd_pd(__m256d __a, __m256d __b)
680{
681 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
682}
683
684/// \brief Horizontally adds the adjacent pairs of values contained in two
685/// 256-bit vectors of [8 x float].
686///
687/// \headerfile <x86intrin.h>
688///
689/// This intrinsic corresponds to the \c VHADDPS / HADDPS instruction.
690///
691/// \param __a
692/// A 256-bit vector of [8 x float] containing one of the source operands.
693/// The horizontal sums of the values are returned in the elements with
694/// index 0, 1, 4, 5 of a vector of [8 x float].
695/// \param __b
696/// A 256-bit vector of [8 x float] containing one of the source operands.
697/// The horizontal sums of the values are returned in the elements with
698/// index 2, 3, 6, 7 of a vector of [8 x float].
699/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
700/// both operands.
701static __inline __m256 __DEFAULT_FN_ATTRS
702_mm256_hadd_ps(__m256 __a, __m256 __b)
703{
704 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
705}
706
707/// \brief Horizontally subtracts the adjacent pairs of values contained in two
708/// 256-bit vectors of [4 x double].
709///
710/// \headerfile <x86intrin.h>
711///
712/// This intrinsic corresponds to the \c VHSUBPD / HSUBPD instruction.
713///
714/// \param __a
715/// A 256-bit vector of [4 x double] containing one of the source operands.
716/// The horizontal differences between the values are returned in the
717/// even-indexed elements of a vector of [4 x double].
718/// \param __b
719/// A 256-bit vector of [4 x double] containing one of the source operands.
720/// The horizontal differences between the values are returned in the
721/// odd-indexed elements of a vector of [4 x double].
722/// \returns A 256-bit vector of [4 x double] containing the horizontal
723/// differences of both operands.
724static __inline __m256d __DEFAULT_FN_ATTRS
725_mm256_hsub_pd(__m256d __a, __m256d __b)
726{
727 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
728}
729
730/// \brief Horizontally subtracts the adjacent pairs of values contained in two
731/// 256-bit vectors of [8 x float].
732///
733/// \headerfile <x86intrin.h>
734///
735/// This intrinsic corresponds to the \c VHSUBPS / HSUBPS instruction.
736///
737/// \param __a
738/// A 256-bit vector of [8 x float] containing one of the source operands.
739/// The horizontal differences between the values are returned in the
740/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
741/// \param __b
742/// A 256-bit vector of [8 x float] containing one of the source operands.
743/// The horizontal differences between the values are returned in the
744/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
745/// \returns A 256-bit vector of [8 x float] containing the horizontal
746/// differences of both operands.
747static __inline __m256 __DEFAULT_FN_ATTRS
748_mm256_hsub_ps(__m256 __a, __m256 __b)
749{
750 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
751}
752
753/* Vector permutations */
754/// \brief Copies the values stored in a 128-bit vector of [2 x double] as
755/// specified by the 128-bit integer vector operand.
756///
757/// \headerfile <x86intrin.h>
758///
759/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
760///
761/// \param __a
762/// A 128-bit vector of [2 x double].
763/// \param __c
764/// A 128-bit integer vector operand specifying how the values are to be
765/// copied.
766/// Bit [1]:
767/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
768/// returned vector
769/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
770/// returned vector
771/// Bit [65]:
772/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
773/// returned vector
774/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
775/// returned vector
776/// \returns A 128-bit vector of [2 x double] containing the copied values.
777static __inline __m128d __DEFAULT_FN_ATTRS
778_mm_permutevar_pd(__m128d __a, __m128i __c)
779{
780 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
781}
782
783/// \brief Copies the values stored in a 256-bit vector of [4 x double] as
784/// specified by the 256-bit integer vector operand.
785///
786/// \headerfile <x86intrin.h>
787///
788/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
789///
790/// \param __a
791/// A 256-bit vector of [4 x double].
792/// \param __c
793/// A 256-bit integer vector operand specifying how the values are to be
794/// copied.
795/// Bit [1]:
796/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
797/// returned vector
798/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
799/// returned vector
800/// Bit [65]:
801/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
802/// returned vector
803/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
804/// returned vector
805/// Bit [129]:
806/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
807/// returned vector
808/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
809/// returned vector
810/// Bit [193]:
811/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
812/// returned vector
813/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
814/// returned vector
815/// \returns A 256-bit vector of [4 x double] containing the copied values.
816static __inline __m256d __DEFAULT_FN_ATTRS
817_mm256_permutevar_pd(__m256d __a, __m256i __c)
818{
819 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
820}
821
822/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
823/// specified by the 128-bit integer vector operand.
824///
825/// \headerfile <x86intrin.h>
826///
827/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
828///
829/// \param __a
830/// A 128-bit vector of [4 x float].
831/// \param __c
832/// A 128-bit integer vector operand specifying how the values are to be
833/// copied.
834/// Bits [1:0]:
835/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
836/// returned vector
837/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
838/// returned vector
839/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
840/// returned vector
841/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
842/// returned vector
843/// Bits [33:32]:
844/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
845/// returned vector
846/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
847/// returned vector
848/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
849/// returned vector
850/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
851/// returned vector
852/// Bits [65:64]:
853/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
854/// returned vector
855/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
856/// returned vector
857/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
858/// returned vector
859/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
860/// returned vector
861/// Bits [97:96]:
862/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
863/// returned vector
864/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
865/// returned vector
866/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
867/// returned vector
868/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
869/// returned vector
870/// \returns A 128-bit vector of [4 x float] containing the copied values.
871static __inline __m128 __DEFAULT_FN_ATTRS
872_mm_permutevar_ps(__m128 __a, __m128i __c)
873{
874 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
875}
876
877/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
878/// specified by the 256-bit integer vector operand.
879///
880/// \headerfile <x86intrin.h>
881///
882/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
883///
884/// \param __a
885/// A 256-bit vector of [8 x float].
886/// \param __c
887/// A 256-bit integer vector operand specifying how the values are to be
888/// copied.
889/// Bits [1:0]:
890/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
891/// returned vector
892/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
893/// returned vector
894/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
895/// returned vector
896/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
897/// returned vector
898/// Bits [33:32]:
899/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
900/// returned vector
901/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
902/// returned vector
903/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
904/// returned vector
905/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
906/// returned vector
907/// Bits [65:64]:
908/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
909/// returned vector
910/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
911/// returned vector
912/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
913/// returned vector
914/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
915/// returned vector
916/// Bits [97:96]:
917/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
918/// returned vector
919/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
920/// returned vector
921/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
922/// returned vector
923/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
924/// returned vector
925/// Bits [129:128]:
926/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
927/// returned vector
928/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
929/// returned vector
930/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
931/// returned vector
932/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
933/// returned vector
934/// Bits [161:160]:
935/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
936/// returned vector
937/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
938/// returned vector
939/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
940/// returned vector
941/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
942/// returned vector
943/// Bits [193:192]:
944/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
945/// returned vector
946/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
947/// returned vector
948/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
949/// returned vector
950/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
951/// returned vector
952/// Bits [225:224]:
953/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
954/// returned vector
955/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
956/// returned vector
957/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
958/// returned vector
959/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
960/// returned vector
961/// \returns A 256-bit vector of [8 x float] containing the copied values.
962static __inline __m256 __DEFAULT_FN_ATTRS
963_mm256_permutevar_ps(__m256 __a, __m256i __c)
964{
965 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
966}
967
968/// \brief Copies the values stored in a 128-bit vector of [2 x double] as
969/// specified by the immediate integer operand.
970///
971/// \headerfile <x86intrin.h>
972///
973/// \code
974/// __m128d _mm_permute_pd(__m128d A, const int C);
975/// \endcode
976///
977/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
978///
979/// \param A
980/// A 128-bit vector of [2 x double].
981/// \param C
982/// An immediate integer operand specifying how the values are to be copied.
983/// Bit [0]:
984/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
985/// returned vector
986/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
987/// returned vector
988/// Bit [1]:
989/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
990/// returned vector
991/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
992/// returned vector
993/// \returns A 128-bit vector of [2 x double] containing the copied values.
994#define _mm_permute_pd(A, C) __extension__ ({ \
995 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
996 (__v2df)_mm_setzero_pd(), \
997 (C) & 0x1, ((C) & 0x2) >> 1); })
998
999/// \brief Copies the values stored in a 256-bit vector of [4 x double] as
1000/// specified by the immediate integer operand.
1001///
1002/// \headerfile <x86intrin.h>
1003///
1004/// \code
1005/// __m256d _mm256_permute_pd(__m256d A, const int C);
1006/// \endcode
1007///
1008/// This intrinsic corresponds to the \c VPERMILPD / PERMILPD instruction.
1009///
1010/// \param A
1011/// A 256-bit vector of [4 x double].
1012/// \param C
1013/// An immediate integer operand specifying how the values are to be copied.
1014/// Bit [0]:
1015/// 0: Bits [63:0] of the source are copied to bits [63:0] of the
1016/// returned vector
1017/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1018/// returned vector
1019/// Bit [1]:
1020/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1021/// returned vector
1022/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1023/// returned vector
1024/// Bit [2]:
1025/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1026/// returned vector
1027/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1028/// returned vector
1029/// Bit [3]:
1030/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1031/// returned vector
1032/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1033/// returned vector
1034/// \returns A 256-bit vector of [4 x double] containing the copied values.
1035#define _mm256_permute_pd(A, C) __extension__ ({ \
1036 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
1037 (__v4df)_mm256_setzero_pd(), \
1038 (C) & 0x1, ((C) & 0x2) >> 1, \
1039 2 + (((C) & 0x4) >> 2), \
1040 2 + (((C) & 0x8) >> 3)); })
1041
1042/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
1043/// specified by the immediate integer operand.
1044///
1045/// \headerfile <x86intrin.h>
1046///
1047/// \code
1048/// __m128 _mm_permute_ps(__m128 A, const int C);
1049/// \endcode
1050///
1051/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1052///
1053/// \param A
1054/// A 128-bit vector of [4 x float].
1055/// \param C
1056/// An immediate integer operand specifying how the values are to be copied.
1057/// Bits [1:0]:
1058/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1059/// returned vector
1060/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1061/// returned vector
1062/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1063/// returned vector
1064/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1065/// returned vector
1066/// Bits [3:2]:
1067/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1068/// returned vector
1069/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1070/// returned vector
1071/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1072/// returned vector
1073/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1074/// returned vector
1075/// Bits [5:4]:
1076/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1077/// returned vector
1078/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1079/// returned vector
1080/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1081/// returned vector
1082/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1083/// returned vector
1084/// Bits [7:6]:
1085/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1086/// returned vector
1087/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1088/// returned vector
1089/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1090/// returned vector
1091/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1092/// returned vector
1093/// \returns A 128-bit vector of [4 x float] containing the copied values.
1094#define _mm_permute_ps(A, C) __extension__ ({ \
1095 (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
1096 (__v4sf)_mm_setzero_ps(), \
1097 (C) & 0x3, ((C) & 0xc) >> 2, \
1098 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
1099
1100/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
1101/// specified by the immediate integer operand.
1102///
1103/// \headerfile <x86intrin.h>
1104///
1105/// \code
1106/// __m256 _mm256_permute_ps(__m256 A, const int C);
1107/// \endcode
1108///
1109/// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1110///
1111/// \param A
1112/// A 256-bit vector of [8 x float].
1113/// \param C
1114/// An immediate integer operand specifying how the values are to be copied.
1115/// Bits [1:0]:
1116/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1117/// returned vector
1118/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1119/// returned vector
1120/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1121/// returned vector
1122/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1123/// returned vector
1124/// Bits [3:2]:
1125/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1126/// returned vector
1127/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1128/// returned vector
1129/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1130/// returned vector
1131/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1132/// returned vector
1133/// Bits [5:4]:
1134/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1135/// returned vector
1136/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1137/// returned vector
1138/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1139/// returned vector
1140/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1141/// returned vector
1142/// Bits [7:6]:
1143/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1144/// returned vector
1145/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1146/// returned vector
1147/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1148/// returned vector
1149/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1150/// returned vector
1151/// Bits [1:0]:
1152/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1153/// returned vector
1154/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1155/// returned vector
1156/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1157/// returned vector
1158/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1159/// returned vector
1160/// Bits [3:2]:
1161/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1162/// returned vector
1163/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1164/// returned vector
1165/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1166/// returned vector
1167/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1168/// returned vector
1169/// Bits [5:4]:
1170/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1171/// returned vector
1172/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1173/// returned vector
1174/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1175/// returned vector
1176/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1177/// returned vector
1178/// Bits [7:6]:
1179/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1180/// returned vector
1181/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1182/// returned vector
1183/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1184/// returned vector
1185/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1186/// returned vector
1187/// \returns A 256-bit vector of [8 x float] containing the copied values.
1188#define _mm256_permute_ps(A, C) __extension__ ({ \
1189 (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
1190 (__v8sf)_mm256_setzero_ps(), \
1191 (C) & 0x3, ((C) & 0xc) >> 2, \
1192 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
1193 4 + (((C) & 0x03) >> 0), \
1194 4 + (((C) & 0x0c) >> 2), \
1195 4 + (((C) & 0x30) >> 4), \
1196 4 + (((C) & 0xc0) >> 6)); })
1197
1198#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
1199 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1200 (__v4df)(__m256d)(V2), (M)); })
1201
1202#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
1203 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1204 (__v8sf)(__m256)(V2), (M)); })
1205
1206#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
1207 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1208 (__v8si)(__m256i)(V2), (M)); })
1209
1210/* Vector Blend */
1211#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
1212 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
1213 (__v4df)(__m256d)(V2), \
1214 (((M) & 0x01) ? 4 : 0), \
1215 (((M) & 0x02) ? 5 : 1), \
1216 (((M) & 0x04) ? 6 : 2), \
1217 (((M) & 0x08) ? 7 : 3)); })
1218
1219#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
1220 (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
1221 (__v8sf)(__m256)(V2), \
1222 (((M) & 0x01) ? 8 : 0), \
1223 (((M) & 0x02) ? 9 : 1), \
1224 (((M) & 0x04) ? 10 : 2), \
1225 (((M) & 0x08) ? 11 : 3), \
1226 (((M) & 0x10) ? 12 : 4), \
1227 (((M) & 0x20) ? 13 : 5), \
1228 (((M) & 0x40) ? 14 : 6), \
1229 (((M) & 0x80) ? 15 : 7)); })
1230
1231static __inline __m256d __DEFAULT_FN_ATTRS
1232_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1233{
1234 return (__m256d)__builtin_ia32_blendvpd256(
1235 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1236}
1237
1238static __inline __m256 __DEFAULT_FN_ATTRS
1239_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1240{
1241 return (__m256)__builtin_ia32_blendvps256(
1242 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1243}
1244
1245/* Vector Dot Product */
1246#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
1247 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1248 (__v8sf)(__m256)(V2), (M)); })
1249
1250/* Vector shuffle */
1251#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
1252 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
1253 (__v8sf)(__m256)(b), \
1254 (mask) & 0x3, \
1255 ((mask) & 0xc) >> 2, \
1256 (((mask) & 0x30) >> 4) + 8, \
1257 (((mask) & 0xc0) >> 6) + 8, \
1258 ((mask) & 0x3) + 4, \
1259 (((mask) & 0xc) >> 2) + 4, \
1260 (((mask) & 0x30) >> 4) + 12, \
1261 (((mask) & 0xc0) >> 6) + 12); })
1262
1263#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
1264 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
1265 (__v4df)(__m256d)(b), \
1266 (mask) & 0x1, \
1267 (((mask) & 0x2) >> 1) + 4, \
1268 (((mask) & 0x4) >> 2) + 2, \
1269 (((mask) & 0x8) >> 3) + 6); })
1270
1271/* Compare */
1272#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
1273#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
1274#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
1275#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
1276#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
1277#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
1278#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
1279#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
1280#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1281#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
1282#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1283#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1284#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1285#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1286#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1287#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1288#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1289#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1290#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1291#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1292#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1293#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1294#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
1295#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1296#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1297#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
1298#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1299#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1300#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1301#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1302#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1303#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1304
1305#define _mm_cmp_pd(a, b, c) __extension__ ({ \
1306 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1307 (__v2df)(__m128d)(b), (c)); })
1308
1309#define _mm_cmp_ps(a, b, c) __extension__ ({ \
1310 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1311 (__v4sf)(__m128)(b), (c)); })
1312
1313#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
1314 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1315 (__v4df)(__m256d)(b), (c)); })
1316
1317#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
1318 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1319 (__v8sf)(__m256)(b), (c)); })
1320
1321#define _mm_cmp_sd(a, b, c) __extension__ ({ \
1322 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1323 (__v2df)(__m128d)(b), (c)); })
1324
1325#define _mm_cmp_ss(a, b, c) __extension__ ({ \
1326 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
1327 (__v4sf)(__m128)(b), (c)); })
1328
1329static __inline int __DEFAULT_FN_ATTRS
1330_mm256_extract_epi32(__m256i __a, const int __imm)
1331{
1332 __v8si __b = (__v8si)__a;
1333 return __b[__imm & 7];
1334}
1335
1336static __inline int __DEFAULT_FN_ATTRS
1337_mm256_extract_epi16(__m256i __a, const int __imm)
1338{
1339 __v16hi __b = (__v16hi)__a;
1340 return __b[__imm & 15];
1341}
1342
1343static __inline int __DEFAULT_FN_ATTRS
1344_mm256_extract_epi8(__m256i __a, const int __imm)
1345{
1346 __v32qi __b = (__v32qi)__a;
1347 return __b[__imm & 31];
1348}
1349
1350#ifdef __x86_64__
1351static __inline long long __DEFAULT_FN_ATTRS
1352_mm256_extract_epi64(__m256i __a, const int __imm)
1353{
1354 __v4di __b = (__v4di)__a;
1355 return __b[__imm & 3];
1356}
1357#endif
1358
1359static __inline __m256i __DEFAULT_FN_ATTRS
1360_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
1361{
1362 __v8si __c = (__v8si)__a;
1363 __c[__imm & 7] = __b;
1364 return (__m256i)__c;
1365}
1366
1367static __inline __m256i __DEFAULT_FN_ATTRS
1368_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
1369{
1370 __v16hi __c = (__v16hi)__a;
1371 __c[__imm & 15] = __b;
1372 return (__m256i)__c;
1373}
1374
1375static __inline __m256i __DEFAULT_FN_ATTRS
1376_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
1377{
1378 __v32qi __c = (__v32qi)__a;
1379 __c[__imm & 31] = __b;
1380 return (__m256i)__c;
1381}
1382
1383#ifdef __x86_64__
1384static __inline __m256i __DEFAULT_FN_ATTRS
1385_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
1386{
1387 __v4di __c = (__v4di)__a;
1388 __c[__imm & 3] = __b;
1389 return (__m256i)__c;
1390}
1391#endif
1392
1393/* Conversion */
1394static __inline __m256d __DEFAULT_FN_ATTRS
1395_mm256_cvtepi32_pd(__m128i __a)
1396{
1397 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
1398}
1399
1400static __inline __m256 __DEFAULT_FN_ATTRS
1401_mm256_cvtepi32_ps(__m256i __a)
1402{
1403 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
1404}
1405
1406static __inline __m128 __DEFAULT_FN_ATTRS
1407_mm256_cvtpd_ps(__m256d __a)
1408{
1409 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
1410}
1411
1412static __inline __m256i __DEFAULT_FN_ATTRS
1413_mm256_cvtps_epi32(__m256 __a)
1414{
1415 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
1416}
1417
1418static __inline __m256d __DEFAULT_FN_ATTRS
1419_mm256_cvtps_pd(__m128 __a)
1420{
1421 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
1422}
1423
1424static __inline __m128i __DEFAULT_FN_ATTRS
1425_mm256_cvttpd_epi32(__m256d __a)
1426{
1427 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
1428}
1429
1430static __inline __m128i __DEFAULT_FN_ATTRS
1431_mm256_cvtpd_epi32(__m256d __a)
1432{
1433 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
1434}
1435
1436static __inline __m256i __DEFAULT_FN_ATTRS
1437_mm256_cvttps_epi32(__m256 __a)
1438{
1439 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
1440}
1441
1442/* Vector replicate */
1443static __inline __m256 __DEFAULT_FN_ATTRS
1444_mm256_movehdup_ps(__m256 __a)
1445{
1446 return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
1447}
1448
1449static __inline __m256 __DEFAULT_FN_ATTRS
1450_mm256_moveldup_ps(__m256 __a)
1451{
1452 return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
1453}
1454
1455static __inline __m256d __DEFAULT_FN_ATTRS
1456_mm256_movedup_pd(__m256d __a)
1457{
1458 return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
1459}
1460
1461/* Unpack and Interleave */
1462static __inline __m256d __DEFAULT_FN_ATTRS
1463_mm256_unpackhi_pd(__m256d __a, __m256d __b)
1464{
1465 return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
1466}
1467
1468static __inline __m256d __DEFAULT_FN_ATTRS
1469_mm256_unpacklo_pd(__m256d __a, __m256d __b)
1470{
1471 return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
1472}
1473
1474static __inline __m256 __DEFAULT_FN_ATTRS
1475_mm256_unpackhi_ps(__m256 __a, __m256 __b)
1476{
1477 return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
1478}
1479
1480static __inline __m256 __DEFAULT_FN_ATTRS
1481_mm256_unpacklo_ps(__m256 __a, __m256 __b)
1482{
1483 return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
1484}
1485
1486/* Bit Test */
1487static __inline int __DEFAULT_FN_ATTRS
1488_mm_testz_pd(__m128d __a, __m128d __b)
1489{
1490 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
1491}
1492
1493static __inline int __DEFAULT_FN_ATTRS
1494_mm_testc_pd(__m128d __a, __m128d __b)
1495{
1496 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
1497}
1498
1499static __inline int __DEFAULT_FN_ATTRS
1500_mm_testnzc_pd(__m128d __a, __m128d __b)
1501{
1502 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
1503}
1504
1505static __inline int __DEFAULT_FN_ATTRS
1506_mm_testz_ps(__m128 __a, __m128 __b)
1507{
1508 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
1509}
1510
1511static __inline int __DEFAULT_FN_ATTRS
1512_mm_testc_ps(__m128 __a, __m128 __b)
1513{
1514 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
1515}
1516
1517static __inline int __DEFAULT_FN_ATTRS
1518_mm_testnzc_ps(__m128 __a, __m128 __b)
1519{
1520 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
1521}
1522
1523static __inline int __DEFAULT_FN_ATTRS
1524_mm256_testz_pd(__m256d __a, __m256d __b)
1525{
1526 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
1527}
1528
1529static __inline int __DEFAULT_FN_ATTRS
1530_mm256_testc_pd(__m256d __a, __m256d __b)
1531{
1532 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
1533}
1534
1535static __inline int __DEFAULT_FN_ATTRS
1536_mm256_testnzc_pd(__m256d __a, __m256d __b)
1537{
1538 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
1539}
1540
1541static __inline int __DEFAULT_FN_ATTRS
1542_mm256_testz_ps(__m256 __a, __m256 __b)
1543{
1544 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
1545}
1546
1547static __inline int __DEFAULT_FN_ATTRS
1548_mm256_testc_ps(__m256 __a, __m256 __b)
1549{
1550 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
1551}
1552
1553static __inline int __DEFAULT_FN_ATTRS
1554_mm256_testnzc_ps(__m256 __a, __m256 __b)
1555{
1556 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
1557}
1558
1559static __inline int __DEFAULT_FN_ATTRS
1560_mm256_testz_si256(__m256i __a, __m256i __b)
1561{
1562 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
1563}
1564
1565static __inline int __DEFAULT_FN_ATTRS
1566_mm256_testc_si256(__m256i __a, __m256i __b)
1567{
1568 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
1569}
1570
1571static __inline int __DEFAULT_FN_ATTRS
1572_mm256_testnzc_si256(__m256i __a, __m256i __b)
1573{
1574 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
1575}
1576
1577/* Vector extract sign mask */
1578static __inline int __DEFAULT_FN_ATTRS
1579_mm256_movemask_pd(__m256d __a)
1580{
1581 return __builtin_ia32_movmskpd256((__v4df)__a);
1582}
1583
1584static __inline int __DEFAULT_FN_ATTRS
1585_mm256_movemask_ps(__m256 __a)
1586{
1587 return __builtin_ia32_movmskps256((__v8sf)__a);
1588}
1589
1590/* Vector __zero */
1591static __inline void __DEFAULT_FN_ATTRS
1592_mm256_zeroall(void)
1593{
1594 __builtin_ia32_vzeroall();
1595}
1596
1597static __inline void __DEFAULT_FN_ATTRS
1598_mm256_zeroupper(void)
1599{
1600 __builtin_ia32_vzeroupper();
1601}
1602
1603/* Vector load with broadcast */
1604static __inline __m128 __DEFAULT_FN_ATTRS
1605_mm_broadcast_ss(float const *__a)
1606{
1607 float __f = *__a;
1608 return (__m128)(__v4sf){ __f, __f, __f, __f };
1609}
1610
1611static __inline __m256d __DEFAULT_FN_ATTRS
1612_mm256_broadcast_sd(double const *__a)
1613{
1614 double __d = *__a;
1615 return (__m256d)(__v4df){ __d, __d, __d, __d };
1616}
1617
1618static __inline __m256 __DEFAULT_FN_ATTRS
1619_mm256_broadcast_ss(float const *__a)
1620{
1621 float __f = *__a;
1622 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
1623}
1624
1625static __inline __m256d __DEFAULT_FN_ATTRS
1626_mm256_broadcast_pd(__m128d const *__a)
1627{
1628 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
1629}
1630
1631static __inline __m256 __DEFAULT_FN_ATTRS
1632_mm256_broadcast_ps(__m128 const *__a)
1633{
1634 return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
1635}
1636
1637/* SIMD load ops */
1638static __inline __m256d __DEFAULT_FN_ATTRS
1639_mm256_load_pd(double const *__p)
1640{
1641 return *(__m256d *)__p;
1642}
1643
1644static __inline __m256 __DEFAULT_FN_ATTRS
1645_mm256_load_ps(float const *__p)
1646{
1647 return *(__m256 *)__p;
1648}
1649
1650static __inline __m256d __DEFAULT_FN_ATTRS
1651_mm256_loadu_pd(double const *__p)
1652{
1653 struct __loadu_pd {
1654 __m256d __v;
1655 } __attribute__((__packed__, __may_alias__));
1656 return ((struct __loadu_pd*)__p)->__v;
1657}
1658
1659static __inline __m256 __DEFAULT_FN_ATTRS
1660_mm256_loadu_ps(float const *__p)
1661{
1662 struct __loadu_ps {
1663 __m256 __v;
1664 } __attribute__((__packed__, __may_alias__));
1665 return ((struct __loadu_ps*)__p)->__v;
1666}
1667
1668static __inline __m256i __DEFAULT_FN_ATTRS
1669_mm256_load_si256(__m256i const *__p)
1670{
1671 return *__p;
1672}
1673
1674static __inline __m256i __DEFAULT_FN_ATTRS
1675_mm256_loadu_si256(__m256i const *__p)
1676{
1677 struct __loadu_si256 {
1678 __m256i __v;
1679 } __attribute__((__packed__, __may_alias__));
1680 return ((struct __loadu_si256*)__p)->__v;
1681}
1682
1683static __inline __m256i __DEFAULT_FN_ATTRS
1684_mm256_lddqu_si256(__m256i const *__p)
1685{
1686 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
1687}
1688
1689/* SIMD store ops */
1690static __inline void __DEFAULT_FN_ATTRS
1691_mm256_store_pd(double *__p, __m256d __a)
1692{
1693 *(__m256d *)__p = __a;
1694}
1695
1696static __inline void __DEFAULT_FN_ATTRS
1697_mm256_store_ps(float *__p, __m256 __a)
1698{
1699 *(__m256 *)__p = __a;
1700}
1701
1702static __inline void __DEFAULT_FN_ATTRS
1703_mm256_storeu_pd(double *__p, __m256d __a)
1704{
1705 __builtin_ia32_storeupd256(__p, (__v4df)__a);
1706}
1707
1708static __inline void __DEFAULT_FN_ATTRS
1709_mm256_storeu_ps(float *__p, __m256 __a)
1710{
1711 __builtin_ia32_storeups256(__p, (__v8sf)__a);
1712}
1713
1714static __inline void __DEFAULT_FN_ATTRS
1715_mm256_store_si256(__m256i *__p, __m256i __a)
1716{
1717 *__p = __a;
1718}
1719
1720static __inline void __DEFAULT_FN_ATTRS
1721_mm256_storeu_si256(__m256i *__p, __m256i __a)
1722{
1723 __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
1724}
1725
1726/* Conditional load ops */
1727static __inline __m128d __DEFAULT_FN_ATTRS
1728_mm_maskload_pd(double const *__p, __m128i __m)
1729{
1730 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
1731}
1732
1733static __inline __m256d __DEFAULT_FN_ATTRS
1734_mm256_maskload_pd(double const *__p, __m256i __m)
1735{
1736 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
1737 (__v4di)__m);
1738}
1739
1740static __inline __m128 __DEFAULT_FN_ATTRS
1741_mm_maskload_ps(float const *__p, __m128i __m)
1742{
1743 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
1744}
1745
1746static __inline __m256 __DEFAULT_FN_ATTRS
1747_mm256_maskload_ps(float const *__p, __m256i __m)
1748{
1749 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
1750}
1751
1752/* Conditional store ops */
1753static __inline void __DEFAULT_FN_ATTRS
1754_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
1755{
1756 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
1757}
1758
1759static __inline void __DEFAULT_FN_ATTRS
1760_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
1761{
1762 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
1763}
1764
1765static __inline void __DEFAULT_FN_ATTRS
1766_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
1767{
1768 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
1769}
1770
1771static __inline void __DEFAULT_FN_ATTRS
1772_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
1773{
1774 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
1775}
1776
1777/* Cacheability support ops */
1778static __inline void __DEFAULT_FN_ATTRS
1779_mm256_stream_si256(__m256i *__a, __m256i __b)
1780{
1781 __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
1782}
1783
1784static __inline void __DEFAULT_FN_ATTRS
1785_mm256_stream_pd(double *__a, __m256d __b)
1786{
1787 __builtin_ia32_movntpd256(__a, (__v4df)__b);
1788}
1789
1790static __inline void __DEFAULT_FN_ATTRS
1791_mm256_stream_ps(float *__p, __m256 __a)
1792{
1793 __builtin_ia32_movntps256(__p, (__v8sf)__a);
1794}
1795
1796/* Create vectors */
1797static __inline__ __m256d __DEFAULT_FN_ATTRS
1798_mm256_undefined_pd()
1799{
1800 return (__m256d)__builtin_ia32_undef256();
1801}
1802
1803static __inline__ __m256 __DEFAULT_FN_ATTRS
1804_mm256_undefined_ps()
1805{
1806 return (__m256)__builtin_ia32_undef256();
1807}
1808
1809static __inline__ __m256i __DEFAULT_FN_ATTRS
1810_mm256_undefined_si256()
1811{
1812 return (__m256i)__builtin_ia32_undef256();
1813}
1814
1815static __inline __m256d __DEFAULT_FN_ATTRS
1816_mm256_set_pd(double __a, double __b, double __c, double __d)
1817{
1818 return (__m256d){ __d, __c, __b, __a };
1819}
1820
1821static __inline __m256 __DEFAULT_FN_ATTRS
1822_mm256_set_ps(float __a, float __b, float __c, float __d,
1823 float __e, float __f, float __g, float __h)
1824{
1825 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
1826}
1827
1828static __inline __m256i __DEFAULT_FN_ATTRS
1829_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
1830 int __i4, int __i5, int __i6, int __i7)
1831{
1832 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
1833}
1834
1835static __inline __m256i __DEFAULT_FN_ATTRS
1836_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
1837 short __w11, short __w10, short __w09, short __w08,
1838 short __w07, short __w06, short __w05, short __w04,
1839 short __w03, short __w02, short __w01, short __w00)
1840{
1841 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
1842 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
1843}
1844
1845static __inline __m256i __DEFAULT_FN_ATTRS
1846_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
1847 char __b27, char __b26, char __b25, char __b24,
1848 char __b23, char __b22, char __b21, char __b20,
1849 char __b19, char __b18, char __b17, char __b16,
1850 char __b15, char __b14, char __b13, char __b12,
1851 char __b11, char __b10, char __b09, char __b08,
1852 char __b07, char __b06, char __b05, char __b04,
1853 char __b03, char __b02, char __b01, char __b00)
1854{
1855 return (__m256i)(__v32qi){
1856 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
1857 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
1858 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
1859 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
1860 };
1861}
1862
1863static __inline __m256i __DEFAULT_FN_ATTRS
1864_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
1865{
1866 return (__m256i)(__v4di){ __d, __c, __b, __a };
1867}
1868
1869/* Create vectors with elements in reverse order */
1870static __inline __m256d __DEFAULT_FN_ATTRS
1871_mm256_setr_pd(double __a, double __b, double __c, double __d)
1872{
1873 return (__m256d){ __a, __b, __c, __d };
1874}
1875
1876static __inline __m256 __DEFAULT_FN_ATTRS
1877_mm256_setr_ps(float __a, float __b, float __c, float __d,
1878 float __e, float __f, float __g, float __h)
1879{
1880 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
1881}
1882
1883static __inline __m256i __DEFAULT_FN_ATTRS
1884_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
1885 int __i4, int __i5, int __i6, int __i7)
1886{
1887 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
1888}
1889
1890static __inline __m256i __DEFAULT_FN_ATTRS
1891_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
1892 short __w11, short __w10, short __w09, short __w08,
1893 short __w07, short __w06, short __w05, short __w04,
1894 short __w03, short __w02, short __w01, short __w00)
1895{
1896 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
1897 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
1898}
1899
1900static __inline __m256i __DEFAULT_FN_ATTRS
1901_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
1902 char __b27, char __b26, char __b25, char __b24,
1903 char __b23, char __b22, char __b21, char __b20,
1904 char __b19, char __b18, char __b17, char __b16,
1905 char __b15, char __b14, char __b13, char __b12,
1906 char __b11, char __b10, char __b09, char __b08,
1907 char __b07, char __b06, char __b05, char __b04,
1908 char __b03, char __b02, char __b01, char __b00)
1909{
1910 return (__m256i)(__v32qi){
1911 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
1912 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
1913 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
1914 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
1915}
1916
1917static __inline __m256i __DEFAULT_FN_ATTRS
1918_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
1919{
1920 return (__m256i)(__v4di){ __a, __b, __c, __d };
1921}
1922
1923/* Create vectors with repeated elements */
1924static __inline __m256d __DEFAULT_FN_ATTRS
1925_mm256_set1_pd(double __w)
1926{
1927 return (__m256d){ __w, __w, __w, __w };
1928}
1929
1930static __inline __m256 __DEFAULT_FN_ATTRS
1931_mm256_set1_ps(float __w)
1932{
1933 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
1934}
1935
1936static __inline __m256i __DEFAULT_FN_ATTRS
1937_mm256_set1_epi32(int __i)
1938{
1939 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
1940}
1941
1942static __inline __m256i __DEFAULT_FN_ATTRS
1943_mm256_set1_epi16(short __w)
1944{
1945 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
1946 __w, __w, __w, __w, __w, __w };
1947}
1948
1949static __inline __m256i __DEFAULT_FN_ATTRS
1950_mm256_set1_epi8(char __b)
1951{
1952 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
1953 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
1954 __b, __b, __b, __b, __b, __b, __b };
1955}
1956
1957static __inline __m256i __DEFAULT_FN_ATTRS
1958_mm256_set1_epi64x(long long __q)
1959{
1960 return (__m256i)(__v4di){ __q, __q, __q, __q };
1961}
1962
1963/* Create __zeroed vectors */
1964static __inline __m256d __DEFAULT_FN_ATTRS
1965_mm256_setzero_pd(void)
1966{
1967 return (__m256d){ 0, 0, 0, 0 };
1968}
1969
1970static __inline __m256 __DEFAULT_FN_ATTRS
1971_mm256_setzero_ps(void)
1972{
1973 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1974}
1975
1976static __inline __m256i __DEFAULT_FN_ATTRS
1977_mm256_setzero_si256(void)
1978{
1979 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1980}
1981
1982/* Cast between vector types */
1983static __inline __m256 __DEFAULT_FN_ATTRS
1984_mm256_castpd_ps(__m256d __a)
1985{
1986 return (__m256)__a;
1987}
1988
1989static __inline __m256i __DEFAULT_FN_ATTRS
1990_mm256_castpd_si256(__m256d __a)
1991{
1992 return (__m256i)__a;
1993}
1994
1995static __inline __m256d __DEFAULT_FN_ATTRS
1996_mm256_castps_pd(__m256 __a)
1997{
1998 return (__m256d)__a;
1999}
2000
2001static __inline __m256i __DEFAULT_FN_ATTRS
2002_mm256_castps_si256(__m256 __a)
2003{
2004 return (__m256i)__a;
2005}
2006
2007static __inline __m256 __DEFAULT_FN_ATTRS
2008_mm256_castsi256_ps(__m256i __a)
2009{
2010 return (__m256)__a;
2011}
2012
2013static __inline __m256d __DEFAULT_FN_ATTRS
2014_mm256_castsi256_pd(__m256i __a)
2015{
2016 return (__m256d)__a;
2017}
2018
2019static __inline __m128d __DEFAULT_FN_ATTRS
2020_mm256_castpd256_pd128(__m256d __a)
2021{
2022 return __builtin_shufflevector(__a, __a, 0, 1);
2023}
2024
2025static __inline __m128 __DEFAULT_FN_ATTRS
2026_mm256_castps256_ps128(__m256 __a)
2027{
2028 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
2029}
2030
2031static __inline __m128i __DEFAULT_FN_ATTRS
2032_mm256_castsi256_si128(__m256i __a)
2033{
2034 return __builtin_shufflevector(__a, __a, 0, 1);
2035}
2036
2037static __inline __m256d __DEFAULT_FN_ATTRS
2038_mm256_castpd128_pd256(__m128d __a)
2039{
2040 return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
2041}
2042
2043static __inline __m256 __DEFAULT_FN_ATTRS
2044_mm256_castps128_ps256(__m128 __a)
2045{
2046 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
2047}
2048
2049static __inline __m256i __DEFAULT_FN_ATTRS
2050_mm256_castsi128_si256(__m128i __a)
2051{
2052 return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
2053}
2054
2055/*
2056 Vector insert.
2057 We use macros rather than inlines because we only want to accept
2058 invocations where the immediate M is a constant expression.
2059*/
2060#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
2061 (__m256)__builtin_shufflevector( \
2062 (__v8sf)(__m256)(V1), \
2063 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
2064 (((M) & 1) ? 0 : 8), \
2065 (((M) & 1) ? 1 : 9), \
2066 (((M) & 1) ? 2 : 10), \
2067 (((M) & 1) ? 3 : 11), \
2068 (((M) & 1) ? 8 : 4), \
2069 (((M) & 1) ? 9 : 5), \
2070 (((M) & 1) ? 10 : 6), \
2071 (((M) & 1) ? 11 : 7) );})
2072
2073#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
2074 (__m256d)__builtin_shufflevector( \
2075 (__v4df)(__m256d)(V1), \
2076 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
2077 (((M) & 1) ? 0 : 4), \
2078 (((M) & 1) ? 1 : 5), \
2079 (((M) & 1) ? 4 : 2), \
2080 (((M) & 1) ? 5 : 3) );})
2081
2082#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
2083 (__m256i)__builtin_shufflevector( \
2084 (__v4di)(__m256i)(V1), \
2085 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
2086 (((M) & 1) ? 0 : 4), \
2087 (((M) & 1) ? 1 : 5), \
2088 (((M) & 1) ? 4 : 2), \
2089 (((M) & 1) ? 5 : 3) );})
2090
2091/*
2092 Vector extract.
2093 We use macros rather than inlines because we only want to accept
2094 invocations where the immediate M is a constant expression.
2095*/
2096#define _mm256_extractf128_ps(V, M) __extension__ ({ \
2097 (__m128)__builtin_shufflevector( \
2098 (__v8sf)(__m256)(V), \
2099 (__v8sf)(_mm256_setzero_ps()), \
2100 (((M) & 1) ? 4 : 0), \
2101 (((M) & 1) ? 5 : 1), \
2102 (((M) & 1) ? 6 : 2), \
2103 (((M) & 1) ? 7 : 3) );})
2104
2105#define _mm256_extractf128_pd(V, M) __extension__ ({ \
2106 (__m128d)__builtin_shufflevector( \
2107 (__v4df)(__m256d)(V), \
2108 (__v4df)(_mm256_setzero_pd()), \
2109 (((M) & 1) ? 2 : 0), \
2110 (((M) & 1) ? 3 : 1) );})
2111
2112#define _mm256_extractf128_si256(V, M) __extension__ ({ \
2113 (__m128i)__builtin_shufflevector( \
2114 (__v4di)(__m256i)(V), \
2115 (__v4di)(_mm256_setzero_si256()), \
2116 (((M) & 1) ? 2 : 0), \
2117 (((M) & 1) ? 3 : 1) );})
2118
2119/* SIMD load ops (unaligned) */
2120static __inline __m256 __DEFAULT_FN_ATTRS
2121_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
2122{
2123 struct __loadu_ps {
2124 __m128 __v;
2125 } __attribute__((__packed__, __may_alias__));
2126
2127 __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
2128 return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
2129}
2130
2131static __inline __m256d __DEFAULT_FN_ATTRS
2132_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
2133{
2134 struct __loadu_pd {
2135 __m128d __v;
2136 } __attribute__((__packed__, __may_alias__));
2137
2138 __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
2139 return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
2140}
2141
2142static __inline __m256i __DEFAULT_FN_ATTRS
2143_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
2144{
2145 struct __loadu_si128 {
2146 __m128i __v;
2147 } __attribute__((__packed__, __may_alias__));
2148 __m256i __v256 = _mm256_castsi128_si256(
2149 ((struct __loadu_si128*)__addr_lo)->__v);
2150 return _mm256_insertf128_si256(__v256,
2151 ((struct __loadu_si128*)__addr_hi)->__v, 1);
2152}
2153
2154/* SIMD store ops (unaligned) */
2155static __inline void __DEFAULT_FN_ATTRS
2156_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
2157{
2158 __m128 __v128;
2159
2160 __v128 = _mm256_castps256_ps128(__a);
2161 __builtin_ia32_storeups(__addr_lo, __v128);
2162 __v128 = _mm256_extractf128_ps(__a, 1);
2163 __builtin_ia32_storeups(__addr_hi, __v128);
2164}
2165
2166static __inline void __DEFAULT_FN_ATTRS
2167_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
2168{
2169 __m128d __v128;
2170
2171 __v128 = _mm256_castpd256_pd128(__a);
2172 __builtin_ia32_storeupd(__addr_lo, __v128);
2173 __v128 = _mm256_extractf128_pd(__a, 1);
2174 __builtin_ia32_storeupd(__addr_hi, __v128);
2175}
2176
2177static __inline void __DEFAULT_FN_ATTRS
2178_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
2179{
2180 __m128i __v128;
2181
2182 __v128 = _mm256_castsi256_si128(__a);
2183 __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
2184 __v128 = _mm256_extractf128_si256(__a, 1);
2185 __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
2186}
2187
2188static __inline __m256 __DEFAULT_FN_ATTRS
2189_mm256_set_m128 (__m128 __hi, __m128 __lo) {
2190 return (__m256) __builtin_shufflevector(__lo, __hi, 0, 1, 2, 3, 4, 5, 6, 7);
2191}
2192
2193static __inline __m256d __DEFAULT_FN_ATTRS
2194_mm256_set_m128d (__m128d __hi, __m128d __lo) {
2195 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
2196}
2197
2198static __inline __m256i __DEFAULT_FN_ATTRS
2199_mm256_set_m128i (__m128i __hi, __m128i __lo) {
2200 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
2201}
2202
2203static __inline __m256 __DEFAULT_FN_ATTRS
2204_mm256_setr_m128 (__m128 __lo, __m128 __hi) {
2205 return _mm256_set_m128(__hi, __lo);
2206}
2207
2208static __inline __m256d __DEFAULT_FN_ATTRS
2209_mm256_setr_m128d (__m128d __lo, __m128d __hi) {
2210 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
2211}
2212
2213static __inline __m256i __DEFAULT_FN_ATTRS
2214_mm256_setr_m128i (__m128i __lo, __m128i __hi) {
2215 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
2216}
2217
2218#undef __DEFAULT_FN_ATTRS
2219
2220#endif /* __AVXINTRIN_H */