blob: 2d1735ec19f32137c8c1092abf681fa174180936 [file] [log] [blame]
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer6f35f3c2010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000027
Richard Smith49e56442013-07-14 05:41:45 +000028#ifndef __AVXINTRIN_H
29#define __AVXINTRIN_H
30
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000031typedef double __v4df __attribute__ ((__vector_size__ (32)));
32typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34typedef int __v8si __attribute__ ((__vector_size__ (32)));
35typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37
38typedef float __m256 __attribute__ ((__vector_size__ (32)));
39typedef double __m256d __attribute__((__vector_size__(32)));
40typedef long long __m256i __attribute__((__vector_size__(32)));
41
42/* Arithmetic */
43static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +000044_mm256_add_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000045{
David Blaikie3302f2b2013-01-16 23:08:36 +000046 return __a+__b;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000047}
48
49static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +000050_mm256_add_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000051{
David Blaikie3302f2b2013-01-16 23:08:36 +000052 return __a+__b;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000053}
54
55static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +000056_mm256_sub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000057{
David Blaikie3302f2b2013-01-16 23:08:36 +000058 return __a-__b;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000059}
60
61static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +000062_mm256_sub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000063{
David Blaikie3302f2b2013-01-16 23:08:36 +000064 return __a-__b;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000065}
66
67static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +000068_mm256_addsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000069{
David Blaikie3302f2b2013-01-16 23:08:36 +000070 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000071}
72
73static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +000074_mm256_addsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000075{
David Blaikie3302f2b2013-01-16 23:08:36 +000076 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000077}
78
79static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +000080_mm256_div_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000081{
David Blaikie3302f2b2013-01-16 23:08:36 +000082 return __a / __b;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000083}
84
85static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +000086_mm256_div_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000087{
David Blaikie3302f2b2013-01-16 23:08:36 +000088 return __a / __b;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000089}
90
91static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +000092_mm256_max_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000093{
David Blaikie3302f2b2013-01-16 23:08:36 +000094 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000095}
96
97static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +000098_mm256_max_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +000099{
David Blaikie3302f2b2013-01-16 23:08:36 +0000100 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000101}
102
103static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000104_mm256_min_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000105{
David Blaikie3302f2b2013-01-16 23:08:36 +0000106 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000107}
108
109static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000110_mm256_min_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000111{
David Blaikie3302f2b2013-01-16 23:08:36 +0000112 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000113}
114
115static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000116_mm256_mul_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000117{
David Blaikie3302f2b2013-01-16 23:08:36 +0000118 return __a * __b;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000119}
120
121static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000122_mm256_mul_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000123{
David Blaikie3302f2b2013-01-16 23:08:36 +0000124 return __a * __b;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000125}
126
127static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000128_mm256_sqrt_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000129{
David Blaikie3302f2b2013-01-16 23:08:36 +0000130 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000131}
132
133static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000134_mm256_sqrt_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000135{
David Blaikie3302f2b2013-01-16 23:08:36 +0000136 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000137}
138
139static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000140_mm256_rsqrt_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000141{
David Blaikie3302f2b2013-01-16 23:08:36 +0000142 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000143}
144
145static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000146_mm256_rcp_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000147{
David Blaikie3302f2b2013-01-16 23:08:36 +0000148 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000149}
150
Chad Rosier060d03b2011-12-17 00:15:26 +0000151#define _mm256_round_pd(V, M) __extension__ ({ \
152 __m256d __V = (V); \
Craig Topper9f009482011-12-24 07:55:14 +0000153 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000154
Chad Rosier060d03b2011-12-17 00:15:26 +0000155#define _mm256_round_ps(V, M) __extension__ ({ \
156 __m256 __V = (V); \
Craig Topper9f009482011-12-24 07:55:14 +0000157 (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000158
159#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
160#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
161#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
162#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
163
164/* Logical */
165static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000166_mm256_and_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000167{
David Blaikie3302f2b2013-01-16 23:08:36 +0000168 return (__m256d)((__v4di)__a & (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000169}
170
171static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000172_mm256_and_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000173{
David Blaikie3302f2b2013-01-16 23:08:36 +0000174 return (__m256)((__v8si)__a & (__v8si)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000175}
176
177static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000178_mm256_andnot_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000179{
David Blaikie3302f2b2013-01-16 23:08:36 +0000180 return (__m256d)(~(__v4di)__a & (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000181}
182
183static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000184_mm256_andnot_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000185{
David Blaikie3302f2b2013-01-16 23:08:36 +0000186 return (__m256)(~(__v8si)__a & (__v8si)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000187}
188
189static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000190_mm256_or_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000191{
David Blaikie3302f2b2013-01-16 23:08:36 +0000192 return (__m256d)((__v4di)__a | (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000193}
194
195static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000196_mm256_or_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000197{
David Blaikie3302f2b2013-01-16 23:08:36 +0000198 return (__m256)((__v8si)__a | (__v8si)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000199}
200
201static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000202_mm256_xor_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000203{
David Blaikie3302f2b2013-01-16 23:08:36 +0000204 return (__m256d)((__v4di)__a ^ (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000205}
206
207static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000208_mm256_xor_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000209{
David Blaikie3302f2b2013-01-16 23:08:36 +0000210 return (__m256)((__v8si)__a ^ (__v8si)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000211}
212
213/* Horizontal arithmetic */
214static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000215_mm256_hadd_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000216{
David Blaikie3302f2b2013-01-16 23:08:36 +0000217 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000218}
219
220static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000221_mm256_hadd_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000222{
David Blaikie3302f2b2013-01-16 23:08:36 +0000223 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000224}
225
226static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000227_mm256_hsub_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000228{
David Blaikie3302f2b2013-01-16 23:08:36 +0000229 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000230}
231
232static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000233_mm256_hsub_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000234{
David Blaikie3302f2b2013-01-16 23:08:36 +0000235 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000236}
237
238/* Vector permutations */
239static __inline __m128d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000240_mm_permutevar_pd(__m128d __a, __m128i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000241{
David Blaikie3302f2b2013-01-16 23:08:36 +0000242 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000243}
244
245static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000246_mm256_permutevar_pd(__m256d __a, __m256i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000247{
David Blaikie3302f2b2013-01-16 23:08:36 +0000248 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000249}
250
251static __inline __m128 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000252_mm_permutevar_ps(__m128 __a, __m128i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000253{
David Blaikie3302f2b2013-01-16 23:08:36 +0000254 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000255}
256
257static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000258_mm256_permutevar_ps(__m256 __a, __m256i __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000259{
Craig Topper9fee8ab2015-01-31 06:33:59 +0000260 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000261}
262
Chad Rosier93375d52011-12-17 01:39:56 +0000263#define _mm_permute_pd(A, C) __extension__ ({ \
264 __m128d __A = (A); \
Craig Topperfec9f8e2012-02-08 05:16:54 +0000265 (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
266 (C) & 0x1, ((C) & 0x2) >> 1); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000267
Chad Rosier93375d52011-12-17 01:39:56 +0000268#define _mm256_permute_pd(A, C) __extension__ ({ \
269 __m256d __A = (A); \
Craig Topperfec9f8e2012-02-08 05:16:54 +0000270 (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
271 (C) & 0x1, ((C) & 0x2) >> 1, \
272 2 + (((C) & 0x4) >> 2), \
273 2 + (((C) & 0x8) >> 3)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000274
Chad Rosier7caca842011-12-17 01:51:05 +0000275#define _mm_permute_ps(A, C) __extension__ ({ \
276 __m128 __A = (A); \
Craig Topperfec9f8e2012-02-08 05:16:54 +0000277 (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
278 (C) & 0x3, ((C) & 0xc) >> 2, \
Craig Topper678a53c2012-03-30 05:09:18 +0000279 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000280
Chad Rosier7caca842011-12-17 01:51:05 +0000281#define _mm256_permute_ps(A, C) __extension__ ({ \
282 __m256 __A = (A); \
Craig Topperfec9f8e2012-02-08 05:16:54 +0000283 (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
284 (C) & 0x3, ((C) & 0xc) >> 2, \
285 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
286 4 + (((C) & 0x03) >> 0), \
287 4 + (((C) & 0x0c) >> 2), \
288 4 + (((C) & 0x30) >> 4), \
289 4 + (((C) & 0xc0) >> 6)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000290
Chad Rosier9138fea252011-12-16 21:07:34 +0000291#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
292 __m256d __V1 = (V1); \
293 __m256d __V2 = (V2); \
Craig Topper26e74e52012-04-17 05:16:56 +0000294 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000295
Chad Rosier9138fea252011-12-16 21:07:34 +0000296#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
297 __m256 __V1 = (V1); \
298 __m256 __V2 = (V2); \
Craig Topper26e74e52012-04-17 05:16:56 +0000299 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000300
Chad Rosier9138fea252011-12-16 21:07:34 +0000301#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
302 __m256i __V1 = (V1); \
303 __m256i __V2 = (V2); \
Craig Topper26e74e52012-04-17 05:16:56 +0000304 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000305
306/* Vector Blend */
Eli Friedmanf16beb32011-11-10 00:11:13 +0000307#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
308 __m256d __V1 = (V1); \
309 __m256d __V2 = (V2); \
Filipe Cabecinhas5d289b42014-05-13 02:37:02 +0000310 (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \
311 (((M) & 0x01) ? 4 : 0), \
312 (((M) & 0x02) ? 5 : 1), \
313 (((M) & 0x04) ? 6 : 2), \
314 (((M) & 0x08) ? 7 : 3)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000315
Eli Friedmanf16beb32011-11-10 00:11:13 +0000316#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
317 __m256 __V1 = (V1); \
318 __m256 __V2 = (V2); \
Filipe Cabecinhas5d289b42014-05-13 02:37:02 +0000319 (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \
320 (((M) & 0x01) ? 8 : 0), \
321 (((M) & 0x02) ? 9 : 1), \
322 (((M) & 0x04) ? 10 : 2), \
323 (((M) & 0x08) ? 11 : 3), \
324 (((M) & 0x10) ? 12 : 4), \
325 (((M) & 0x20) ? 13 : 5), \
326 (((M) & 0x40) ? 14 : 6), \
327 (((M) & 0x80) ? 15 : 7)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000328
329static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000330_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000331{
David Blaikie3302f2b2013-01-16 23:08:36 +0000332 return (__m256d)__builtin_ia32_blendvpd256(
333 (__v4df)__a, (__v4df)__b, (__v4df)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000334}
335
336static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000337_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000338{
David Blaikie5bb70032013-01-16 23:13:42 +0000339 return (__m256)__builtin_ia32_blendvps256(
David Blaikie3302f2b2013-01-16 23:08:36 +0000340 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000341}
342
343/* Vector Dot Product */
Eli Friedmanf16beb32011-11-10 00:11:13 +0000344#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
345 __m256 __V1 = (V1); \
346 __m256 __V2 = (V2); \
Craig Topper9f009482011-12-24 07:55:14 +0000347 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000348
349/* Vector shuffle */
Bob Wilsonc9b97cc2011-11-05 06:08:06 +0000350#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
351 __m256 __a = (a); \
352 __m256 __b = (b); \
353 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
Bruno Cardoso Lopese712a132010-08-11 01:17:34 +0000354 (mask) & 0x3, ((mask) & 0xc) >> 2, \
Bruno Cardoso Lopes8c333152010-08-11 18:45:43 +0000355 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \
Bruno Cardoso Lopes7a98a7e2011-08-23 23:29:45 +0000356 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \
Bob Wilsonc9b97cc2011-11-05 06:08:06 +0000357 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000358
Bob Wilsonc9b97cc2011-11-05 06:08:06 +0000359#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
360 __m256d __a = (a); \
361 __m256d __b = (b); \
362 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
Bruno Cardoso Lopese712a132010-08-11 01:17:34 +0000363 (mask) & 0x1, \
364 (((mask) & 0x2) >> 1) + 4, \
365 (((mask) & 0x4) >> 2) + 2, \
Bob Wilsonc9b97cc2011-11-05 06:08:06 +0000366 (((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000367
368/* Compare */
369#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
370#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
371#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
372#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
373#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
374#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
375#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
376#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
377#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
378#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
379#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
380#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
381#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
382#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
383#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
384#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
385#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
386#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
387#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
388#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
389#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
390#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
391#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
392#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
393#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
394#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
395#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
396#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
397#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
398#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
399#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
400#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
401
Bob Wilsonc9b97cc2011-11-05 06:08:06 +0000402#define _mm_cmp_pd(a, b, c) __extension__ ({ \
403 __m128d __a = (a); \
404 __m128d __b = (b); \
405 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000406
Bob Wilsonc9b97cc2011-11-05 06:08:06 +0000407#define _mm_cmp_ps(a, b, c) __extension__ ({ \
408 __m128 __a = (a); \
409 __m128 __b = (b); \
410 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000411
Bob Wilsonc9b97cc2011-11-05 06:08:06 +0000412#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
413 __m256d __a = (a); \
414 __m256d __b = (b); \
415 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000416
Bob Wilsonc9b97cc2011-11-05 06:08:06 +0000417#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
418 __m256 __a = (a); \
419 __m256 __b = (b); \
420 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000421
Bob Wilsonc9b97cc2011-11-05 06:08:06 +0000422#define _mm_cmp_sd(a, b, c) __extension__ ({ \
423 __m128d __a = (a); \
424 __m128d __b = (b); \
425 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000426
Bob Wilsonc9b97cc2011-11-05 06:08:06 +0000427#define _mm_cmp_ss(a, b, c) __extension__ ({ \
428 __m128 __a = (a); \
429 __m128 __b = (b); \
430 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000431
432/* Vector extract */
Chad Rosier0adfe7a2011-12-17 01:22:27 +0000433#define _mm256_extractf128_pd(A, O) __extension__ ({ \
434 __m256d __A = (A); \
Craig Topper9f009482011-12-24 07:55:14 +0000435 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000436
Chad Rosier0adfe7a2011-12-17 01:22:27 +0000437#define _mm256_extractf128_ps(A, O) __extension__ ({ \
438 __m256 __A = (A); \
Craig Topper9f009482011-12-24 07:55:14 +0000439 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000440
Chad Rosier0adfe7a2011-12-17 01:22:27 +0000441#define _mm256_extractf128_si256(A, O) __extension__ ({ \
442 __m256i __A = (A); \
Craig Topper9f009482011-12-24 07:55:14 +0000443 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000444
445static __inline int __attribute__((__always_inline__, __nodebug__))
Craig Topper459554f2015-01-31 06:31:30 +0000446_mm256_extract_epi32(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000447{
David Blaikie3302f2b2013-01-16 23:08:36 +0000448 __v8si __b = (__v8si)__a;
Manman Renc94122e2013-10-23 20:33:14 +0000449 return __b[__imm & 7];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000450}
451
452static __inline int __attribute__((__always_inline__, __nodebug__))
Craig Topper459554f2015-01-31 06:31:30 +0000453_mm256_extract_epi16(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000454{
David Blaikie3302f2b2013-01-16 23:08:36 +0000455 __v16hi __b = (__v16hi)__a;
Manman Renc94122e2013-10-23 20:33:14 +0000456 return __b[__imm & 15];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000457}
458
459static __inline int __attribute__((__always_inline__, __nodebug__))
Craig Topper459554f2015-01-31 06:31:30 +0000460_mm256_extract_epi8(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000461{
David Blaikie3302f2b2013-01-16 23:08:36 +0000462 __v32qi __b = (__v32qi)__a;
Manman Renc94122e2013-10-23 20:33:14 +0000463 return __b[__imm & 31];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000464}
465
466#ifdef __x86_64__
467static __inline long long __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000468_mm256_extract_epi64(__m256i __a, const int __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000469{
David Blaikie3302f2b2013-01-16 23:08:36 +0000470 __v4di __b = (__v4di)__a;
Manman Renc94122e2013-10-23 20:33:14 +0000471 return __b[__imm & 3];
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000472}
473#endif
474
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000475static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000476_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000477{
David Blaikie3302f2b2013-01-16 23:08:36 +0000478 __v8si __c = (__v8si)__a;
479 __c[__imm & 7] = __b;
480 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000481}
482
483static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000484_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000485{
David Blaikie3302f2b2013-01-16 23:08:36 +0000486 __v16hi __c = (__v16hi)__a;
487 __c[__imm & 15] = __b;
488 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000489}
490
491static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000492_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000493{
David Blaikie3302f2b2013-01-16 23:08:36 +0000494 __v32qi __c = (__v32qi)__a;
495 __c[__imm & 31] = __b;
496 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000497}
498
499#ifdef __x86_64__
500static __inline __m256i __attribute__((__always_inline__, __nodebug__))
Filipe Cabecinhasd7400292015-02-19 19:00:33 +0000501_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000502{
David Blaikie3302f2b2013-01-16 23:08:36 +0000503 __v4di __c = (__v4di)__a;
504 __c[__imm & 3] = __b;
505 return (__m256i)__c;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000506}
507#endif
508
509/* Conversion */
510static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000511_mm256_cvtepi32_pd(__m128i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000512{
David Blaikie3302f2b2013-01-16 23:08:36 +0000513 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000514}
515
516static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000517_mm256_cvtepi32_ps(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000518{
David Blaikie3302f2b2013-01-16 23:08:36 +0000519 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000520}
521
522static __inline __m128 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000523_mm256_cvtpd_ps(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000524{
David Blaikie3302f2b2013-01-16 23:08:36 +0000525 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000526}
527
528static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000529_mm256_cvtps_epi32(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000530{
David Blaikie3302f2b2013-01-16 23:08:36 +0000531 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000532}
533
534static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000535_mm256_cvtps_pd(__m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000536{
David Blaikie3302f2b2013-01-16 23:08:36 +0000537 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000538}
539
540static __inline __m128i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000541_mm256_cvttpd_epi32(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000542{
David Blaikie3302f2b2013-01-16 23:08:36 +0000543 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000544}
545
546static __inline __m128i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000547_mm256_cvtpd_epi32(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000548{
David Blaikie3302f2b2013-01-16 23:08:36 +0000549 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000550}
551
552static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000553_mm256_cvttps_epi32(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000554{
David Blaikie3302f2b2013-01-16 23:08:36 +0000555 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000556}
557
558/* Vector replicate */
559static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000560_mm256_movehdup_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000561{
David Blaikie3302f2b2013-01-16 23:08:36 +0000562 return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000563}
564
565static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000566_mm256_moveldup_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000567{
David Blaikie3302f2b2013-01-16 23:08:36 +0000568 return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000569}
570
571static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000572_mm256_movedup_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000573{
David Blaikie3302f2b2013-01-16 23:08:36 +0000574 return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000575}
576
577/* Unpack and Interleave */
578static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000579_mm256_unpackhi_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000580{
David Blaikie3302f2b2013-01-16 23:08:36 +0000581 return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000582}
583
584static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000585_mm256_unpacklo_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000586{
David Blaikie3302f2b2013-01-16 23:08:36 +0000587 return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000588}
589
590static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000591_mm256_unpackhi_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000592{
David Blaikie3302f2b2013-01-16 23:08:36 +0000593 return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000594}
595
596static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000597_mm256_unpacklo_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000598{
David Blaikie3302f2b2013-01-16 23:08:36 +0000599 return __builtin_shufflevector(__a, __b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000600}
601
602/* Bit Test */
603static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000604_mm_testz_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000605{
David Blaikie3302f2b2013-01-16 23:08:36 +0000606 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000607}
608
609static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000610_mm_testc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000611{
David Blaikie3302f2b2013-01-16 23:08:36 +0000612 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000613}
614
615static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000616_mm_testnzc_pd(__m128d __a, __m128d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000617{
David Blaikie3302f2b2013-01-16 23:08:36 +0000618 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000619}
620
621static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000622_mm_testz_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000623{
David Blaikie3302f2b2013-01-16 23:08:36 +0000624 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000625}
626
627static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000628_mm_testc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000629{
David Blaikie3302f2b2013-01-16 23:08:36 +0000630 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000631}
632
633static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000634_mm_testnzc_ps(__m128 __a, __m128 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000635{
David Blaikie3302f2b2013-01-16 23:08:36 +0000636 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000637}
638
639static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000640_mm256_testz_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000641{
David Blaikie3302f2b2013-01-16 23:08:36 +0000642 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000643}
644
645static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000646_mm256_testc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000647{
David Blaikie3302f2b2013-01-16 23:08:36 +0000648 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000649}
650
651static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000652_mm256_testnzc_pd(__m256d __a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000653{
David Blaikie3302f2b2013-01-16 23:08:36 +0000654 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000655}
656
657static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000658_mm256_testz_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000659{
David Blaikie3302f2b2013-01-16 23:08:36 +0000660 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000661}
662
663static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000664_mm256_testc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000665{
David Blaikie3302f2b2013-01-16 23:08:36 +0000666 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000667}
668
669static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000670_mm256_testnzc_ps(__m256 __a, __m256 __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000671{
David Blaikie3302f2b2013-01-16 23:08:36 +0000672 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000673}
674
675static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000676_mm256_testz_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000677{
David Blaikie3302f2b2013-01-16 23:08:36 +0000678 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000679}
680
681static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000682_mm256_testc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000683{
David Blaikie3302f2b2013-01-16 23:08:36 +0000684 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000685}
686
687static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000688_mm256_testnzc_si256(__m256i __a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000689{
David Blaikie3302f2b2013-01-16 23:08:36 +0000690 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000691}
692
693/* Vector extract sign mask */
694static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000695_mm256_movemask_pd(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000696{
David Blaikie3302f2b2013-01-16 23:08:36 +0000697 return __builtin_ia32_movmskpd256((__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000698}
699
700static __inline int __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000701_mm256_movemask_ps(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000702{
David Blaikie3302f2b2013-01-16 23:08:36 +0000703 return __builtin_ia32_movmskps256((__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000704}
705
David Blaikie3302f2b2013-01-16 23:08:36 +0000706/* Vector __zero */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000707static __inline void __attribute__((__always_inline__, __nodebug__))
708_mm256_zeroall(void)
709{
710 __builtin_ia32_vzeroall();
711}
712
713static __inline void __attribute__((__always_inline__, __nodebug__))
714_mm256_zeroupper(void)
715{
716 __builtin_ia32_vzeroupper();
717}
718
719/* Vector load with broadcast */
720static __inline __m128 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000721_mm_broadcast_ss(float const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000722{
Adam Nemet286ae082014-05-29 20:47:29 +0000723 float __f = *__a;
724 return (__m128)(__v4sf){ __f, __f, __f, __f };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000725}
726
727static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000728_mm256_broadcast_sd(double const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000729{
Adam Nemet286ae082014-05-29 20:47:29 +0000730 double __d = *__a;
731 return (__m256d)(__v4df){ __d, __d, __d, __d };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000732}
733
734static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000735_mm256_broadcast_ss(float const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000736{
Adam Nemet286ae082014-05-29 20:47:29 +0000737 float __f = *__a;
738 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000739}
740
741static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000742_mm256_broadcast_pd(__m128d const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000743{
David Blaikie3302f2b2013-01-16 23:08:36 +0000744 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000745}
746
747static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000748_mm256_broadcast_ps(__m128 const *__a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000749{
David Blaikie3302f2b2013-01-16 23:08:36 +0000750 return (__m256)__builtin_ia32_vbroadcastf128_ps256(__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000751}
752
753/* SIMD load ops */
754static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000755_mm256_load_pd(double const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000756{
David Blaikie3302f2b2013-01-16 23:08:36 +0000757 return *(__m256d *)__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000758}
759
760static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000761_mm256_load_ps(float const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000762{
David Blaikie3302f2b2013-01-16 23:08:36 +0000763 return *(__m256 *)__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000764}
765
766static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000767_mm256_loadu_pd(double const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000768{
Craig Topper9e9301a2012-01-25 04:26:17 +0000769 struct __loadu_pd {
David Blaikie3302f2b2013-01-16 23:08:36 +0000770 __m256d __v;
David Majnemer1cf22e62015-02-04 00:26:10 +0000771 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +0000772 return ((struct __loadu_pd*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000773}
774
775static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000776_mm256_loadu_ps(float const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000777{
Craig Topper9e9301a2012-01-25 04:26:17 +0000778 struct __loadu_ps {
David Blaikie3302f2b2013-01-16 23:08:36 +0000779 __m256 __v;
David Majnemer1cf22e62015-02-04 00:26:10 +0000780 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +0000781 return ((struct __loadu_ps*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000782}
783
784static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000785_mm256_load_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000786{
David Blaikie3302f2b2013-01-16 23:08:36 +0000787 return *__p;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000788}
789
790static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000791_mm256_loadu_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000792{
Craig Topper9e9301a2012-01-25 04:26:17 +0000793 struct __loadu_si256 {
David Blaikie3302f2b2013-01-16 23:08:36 +0000794 __m256i __v;
David Majnemer1cf22e62015-02-04 00:26:10 +0000795 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +0000796 return ((struct __loadu_si256*)__p)->__v;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000797}
798
799static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000800_mm256_lddqu_si256(__m256i const *__p)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000801{
David Blaikie3302f2b2013-01-16 23:08:36 +0000802 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000803}
804
805/* SIMD store ops */
806static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000807_mm256_store_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000808{
David Blaikie3302f2b2013-01-16 23:08:36 +0000809 *(__m256d *)__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000810}
811
812static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000813_mm256_store_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000814{
David Blaikie3302f2b2013-01-16 23:08:36 +0000815 *(__m256 *)__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000816}
817
818static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000819_mm256_storeu_pd(double *__p, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000820{
David Blaikie3302f2b2013-01-16 23:08:36 +0000821 __builtin_ia32_storeupd256(__p, (__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000822}
823
824static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000825_mm256_storeu_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000826{
David Blaikie3302f2b2013-01-16 23:08:36 +0000827 __builtin_ia32_storeups256(__p, (__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000828}
829
830static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000831_mm256_store_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000832{
David Blaikie3302f2b2013-01-16 23:08:36 +0000833 *__p = __a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000834}
835
836static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000837_mm256_storeu_si256(__m256i *__p, __m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000838{
David Blaikie3302f2b2013-01-16 23:08:36 +0000839 __builtin_ia32_storedqu256((char *)__p, (__v32qi)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000840}
841
842/* Conditional load ops */
843static __inline __m128d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000844_mm_maskload_pd(double const *__p, __m128d __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000845{
David Blaikie3302f2b2013-01-16 23:08:36 +0000846 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2df)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000847}
848
849static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000850_mm256_maskload_pd(double const *__p, __m256d __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000851{
David Blaikie3302f2b2013-01-16 23:08:36 +0000852 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
853 (__v4df)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000854}
855
856static __inline __m128 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000857_mm_maskload_ps(float const *__p, __m128 __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000858{
David Blaikie3302f2b2013-01-16 23:08:36 +0000859 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4sf)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000860}
861
862static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000863_mm256_maskload_ps(float const *__p, __m256 __m)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000864{
David Blaikie3302f2b2013-01-16 23:08:36 +0000865 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8sf)__m);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000866}
867
868/* Conditional store ops */
869static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000870_mm256_maskstore_ps(float *__p, __m256 __m, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000871{
David Blaikie3302f2b2013-01-16 23:08:36 +0000872 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8sf)__m, (__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000873}
874
875static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000876_mm_maskstore_pd(double *__p, __m128d __m, __m128d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000877{
David Blaikie3302f2b2013-01-16 23:08:36 +0000878 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2df)__m, (__v2df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000879}
880
881static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000882_mm256_maskstore_pd(double *__p, __m256d __m, __m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000883{
David Blaikie3302f2b2013-01-16 23:08:36 +0000884 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4df)__m, (__v4df)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000885}
886
887static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000888_mm_maskstore_ps(float *__p, __m128 __m, __m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000889{
David Blaikie3302f2b2013-01-16 23:08:36 +0000890 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4sf)__m, (__v4sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000891}
892
893/* Cacheability support ops */
894static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000895_mm256_stream_si256(__m256i *__a, __m256i __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000896{
David Blaikie3302f2b2013-01-16 23:08:36 +0000897 __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000898}
899
900static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000901_mm256_stream_pd(double *__a, __m256d __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000902{
David Blaikie3302f2b2013-01-16 23:08:36 +0000903 __builtin_ia32_movntpd256(__a, (__v4df)__b);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000904}
905
906static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000907_mm256_stream_ps(float *__p, __m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000908{
David Blaikie3302f2b2013-01-16 23:08:36 +0000909 __builtin_ia32_movntps256(__p, (__v8sf)__a);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000910}
911
912/* Create vectors */
913static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000914_mm256_set_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000915{
David Blaikie3302f2b2013-01-16 23:08:36 +0000916 return (__m256d){ __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000917}
918
919static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000920_mm256_set_ps(float __a, float __b, float __c, float __d,
Craig Topper9fee8ab2015-01-31 06:33:59 +0000921 float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000922{
David Blaikie3302f2b2013-01-16 23:08:36 +0000923 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000924}
925
926static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000927_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper9fee8ab2015-01-31 06:33:59 +0000928 int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000929{
David Blaikie3302f2b2013-01-16 23:08:36 +0000930 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000931}
932
933static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000934_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper9fee8ab2015-01-31 06:33:59 +0000935 short __w11, short __w10, short __w09, short __w08,
936 short __w07, short __w06, short __w05, short __w04,
937 short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000938{
David Blaikie3302f2b2013-01-16 23:08:36 +0000939 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
940 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000941}
942
943static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000944_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper9fee8ab2015-01-31 06:33:59 +0000945 char __b27, char __b26, char __b25, char __b24,
946 char __b23, char __b22, char __b21, char __b20,
947 char __b19, char __b18, char __b17, char __b16,
948 char __b15, char __b14, char __b13, char __b12,
949 char __b11, char __b10, char __b09, char __b08,
950 char __b07, char __b06, char __b05, char __b04,
951 char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000952{
953 return (__m256i)(__v32qi){
David Blaikie3302f2b2013-01-16 23:08:36 +0000954 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
955 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
956 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
957 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000958 };
959}
960
961static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000962_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000963{
David Blaikie3302f2b2013-01-16 23:08:36 +0000964 return (__m256i)(__v4di){ __d, __c, __b, __a };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000965}
966
967/* Create vectors with elements in reverse order */
968static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000969_mm256_setr_pd(double __a, double __b, double __c, double __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000970{
David Blaikie3302f2b2013-01-16 23:08:36 +0000971 return (__m256d){ __a, __b, __c, __d };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000972}
973
974static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000975_mm256_setr_ps(float __a, float __b, float __c, float __d,
Craig Topper9fee8ab2015-01-31 06:33:59 +0000976 float __e, float __f, float __g, float __h)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000977{
David Blaikie3302f2b2013-01-16 23:08:36 +0000978 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000979}
980
981static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000982_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
Craig Topper9fee8ab2015-01-31 06:33:59 +0000983 int __i4, int __i5, int __i6, int __i7)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000984{
David Blaikie3302f2b2013-01-16 23:08:36 +0000985 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000986}
987
988static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000989_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
Craig Topper9fee8ab2015-01-31 06:33:59 +0000990 short __w11, short __w10, short __w09, short __w08,
991 short __w07, short __w06, short __w05, short __w04,
992 short __w03, short __w02, short __w01, short __w00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000993{
David Blaikie3302f2b2013-01-16 23:08:36 +0000994 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
995 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +0000996}
997
998static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +0000999_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
Craig Topper9fee8ab2015-01-31 06:33:59 +00001000 char __b27, char __b26, char __b25, char __b24,
1001 char __b23, char __b22, char __b21, char __b20,
1002 char __b19, char __b18, char __b17, char __b16,
1003 char __b15, char __b14, char __b13, char __b12,
1004 char __b11, char __b10, char __b09, char __b08,
1005 char __b07, char __b06, char __b05, char __b04,
1006 char __b03, char __b02, char __b01, char __b00)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001007{
1008 return (__m256i)(__v32qi){
David Blaikie3302f2b2013-01-16 23:08:36 +00001009 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
Craig Topper9fee8ab2015-01-31 06:33:59 +00001010 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
1011 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
1012 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001013}
1014
1015static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001016_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001017{
David Blaikie3302f2b2013-01-16 23:08:36 +00001018 return (__m256i)(__v4di){ __a, __b, __c, __d };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001019}
1020
1021/* Create vectors with repeated elements */
1022static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001023_mm256_set1_pd(double __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001024{
David Blaikie3302f2b2013-01-16 23:08:36 +00001025 return (__m256d){ __w, __w, __w, __w };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001026}
1027
1028static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001029_mm256_set1_ps(float __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001030{
David Blaikie3302f2b2013-01-16 23:08:36 +00001031 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001032}
1033
1034static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001035_mm256_set1_epi32(int __i)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001036{
David Blaikie3302f2b2013-01-16 23:08:36 +00001037 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001038}
1039
1040static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001041_mm256_set1_epi16(short __w)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001042{
David Blaikie3302f2b2013-01-16 23:08:36 +00001043 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
1044 __w, __w, __w, __w, __w, __w };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001045}
1046
1047static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001048_mm256_set1_epi8(char __b)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001049{
David Blaikie3302f2b2013-01-16 23:08:36 +00001050 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
1051 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
1052 __b, __b, __b, __b, __b, __b, __b };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001053}
1054
1055static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001056_mm256_set1_epi64x(long long __q)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001057{
David Blaikie3302f2b2013-01-16 23:08:36 +00001058 return (__m256i)(__v4di){ __q, __q, __q, __q };
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001059}
1060
David Blaikie3302f2b2013-01-16 23:08:36 +00001061/* Create __zeroed vectors */
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001062static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1063_mm256_setzero_pd(void)
1064{
1065 return (__m256d){ 0, 0, 0, 0 };
1066}
1067
1068static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1069_mm256_setzero_ps(void)
1070{
1071 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1072}
1073
1074static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1075_mm256_setzero_si256(void)
1076{
1077 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1078}
1079
1080/* Cast between vector types */
1081static __inline __m256 __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001082_mm256_castpd_ps(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001083{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001084 return (__m256)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001085}
1086
1087static __inline __m256i __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001088_mm256_castpd_si256(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001089{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001090 return (__m256i)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001091}
1092
1093static __inline __m256d __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001094_mm256_castps_pd(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001095{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001096 return (__m256d)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001097}
1098
1099static __inline __m256i __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001100_mm256_castps_si256(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001101{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001102 return (__m256i)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001103}
1104
1105static __inline __m256 __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001106_mm256_castsi256_ps(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001107{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001108 return (__m256)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001109}
1110
1111static __inline __m256d __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001112_mm256_castsi256_pd(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001113{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001114 return (__m256d)__a;
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001115}
1116
1117static __inline __m128d __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001118_mm256_castpd256_pd128(__m256d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001119{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001120 return __builtin_shufflevector(__a, __a, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001121}
1122
1123static __inline __m128 __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001124_mm256_castps256_ps128(__m256 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001125{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001126 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001127}
1128
1129static __inline __m128i __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001130_mm256_castsi256_si128(__m256i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001131{
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001132 return __builtin_shufflevector(__a, __a, 0, 1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001133}
1134
1135static __inline __m256d __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001136_mm256_castpd128_pd256(__m128d __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001137{
Craig Topperc5244512013-08-05 06:17:21 +00001138 return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001139}
1140
1141static __inline __m256 __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001142_mm256_castps128_ps256(__m128 __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001143{
Craig Topperc5244512013-08-05 06:17:21 +00001144 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001145}
1146
1147static __inline __m256i __attribute__((__always_inline__, __nodebug__))
Reid Kleckner7ab75b32013-04-19 17:00:14 +00001148_mm256_castsi128_si256(__m128i __a)
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001149{
Craig Topperc5244512013-08-05 06:17:21 +00001150 return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
Bruno Cardoso Lopes7c4b5132010-08-04 22:03:36 +00001151}
Chad Rosierf8df4f42012-03-20 16:40:00 +00001152
Sanjay Patel7f6aa522015-03-10 15:19:26 +00001153/*
1154 Vector insert.
1155 We use macros rather than inlines because we only want to accept
1156 invocations where the immediate M is a constant expression.
1157*/
1158#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
1159 (__m256)__builtin_shufflevector( \
1160 (__v8sf)(V1), \
1161 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
1162 (((M) & 1) ? 0 : 8), \
1163 (((M) & 1) ? 1 : 9), \
1164 (((M) & 1) ? 2 : 10), \
1165 (((M) & 1) ? 3 : 11), \
1166 (((M) & 1) ? 8 : 4), \
1167 (((M) & 1) ? 9 : 5), \
1168 (((M) & 1) ? 10 : 6), \
1169 (((M) & 1) ? 11 : 7) );})
1170
1171#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
1172 (__m256d)__builtin_shufflevector( \
1173 (__v4df)(V1), \
1174 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
1175 (((M) & 1) ? 0 : 4), \
1176 (((M) & 1) ? 1 : 5), \
1177 (((M) & 1) ? 4 : 2), \
1178 (((M) & 1) ? 5 : 3) );})
1179
1180#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
1181 (__m256i)__builtin_shufflevector( \
1182 (__v4di)(V1), \
1183 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
1184 (((M) & 1) ? 0 : 4), \
1185 (((M) & 1) ? 1 : 5), \
1186 (((M) & 1) ? 4 : 2), \
1187 (((M) & 1) ? 5 : 3) );})
1188
Chad Rosierf8df4f42012-03-20 16:40:00 +00001189/* SIMD load ops (unaligned) */
1190static __inline __m256 __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001191_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00001192{
1193 struct __loadu_ps {
David Blaikie3302f2b2013-01-16 23:08:36 +00001194 __m128 __v;
Chad Rosierf8df4f42012-03-20 16:40:00 +00001195 } __attribute__((__packed__, __may_alias__));
1196
David Blaikie3302f2b2013-01-16 23:08:36 +00001197 __m256 __v256 = _mm256_castps128_ps256(((struct __loadu_ps*)__addr_lo)->__v);
1198 return _mm256_insertf128_ps(__v256, ((struct __loadu_ps*)__addr_hi)->__v, 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00001199}
1200
1201static __inline __m256d __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001202_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00001203{
1204 struct __loadu_pd {
David Blaikie3302f2b2013-01-16 23:08:36 +00001205 __m128d __v;
Chad Rosierf8df4f42012-03-20 16:40:00 +00001206 } __attribute__((__packed__, __may_alias__));
1207
David Blaikie3302f2b2013-01-16 23:08:36 +00001208 __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
1209 return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00001210}
1211
1212static __inline __m256i __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001213_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
Chad Rosierf8df4f42012-03-20 16:40:00 +00001214{
1215 struct __loadu_si128 {
David Blaikie3302f2b2013-01-16 23:08:36 +00001216 __m128i __v;
David Majnemer1cf22e62015-02-04 00:26:10 +00001217 } __attribute__((__packed__, __may_alias__));
David Blaikie3302f2b2013-01-16 23:08:36 +00001218 __m256i __v256 = _mm256_castsi128_si256(
1219 ((struct __loadu_si128*)__addr_lo)->__v);
1220 return _mm256_insertf128_si256(__v256,
1221 ((struct __loadu_si128*)__addr_hi)->__v, 1);
Chad Rosierf8df4f42012-03-20 16:40:00 +00001222}
1223
1224/* SIMD store ops (unaligned) */
1225static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001226_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00001227{
David Blaikie3302f2b2013-01-16 23:08:36 +00001228 __m128 __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00001229
David Blaikie3302f2b2013-01-16 23:08:36 +00001230 __v128 = _mm256_castps256_ps128(__a);
1231 __builtin_ia32_storeups(__addr_lo, __v128);
1232 __v128 = _mm256_extractf128_ps(__a, 1);
1233 __builtin_ia32_storeups(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00001234}
1235
1236static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001237_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00001238{
David Blaikie3302f2b2013-01-16 23:08:36 +00001239 __m128d __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00001240
David Blaikie3302f2b2013-01-16 23:08:36 +00001241 __v128 = _mm256_castpd256_pd128(__a);
1242 __builtin_ia32_storeupd(__addr_lo, __v128);
1243 __v128 = _mm256_extractf128_pd(__a, 1);
1244 __builtin_ia32_storeupd(__addr_hi, __v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00001245}
1246
1247static __inline void __attribute__((__always_inline__, __nodebug__))
David Blaikie3302f2b2013-01-16 23:08:36 +00001248_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
Chad Rosierf8df4f42012-03-20 16:40:00 +00001249{
David Blaikie3302f2b2013-01-16 23:08:36 +00001250 __m128i __v128;
Chad Rosierf8df4f42012-03-20 16:40:00 +00001251
David Blaikie3302f2b2013-01-16 23:08:36 +00001252 __v128 = _mm256_castsi256_si128(__a);
1253 __builtin_ia32_storedqu((char *)__addr_lo, (__v16qi)__v128);
1254 __v128 = _mm256_extractf128_si256(__a, 1);
1255 __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128);
Chad Rosierf8df4f42012-03-20 16:40:00 +00001256}
Richard Smith49e56442013-07-14 05:41:45 +00001257
1258#endif /* __AVXINTRIN_H */