blob: 7a0ec3fbd63613520577297e4fe886fe05d5506c [file] [log] [blame]
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer01b57e32010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +000027
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43 return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49 return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55 return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61 return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79 return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85 return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115 return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121 return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
Chad Rosierb8786c42011-12-17 00:15:26 +0000148#define _mm256_round_pd(V, M) __extension__ ({ \
149 __m256d __V = (V); \
Craig Topper34a1da42011-12-24 07:55:14 +0000150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000151
Chad Rosierb8786c42011-12-17 00:15:26 +0000152#define _mm256_round_ps(V, M) __extension__ ({ \
153 __m256 __V = (V); \
Craig Topper34a1da42011-12-24 07:55:14 +0000154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000155
156#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
158#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
160
161/* Logical */
162static __inline __m256d __attribute__((__always_inline__, __nodebug__))
163_mm256_and_pd(__m256d a, __m256d b)
164{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000165 return (__m256d)((__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000166}
167
168static __inline __m256 __attribute__((__always_inline__, __nodebug__))
169_mm256_and_ps(__m256 a, __m256 b)
170{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000171 return (__m256)((__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000172}
173
174static __inline __m256d __attribute__((__always_inline__, __nodebug__))
175_mm256_andnot_pd(__m256d a, __m256d b)
176{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000177 return (__m256d)(~(__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000178}
179
180static __inline __m256 __attribute__((__always_inline__, __nodebug__))
181_mm256_andnot_ps(__m256 a, __m256 b)
182{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000183 return (__m256)(~(__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000184}
185
186static __inline __m256d __attribute__((__always_inline__, __nodebug__))
187_mm256_or_pd(__m256d a, __m256d b)
188{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000189 return (__m256d)((__v4di)a | (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000190}
191
192static __inline __m256 __attribute__((__always_inline__, __nodebug__))
193_mm256_or_ps(__m256 a, __m256 b)
194{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000195 return (__m256)((__v8si)a | (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000196}
197
198static __inline __m256d __attribute__((__always_inline__, __nodebug__))
199_mm256_xor_pd(__m256d a, __m256d b)
200{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000201 return (__m256d)((__v4di)a ^ (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000202}
203
204static __inline __m256 __attribute__((__always_inline__, __nodebug__))
205_mm256_xor_ps(__m256 a, __m256 b)
206{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000207 return (__m256)((__v8si)a ^ (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000208}
209
210/* Horizontal arithmetic */
211static __inline __m256d __attribute__((__always_inline__, __nodebug__))
212_mm256_hadd_pd(__m256d a, __m256d b)
213{
214 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
215}
216
217static __inline __m256 __attribute__((__always_inline__, __nodebug__))
218_mm256_hadd_ps(__m256 a, __m256 b)
219{
220 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
221}
222
223static __inline __m256d __attribute__((__always_inline__, __nodebug__))
224_mm256_hsub_pd(__m256d a, __m256d b)
225{
226 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
227}
228
229static __inline __m256 __attribute__((__always_inline__, __nodebug__))
230_mm256_hsub_ps(__m256 a, __m256 b)
231{
232 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
233}
234
235/* Vector permutations */
236static __inline __m128d __attribute__((__always_inline__, __nodebug__))
237_mm_permutevar_pd(__m128d a, __m128i c)
238{
239 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
240}
241
242static __inline __m256d __attribute__((__always_inline__, __nodebug__))
243_mm256_permutevar_pd(__m256d a, __m256i c)
244{
245 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
246}
247
248static __inline __m128 __attribute__((__always_inline__, __nodebug__))
249_mm_permutevar_ps(__m128 a, __m128i c)
250{
251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
252}
253
254static __inline __m256 __attribute__((__always_inline__, __nodebug__))
255_mm256_permutevar_ps(__m256 a, __m256i c)
256{
257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
258 (__v8si)c);
259}
260
Chad Rosierc17f88e2011-12-17 01:39:56 +0000261#define _mm_permute_pd(A, C) __extension__ ({ \
262 __m128d __A = (A); \
Craig Topper10c57a82012-02-08 05:16:54 +0000263 (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
264 (C) & 0x1, ((C) & 0x2) >> 1); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000265
Chad Rosierc17f88e2011-12-17 01:39:56 +0000266#define _mm256_permute_pd(A, C) __extension__ ({ \
267 __m256d __A = (A); \
Craig Topper10c57a82012-02-08 05:16:54 +0000268 (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
269 (C) & 0x1, ((C) & 0x2) >> 1, \
270 2 + (((C) & 0x4) >> 2), \
271 2 + (((C) & 0x8) >> 3)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000272
Chad Rosierd7dd7752011-12-17 01:51:05 +0000273#define _mm_permute_ps(A, C) __extension__ ({ \
274 __m128 __A = (A); \
Craig Topper10c57a82012-02-08 05:16:54 +0000275 (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
276 (C) & 0x3, ((C) & 0xc) >> 2, \
Craig Topper56296462012-03-30 05:09:18 +0000277 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000278
Chad Rosierd7dd7752011-12-17 01:51:05 +0000279#define _mm256_permute_ps(A, C) __extension__ ({ \
280 __m256 __A = (A); \
Craig Topper10c57a82012-02-08 05:16:54 +0000281 (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
282 (C) & 0x3, ((C) & 0xc) >> 2, \
283 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
284 4 + (((C) & 0x03) >> 0), \
285 4 + (((C) & 0x0c) >> 2), \
286 4 + (((C) & 0x30) >> 4), \
287 4 + (((C) & 0xc0) >> 6)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000288
Chad Rosierc5cda112011-12-16 21:07:34 +0000289#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
290 __m256d __V1 = (V1); \
291 __m256d __V2 = (V2); \
Craig Toppercfa8e652012-02-08 07:33:36 +0000292 (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \
293 ((M) & 0x3) * 2, \
294 ((M) & 0x3) * 2 + 1, \
295 (((M) & 0x30) >> 4) * 2, \
296 (((M) & 0x30) >> 4) * 2 + 1); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000297
Chad Rosierc5cda112011-12-16 21:07:34 +0000298#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
299 __m256 __V1 = (V1); \
300 __m256 __V2 = (V2); \
Craig Toppercfa8e652012-02-08 07:33:36 +0000301 (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \
302 ((M) & 0x3) * 4, \
303 ((M) & 0x3) * 4 + 1, \
304 ((M) & 0x3) * 4 + 2, \
305 ((M) & 0x3) * 4 + 3, \
306 (((M) & 0x30) >> 4) * 4, \
307 (((M) & 0x30) >> 4) * 4 + 1, \
308 (((M) & 0x30) >> 4) * 4 + 2, \
309 (((M) & 0x30) >> 4) * 4 + 3); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000310
Chad Rosierc5cda112011-12-16 21:07:34 +0000311#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
312 __m256i __V1 = (V1); \
313 __m256i __V2 = (V2); \
Craig Toppercfa8e652012-02-08 07:33:36 +0000314 (__m256i)__builtin_shufflevector((__v8si)__V1, (__v8si)__V2, \
315 ((M) & 0x3) * 4, \
316 ((M) & 0x3) * 4 + 1, \
317 ((M) & 0x3) * 4 + 2, \
318 ((M) & 0x3) * 4 + 3, \
319 (((M) & 0x30) >> 4) * 4, \
320 (((M) & 0x30) >> 4) * 4 + 1, \
321 (((M) & 0x30) >> 4) * 4 + 2, \
322 (((M) & 0x30) >> 4) * 4 + 3); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000323
324/* Vector Blend */
Eli Friedman34720892011-11-10 00:11:13 +0000325#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
326 __m256d __V1 = (V1); \
327 __m256d __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000328 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000329
Eli Friedman34720892011-11-10 00:11:13 +0000330#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
331 __m256 __V1 = (V1); \
332 __m256 __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000333 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000334
335static __inline __m256d __attribute__((__always_inline__, __nodebug__))
336_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
337{
338 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
339}
340
341static __inline __m256 __attribute__((__always_inline__, __nodebug__))
342_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
343{
344 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
345}
346
347/* Vector Dot Product */
Eli Friedman34720892011-11-10 00:11:13 +0000348#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
349 __m256 __V1 = (V1); \
350 __m256 __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000351 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000352
353/* Vector shuffle */
Bob Wilson32bae372011-11-05 06:08:06 +0000354#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
355 __m256 __a = (a); \
356 __m256 __b = (b); \
357 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000358 (mask) & 0x3, ((mask) & 0xc) >> 2, \
Bruno Cardoso Lopes70141c22010-08-11 18:45:43 +0000359 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \
Bruno Cardoso Lopes426344d2011-08-23 23:29:45 +0000360 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \
Bob Wilson32bae372011-11-05 06:08:06 +0000361 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000362
Bob Wilson32bae372011-11-05 06:08:06 +0000363#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
364 __m256d __a = (a); \
365 __m256d __b = (b); \
366 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000367 (mask) & 0x1, \
368 (((mask) & 0x2) >> 1) + 4, \
369 (((mask) & 0x4) >> 2) + 2, \
Bob Wilson32bae372011-11-05 06:08:06 +0000370 (((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000371
372/* Compare */
373#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
374#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
375#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
376#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
377#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
378#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
379#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
380#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
381#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
382#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
383#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
384#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
385#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
386#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
387#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
388#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
389#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
390#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
391#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
392#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
393#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
394#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
395#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
396#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
397#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
398#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
399#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
400#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
401#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
402#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
403#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
404#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
405
Bob Wilson32bae372011-11-05 06:08:06 +0000406#define _mm_cmp_pd(a, b, c) __extension__ ({ \
407 __m128d __a = (a); \
408 __m128d __b = (b); \
409 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000410
Bob Wilson32bae372011-11-05 06:08:06 +0000411#define _mm_cmp_ps(a, b, c) __extension__ ({ \
412 __m128 __a = (a); \
413 __m128 __b = (b); \
414 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000415
Bob Wilson32bae372011-11-05 06:08:06 +0000416#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
417 __m256d __a = (a); \
418 __m256d __b = (b); \
419 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000420
Bob Wilson32bae372011-11-05 06:08:06 +0000421#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
422 __m256 __a = (a); \
423 __m256 __b = (b); \
424 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000425
Bob Wilson32bae372011-11-05 06:08:06 +0000426#define _mm_cmp_sd(a, b, c) __extension__ ({ \
427 __m128d __a = (a); \
428 __m128d __b = (b); \
429 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000430
Bob Wilson32bae372011-11-05 06:08:06 +0000431#define _mm_cmp_ss(a, b, c) __extension__ ({ \
432 __m128 __a = (a); \
433 __m128 __b = (b); \
434 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000435
436/* Vector extract */
Chad Rosier1e4faf52011-12-17 01:22:27 +0000437#define _mm256_extractf128_pd(A, O) __extension__ ({ \
438 __m256d __A = (A); \
Craig Topper34a1da42011-12-24 07:55:14 +0000439 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000440
Chad Rosier1e4faf52011-12-17 01:22:27 +0000441#define _mm256_extractf128_ps(A, O) __extension__ ({ \
442 __m256 __A = (A); \
Craig Topper34a1da42011-12-24 07:55:14 +0000443 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000444
Chad Rosier1e4faf52011-12-17 01:22:27 +0000445#define _mm256_extractf128_si256(A, O) __extension__ ({ \
446 __m256i __A = (A); \
Craig Topper34a1da42011-12-24 07:55:14 +0000447 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000448
449static __inline int __attribute__((__always_inline__, __nodebug__))
450_mm256_extract_epi32(__m256i a, int const imm)
451{
452 __v8si b = (__v8si)a;
453 return b[imm];
454}
455
456static __inline int __attribute__((__always_inline__, __nodebug__))
457_mm256_extract_epi16(__m256i a, int const imm)
458{
459 __v16hi b = (__v16hi)a;
460 return b[imm];
461}
462
463static __inline int __attribute__((__always_inline__, __nodebug__))
464_mm256_extract_epi8(__m256i a, int const imm)
465{
466 __v32qi b = (__v32qi)a;
467 return b[imm];
468}
469
470#ifdef __x86_64__
471static __inline long long __attribute__((__always_inline__, __nodebug__))
472_mm256_extract_epi64(__m256i a, const int imm)
473{
474 __v4di b = (__v4di)a;
475 return b[imm];
476}
477#endif
478
479/* Vector insert */
Chad Rosierb95ddf12011-12-16 21:40:31 +0000480#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
481 __m256d __V1 = (V1); \
482 __m128d __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000483 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000484
Chad Rosierb95ddf12011-12-16 21:40:31 +0000485#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
486 __m256 __V1 = (V1); \
487 __m128 __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000488 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000489
Chad Rosierb95ddf12011-12-16 21:40:31 +0000490#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
491 __m256i __V1 = (V1); \
492 __m128i __V2 = (V2); \
Craig Topper34a1da42011-12-24 07:55:14 +0000493 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000494
495static __inline __m256i __attribute__((__always_inline__, __nodebug__))
496_mm256_insert_epi32(__m256i a, int b, int const imm)
497{
498 __v8si c = (__v8si)a;
499 c[imm & 7] = b;
500 return (__m256i)c;
501}
502
503static __inline __m256i __attribute__((__always_inline__, __nodebug__))
504_mm256_insert_epi16(__m256i a, int b, int const imm)
505{
506 __v16hi c = (__v16hi)a;
507 c[imm & 15] = b;
508 return (__m256i)c;
509}
510
511static __inline __m256i __attribute__((__always_inline__, __nodebug__))
512_mm256_insert_epi8(__m256i a, int b, int const imm)
513{
514 __v32qi c = (__v32qi)a;
515 c[imm & 31] = b;
516 return (__m256i)c;
517}
518
519#ifdef __x86_64__
520static __inline __m256i __attribute__((__always_inline__, __nodebug__))
521_mm256_insert_epi64(__m256i a, int b, int const imm)
522{
523 __v4di c = (__v4di)a;
524 c[imm & 3] = b;
525 return (__m256i)c;
526}
527#endif
528
529/* Conversion */
530static __inline __m256d __attribute__((__always_inline__, __nodebug__))
531_mm256_cvtepi32_pd(__m128i a)
532{
533 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
534}
535
536static __inline __m256 __attribute__((__always_inline__, __nodebug__))
537_mm256_cvtepi32_ps(__m256i a)
538{
539 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
540}
541
542static __inline __m128 __attribute__((__always_inline__, __nodebug__))
543_mm256_cvtpd_ps(__m256d a)
544{
545 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
546}
547
548static __inline __m256i __attribute__((__always_inline__, __nodebug__))
549_mm256_cvtps_epi32(__m256 a)
550{
551 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
552}
553
554static __inline __m256d __attribute__((__always_inline__, __nodebug__))
555_mm256_cvtps_pd(__m128 a)
556{
557 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
558}
559
560static __inline __m128i __attribute__((__always_inline__, __nodebug__))
561_mm256_cvttpd_epi32(__m256d a)
562{
563 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
564}
565
566static __inline __m128i __attribute__((__always_inline__, __nodebug__))
567_mm256_cvtpd_epi32(__m256d a)
568{
569 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
570}
571
572static __inline __m256i __attribute__((__always_inline__, __nodebug__))
573_mm256_cvttps_epi32(__m256 a)
574{
575 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
576}
577
578/* Vector replicate */
579static __inline __m256 __attribute__((__always_inline__, __nodebug__))
580_mm256_movehdup_ps(__m256 a)
581{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000582 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000583}
584
585static __inline __m256 __attribute__((__always_inline__, __nodebug__))
586_mm256_moveldup_ps(__m256 a)
587{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000588 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000589}
590
591static __inline __m256d __attribute__((__always_inline__, __nodebug__))
592_mm256_movedup_pd(__m256d a)
593{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000594 return __builtin_shufflevector(a, a, 0, 0, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000595}
596
597/* Unpack and Interleave */
598static __inline __m256d __attribute__((__always_inline__, __nodebug__))
599_mm256_unpackhi_pd(__m256d a, __m256d b)
600{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000601 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000602}
603
604static __inline __m256d __attribute__((__always_inline__, __nodebug__))
605_mm256_unpacklo_pd(__m256d a, __m256d b)
606{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000607 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000608}
609
610static __inline __m256 __attribute__((__always_inline__, __nodebug__))
611_mm256_unpackhi_ps(__m256 a, __m256 b)
612{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000613 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000614}
615
616static __inline __m256 __attribute__((__always_inline__, __nodebug__))
617_mm256_unpacklo_ps(__m256 a, __m256 b)
618{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000619 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000620}
621
622/* Bit Test */
623static __inline int __attribute__((__always_inline__, __nodebug__))
624_mm_testz_pd(__m128d a, __m128d b)
625{
626 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
627}
628
629static __inline int __attribute__((__always_inline__, __nodebug__))
630_mm_testc_pd(__m128d a, __m128d b)
631{
632 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
633}
634
635static __inline int __attribute__((__always_inline__, __nodebug__))
636_mm_testnzc_pd(__m128d a, __m128d b)
637{
638 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
639}
640
641static __inline int __attribute__((__always_inline__, __nodebug__))
642_mm_testz_ps(__m128 a, __m128 b)
643{
644 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
645}
646
647static __inline int __attribute__((__always_inline__, __nodebug__))
648_mm_testc_ps(__m128 a, __m128 b)
649{
650 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
651}
652
653static __inline int __attribute__((__always_inline__, __nodebug__))
654_mm_testnzc_ps(__m128 a, __m128 b)
655{
656 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
657}
658
659static __inline int __attribute__((__always_inline__, __nodebug__))
660_mm256_testz_pd(__m256d a, __m256d b)
661{
662 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
663}
664
665static __inline int __attribute__((__always_inline__, __nodebug__))
666_mm256_testc_pd(__m256d a, __m256d b)
667{
668 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
669}
670
671static __inline int __attribute__((__always_inline__, __nodebug__))
672_mm256_testnzc_pd(__m256d a, __m256d b)
673{
674 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
675}
676
677static __inline int __attribute__((__always_inline__, __nodebug__))
678_mm256_testz_ps(__m256 a, __m256 b)
679{
680 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
681}
682
683static __inline int __attribute__((__always_inline__, __nodebug__))
684_mm256_testc_ps(__m256 a, __m256 b)
685{
686 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
687}
688
689static __inline int __attribute__((__always_inline__, __nodebug__))
690_mm256_testnzc_ps(__m256 a, __m256 b)
691{
692 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
693}
694
695static __inline int __attribute__((__always_inline__, __nodebug__))
696_mm256_testz_si256(__m256i a, __m256i b)
697{
698 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
699}
700
701static __inline int __attribute__((__always_inline__, __nodebug__))
702_mm256_testc_si256(__m256i a, __m256i b)
703{
704 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
705}
706
707static __inline int __attribute__((__always_inline__, __nodebug__))
708_mm256_testnzc_si256(__m256i a, __m256i b)
709{
710 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
711}
712
713/* Vector extract sign mask */
714static __inline int __attribute__((__always_inline__, __nodebug__))
715_mm256_movemask_pd(__m256d a)
716{
717 return __builtin_ia32_movmskpd256((__v4df)a);
718}
719
720static __inline int __attribute__((__always_inline__, __nodebug__))
721_mm256_movemask_ps(__m256 a)
722{
723 return __builtin_ia32_movmskps256((__v8sf)a);
724}
725
726/* Vector zero */
727static __inline void __attribute__((__always_inline__, __nodebug__))
728_mm256_zeroall(void)
729{
730 __builtin_ia32_vzeroall();
731}
732
733static __inline void __attribute__((__always_inline__, __nodebug__))
734_mm256_zeroupper(void)
735{
736 __builtin_ia32_vzeroupper();
737}
738
739/* Vector load with broadcast */
740static __inline __m128 __attribute__((__always_inline__, __nodebug__))
741_mm_broadcast_ss(float const *a)
742{
743 return (__m128)__builtin_ia32_vbroadcastss(a);
744}
745
746static __inline __m256d __attribute__((__always_inline__, __nodebug__))
747_mm256_broadcast_sd(double const *a)
748{
749 return (__m256d)__builtin_ia32_vbroadcastsd256(a);
750}
751
752static __inline __m256 __attribute__((__always_inline__, __nodebug__))
753_mm256_broadcast_ss(float const *a)
754{
755 return (__m256)__builtin_ia32_vbroadcastss256(a);
756}
757
758static __inline __m256d __attribute__((__always_inline__, __nodebug__))
759_mm256_broadcast_pd(__m128d const *a)
760{
761 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
762}
763
764static __inline __m256 __attribute__((__always_inline__, __nodebug__))
765_mm256_broadcast_ps(__m128 const *a)
766{
767 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
768}
769
770/* SIMD load ops */
771static __inline __m256d __attribute__((__always_inline__, __nodebug__))
772_mm256_load_pd(double const *p)
773{
774 return *(__m256d *)p;
775}
776
777static __inline __m256 __attribute__((__always_inline__, __nodebug__))
778_mm256_load_ps(float const *p)
779{
780 return *(__m256 *)p;
781}
782
783static __inline __m256d __attribute__((__always_inline__, __nodebug__))
784_mm256_loadu_pd(double const *p)
785{
Craig Topper2ee2ac22012-01-25 04:26:17 +0000786 struct __loadu_pd {
787 __m256d v;
788 } __attribute__((packed, may_alias));
789 return ((struct __loadu_pd*)p)->v;
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000790}
791
792static __inline __m256 __attribute__((__always_inline__, __nodebug__))
793_mm256_loadu_ps(float const *p)
794{
Craig Topper2ee2ac22012-01-25 04:26:17 +0000795 struct __loadu_ps {
796 __m256 v;
797 } __attribute__((packed, may_alias));
798 return ((struct __loadu_ps*)p)->v;
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000799}
800
801static __inline __m256i __attribute__((__always_inline__, __nodebug__))
802_mm256_load_si256(__m256i const *p)
803{
804 return *p;
805}
806
807static __inline __m256i __attribute__((__always_inline__, __nodebug__))
808_mm256_loadu_si256(__m256i const *p)
809{
Craig Topper2ee2ac22012-01-25 04:26:17 +0000810 struct __loadu_si256 {
811 __m256i v;
812 } __attribute__((packed, may_alias));
813 return ((struct __loadu_si256*)p)->v;
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000814}
815
816static __inline __m256i __attribute__((__always_inline__, __nodebug__))
817_mm256_lddqu_si256(__m256i const *p)
818{
819 return (__m256i)__builtin_ia32_lddqu256((char const *)p);
820}
821
822/* SIMD store ops */
823static __inline void __attribute__((__always_inline__, __nodebug__))
824_mm256_store_pd(double *p, __m256d a)
825{
826 *(__m256d *)p = a;
827}
828
829static __inline void __attribute__((__always_inline__, __nodebug__))
830_mm256_store_ps(float *p, __m256 a)
831{
832 *(__m256 *)p = a;
833}
834
835static __inline void __attribute__((__always_inline__, __nodebug__))
836_mm256_storeu_pd(double *p, __m256d a)
837{
838 __builtin_ia32_storeupd256(p, (__v4df)a);
839}
840
841static __inline void __attribute__((__always_inline__, __nodebug__))
842_mm256_storeu_ps(float *p, __m256 a)
843{
844 __builtin_ia32_storeups256(p, (__v8sf)a);
845}
846
847static __inline void __attribute__((__always_inline__, __nodebug__))
848_mm256_store_si256(__m256i *p, __m256i a)
849{
850 *p = a;
851}
852
853static __inline void __attribute__((__always_inline__, __nodebug__))
854_mm256_storeu_si256(__m256i *p, __m256i a)
855{
856 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
857}
858
859/* Conditional load ops */
860static __inline __m128d __attribute__((__always_inline__, __nodebug__))
861_mm_maskload_pd(double const *p, __m128d m)
862{
863 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
864}
865
866static __inline __m256d __attribute__((__always_inline__, __nodebug__))
867_mm256_maskload_pd(double const *p, __m256d m)
868{
869 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
870}
871
872static __inline __m128 __attribute__((__always_inline__, __nodebug__))
873_mm_maskload_ps(float const *p, __m128 m)
874{
875 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
876}
877
878static __inline __m256 __attribute__((__always_inline__, __nodebug__))
879_mm256_maskload_ps(float const *p, __m256 m)
880{
881 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
882}
883
884/* Conditional store ops */
885static __inline void __attribute__((__always_inline__, __nodebug__))
886_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
887{
888 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
889}
890
891static __inline void __attribute__((__always_inline__, __nodebug__))
892_mm_maskstore_pd(double *p, __m128d m, __m128d a)
893{
894 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
895}
896
897static __inline void __attribute__((__always_inline__, __nodebug__))
898_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
899{
900 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
901}
902
903static __inline void __attribute__((__always_inline__, __nodebug__))
904_mm_maskstore_ps(float *p, __m128 m, __m128 a)
905{
906 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
907}
908
909/* Cacheability support ops */
910static __inline void __attribute__((__always_inline__, __nodebug__))
911_mm256_stream_si256(__m256i *a, __m256i b)
912{
913 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
914}
915
916static __inline void __attribute__((__always_inline__, __nodebug__))
917_mm256_stream_pd(double *a, __m256d b)
918{
919 __builtin_ia32_movntpd256(a, (__v4df)b);
920}
921
922static __inline void __attribute__((__always_inline__, __nodebug__))
923_mm256_stream_ps(float *p, __m256 a)
924{
925 __builtin_ia32_movntps256(p, (__v8sf)a);
926}
927
928/* Create vectors */
929static __inline __m256d __attribute__((__always_inline__, __nodebug__))
930_mm256_set_pd(double a, double b, double c, double d)
931{
932 return (__m256d){ d, c, b, a };
933}
934
935static __inline __m256 __attribute__((__always_inline__, __nodebug__))
936_mm256_set_ps(float a, float b, float c, float d,
937 float e, float f, float g, float h)
938{
939 return (__m256){ h, g, f, e, d, c, b, a };
940}
941
942static __inline __m256i __attribute__((__always_inline__, __nodebug__))
943_mm256_set_epi32(int i0, int i1, int i2, int i3,
944 int i4, int i5, int i6, int i7)
945{
946 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
947}
948
949static __inline __m256i __attribute__((__always_inline__, __nodebug__))
950_mm256_set_epi16(short w15, short w14, short w13, short w12,
951 short w11, short w10, short w09, short w08,
952 short w07, short w06, short w05, short w04,
953 short w03, short w02, short w01, short w00)
954{
955 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
956 w08, w09, w10, w11, w12, w13, w14, w15 };
957}
958
959static __inline __m256i __attribute__((__always_inline__, __nodebug__))
960_mm256_set_epi8(char b31, char b30, char b29, char b28,
961 char b27, char b26, char b25, char b24,
962 char b23, char b22, char b21, char b20,
963 char b19, char b18, char b17, char b16,
964 char b15, char b14, char b13, char b12,
965 char b11, char b10, char b09, char b08,
966 char b07, char b06, char b05, char b04,
967 char b03, char b02, char b01, char b00)
968{
969 return (__m256i)(__v32qi){
970 b00, b01, b02, b03, b04, b05, b06, b07,
971 b08, b09, b10, b11, b12, b13, b14, b15,
972 b16, b17, b18, b19, b20, b21, b22, b23,
973 b24, b25, b26, b27, b28, b29, b30, b31
974 };
975}
976
977static __inline __m256i __attribute__((__always_inline__, __nodebug__))
978_mm256_set_epi64x(long long a, long long b, long long c, long long d)
979{
980 return (__m256i)(__v4di){ d, c, b, a };
981}
982
983/* Create vectors with elements in reverse order */
984static __inline __m256d __attribute__((__always_inline__, __nodebug__))
985_mm256_setr_pd(double a, double b, double c, double d)
986{
987 return (__m256d){ a, b, c, d };
988}
989
990static __inline __m256 __attribute__((__always_inline__, __nodebug__))
991_mm256_setr_ps(float a, float b, float c, float d,
992 float e, float f, float g, float h)
993{
994 return (__m256){ a, b, c, d, e, f, g, h };
995}
996
997static __inline __m256i __attribute__((__always_inline__, __nodebug__))
998_mm256_setr_epi32(int i0, int i1, int i2, int i3,
999 int i4, int i5, int i6, int i7)
1000{
1001 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
1002}
1003
1004static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1005_mm256_setr_epi16(short w15, short w14, short w13, short w12,
1006 short w11, short w10, short w09, short w08,
1007 short w07, short w06, short w05, short w04,
1008 short w03, short w02, short w01, short w00)
1009{
1010 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
1011 w07, w06, w05, w04, w03, w02, w01, w00 };
1012}
1013
1014static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1015_mm256_setr_epi8(char b31, char b30, char b29, char b28,
1016 char b27, char b26, char b25, char b24,
1017 char b23, char b22, char b21, char b20,
1018 char b19, char b18, char b17, char b16,
1019 char b15, char b14, char b13, char b12,
1020 char b11, char b10, char b09, char b08,
1021 char b07, char b06, char b05, char b04,
1022 char b03, char b02, char b01, char b00)
1023{
1024 return (__m256i)(__v32qi){
1025 b31, b30, b29, b28, b27, b26, b25, b24,
1026 b23, b22, b21, b20, b19, b18, b17, b16,
1027 b15, b14, b13, b12, b11, b10, b09, b08,
1028 b07, b06, b05, b04, b03, b02, b01, b00 };
1029}
1030
1031static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1032_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1033{
1034 return (__m256i)(__v4di){ a, b, c, d };
1035}
1036
1037/* Create vectors with repeated elements */
1038static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1039_mm256_set1_pd(double w)
1040{
1041 return (__m256d){ w, w, w, w };
1042}
1043
1044static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1045_mm256_set1_ps(float w)
1046{
1047 return (__m256){ w, w, w, w, w, w, w, w };
1048}
1049
1050static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1051_mm256_set1_epi32(int i)
1052{
1053 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1054}
1055
1056static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1057_mm256_set1_epi16(short w)
1058{
1059 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1060}
1061
1062static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1063_mm256_set1_epi8(char b)
1064{
1065 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1066 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1067}
1068
1069static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1070_mm256_set1_epi64x(long long q)
1071{
1072 return (__m256i)(__v4di){ q, q, q, q };
1073}
1074
1075/* Create zeroed vectors */
1076static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1077_mm256_setzero_pd(void)
1078{
1079 return (__m256d){ 0, 0, 0, 0 };
1080}
1081
1082static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1083_mm256_setzero_ps(void)
1084{
1085 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1086}
1087
1088static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1089_mm256_setzero_si256(void)
1090{
1091 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1092}
1093
1094/* Cast between vector types */
1095static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1096_mm256_castpd_ps(__m256d in)
1097{
1098 return (__m256)in;
1099}
1100
1101static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1102_mm256_castpd_si256(__m256d in)
1103{
1104 return (__m256i)in;
1105}
1106
1107static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1108_mm256_castps_pd(__m256 in)
1109{
1110 return (__m256d)in;
1111}
1112
1113static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1114_mm256_castps_si256(__m256 in)
1115{
1116 return (__m256i)in;
1117}
1118
1119static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1120_mm256_castsi256_ps(__m256i in)
1121{
1122 return (__m256)in;
1123}
1124
1125static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1126_mm256_castsi256_pd(__m256i in)
1127{
1128 return (__m256d)in;
1129}
1130
1131static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1132_mm256_castpd256_pd128(__m256d in)
1133{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001134 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001135}
1136
1137static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1138_mm256_castps256_ps128(__m256 in)
1139{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001140 return __builtin_shufflevector(in, in, 0, 1, 2, 3);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001141}
1142
1143static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1144_mm256_castsi256_si128(__m256i in)
1145{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001146 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001147}
1148
1149static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1150_mm256_castpd128_pd256(__m128d in)
1151{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001152 __m128d zero = _mm_setzero_pd();
1153 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001154}
1155
1156static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1157_mm256_castps128_ps256(__m128 in)
1158{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001159 __m128 zero = _mm_setzero_ps();
1160 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001161}
1162
1163static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1164_mm256_castsi128_si256(__m128i in)
1165{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001166 __m128i zero = _mm_setzero_si128();
1167 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001168}
Chad Rosierdb163c82012-03-20 16:40:00 +00001169
1170/* SIMD load ops (unaligned) */
1171static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1172_mm256_loadu2_m128(float const *addr_hi, float const *addr_lo)
1173{
1174 struct __loadu_ps {
1175 __m128 v;
1176 } __attribute__((__packed__, __may_alias__));
1177
1178 __m256 v256 = _mm256_castps128_ps256(((struct __loadu_ps*)addr_lo)->v);
1179 return _mm256_insertf128_ps(v256, ((struct __loadu_ps*)addr_hi)->v, 1);
1180}
1181
1182static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1183_mm256_loadu2_m128d(double const *addr_hi, double const *addr_lo)
1184{
1185 struct __loadu_pd {
1186 __m128d v;
1187 } __attribute__((__packed__, __may_alias__));
1188
1189 __m256d v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)addr_lo)->v);
1190 return _mm256_insertf128_pd(v256, ((struct __loadu_pd*)addr_hi)->v, 1);
1191}
1192
1193static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1194_mm256_loadu2_m128i(__m128i const *addr_hi, __m128i const *addr_lo)
1195{
1196 struct __loadu_si128 {
1197 __m128i v;
1198 } __attribute__((packed, may_alias));
1199 __m256i v256 = _mm256_castsi128_si256(((struct __loadu_si128*)addr_lo)->v);
1200 return _mm256_insertf128_si256(v256, ((struct __loadu_si128*)addr_hi)->v, 1);
1201}
1202
1203/* SIMD store ops (unaligned) */
1204static __inline void __attribute__((__always_inline__, __nodebug__))
Chad Rosier41a7e892012-04-12 16:29:08 +00001205_mm256_storeu2_m128(float *addr_hi, float *addr_lo, __m256 a)
Chad Rosierdb163c82012-03-20 16:40:00 +00001206{
1207 __m128 v128;
1208
1209 v128 = _mm256_castps256_ps128(a);
1210 __builtin_ia32_storeups(addr_lo, v128);
1211 v128 = _mm256_extractf128_ps(a, 1);
1212 __builtin_ia32_storeups(addr_hi, v128);
1213}
1214
1215static __inline void __attribute__((__always_inline__, __nodebug__))
Chad Rosier41a7e892012-04-12 16:29:08 +00001216_mm256_storeu2_m128d(double *addr_hi, double *addr_lo, __m256d a)
Chad Rosierdb163c82012-03-20 16:40:00 +00001217{
1218 __m128d v128;
1219
1220 v128 = _mm256_castpd256_pd128(a);
1221 __builtin_ia32_storeupd(addr_lo, v128);
1222 v128 = _mm256_extractf128_pd(a, 1);
1223 __builtin_ia32_storeupd(addr_hi, v128);
1224}
1225
1226static __inline void __attribute__((__always_inline__, __nodebug__))
Chad Rosier41a7e892012-04-12 16:29:08 +00001227_mm256_storeu2_m128i(__m128i *addr_hi, __m128i *addr_lo, __m256i a)
Chad Rosierdb163c82012-03-20 16:40:00 +00001228{
1229 __m128i v128;
1230
1231 v128 = _mm256_castsi256_si128(a);
1232 __builtin_ia32_storedqu((char *)addr_lo, (__v16qi)v128);
1233 v128 = _mm256_extractf128_si256(a, 1);
1234 __builtin_ia32_storedqu((char *)addr_hi, (__v16qi)v128);
1235}