blob: 3ae5dc76fcd26e3013d94526659a0435548742a0 [file] [log] [blame]
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer01b57e32010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +000027
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43 return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49 return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55 return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61 return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79 return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85 return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115 return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121 return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
148static __inline __m256d __attribute__((__always_inline__, __nodebug__))
149_mm256_round_pd(__m256d v, const int m)
150{
151 return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m);
152}
153
154static __inline __m256 __attribute__((__always_inline__, __nodebug__))
155_mm256_round_ps(__m256 v, const int m)
156{
157 return (__m256)__builtin_ia32_roundps256((__v8sf)v, m);
158}
159
160#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
161#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
162#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
163#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
164
165/* Logical */
166static __inline __m256d __attribute__((__always_inline__, __nodebug__))
167_mm256_and_pd(__m256d a, __m256d b)
168{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000169 return (__m256d)((__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000170}
171
172static __inline __m256 __attribute__((__always_inline__, __nodebug__))
173_mm256_and_ps(__m256 a, __m256 b)
174{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000175 return (__m256)((__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000176}
177
178static __inline __m256d __attribute__((__always_inline__, __nodebug__))
179_mm256_andnot_pd(__m256d a, __m256d b)
180{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000181 return (__m256d)(~(__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000182}
183
184static __inline __m256 __attribute__((__always_inline__, __nodebug__))
185_mm256_andnot_ps(__m256 a, __m256 b)
186{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000187 return (__m256)(~(__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000188}
189
190static __inline __m256d __attribute__((__always_inline__, __nodebug__))
191_mm256_or_pd(__m256d a, __m256d b)
192{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000193 return (__m256d)((__v4di)a | (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000194}
195
196static __inline __m256 __attribute__((__always_inline__, __nodebug__))
197_mm256_or_ps(__m256 a, __m256 b)
198{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000199 return (__m256)((__v8si)a | (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000200}
201
202static __inline __m256d __attribute__((__always_inline__, __nodebug__))
203_mm256_xor_pd(__m256d a, __m256d b)
204{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000205 return (__m256d)((__v4di)a ^ (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000206}
207
208static __inline __m256 __attribute__((__always_inline__, __nodebug__))
209_mm256_xor_ps(__m256 a, __m256 b)
210{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000211 return (__m256)((__v8si)a ^ (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000212}
213
214/* Horizontal arithmetic */
215static __inline __m256d __attribute__((__always_inline__, __nodebug__))
216_mm256_hadd_pd(__m256d a, __m256d b)
217{
218 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
219}
220
221static __inline __m256 __attribute__((__always_inline__, __nodebug__))
222_mm256_hadd_ps(__m256 a, __m256 b)
223{
224 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
225}
226
227static __inline __m256d __attribute__((__always_inline__, __nodebug__))
228_mm256_hsub_pd(__m256d a, __m256d b)
229{
230 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
231}
232
233static __inline __m256 __attribute__((__always_inline__, __nodebug__))
234_mm256_hsub_ps(__m256 a, __m256 b)
235{
236 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
237}
238
239/* Vector permutations */
240static __inline __m128d __attribute__((__always_inline__, __nodebug__))
241_mm_permutevar_pd(__m128d a, __m128i c)
242{
243 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
244}
245
246static __inline __m256d __attribute__((__always_inline__, __nodebug__))
247_mm256_permutevar_pd(__m256d a, __m256i c)
248{
249 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
250}
251
252static __inline __m128 __attribute__((__always_inline__, __nodebug__))
253_mm_permutevar_ps(__m128 a, __m128i c)
254{
255 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
256}
257
258static __inline __m256 __attribute__((__always_inline__, __nodebug__))
259_mm256_permutevar_ps(__m256 a, __m256i c)
260{
261 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
262 (__v8si)c);
263}
264
265static __inline __m128d __attribute__((__always_inline__, __nodebug__))
266_mm_permute_pd(__m128d a, const int c)
267{
268 return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
269}
270
271static __inline __m256d __attribute__((__always_inline__, __nodebug__))
272_mm256_permute_pd(__m256d a, const int c)
273{
274 return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
275}
276
277static __inline __m128 __attribute__((__always_inline__, __nodebug__))
278_mm_permute_ps(__m128 a, const int c)
279{
280 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
281}
282
283static __inline __m256 __attribute__((__always_inline__, __nodebug__))
284_mm256_permute_ps(__m256 a, const int c)
285{
286 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
287}
288
Chad Rosierc5cda112011-12-16 21:07:34 +0000289#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
290 __m256d __V1 = (V1); \
291 __m256d __V2 = (V2); \
292 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000293
Chad Rosierc5cda112011-12-16 21:07:34 +0000294#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
295 __m256 __V1 = (V1); \
296 __m256 __V2 = (V2); \
297 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000298
Chad Rosierc5cda112011-12-16 21:07:34 +0000299#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
300 __m256i __V1 = (V1); \
301 __m256i __V2 = (V2); \
302 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000303
304/* Vector Blend */
Eli Friedman34720892011-11-10 00:11:13 +0000305#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
306 __m256d __V1 = (V1); \
307 __m256d __V2 = (V2); \
308 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000309
Eli Friedman34720892011-11-10 00:11:13 +0000310#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
311 __m256 __V1 = (V1); \
312 __m256 __V2 = (V2); \
313 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000314
315static __inline __m256d __attribute__((__always_inline__, __nodebug__))
316_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
317{
318 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
319}
320
321static __inline __m256 __attribute__((__always_inline__, __nodebug__))
322_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
323{
324 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
325}
326
327/* Vector Dot Product */
Eli Friedman34720892011-11-10 00:11:13 +0000328#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
329 __m256 __V1 = (V1); \
330 __m256 __V2 = (V2); \
331 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000332
333/* Vector shuffle */
Bob Wilson32bae372011-11-05 06:08:06 +0000334#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
335 __m256 __a = (a); \
336 __m256 __b = (b); \
337 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000338 (mask) & 0x3, ((mask) & 0xc) >> 2, \
Bruno Cardoso Lopes70141c22010-08-11 18:45:43 +0000339 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \
Bruno Cardoso Lopes426344d2011-08-23 23:29:45 +0000340 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \
Bob Wilson32bae372011-11-05 06:08:06 +0000341 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000342
Bob Wilson32bae372011-11-05 06:08:06 +0000343#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
344 __m256d __a = (a); \
345 __m256d __b = (b); \
346 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000347 (mask) & 0x1, \
348 (((mask) & 0x2) >> 1) + 4, \
349 (((mask) & 0x4) >> 2) + 2, \
Bob Wilson32bae372011-11-05 06:08:06 +0000350 (((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000351
352/* Compare */
353#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
354#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
355#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
356#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
357#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
358#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
359#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
360#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
361#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
362#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
363#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
364#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
365#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
366#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
367#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
368#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
369#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
370#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
371#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
372#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
373#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
374#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
375#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
376#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
377#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
378#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
379#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
380#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
381#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
382#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
383#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
384#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
385
Bob Wilson32bae372011-11-05 06:08:06 +0000386#define _mm_cmp_pd(a, b, c) __extension__ ({ \
387 __m128d __a = (a); \
388 __m128d __b = (b); \
389 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000390
Bob Wilson32bae372011-11-05 06:08:06 +0000391#define _mm_cmp_ps(a, b, c) __extension__ ({ \
392 __m128 __a = (a); \
393 __m128 __b = (b); \
394 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000395
Bob Wilson32bae372011-11-05 06:08:06 +0000396#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
397 __m256d __a = (a); \
398 __m256d __b = (b); \
399 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000400
Bob Wilson32bae372011-11-05 06:08:06 +0000401#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
402 __m256 __a = (a); \
403 __m256 __b = (b); \
404 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000405
Bob Wilson32bae372011-11-05 06:08:06 +0000406#define _mm_cmp_sd(a, b, c) __extension__ ({ \
407 __m128d __a = (a); \
408 __m128d __b = (b); \
409 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000410
Bob Wilson32bae372011-11-05 06:08:06 +0000411#define _mm_cmp_ss(a, b, c) __extension__ ({ \
412 __m128 __a = (a); \
413 __m128 __b = (b); \
414 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000415
416/* Vector extract */
417static __inline __m128d __attribute__((__always_inline__, __nodebug__))
418_mm256_extractf128_pd(__m256d a, const int o)
419{
420 return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
421}
422
423static __inline __m128 __attribute__((__always_inline__, __nodebug__))
424_mm256_extractf128_ps(__m256 a, const int o)
425{
426 return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
427}
428
429static __inline __m128i __attribute__((__always_inline__, __nodebug__))
430_mm256_extractf128_si256(__m256i a, const int o)
431{
432 return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
433}
434
435static __inline int __attribute__((__always_inline__, __nodebug__))
436_mm256_extract_epi32(__m256i a, int const imm)
437{
438 __v8si b = (__v8si)a;
439 return b[imm];
440}
441
442static __inline int __attribute__((__always_inline__, __nodebug__))
443_mm256_extract_epi16(__m256i a, int const imm)
444{
445 __v16hi b = (__v16hi)a;
446 return b[imm];
447}
448
449static __inline int __attribute__((__always_inline__, __nodebug__))
450_mm256_extract_epi8(__m256i a, int const imm)
451{
452 __v32qi b = (__v32qi)a;
453 return b[imm];
454}
455
456#ifdef __x86_64__
457static __inline long long __attribute__((__always_inline__, __nodebug__))
458_mm256_extract_epi64(__m256i a, const int imm)
459{
460 __v4di b = (__v4di)a;
461 return b[imm];
462}
463#endif
464
465/* Vector insert */
466static __inline __m256d __attribute__((__always_inline__, __nodebug__))
467_mm256_insertf128_pd(__m256d a, __m128d b, const int o)
468{
469 return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o);
470}
471
472static __inline __m256 __attribute__((__always_inline__, __nodebug__))
473_mm256_insertf128_ps(__m256 a, __m128 b, const int o)
474{
475 return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o);
476}
477
478static __inline __m256i __attribute__((__always_inline__, __nodebug__))
479_mm256_insertf128_si256(__m256i a, __m128i b, const int o)
480{
481 return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o);
482}
483
484static __inline __m256i __attribute__((__always_inline__, __nodebug__))
485_mm256_insert_epi32(__m256i a, int b, int const imm)
486{
487 __v8si c = (__v8si)a;
488 c[imm & 7] = b;
489 return (__m256i)c;
490}
491
492static __inline __m256i __attribute__((__always_inline__, __nodebug__))
493_mm256_insert_epi16(__m256i a, int b, int const imm)
494{
495 __v16hi c = (__v16hi)a;
496 c[imm & 15] = b;
497 return (__m256i)c;
498}
499
500static __inline __m256i __attribute__((__always_inline__, __nodebug__))
501_mm256_insert_epi8(__m256i a, int b, int const imm)
502{
503 __v32qi c = (__v32qi)a;
504 c[imm & 31] = b;
505 return (__m256i)c;
506}
507
508#ifdef __x86_64__
509static __inline __m256i __attribute__((__always_inline__, __nodebug__))
510_mm256_insert_epi64(__m256i a, int b, int const imm)
511{
512 __v4di c = (__v4di)a;
513 c[imm & 3] = b;
514 return (__m256i)c;
515}
516#endif
517
518/* Conversion */
519static __inline __m256d __attribute__((__always_inline__, __nodebug__))
520_mm256_cvtepi32_pd(__m128i a)
521{
522 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
523}
524
525static __inline __m256 __attribute__((__always_inline__, __nodebug__))
526_mm256_cvtepi32_ps(__m256i a)
527{
528 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
529}
530
531static __inline __m128 __attribute__((__always_inline__, __nodebug__))
532_mm256_cvtpd_ps(__m256d a)
533{
534 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
535}
536
537static __inline __m256i __attribute__((__always_inline__, __nodebug__))
538_mm256_cvtps_epi32(__m256 a)
539{
540 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
541}
542
543static __inline __m256d __attribute__((__always_inline__, __nodebug__))
544_mm256_cvtps_pd(__m128 a)
545{
546 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
547}
548
549static __inline __m128i __attribute__((__always_inline__, __nodebug__))
550_mm256_cvttpd_epi32(__m256d a)
551{
552 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
553}
554
555static __inline __m128i __attribute__((__always_inline__, __nodebug__))
556_mm256_cvtpd_epi32(__m256d a)
557{
558 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
559}
560
561static __inline __m256i __attribute__((__always_inline__, __nodebug__))
562_mm256_cvttps_epi32(__m256 a)
563{
564 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
565}
566
567/* Vector replicate */
568static __inline __m256 __attribute__((__always_inline__, __nodebug__))
569_mm256_movehdup_ps(__m256 a)
570{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000571 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000572}
573
574static __inline __m256 __attribute__((__always_inline__, __nodebug__))
575_mm256_moveldup_ps(__m256 a)
576{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000577 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000578}
579
580static __inline __m256d __attribute__((__always_inline__, __nodebug__))
581_mm256_movedup_pd(__m256d a)
582{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000583 return __builtin_shufflevector(a, a, 0, 0, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000584}
585
586/* Unpack and Interleave */
587static __inline __m256d __attribute__((__always_inline__, __nodebug__))
588_mm256_unpackhi_pd(__m256d a, __m256d b)
589{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000590 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000591}
592
593static __inline __m256d __attribute__((__always_inline__, __nodebug__))
594_mm256_unpacklo_pd(__m256d a, __m256d b)
595{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000596 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000597}
598
599static __inline __m256 __attribute__((__always_inline__, __nodebug__))
600_mm256_unpackhi_ps(__m256 a, __m256 b)
601{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000602 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000603}
604
605static __inline __m256 __attribute__((__always_inline__, __nodebug__))
606_mm256_unpacklo_ps(__m256 a, __m256 b)
607{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000608 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000609}
610
611/* Bit Test */
612static __inline int __attribute__((__always_inline__, __nodebug__))
613_mm_testz_pd(__m128d a, __m128d b)
614{
615 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
616}
617
618static __inline int __attribute__((__always_inline__, __nodebug__))
619_mm_testc_pd(__m128d a, __m128d b)
620{
621 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
622}
623
624static __inline int __attribute__((__always_inline__, __nodebug__))
625_mm_testnzc_pd(__m128d a, __m128d b)
626{
627 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
628}
629
630static __inline int __attribute__((__always_inline__, __nodebug__))
631_mm_testz_ps(__m128 a, __m128 b)
632{
633 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
634}
635
636static __inline int __attribute__((__always_inline__, __nodebug__))
637_mm_testc_ps(__m128 a, __m128 b)
638{
639 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
640}
641
642static __inline int __attribute__((__always_inline__, __nodebug__))
643_mm_testnzc_ps(__m128 a, __m128 b)
644{
645 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
646}
647
648static __inline int __attribute__((__always_inline__, __nodebug__))
649_mm256_testz_pd(__m256d a, __m256d b)
650{
651 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
652}
653
654static __inline int __attribute__((__always_inline__, __nodebug__))
655_mm256_testc_pd(__m256d a, __m256d b)
656{
657 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
658}
659
660static __inline int __attribute__((__always_inline__, __nodebug__))
661_mm256_testnzc_pd(__m256d a, __m256d b)
662{
663 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
664}
665
666static __inline int __attribute__((__always_inline__, __nodebug__))
667_mm256_testz_ps(__m256 a, __m256 b)
668{
669 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
670}
671
672static __inline int __attribute__((__always_inline__, __nodebug__))
673_mm256_testc_ps(__m256 a, __m256 b)
674{
675 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
676}
677
678static __inline int __attribute__((__always_inline__, __nodebug__))
679_mm256_testnzc_ps(__m256 a, __m256 b)
680{
681 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
682}
683
684static __inline int __attribute__((__always_inline__, __nodebug__))
685_mm256_testz_si256(__m256i a, __m256i b)
686{
687 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
688}
689
690static __inline int __attribute__((__always_inline__, __nodebug__))
691_mm256_testc_si256(__m256i a, __m256i b)
692{
693 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
694}
695
696static __inline int __attribute__((__always_inline__, __nodebug__))
697_mm256_testnzc_si256(__m256i a, __m256i b)
698{
699 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
700}
701
702/* Vector extract sign mask */
703static __inline int __attribute__((__always_inline__, __nodebug__))
704_mm256_movemask_pd(__m256d a)
705{
706 return __builtin_ia32_movmskpd256((__v4df)a);
707}
708
709static __inline int __attribute__((__always_inline__, __nodebug__))
710_mm256_movemask_ps(__m256 a)
711{
712 return __builtin_ia32_movmskps256((__v8sf)a);
713}
714
715/* Vector zero */
716static __inline void __attribute__((__always_inline__, __nodebug__))
717_mm256_zeroall(void)
718{
719 __builtin_ia32_vzeroall();
720}
721
722static __inline void __attribute__((__always_inline__, __nodebug__))
723_mm256_zeroupper(void)
724{
725 __builtin_ia32_vzeroupper();
726}
727
728/* Vector load with broadcast */
729static __inline __m128 __attribute__((__always_inline__, __nodebug__))
730_mm_broadcast_ss(float const *a)
731{
732 return (__m128)__builtin_ia32_vbroadcastss(a);
733}
734
735static __inline __m256d __attribute__((__always_inline__, __nodebug__))
736_mm256_broadcast_sd(double const *a)
737{
738 return (__m256d)__builtin_ia32_vbroadcastsd256(a);
739}
740
741static __inline __m256 __attribute__((__always_inline__, __nodebug__))
742_mm256_broadcast_ss(float const *a)
743{
744 return (__m256)__builtin_ia32_vbroadcastss256(a);
745}
746
747static __inline __m256d __attribute__((__always_inline__, __nodebug__))
748_mm256_broadcast_pd(__m128d const *a)
749{
750 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
751}
752
753static __inline __m256 __attribute__((__always_inline__, __nodebug__))
754_mm256_broadcast_ps(__m128 const *a)
755{
756 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
757}
758
759/* SIMD load ops */
760static __inline __m256d __attribute__((__always_inline__, __nodebug__))
761_mm256_load_pd(double const *p)
762{
763 return *(__m256d *)p;
764}
765
766static __inline __m256 __attribute__((__always_inline__, __nodebug__))
767_mm256_load_ps(float const *p)
768{
769 return *(__m256 *)p;
770}
771
772static __inline __m256d __attribute__((__always_inline__, __nodebug__))
773_mm256_loadu_pd(double const *p)
774{
775 return (__m256d)__builtin_ia32_loadupd256(p);
776}
777
778static __inline __m256 __attribute__((__always_inline__, __nodebug__))
779_mm256_loadu_ps(float const *p)
780{
781 return (__m256)__builtin_ia32_loadups256(p);
782}
783
784static __inline __m256i __attribute__((__always_inline__, __nodebug__))
785_mm256_load_si256(__m256i const *p)
786{
787 return *p;
788}
789
790static __inline __m256i __attribute__((__always_inline__, __nodebug__))
791_mm256_loadu_si256(__m256i const *p)
792{
793 return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
794}
795
796static __inline __m256i __attribute__((__always_inline__, __nodebug__))
797_mm256_lddqu_si256(__m256i const *p)
798{
799 return (__m256i)__builtin_ia32_lddqu256((char const *)p);
800}
801
802/* SIMD store ops */
803static __inline void __attribute__((__always_inline__, __nodebug__))
804_mm256_store_pd(double *p, __m256d a)
805{
806 *(__m256d *)p = a;
807}
808
809static __inline void __attribute__((__always_inline__, __nodebug__))
810_mm256_store_ps(float *p, __m256 a)
811{
812 *(__m256 *)p = a;
813}
814
815static __inline void __attribute__((__always_inline__, __nodebug__))
816_mm256_storeu_pd(double *p, __m256d a)
817{
818 __builtin_ia32_storeupd256(p, (__v4df)a);
819}
820
821static __inline void __attribute__((__always_inline__, __nodebug__))
822_mm256_storeu_ps(float *p, __m256 a)
823{
824 __builtin_ia32_storeups256(p, (__v8sf)a);
825}
826
827static __inline void __attribute__((__always_inline__, __nodebug__))
828_mm256_store_si256(__m256i *p, __m256i a)
829{
830 *p = a;
831}
832
833static __inline void __attribute__((__always_inline__, __nodebug__))
834_mm256_storeu_si256(__m256i *p, __m256i a)
835{
836 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
837}
838
839/* Conditional load ops */
840static __inline __m128d __attribute__((__always_inline__, __nodebug__))
841_mm_maskload_pd(double const *p, __m128d m)
842{
843 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
844}
845
846static __inline __m256d __attribute__((__always_inline__, __nodebug__))
847_mm256_maskload_pd(double const *p, __m256d m)
848{
849 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
850}
851
852static __inline __m128 __attribute__((__always_inline__, __nodebug__))
853_mm_maskload_ps(float const *p, __m128 m)
854{
855 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
856}
857
858static __inline __m256 __attribute__((__always_inline__, __nodebug__))
859_mm256_maskload_ps(float const *p, __m256 m)
860{
861 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
862}
863
864/* Conditional store ops */
865static __inline void __attribute__((__always_inline__, __nodebug__))
866_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
867{
868 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
869}
870
871static __inline void __attribute__((__always_inline__, __nodebug__))
872_mm_maskstore_pd(double *p, __m128d m, __m128d a)
873{
874 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
875}
876
877static __inline void __attribute__((__always_inline__, __nodebug__))
878_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
879{
880 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
881}
882
883static __inline void __attribute__((__always_inline__, __nodebug__))
884_mm_maskstore_ps(float *p, __m128 m, __m128 a)
885{
886 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
887}
888
889/* Cacheability support ops */
890static __inline void __attribute__((__always_inline__, __nodebug__))
891_mm256_stream_si256(__m256i *a, __m256i b)
892{
893 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
894}
895
896static __inline void __attribute__((__always_inline__, __nodebug__))
897_mm256_stream_pd(double *a, __m256d b)
898{
899 __builtin_ia32_movntpd256(a, (__v4df)b);
900}
901
902static __inline void __attribute__((__always_inline__, __nodebug__))
903_mm256_stream_ps(float *p, __m256 a)
904{
905 __builtin_ia32_movntps256(p, (__v8sf)a);
906}
907
908/* Create vectors */
909static __inline __m256d __attribute__((__always_inline__, __nodebug__))
910_mm256_set_pd(double a, double b, double c, double d)
911{
912 return (__m256d){ d, c, b, a };
913}
914
915static __inline __m256 __attribute__((__always_inline__, __nodebug__))
916_mm256_set_ps(float a, float b, float c, float d,
917 float e, float f, float g, float h)
918{
919 return (__m256){ h, g, f, e, d, c, b, a };
920}
921
922static __inline __m256i __attribute__((__always_inline__, __nodebug__))
923_mm256_set_epi32(int i0, int i1, int i2, int i3,
924 int i4, int i5, int i6, int i7)
925{
926 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
927}
928
929static __inline __m256i __attribute__((__always_inline__, __nodebug__))
930_mm256_set_epi16(short w15, short w14, short w13, short w12,
931 short w11, short w10, short w09, short w08,
932 short w07, short w06, short w05, short w04,
933 short w03, short w02, short w01, short w00)
934{
935 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
936 w08, w09, w10, w11, w12, w13, w14, w15 };
937}
938
939static __inline __m256i __attribute__((__always_inline__, __nodebug__))
940_mm256_set_epi8(char b31, char b30, char b29, char b28,
941 char b27, char b26, char b25, char b24,
942 char b23, char b22, char b21, char b20,
943 char b19, char b18, char b17, char b16,
944 char b15, char b14, char b13, char b12,
945 char b11, char b10, char b09, char b08,
946 char b07, char b06, char b05, char b04,
947 char b03, char b02, char b01, char b00)
948{
949 return (__m256i)(__v32qi){
950 b00, b01, b02, b03, b04, b05, b06, b07,
951 b08, b09, b10, b11, b12, b13, b14, b15,
952 b16, b17, b18, b19, b20, b21, b22, b23,
953 b24, b25, b26, b27, b28, b29, b30, b31
954 };
955}
956
957static __inline __m256i __attribute__((__always_inline__, __nodebug__))
958_mm256_set_epi64x(long long a, long long b, long long c, long long d)
959{
960 return (__m256i)(__v4di){ d, c, b, a };
961}
962
963/* Create vectors with elements in reverse order */
964static __inline __m256d __attribute__((__always_inline__, __nodebug__))
965_mm256_setr_pd(double a, double b, double c, double d)
966{
967 return (__m256d){ a, b, c, d };
968}
969
970static __inline __m256 __attribute__((__always_inline__, __nodebug__))
971_mm256_setr_ps(float a, float b, float c, float d,
972 float e, float f, float g, float h)
973{
974 return (__m256){ a, b, c, d, e, f, g, h };
975}
976
977static __inline __m256i __attribute__((__always_inline__, __nodebug__))
978_mm256_setr_epi32(int i0, int i1, int i2, int i3,
979 int i4, int i5, int i6, int i7)
980{
981 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
982}
983
984static __inline __m256i __attribute__((__always_inline__, __nodebug__))
985_mm256_setr_epi16(short w15, short w14, short w13, short w12,
986 short w11, short w10, short w09, short w08,
987 short w07, short w06, short w05, short w04,
988 short w03, short w02, short w01, short w00)
989{
990 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
991 w07, w06, w05, w04, w03, w02, w01, w00 };
992}
993
994static __inline __m256i __attribute__((__always_inline__, __nodebug__))
995_mm256_setr_epi8(char b31, char b30, char b29, char b28,
996 char b27, char b26, char b25, char b24,
997 char b23, char b22, char b21, char b20,
998 char b19, char b18, char b17, char b16,
999 char b15, char b14, char b13, char b12,
1000 char b11, char b10, char b09, char b08,
1001 char b07, char b06, char b05, char b04,
1002 char b03, char b02, char b01, char b00)
1003{
1004 return (__m256i)(__v32qi){
1005 b31, b30, b29, b28, b27, b26, b25, b24,
1006 b23, b22, b21, b20, b19, b18, b17, b16,
1007 b15, b14, b13, b12, b11, b10, b09, b08,
1008 b07, b06, b05, b04, b03, b02, b01, b00 };
1009}
1010
1011static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1012_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1013{
1014 return (__m256i)(__v4di){ a, b, c, d };
1015}
1016
1017/* Create vectors with repeated elements */
1018static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1019_mm256_set1_pd(double w)
1020{
1021 return (__m256d){ w, w, w, w };
1022}
1023
1024static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1025_mm256_set1_ps(float w)
1026{
1027 return (__m256){ w, w, w, w, w, w, w, w };
1028}
1029
1030static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1031_mm256_set1_epi32(int i)
1032{
1033 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1034}
1035
1036static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1037_mm256_set1_epi16(short w)
1038{
1039 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1040}
1041
1042static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1043_mm256_set1_epi8(char b)
1044{
1045 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1046 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1047}
1048
1049static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1050_mm256_set1_epi64x(long long q)
1051{
1052 return (__m256i)(__v4di){ q, q, q, q };
1053}
1054
1055/* Create zeroed vectors */
1056static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1057_mm256_setzero_pd(void)
1058{
1059 return (__m256d){ 0, 0, 0, 0 };
1060}
1061
1062static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1063_mm256_setzero_ps(void)
1064{
1065 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1066}
1067
1068static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1069_mm256_setzero_si256(void)
1070{
1071 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1072}
1073
1074/* Cast between vector types */
1075static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1076_mm256_castpd_ps(__m256d in)
1077{
1078 return (__m256)in;
1079}
1080
1081static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1082_mm256_castpd_si256(__m256d in)
1083{
1084 return (__m256i)in;
1085}
1086
1087static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1088_mm256_castps_pd(__m256 in)
1089{
1090 return (__m256d)in;
1091}
1092
1093static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1094_mm256_castps_si256(__m256 in)
1095{
1096 return (__m256i)in;
1097}
1098
1099static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1100_mm256_castsi256_ps(__m256i in)
1101{
1102 return (__m256)in;
1103}
1104
1105static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1106_mm256_castsi256_pd(__m256i in)
1107{
1108 return (__m256d)in;
1109}
1110
1111static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1112_mm256_castpd256_pd128(__m256d in)
1113{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001114 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001115}
1116
1117static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1118_mm256_castps256_ps128(__m256 in)
1119{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001120 return __builtin_shufflevector(in, in, 0, 1, 2, 3);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001121}
1122
1123static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1124_mm256_castsi256_si128(__m256i in)
1125{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001126 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001127}
1128
1129static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1130_mm256_castpd128_pd256(__m128d in)
1131{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001132 __m128d zero = _mm_setzero_pd();
1133 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001134}
1135
1136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1137_mm256_castps128_ps256(__m128 in)
1138{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001139 __m128 zero = _mm_setzero_ps();
1140 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001141}
1142
1143static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1144_mm256_castsi128_si256(__m128i in)
1145{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001146 __m128i zero = _mm_setzero_si128();
1147 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001148}