blob: b2d16d5386ad60405daa24a3377a32d7cb67defb [file] [log] [blame]
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer01b57e32010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +000027
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43 return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49 return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55 return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61 return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79 return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85 return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115 return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121 return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
148static __inline __m256d __attribute__((__always_inline__, __nodebug__))
149_mm256_round_pd(__m256d v, const int m)
150{
151 return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m);
152}
153
154static __inline __m256 __attribute__((__always_inline__, __nodebug__))
155_mm256_round_ps(__m256 v, const int m)
156{
157 return (__m256)__builtin_ia32_roundps256((__v8sf)v, m);
158}
159
160#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
161#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
162#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
163#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
164
165/* Logical */
166static __inline __m256d __attribute__((__always_inline__, __nodebug__))
167_mm256_and_pd(__m256d a, __m256d b)
168{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000169 return (__m256d)((__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000170}
171
172static __inline __m256 __attribute__((__always_inline__, __nodebug__))
173_mm256_and_ps(__m256 a, __m256 b)
174{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000175 return (__m256)((__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000176}
177
178static __inline __m256d __attribute__((__always_inline__, __nodebug__))
179_mm256_andnot_pd(__m256d a, __m256d b)
180{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000181 return (__m256d)(~(__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000182}
183
184static __inline __m256 __attribute__((__always_inline__, __nodebug__))
185_mm256_andnot_ps(__m256 a, __m256 b)
186{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000187 return (__m256)(~(__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000188}
189
190static __inline __m256d __attribute__((__always_inline__, __nodebug__))
191_mm256_or_pd(__m256d a, __m256d b)
192{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000193 return (__m256d)((__v4di)a | (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000194}
195
196static __inline __m256 __attribute__((__always_inline__, __nodebug__))
197_mm256_or_ps(__m256 a, __m256 b)
198{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000199 return (__m256)((__v8si)a | (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000200}
201
202static __inline __m256d __attribute__((__always_inline__, __nodebug__))
203_mm256_xor_pd(__m256d a, __m256d b)
204{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000205 return (__m256d)((__v4di)a ^ (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000206}
207
208static __inline __m256 __attribute__((__always_inline__, __nodebug__))
209_mm256_xor_ps(__m256 a, __m256 b)
210{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000211 return (__m256)((__v8si)a ^ (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000212}
213
214/* Horizontal arithmetic */
215static __inline __m256d __attribute__((__always_inline__, __nodebug__))
216_mm256_hadd_pd(__m256d a, __m256d b)
217{
218 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
219}
220
221static __inline __m256 __attribute__((__always_inline__, __nodebug__))
222_mm256_hadd_ps(__m256 a, __m256 b)
223{
224 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
225}
226
227static __inline __m256d __attribute__((__always_inline__, __nodebug__))
228_mm256_hsub_pd(__m256d a, __m256d b)
229{
230 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
231}
232
233static __inline __m256 __attribute__((__always_inline__, __nodebug__))
234_mm256_hsub_ps(__m256 a, __m256 b)
235{
236 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
237}
238
239/* Vector permutations */
240static __inline __m128d __attribute__((__always_inline__, __nodebug__))
241_mm_permutevar_pd(__m128d a, __m128i c)
242{
243 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
244}
245
246static __inline __m256d __attribute__((__always_inline__, __nodebug__))
247_mm256_permutevar_pd(__m256d a, __m256i c)
248{
249 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
250}
251
252static __inline __m128 __attribute__((__always_inline__, __nodebug__))
253_mm_permutevar_ps(__m128 a, __m128i c)
254{
255 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
256}
257
258static __inline __m256 __attribute__((__always_inline__, __nodebug__))
259_mm256_permutevar_ps(__m256 a, __m256i c)
260{
261 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
262 (__v8si)c);
263}
264
265static __inline __m128d __attribute__((__always_inline__, __nodebug__))
266_mm_permute_pd(__m128d a, const int c)
267{
268 return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
269}
270
271static __inline __m256d __attribute__((__always_inline__, __nodebug__))
272_mm256_permute_pd(__m256d a, const int c)
273{
274 return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
275}
276
277static __inline __m128 __attribute__((__always_inline__, __nodebug__))
278_mm_permute_ps(__m128 a, const int c)
279{
280 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
281}
282
283static __inline __m256 __attribute__((__always_inline__, __nodebug__))
284_mm256_permute_ps(__m256 a, const int c)
285{
286 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
287}
288
289static __inline __m256d __attribute__((__always_inline__, __nodebug__))
290_mm256_permute2f128_pd(__m256d a, __m256d b, const int c)
291{
292 return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c);
293}
294
295static __inline __m256 __attribute__((__always_inline__, __nodebug__))
296_mm256_permute2f128_ps(__m256 a, __m256 b, const int c)
297{
298 return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c);
299}
300
301static __inline __m256i __attribute__((__always_inline__, __nodebug__))
302_mm256_permute2f128_si256(__m256i a, __m256i b, const int c)
303{
304 return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c);
305}
306
307/* Vector Blend */
308static __inline __m256d __attribute__((__always_inline__, __nodebug__))
309_mm256_blend_pd(__m256d a, __m256d b, const int c)
310{
311 return (__m256d)__builtin_ia32_blendpd256((__v4df)a, (__v4df)b, c);
312}
313
314static __inline __m256 __attribute__((__always_inline__, __nodebug__))
315_mm256_blend_ps(__m256 a, __m256 b, const int c)
316{
317 return (__m256)__builtin_ia32_blendps256((__v8sf)a, (__v8sf)b, c);
318}
319
320static __inline __m256d __attribute__((__always_inline__, __nodebug__))
321_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
322{
323 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
324}
325
326static __inline __m256 __attribute__((__always_inline__, __nodebug__))
327_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
328{
329 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
330}
331
332/* Vector Dot Product */
333static __inline __m256 __attribute__((__always_inline__, __nodebug__))
334_mm256_dp_ps(__m256 a, __m256 b, const int c)
335{
336 return (__m256)__builtin_ia32_dpps256((__v8sf)a, (__v8sf)b, c);
337}
338
339/* Vector shuffle */
Bob Wilson32bae372011-11-05 06:08:06 +0000340#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
341 __m256 __a = (a); \
342 __m256 __b = (b); \
343 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000344 (mask) & 0x3, ((mask) & 0xc) >> 2, \
Bruno Cardoso Lopes70141c22010-08-11 18:45:43 +0000345 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \
Bruno Cardoso Lopes426344d2011-08-23 23:29:45 +0000346 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \
Bob Wilson32bae372011-11-05 06:08:06 +0000347 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000348
Bob Wilson32bae372011-11-05 06:08:06 +0000349#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
350 __m256d __a = (a); \
351 __m256d __b = (b); \
352 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000353 (mask) & 0x1, \
354 (((mask) & 0x2) >> 1) + 4, \
355 (((mask) & 0x4) >> 2) + 2, \
Bob Wilson32bae372011-11-05 06:08:06 +0000356 (((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000357
358/* Compare */
359#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
360#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
361#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
362#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
363#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
364#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
365#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
366#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
367#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
368#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
369#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
370#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
371#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
372#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
373#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
374#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
375#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
376#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
377#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
378#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
379#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
380#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
381#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
382#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
383#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
384#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
385#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
386#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
387#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
388#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
389#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
390#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
391
Bob Wilson32bae372011-11-05 06:08:06 +0000392#define _mm_cmp_pd(a, b, c) __extension__ ({ \
393 __m128d __a = (a); \
394 __m128d __b = (b); \
395 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000396
Bob Wilson32bae372011-11-05 06:08:06 +0000397#define _mm_cmp_ps(a, b, c) __extension__ ({ \
398 __m128 __a = (a); \
399 __m128 __b = (b); \
400 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000401
Bob Wilson32bae372011-11-05 06:08:06 +0000402#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
403 __m256d __a = (a); \
404 __m256d __b = (b); \
405 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000406
Bob Wilson32bae372011-11-05 06:08:06 +0000407#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
408 __m256 __a = (a); \
409 __m256 __b = (b); \
410 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000411
Bob Wilson32bae372011-11-05 06:08:06 +0000412#define _mm_cmp_sd(a, b, c) __extension__ ({ \
413 __m128d __a = (a); \
414 __m128d __b = (b); \
415 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000416
Bob Wilson32bae372011-11-05 06:08:06 +0000417#define _mm_cmp_ss(a, b, c) __extension__ ({ \
418 __m128 __a = (a); \
419 __m128 __b = (b); \
420 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000421
422/* Vector extract */
423static __inline __m128d __attribute__((__always_inline__, __nodebug__))
424_mm256_extractf128_pd(__m256d a, const int o)
425{
426 return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
427}
428
429static __inline __m128 __attribute__((__always_inline__, __nodebug__))
430_mm256_extractf128_ps(__m256 a, const int o)
431{
432 return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
433}
434
435static __inline __m128i __attribute__((__always_inline__, __nodebug__))
436_mm256_extractf128_si256(__m256i a, const int o)
437{
438 return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
439}
440
441static __inline int __attribute__((__always_inline__, __nodebug__))
442_mm256_extract_epi32(__m256i a, int const imm)
443{
444 __v8si b = (__v8si)a;
445 return b[imm];
446}
447
448static __inline int __attribute__((__always_inline__, __nodebug__))
449_mm256_extract_epi16(__m256i a, int const imm)
450{
451 __v16hi b = (__v16hi)a;
452 return b[imm];
453}
454
455static __inline int __attribute__((__always_inline__, __nodebug__))
456_mm256_extract_epi8(__m256i a, int const imm)
457{
458 __v32qi b = (__v32qi)a;
459 return b[imm];
460}
461
462#ifdef __x86_64__
463static __inline long long __attribute__((__always_inline__, __nodebug__))
464_mm256_extract_epi64(__m256i a, const int imm)
465{
466 __v4di b = (__v4di)a;
467 return b[imm];
468}
469#endif
470
471/* Vector insert */
472static __inline __m256d __attribute__((__always_inline__, __nodebug__))
473_mm256_insertf128_pd(__m256d a, __m128d b, const int o)
474{
475 return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o);
476}
477
478static __inline __m256 __attribute__((__always_inline__, __nodebug__))
479_mm256_insertf128_ps(__m256 a, __m128 b, const int o)
480{
481 return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o);
482}
483
484static __inline __m256i __attribute__((__always_inline__, __nodebug__))
485_mm256_insertf128_si256(__m256i a, __m128i b, const int o)
486{
487 return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o);
488}
489
490static __inline __m256i __attribute__((__always_inline__, __nodebug__))
491_mm256_insert_epi32(__m256i a, int b, int const imm)
492{
493 __v8si c = (__v8si)a;
494 c[imm & 7] = b;
495 return (__m256i)c;
496}
497
498static __inline __m256i __attribute__((__always_inline__, __nodebug__))
499_mm256_insert_epi16(__m256i a, int b, int const imm)
500{
501 __v16hi c = (__v16hi)a;
502 c[imm & 15] = b;
503 return (__m256i)c;
504}
505
506static __inline __m256i __attribute__((__always_inline__, __nodebug__))
507_mm256_insert_epi8(__m256i a, int b, int const imm)
508{
509 __v32qi c = (__v32qi)a;
510 c[imm & 31] = b;
511 return (__m256i)c;
512}
513
514#ifdef __x86_64__
515static __inline __m256i __attribute__((__always_inline__, __nodebug__))
516_mm256_insert_epi64(__m256i a, int b, int const imm)
517{
518 __v4di c = (__v4di)a;
519 c[imm & 3] = b;
520 return (__m256i)c;
521}
522#endif
523
524/* Conversion */
525static __inline __m256d __attribute__((__always_inline__, __nodebug__))
526_mm256_cvtepi32_pd(__m128i a)
527{
528 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
529}
530
531static __inline __m256 __attribute__((__always_inline__, __nodebug__))
532_mm256_cvtepi32_ps(__m256i a)
533{
534 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
535}
536
537static __inline __m128 __attribute__((__always_inline__, __nodebug__))
538_mm256_cvtpd_ps(__m256d a)
539{
540 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
541}
542
543static __inline __m256i __attribute__((__always_inline__, __nodebug__))
544_mm256_cvtps_epi32(__m256 a)
545{
546 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
547}
548
549static __inline __m256d __attribute__((__always_inline__, __nodebug__))
550_mm256_cvtps_pd(__m128 a)
551{
552 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
553}
554
555static __inline __m128i __attribute__((__always_inline__, __nodebug__))
556_mm256_cvttpd_epi32(__m256d a)
557{
558 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
559}
560
561static __inline __m128i __attribute__((__always_inline__, __nodebug__))
562_mm256_cvtpd_epi32(__m256d a)
563{
564 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
565}
566
567static __inline __m256i __attribute__((__always_inline__, __nodebug__))
568_mm256_cvttps_epi32(__m256 a)
569{
570 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
571}
572
573/* Vector replicate */
574static __inline __m256 __attribute__((__always_inline__, __nodebug__))
575_mm256_movehdup_ps(__m256 a)
576{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000577 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000578}
579
580static __inline __m256 __attribute__((__always_inline__, __nodebug__))
581_mm256_moveldup_ps(__m256 a)
582{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000583 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000584}
585
586static __inline __m256d __attribute__((__always_inline__, __nodebug__))
587_mm256_movedup_pd(__m256d a)
588{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000589 return __builtin_shufflevector(a, a, 0, 0, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000590}
591
592/* Unpack and Interleave */
593static __inline __m256d __attribute__((__always_inline__, __nodebug__))
594_mm256_unpackhi_pd(__m256d a, __m256d b)
595{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000596 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000597}
598
599static __inline __m256d __attribute__((__always_inline__, __nodebug__))
600_mm256_unpacklo_pd(__m256d a, __m256d b)
601{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000602 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000603}
604
605static __inline __m256 __attribute__((__always_inline__, __nodebug__))
606_mm256_unpackhi_ps(__m256 a, __m256 b)
607{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000608 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000609}
610
611static __inline __m256 __attribute__((__always_inline__, __nodebug__))
612_mm256_unpacklo_ps(__m256 a, __m256 b)
613{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000614 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000615}
616
617/* Bit Test */
618static __inline int __attribute__((__always_inline__, __nodebug__))
619_mm_testz_pd(__m128d a, __m128d b)
620{
621 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
622}
623
624static __inline int __attribute__((__always_inline__, __nodebug__))
625_mm_testc_pd(__m128d a, __m128d b)
626{
627 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
628}
629
630static __inline int __attribute__((__always_inline__, __nodebug__))
631_mm_testnzc_pd(__m128d a, __m128d b)
632{
633 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
634}
635
636static __inline int __attribute__((__always_inline__, __nodebug__))
637_mm_testz_ps(__m128 a, __m128 b)
638{
639 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
640}
641
642static __inline int __attribute__((__always_inline__, __nodebug__))
643_mm_testc_ps(__m128 a, __m128 b)
644{
645 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
646}
647
648static __inline int __attribute__((__always_inline__, __nodebug__))
649_mm_testnzc_ps(__m128 a, __m128 b)
650{
651 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
652}
653
654static __inline int __attribute__((__always_inline__, __nodebug__))
655_mm256_testz_pd(__m256d a, __m256d b)
656{
657 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
658}
659
660static __inline int __attribute__((__always_inline__, __nodebug__))
661_mm256_testc_pd(__m256d a, __m256d b)
662{
663 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
664}
665
666static __inline int __attribute__((__always_inline__, __nodebug__))
667_mm256_testnzc_pd(__m256d a, __m256d b)
668{
669 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
670}
671
672static __inline int __attribute__((__always_inline__, __nodebug__))
673_mm256_testz_ps(__m256 a, __m256 b)
674{
675 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
676}
677
678static __inline int __attribute__((__always_inline__, __nodebug__))
679_mm256_testc_ps(__m256 a, __m256 b)
680{
681 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
682}
683
684static __inline int __attribute__((__always_inline__, __nodebug__))
685_mm256_testnzc_ps(__m256 a, __m256 b)
686{
687 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
688}
689
690static __inline int __attribute__((__always_inline__, __nodebug__))
691_mm256_testz_si256(__m256i a, __m256i b)
692{
693 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
694}
695
696static __inline int __attribute__((__always_inline__, __nodebug__))
697_mm256_testc_si256(__m256i a, __m256i b)
698{
699 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
700}
701
702static __inline int __attribute__((__always_inline__, __nodebug__))
703_mm256_testnzc_si256(__m256i a, __m256i b)
704{
705 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
706}
707
708/* Vector extract sign mask */
709static __inline int __attribute__((__always_inline__, __nodebug__))
710_mm256_movemask_pd(__m256d a)
711{
712 return __builtin_ia32_movmskpd256((__v4df)a);
713}
714
715static __inline int __attribute__((__always_inline__, __nodebug__))
716_mm256_movemask_ps(__m256 a)
717{
718 return __builtin_ia32_movmskps256((__v8sf)a);
719}
720
721/* Vector zero */
722static __inline void __attribute__((__always_inline__, __nodebug__))
723_mm256_zeroall(void)
724{
725 __builtin_ia32_vzeroall();
726}
727
728static __inline void __attribute__((__always_inline__, __nodebug__))
729_mm256_zeroupper(void)
730{
731 __builtin_ia32_vzeroupper();
732}
733
734/* Vector load with broadcast */
735static __inline __m128 __attribute__((__always_inline__, __nodebug__))
736_mm_broadcast_ss(float const *a)
737{
738 return (__m128)__builtin_ia32_vbroadcastss(a);
739}
740
741static __inline __m256d __attribute__((__always_inline__, __nodebug__))
742_mm256_broadcast_sd(double const *a)
743{
744 return (__m256d)__builtin_ia32_vbroadcastsd256(a);
745}
746
747static __inline __m256 __attribute__((__always_inline__, __nodebug__))
748_mm256_broadcast_ss(float const *a)
749{
750 return (__m256)__builtin_ia32_vbroadcastss256(a);
751}
752
753static __inline __m256d __attribute__((__always_inline__, __nodebug__))
754_mm256_broadcast_pd(__m128d const *a)
755{
756 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
757}
758
759static __inline __m256 __attribute__((__always_inline__, __nodebug__))
760_mm256_broadcast_ps(__m128 const *a)
761{
762 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
763}
764
765/* SIMD load ops */
766static __inline __m256d __attribute__((__always_inline__, __nodebug__))
767_mm256_load_pd(double const *p)
768{
769 return *(__m256d *)p;
770}
771
772static __inline __m256 __attribute__((__always_inline__, __nodebug__))
773_mm256_load_ps(float const *p)
774{
775 return *(__m256 *)p;
776}
777
778static __inline __m256d __attribute__((__always_inline__, __nodebug__))
779_mm256_loadu_pd(double const *p)
780{
781 return (__m256d)__builtin_ia32_loadupd256(p);
782}
783
784static __inline __m256 __attribute__((__always_inline__, __nodebug__))
785_mm256_loadu_ps(float const *p)
786{
787 return (__m256)__builtin_ia32_loadups256(p);
788}
789
790static __inline __m256i __attribute__((__always_inline__, __nodebug__))
791_mm256_load_si256(__m256i const *p)
792{
793 return *p;
794}
795
796static __inline __m256i __attribute__((__always_inline__, __nodebug__))
797_mm256_loadu_si256(__m256i const *p)
798{
799 return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
800}
801
802static __inline __m256i __attribute__((__always_inline__, __nodebug__))
803_mm256_lddqu_si256(__m256i const *p)
804{
805 return (__m256i)__builtin_ia32_lddqu256((char const *)p);
806}
807
808/* SIMD store ops */
809static __inline void __attribute__((__always_inline__, __nodebug__))
810_mm256_store_pd(double *p, __m256d a)
811{
812 *(__m256d *)p = a;
813}
814
815static __inline void __attribute__((__always_inline__, __nodebug__))
816_mm256_store_ps(float *p, __m256 a)
817{
818 *(__m256 *)p = a;
819}
820
821static __inline void __attribute__((__always_inline__, __nodebug__))
822_mm256_storeu_pd(double *p, __m256d a)
823{
824 __builtin_ia32_storeupd256(p, (__v4df)a);
825}
826
827static __inline void __attribute__((__always_inline__, __nodebug__))
828_mm256_storeu_ps(float *p, __m256 a)
829{
830 __builtin_ia32_storeups256(p, (__v8sf)a);
831}
832
833static __inline void __attribute__((__always_inline__, __nodebug__))
834_mm256_store_si256(__m256i *p, __m256i a)
835{
836 *p = a;
837}
838
839static __inline void __attribute__((__always_inline__, __nodebug__))
840_mm256_storeu_si256(__m256i *p, __m256i a)
841{
842 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
843}
844
845/* Conditional load ops */
846static __inline __m128d __attribute__((__always_inline__, __nodebug__))
847_mm_maskload_pd(double const *p, __m128d m)
848{
849 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
850}
851
852static __inline __m256d __attribute__((__always_inline__, __nodebug__))
853_mm256_maskload_pd(double const *p, __m256d m)
854{
855 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
856}
857
858static __inline __m128 __attribute__((__always_inline__, __nodebug__))
859_mm_maskload_ps(float const *p, __m128 m)
860{
861 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
862}
863
864static __inline __m256 __attribute__((__always_inline__, __nodebug__))
865_mm256_maskload_ps(float const *p, __m256 m)
866{
867 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
868}
869
870/* Conditional store ops */
871static __inline void __attribute__((__always_inline__, __nodebug__))
872_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
873{
874 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
875}
876
877static __inline void __attribute__((__always_inline__, __nodebug__))
878_mm_maskstore_pd(double *p, __m128d m, __m128d a)
879{
880 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
881}
882
883static __inline void __attribute__((__always_inline__, __nodebug__))
884_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
885{
886 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
887}
888
889static __inline void __attribute__((__always_inline__, __nodebug__))
890_mm_maskstore_ps(float *p, __m128 m, __m128 a)
891{
892 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
893}
894
895/* Cacheability support ops */
896static __inline void __attribute__((__always_inline__, __nodebug__))
897_mm256_stream_si256(__m256i *a, __m256i b)
898{
899 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
900}
901
902static __inline void __attribute__((__always_inline__, __nodebug__))
903_mm256_stream_pd(double *a, __m256d b)
904{
905 __builtin_ia32_movntpd256(a, (__v4df)b);
906}
907
908static __inline void __attribute__((__always_inline__, __nodebug__))
909_mm256_stream_ps(float *p, __m256 a)
910{
911 __builtin_ia32_movntps256(p, (__v8sf)a);
912}
913
914/* Create vectors */
915static __inline __m256d __attribute__((__always_inline__, __nodebug__))
916_mm256_set_pd(double a, double b, double c, double d)
917{
918 return (__m256d){ d, c, b, a };
919}
920
921static __inline __m256 __attribute__((__always_inline__, __nodebug__))
922_mm256_set_ps(float a, float b, float c, float d,
923 float e, float f, float g, float h)
924{
925 return (__m256){ h, g, f, e, d, c, b, a };
926}
927
928static __inline __m256i __attribute__((__always_inline__, __nodebug__))
929_mm256_set_epi32(int i0, int i1, int i2, int i3,
930 int i4, int i5, int i6, int i7)
931{
932 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
933}
934
935static __inline __m256i __attribute__((__always_inline__, __nodebug__))
936_mm256_set_epi16(short w15, short w14, short w13, short w12,
937 short w11, short w10, short w09, short w08,
938 short w07, short w06, short w05, short w04,
939 short w03, short w02, short w01, short w00)
940{
941 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
942 w08, w09, w10, w11, w12, w13, w14, w15 };
943}
944
945static __inline __m256i __attribute__((__always_inline__, __nodebug__))
946_mm256_set_epi8(char b31, char b30, char b29, char b28,
947 char b27, char b26, char b25, char b24,
948 char b23, char b22, char b21, char b20,
949 char b19, char b18, char b17, char b16,
950 char b15, char b14, char b13, char b12,
951 char b11, char b10, char b09, char b08,
952 char b07, char b06, char b05, char b04,
953 char b03, char b02, char b01, char b00)
954{
955 return (__m256i)(__v32qi){
956 b00, b01, b02, b03, b04, b05, b06, b07,
957 b08, b09, b10, b11, b12, b13, b14, b15,
958 b16, b17, b18, b19, b20, b21, b22, b23,
959 b24, b25, b26, b27, b28, b29, b30, b31
960 };
961}
962
963static __inline __m256i __attribute__((__always_inline__, __nodebug__))
964_mm256_set_epi64x(long long a, long long b, long long c, long long d)
965{
966 return (__m256i)(__v4di){ d, c, b, a };
967}
968
969/* Create vectors with elements in reverse order */
970static __inline __m256d __attribute__((__always_inline__, __nodebug__))
971_mm256_setr_pd(double a, double b, double c, double d)
972{
973 return (__m256d){ a, b, c, d };
974}
975
976static __inline __m256 __attribute__((__always_inline__, __nodebug__))
977_mm256_setr_ps(float a, float b, float c, float d,
978 float e, float f, float g, float h)
979{
980 return (__m256){ a, b, c, d, e, f, g, h };
981}
982
983static __inline __m256i __attribute__((__always_inline__, __nodebug__))
984_mm256_setr_epi32(int i0, int i1, int i2, int i3,
985 int i4, int i5, int i6, int i7)
986{
987 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
988}
989
990static __inline __m256i __attribute__((__always_inline__, __nodebug__))
991_mm256_setr_epi16(short w15, short w14, short w13, short w12,
992 short w11, short w10, short w09, short w08,
993 short w07, short w06, short w05, short w04,
994 short w03, short w02, short w01, short w00)
995{
996 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
997 w07, w06, w05, w04, w03, w02, w01, w00 };
998}
999
1000static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1001_mm256_setr_epi8(char b31, char b30, char b29, char b28,
1002 char b27, char b26, char b25, char b24,
1003 char b23, char b22, char b21, char b20,
1004 char b19, char b18, char b17, char b16,
1005 char b15, char b14, char b13, char b12,
1006 char b11, char b10, char b09, char b08,
1007 char b07, char b06, char b05, char b04,
1008 char b03, char b02, char b01, char b00)
1009{
1010 return (__m256i)(__v32qi){
1011 b31, b30, b29, b28, b27, b26, b25, b24,
1012 b23, b22, b21, b20, b19, b18, b17, b16,
1013 b15, b14, b13, b12, b11, b10, b09, b08,
1014 b07, b06, b05, b04, b03, b02, b01, b00 };
1015}
1016
1017static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1018_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1019{
1020 return (__m256i)(__v4di){ a, b, c, d };
1021}
1022
1023/* Create vectors with repeated elements */
1024static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1025_mm256_set1_pd(double w)
1026{
1027 return (__m256d){ w, w, w, w };
1028}
1029
1030static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1031_mm256_set1_ps(float w)
1032{
1033 return (__m256){ w, w, w, w, w, w, w, w };
1034}
1035
1036static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1037_mm256_set1_epi32(int i)
1038{
1039 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1040}
1041
1042static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1043_mm256_set1_epi16(short w)
1044{
1045 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1046}
1047
1048static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1049_mm256_set1_epi8(char b)
1050{
1051 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1052 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1053}
1054
1055static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1056_mm256_set1_epi64x(long long q)
1057{
1058 return (__m256i)(__v4di){ q, q, q, q };
1059}
1060
1061/* Create zeroed vectors */
1062static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1063_mm256_setzero_pd(void)
1064{
1065 return (__m256d){ 0, 0, 0, 0 };
1066}
1067
1068static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1069_mm256_setzero_ps(void)
1070{
1071 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1072}
1073
1074static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1075_mm256_setzero_si256(void)
1076{
1077 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1078}
1079
1080/* Cast between vector types */
1081static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1082_mm256_castpd_ps(__m256d in)
1083{
1084 return (__m256)in;
1085}
1086
1087static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1088_mm256_castpd_si256(__m256d in)
1089{
1090 return (__m256i)in;
1091}
1092
1093static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1094_mm256_castps_pd(__m256 in)
1095{
1096 return (__m256d)in;
1097}
1098
1099static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1100_mm256_castps_si256(__m256 in)
1101{
1102 return (__m256i)in;
1103}
1104
1105static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1106_mm256_castsi256_ps(__m256i in)
1107{
1108 return (__m256)in;
1109}
1110
1111static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1112_mm256_castsi256_pd(__m256i in)
1113{
1114 return (__m256d)in;
1115}
1116
1117static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1118_mm256_castpd256_pd128(__m256d in)
1119{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001120 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001121}
1122
1123static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1124_mm256_castps256_ps128(__m256 in)
1125{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001126 return __builtin_shufflevector(in, in, 0, 1, 2, 3);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001127}
1128
1129static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1130_mm256_castsi256_si128(__m256i in)
1131{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001132 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001133}
1134
1135static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1136_mm256_castpd128_pd256(__m128d in)
1137{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001138 __m128d zero = _mm_setzero_pd();
1139 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001140}
1141
1142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1143_mm256_castps128_ps256(__m128 in)
1144{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001145 __m128 zero = _mm_setzero_ps();
1146 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001147}
1148
1149static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1150_mm256_castsi128_si256(__m128i in)
1151{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001152 __m128i zero = _mm_setzero_si128();
1153 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001154}