blob: 4c7c4ced915d4faf5a0b56fe04854d1505dc4790 [file] [log] [blame]
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
Benjamin Kramer01b57e32010-08-20 23:00:03 +000024#ifndef __IMMINTRIN_H
25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26#endif
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +000027
28typedef double __v4df __attribute__ ((__vector_size__ (32)));
29typedef float __v8sf __attribute__ ((__vector_size__ (32)));
30typedef long long __v4di __attribute__ ((__vector_size__ (32)));
31typedef int __v8si __attribute__ ((__vector_size__ (32)));
32typedef short __v16hi __attribute__ ((__vector_size__ (32)));
33typedef char __v32qi __attribute__ ((__vector_size__ (32)));
34
35typedef float __m256 __attribute__ ((__vector_size__ (32)));
36typedef double __m256d __attribute__((__vector_size__(32)));
37typedef long long __m256i __attribute__((__vector_size__(32)));
38
39/* Arithmetic */
40static __inline __m256d __attribute__((__always_inline__, __nodebug__))
41_mm256_add_pd(__m256d a, __m256d b)
42{
43 return a+b;
44}
45
46static __inline __m256 __attribute__((__always_inline__, __nodebug__))
47_mm256_add_ps(__m256 a, __m256 b)
48{
49 return a+b;
50}
51
52static __inline __m256d __attribute__((__always_inline__, __nodebug__))
53_mm256_sub_pd(__m256d a, __m256d b)
54{
55 return a-b;
56}
57
58static __inline __m256 __attribute__((__always_inline__, __nodebug__))
59_mm256_sub_ps(__m256 a, __m256 b)
60{
61 return a-b;
62}
63
64static __inline __m256d __attribute__((__always_inline__, __nodebug__))
65_mm256_addsub_pd(__m256d a, __m256d b)
66{
67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
68}
69
70static __inline __m256 __attribute__((__always_inline__, __nodebug__))
71_mm256_addsub_ps(__m256 a, __m256 b)
72{
73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
74}
75
76static __inline __m256d __attribute__((__always_inline__, __nodebug__))
77_mm256_div_pd(__m256d a, __m256d b)
78{
79 return a / b;
80}
81
82static __inline __m256 __attribute__((__always_inline__, __nodebug__))
83_mm256_div_ps(__m256 a, __m256 b)
84{
85 return a / b;
86}
87
88static __inline __m256d __attribute__((__always_inline__, __nodebug__))
89_mm256_max_pd(__m256d a, __m256d b)
90{
91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
92}
93
94static __inline __m256 __attribute__((__always_inline__, __nodebug__))
95_mm256_max_ps(__m256 a, __m256 b)
96{
97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
98}
99
100static __inline __m256d __attribute__((__always_inline__, __nodebug__))
101_mm256_min_pd(__m256d a, __m256d b)
102{
103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
104}
105
106static __inline __m256 __attribute__((__always_inline__, __nodebug__))
107_mm256_min_ps(__m256 a, __m256 b)
108{
109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
110}
111
112static __inline __m256d __attribute__((__always_inline__, __nodebug__))
113_mm256_mul_pd(__m256d a, __m256d b)
114{
115 return a * b;
116}
117
118static __inline __m256 __attribute__((__always_inline__, __nodebug__))
119_mm256_mul_ps(__m256 a, __m256 b)
120{
121 return a * b;
122}
123
124static __inline __m256d __attribute__((__always_inline__, __nodebug__))
125_mm256_sqrt_pd(__m256d a)
126{
127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
128}
129
130static __inline __m256 __attribute__((__always_inline__, __nodebug__))
131_mm256_sqrt_ps(__m256 a)
132{
133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
134}
135
136static __inline __m256 __attribute__((__always_inline__, __nodebug__))
137_mm256_rsqrt_ps(__m256 a)
138{
139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
140}
141
142static __inline __m256 __attribute__((__always_inline__, __nodebug__))
143_mm256_rcp_ps(__m256 a)
144{
145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
146}
147
148static __inline __m256d __attribute__((__always_inline__, __nodebug__))
149_mm256_round_pd(__m256d v, const int m)
150{
151 return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m);
152}
153
154static __inline __m256 __attribute__((__always_inline__, __nodebug__))
155_mm256_round_ps(__m256 v, const int m)
156{
157 return (__m256)__builtin_ia32_roundps256((__v8sf)v, m);
158}
159
160#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
161#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
162#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
163#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
164
165/* Logical */
166static __inline __m256d __attribute__((__always_inline__, __nodebug__))
167_mm256_and_pd(__m256d a, __m256d b)
168{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000169 return (__m256d)((__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000170}
171
172static __inline __m256 __attribute__((__always_inline__, __nodebug__))
173_mm256_and_ps(__m256 a, __m256 b)
174{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000175 return (__m256)((__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000176}
177
178static __inline __m256d __attribute__((__always_inline__, __nodebug__))
179_mm256_andnot_pd(__m256d a, __m256d b)
180{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000181 return (__m256d)(~(__v4di)a & (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000182}
183
184static __inline __m256 __attribute__((__always_inline__, __nodebug__))
185_mm256_andnot_ps(__m256 a, __m256 b)
186{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000187 return (__m256)(~(__v8si)a & (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000188}
189
190static __inline __m256d __attribute__((__always_inline__, __nodebug__))
191_mm256_or_pd(__m256d a, __m256d b)
192{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000193 return (__m256d)((__v4di)a | (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000194}
195
196static __inline __m256 __attribute__((__always_inline__, __nodebug__))
197_mm256_or_ps(__m256 a, __m256 b)
198{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000199 return (__m256)((__v8si)a | (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000200}
201
202static __inline __m256d __attribute__((__always_inline__, __nodebug__))
203_mm256_xor_pd(__m256d a, __m256d b)
204{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000205 return (__m256d)((__v4di)a ^ (__v4di)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000206}
207
208static __inline __m256 __attribute__((__always_inline__, __nodebug__))
209_mm256_xor_ps(__m256 a, __m256 b)
210{
Bruno Cardoso Lopesda6adc42010-08-05 23:04:58 +0000211 return (__m256)((__v8si)a ^ (__v8si)b);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000212}
213
214/* Horizontal arithmetic */
215static __inline __m256d __attribute__((__always_inline__, __nodebug__))
216_mm256_hadd_pd(__m256d a, __m256d b)
217{
218 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
219}
220
221static __inline __m256 __attribute__((__always_inline__, __nodebug__))
222_mm256_hadd_ps(__m256 a, __m256 b)
223{
224 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
225}
226
227static __inline __m256d __attribute__((__always_inline__, __nodebug__))
228_mm256_hsub_pd(__m256d a, __m256d b)
229{
230 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
231}
232
233static __inline __m256 __attribute__((__always_inline__, __nodebug__))
234_mm256_hsub_ps(__m256 a, __m256 b)
235{
236 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
237}
238
239/* Vector permutations */
240static __inline __m128d __attribute__((__always_inline__, __nodebug__))
241_mm_permutevar_pd(__m128d a, __m128i c)
242{
243 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
244}
245
246static __inline __m256d __attribute__((__always_inline__, __nodebug__))
247_mm256_permutevar_pd(__m256d a, __m256i c)
248{
249 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
250}
251
252static __inline __m128 __attribute__((__always_inline__, __nodebug__))
253_mm_permutevar_ps(__m128 a, __m128i c)
254{
255 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
256}
257
258static __inline __m256 __attribute__((__always_inline__, __nodebug__))
259_mm256_permutevar_ps(__m256 a, __m256i c)
260{
261 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
262 (__v8si)c);
263}
264
265static __inline __m128d __attribute__((__always_inline__, __nodebug__))
266_mm_permute_pd(__m128d a, const int c)
267{
268 return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c);
269}
270
271static __inline __m256d __attribute__((__always_inline__, __nodebug__))
272_mm256_permute_pd(__m256d a, const int c)
273{
274 return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c);
275}
276
277static __inline __m128 __attribute__((__always_inline__, __nodebug__))
278_mm_permute_ps(__m128 a, const int c)
279{
280 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c);
281}
282
283static __inline __m256 __attribute__((__always_inline__, __nodebug__))
284_mm256_permute_ps(__m256 a, const int c)
285{
286 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c);
287}
288
Chad Rosierc5cda112011-12-16 21:07:34 +0000289#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
290 __m256d __V1 = (V1); \
291 __m256d __V2 = (V2); \
292 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000293
Chad Rosierc5cda112011-12-16 21:07:34 +0000294#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
295 __m256 __V1 = (V1); \
296 __m256 __V2 = (V2); \
297 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000298
Chad Rosierc5cda112011-12-16 21:07:34 +0000299#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
300 __m256i __V1 = (V1); \
301 __m256i __V2 = (V2); \
302 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000303
304/* Vector Blend */
Eli Friedman34720892011-11-10 00:11:13 +0000305#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
306 __m256d __V1 = (V1); \
307 __m256d __V2 = (V2); \
308 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000309
Eli Friedman34720892011-11-10 00:11:13 +0000310#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
311 __m256 __V1 = (V1); \
312 __m256 __V2 = (V2); \
313 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000314
315static __inline __m256d __attribute__((__always_inline__, __nodebug__))
316_mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
317{
318 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
319}
320
321static __inline __m256 __attribute__((__always_inline__, __nodebug__))
322_mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
323{
324 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
325}
326
327/* Vector Dot Product */
Eli Friedman34720892011-11-10 00:11:13 +0000328#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
329 __m256 __V1 = (V1); \
330 __m256 __V2 = (V2); \
331 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, M); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000332
333/* Vector shuffle */
Bob Wilson32bae372011-11-05 06:08:06 +0000334#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
335 __m256 __a = (a); \
336 __m256 __b = (b); \
337 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000338 (mask) & 0x3, ((mask) & 0xc) >> 2, \
Bruno Cardoso Lopes70141c22010-08-11 18:45:43 +0000339 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \
Bruno Cardoso Lopes426344d2011-08-23 23:29:45 +0000340 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \
Bob Wilson32bae372011-11-05 06:08:06 +0000341 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000342
Bob Wilson32bae372011-11-05 06:08:06 +0000343#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
344 __m256d __a = (a); \
345 __m256d __b = (b); \
346 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
Bruno Cardoso Lopesb33aa0f2010-08-11 01:17:34 +0000347 (mask) & 0x1, \
348 (((mask) & 0x2) >> 1) + 4, \
349 (((mask) & 0x4) >> 2) + 2, \
Bob Wilson32bae372011-11-05 06:08:06 +0000350 (((mask) & 0x8) >> 3) + 6); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000351
352/* Compare */
353#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
354#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
355#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
356#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
357#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
358#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
359#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
360#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
361#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
362#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
363#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
364#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
365#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
366#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
367#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
368#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
369#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
370#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
371#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
372#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
373#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
374#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
375#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
376#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
377#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
378#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
379#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
380#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
381#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
382#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
383#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
384#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
385
Bob Wilson32bae372011-11-05 06:08:06 +0000386#define _mm_cmp_pd(a, b, c) __extension__ ({ \
387 __m128d __a = (a); \
388 __m128d __b = (b); \
389 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000390
Bob Wilson32bae372011-11-05 06:08:06 +0000391#define _mm_cmp_ps(a, b, c) __extension__ ({ \
392 __m128 __a = (a); \
393 __m128 __b = (b); \
394 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000395
Bob Wilson32bae372011-11-05 06:08:06 +0000396#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
397 __m256d __a = (a); \
398 __m256d __b = (b); \
399 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000400
Bob Wilson32bae372011-11-05 06:08:06 +0000401#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
402 __m256 __a = (a); \
403 __m256 __b = (b); \
404 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000405
Bob Wilson32bae372011-11-05 06:08:06 +0000406#define _mm_cmp_sd(a, b, c) __extension__ ({ \
407 __m128d __a = (a); \
408 __m128d __b = (b); \
409 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000410
Bob Wilson32bae372011-11-05 06:08:06 +0000411#define _mm_cmp_ss(a, b, c) __extension__ ({ \
412 __m128 __a = (a); \
413 __m128 __b = (b); \
414 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000415
416/* Vector extract */
417static __inline __m128d __attribute__((__always_inline__, __nodebug__))
418_mm256_extractf128_pd(__m256d a, const int o)
419{
420 return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o);
421}
422
423static __inline __m128 __attribute__((__always_inline__, __nodebug__))
424_mm256_extractf128_ps(__m256 a, const int o)
425{
426 return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o);
427}
428
429static __inline __m128i __attribute__((__always_inline__, __nodebug__))
430_mm256_extractf128_si256(__m256i a, const int o)
431{
432 return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o);
433}
434
435static __inline int __attribute__((__always_inline__, __nodebug__))
436_mm256_extract_epi32(__m256i a, int const imm)
437{
438 __v8si b = (__v8si)a;
439 return b[imm];
440}
441
442static __inline int __attribute__((__always_inline__, __nodebug__))
443_mm256_extract_epi16(__m256i a, int const imm)
444{
445 __v16hi b = (__v16hi)a;
446 return b[imm];
447}
448
449static __inline int __attribute__((__always_inline__, __nodebug__))
450_mm256_extract_epi8(__m256i a, int const imm)
451{
452 __v32qi b = (__v32qi)a;
453 return b[imm];
454}
455
456#ifdef __x86_64__
457static __inline long long __attribute__((__always_inline__, __nodebug__))
458_mm256_extract_epi64(__m256i a, const int imm)
459{
460 __v4di b = (__v4di)a;
461 return b[imm];
462}
463#endif
464
465/* Vector insert */
Chad Rosierb95ddf12011-12-16 21:40:31 +0000466#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
467 __m256d __V1 = (V1); \
468 __m128d __V2 = (V2); \
469 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000470
Chad Rosierb95ddf12011-12-16 21:40:31 +0000471#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
472 __m256 __V1 = (V1); \
473 __m128 __V2 = (V2); \
474 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000475
Chad Rosierb95ddf12011-12-16 21:40:31 +0000476#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
477 __m256i __V1 = (V1); \
478 __m128i __V2 = (V2); \
479 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, O); })
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000480
481static __inline __m256i __attribute__((__always_inline__, __nodebug__))
482_mm256_insert_epi32(__m256i a, int b, int const imm)
483{
484 __v8si c = (__v8si)a;
485 c[imm & 7] = b;
486 return (__m256i)c;
487}
488
489static __inline __m256i __attribute__((__always_inline__, __nodebug__))
490_mm256_insert_epi16(__m256i a, int b, int const imm)
491{
492 __v16hi c = (__v16hi)a;
493 c[imm & 15] = b;
494 return (__m256i)c;
495}
496
497static __inline __m256i __attribute__((__always_inline__, __nodebug__))
498_mm256_insert_epi8(__m256i a, int b, int const imm)
499{
500 __v32qi c = (__v32qi)a;
501 c[imm & 31] = b;
502 return (__m256i)c;
503}
504
505#ifdef __x86_64__
506static __inline __m256i __attribute__((__always_inline__, __nodebug__))
507_mm256_insert_epi64(__m256i a, int b, int const imm)
508{
509 __v4di c = (__v4di)a;
510 c[imm & 3] = b;
511 return (__m256i)c;
512}
513#endif
514
515/* Conversion */
516static __inline __m256d __attribute__((__always_inline__, __nodebug__))
517_mm256_cvtepi32_pd(__m128i a)
518{
519 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
520}
521
522static __inline __m256 __attribute__((__always_inline__, __nodebug__))
523_mm256_cvtepi32_ps(__m256i a)
524{
525 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
526}
527
528static __inline __m128 __attribute__((__always_inline__, __nodebug__))
529_mm256_cvtpd_ps(__m256d a)
530{
531 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
532}
533
534static __inline __m256i __attribute__((__always_inline__, __nodebug__))
535_mm256_cvtps_epi32(__m256 a)
536{
537 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
538}
539
540static __inline __m256d __attribute__((__always_inline__, __nodebug__))
541_mm256_cvtps_pd(__m128 a)
542{
543 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
544}
545
546static __inline __m128i __attribute__((__always_inline__, __nodebug__))
547_mm256_cvttpd_epi32(__m256d a)
548{
549 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
550}
551
552static __inline __m128i __attribute__((__always_inline__, __nodebug__))
553_mm256_cvtpd_epi32(__m256d a)
554{
555 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
556}
557
558static __inline __m256i __attribute__((__always_inline__, __nodebug__))
559_mm256_cvttps_epi32(__m256 a)
560{
561 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
562}
563
564/* Vector replicate */
565static __inline __m256 __attribute__((__always_inline__, __nodebug__))
566_mm256_movehdup_ps(__m256 a)
567{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000568 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000569}
570
571static __inline __m256 __attribute__((__always_inline__, __nodebug__))
572_mm256_moveldup_ps(__m256 a)
573{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000574 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000575}
576
577static __inline __m256d __attribute__((__always_inline__, __nodebug__))
578_mm256_movedup_pd(__m256d a)
579{
Bruno Cardoso Lopes4a5496b2010-08-10 02:23:54 +0000580 return __builtin_shufflevector(a, a, 0, 0, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000581}
582
583/* Unpack and Interleave */
584static __inline __m256d __attribute__((__always_inline__, __nodebug__))
585_mm256_unpackhi_pd(__m256d a, __m256d b)
586{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000587 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000588}
589
590static __inline __m256d __attribute__((__always_inline__, __nodebug__))
591_mm256_unpacklo_pd(__m256d a, __m256d b)
592{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000593 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000594}
595
596static __inline __m256 __attribute__((__always_inline__, __nodebug__))
597_mm256_unpackhi_ps(__m256 a, __m256 b)
598{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000599 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000600}
601
602static __inline __m256 __attribute__((__always_inline__, __nodebug__))
603_mm256_unpacklo_ps(__m256 a, __m256 b)
604{
Bruno Cardoso Lopesf0e96c92010-08-11 01:43:24 +0000605 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +0000606}
607
608/* Bit Test */
609static __inline int __attribute__((__always_inline__, __nodebug__))
610_mm_testz_pd(__m128d a, __m128d b)
611{
612 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
613}
614
615static __inline int __attribute__((__always_inline__, __nodebug__))
616_mm_testc_pd(__m128d a, __m128d b)
617{
618 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
619}
620
621static __inline int __attribute__((__always_inline__, __nodebug__))
622_mm_testnzc_pd(__m128d a, __m128d b)
623{
624 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
625}
626
627static __inline int __attribute__((__always_inline__, __nodebug__))
628_mm_testz_ps(__m128 a, __m128 b)
629{
630 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
631}
632
633static __inline int __attribute__((__always_inline__, __nodebug__))
634_mm_testc_ps(__m128 a, __m128 b)
635{
636 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
637}
638
639static __inline int __attribute__((__always_inline__, __nodebug__))
640_mm_testnzc_ps(__m128 a, __m128 b)
641{
642 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
643}
644
645static __inline int __attribute__((__always_inline__, __nodebug__))
646_mm256_testz_pd(__m256d a, __m256d b)
647{
648 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
649}
650
651static __inline int __attribute__((__always_inline__, __nodebug__))
652_mm256_testc_pd(__m256d a, __m256d b)
653{
654 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
655}
656
657static __inline int __attribute__((__always_inline__, __nodebug__))
658_mm256_testnzc_pd(__m256d a, __m256d b)
659{
660 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
661}
662
663static __inline int __attribute__((__always_inline__, __nodebug__))
664_mm256_testz_ps(__m256 a, __m256 b)
665{
666 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
667}
668
669static __inline int __attribute__((__always_inline__, __nodebug__))
670_mm256_testc_ps(__m256 a, __m256 b)
671{
672 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
673}
674
675static __inline int __attribute__((__always_inline__, __nodebug__))
676_mm256_testnzc_ps(__m256 a, __m256 b)
677{
678 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
679}
680
681static __inline int __attribute__((__always_inline__, __nodebug__))
682_mm256_testz_si256(__m256i a, __m256i b)
683{
684 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
685}
686
687static __inline int __attribute__((__always_inline__, __nodebug__))
688_mm256_testc_si256(__m256i a, __m256i b)
689{
690 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
691}
692
693static __inline int __attribute__((__always_inline__, __nodebug__))
694_mm256_testnzc_si256(__m256i a, __m256i b)
695{
696 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
697}
698
699/* Vector extract sign mask */
700static __inline int __attribute__((__always_inline__, __nodebug__))
701_mm256_movemask_pd(__m256d a)
702{
703 return __builtin_ia32_movmskpd256((__v4df)a);
704}
705
706static __inline int __attribute__((__always_inline__, __nodebug__))
707_mm256_movemask_ps(__m256 a)
708{
709 return __builtin_ia32_movmskps256((__v8sf)a);
710}
711
712/* Vector zero */
713static __inline void __attribute__((__always_inline__, __nodebug__))
714_mm256_zeroall(void)
715{
716 __builtin_ia32_vzeroall();
717}
718
719static __inline void __attribute__((__always_inline__, __nodebug__))
720_mm256_zeroupper(void)
721{
722 __builtin_ia32_vzeroupper();
723}
724
725/* Vector load with broadcast */
726static __inline __m128 __attribute__((__always_inline__, __nodebug__))
727_mm_broadcast_ss(float const *a)
728{
729 return (__m128)__builtin_ia32_vbroadcastss(a);
730}
731
732static __inline __m256d __attribute__((__always_inline__, __nodebug__))
733_mm256_broadcast_sd(double const *a)
734{
735 return (__m256d)__builtin_ia32_vbroadcastsd256(a);
736}
737
738static __inline __m256 __attribute__((__always_inline__, __nodebug__))
739_mm256_broadcast_ss(float const *a)
740{
741 return (__m256)__builtin_ia32_vbroadcastss256(a);
742}
743
744static __inline __m256d __attribute__((__always_inline__, __nodebug__))
745_mm256_broadcast_pd(__m128d const *a)
746{
747 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
748}
749
750static __inline __m256 __attribute__((__always_inline__, __nodebug__))
751_mm256_broadcast_ps(__m128 const *a)
752{
753 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
754}
755
756/* SIMD load ops */
757static __inline __m256d __attribute__((__always_inline__, __nodebug__))
758_mm256_load_pd(double const *p)
759{
760 return *(__m256d *)p;
761}
762
763static __inline __m256 __attribute__((__always_inline__, __nodebug__))
764_mm256_load_ps(float const *p)
765{
766 return *(__m256 *)p;
767}
768
769static __inline __m256d __attribute__((__always_inline__, __nodebug__))
770_mm256_loadu_pd(double const *p)
771{
772 return (__m256d)__builtin_ia32_loadupd256(p);
773}
774
775static __inline __m256 __attribute__((__always_inline__, __nodebug__))
776_mm256_loadu_ps(float const *p)
777{
778 return (__m256)__builtin_ia32_loadups256(p);
779}
780
781static __inline __m256i __attribute__((__always_inline__, __nodebug__))
782_mm256_load_si256(__m256i const *p)
783{
784 return *p;
785}
786
787static __inline __m256i __attribute__((__always_inline__, __nodebug__))
788_mm256_loadu_si256(__m256i const *p)
789{
790 return (__m256i)__builtin_ia32_loaddqu256((char const *)p);
791}
792
793static __inline __m256i __attribute__((__always_inline__, __nodebug__))
794_mm256_lddqu_si256(__m256i const *p)
795{
796 return (__m256i)__builtin_ia32_lddqu256((char const *)p);
797}
798
799/* SIMD store ops */
800static __inline void __attribute__((__always_inline__, __nodebug__))
801_mm256_store_pd(double *p, __m256d a)
802{
803 *(__m256d *)p = a;
804}
805
806static __inline void __attribute__((__always_inline__, __nodebug__))
807_mm256_store_ps(float *p, __m256 a)
808{
809 *(__m256 *)p = a;
810}
811
812static __inline void __attribute__((__always_inline__, __nodebug__))
813_mm256_storeu_pd(double *p, __m256d a)
814{
815 __builtin_ia32_storeupd256(p, (__v4df)a);
816}
817
818static __inline void __attribute__((__always_inline__, __nodebug__))
819_mm256_storeu_ps(float *p, __m256 a)
820{
821 __builtin_ia32_storeups256(p, (__v8sf)a);
822}
823
824static __inline void __attribute__((__always_inline__, __nodebug__))
825_mm256_store_si256(__m256i *p, __m256i a)
826{
827 *p = a;
828}
829
830static __inline void __attribute__((__always_inline__, __nodebug__))
831_mm256_storeu_si256(__m256i *p, __m256i a)
832{
833 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
834}
835
836/* Conditional load ops */
837static __inline __m128d __attribute__((__always_inline__, __nodebug__))
838_mm_maskload_pd(double const *p, __m128d m)
839{
840 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
841}
842
843static __inline __m256d __attribute__((__always_inline__, __nodebug__))
844_mm256_maskload_pd(double const *p, __m256d m)
845{
846 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
847}
848
849static __inline __m128 __attribute__((__always_inline__, __nodebug__))
850_mm_maskload_ps(float const *p, __m128 m)
851{
852 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
853}
854
855static __inline __m256 __attribute__((__always_inline__, __nodebug__))
856_mm256_maskload_ps(float const *p, __m256 m)
857{
858 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
859}
860
861/* Conditional store ops */
862static __inline void __attribute__((__always_inline__, __nodebug__))
863_mm256_maskstore_ps(float *p, __m256 m, __m256 a)
864{
865 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
866}
867
868static __inline void __attribute__((__always_inline__, __nodebug__))
869_mm_maskstore_pd(double *p, __m128d m, __m128d a)
870{
871 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
872}
873
874static __inline void __attribute__((__always_inline__, __nodebug__))
875_mm256_maskstore_pd(double *p, __m256d m, __m256d a)
876{
877 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
878}
879
880static __inline void __attribute__((__always_inline__, __nodebug__))
881_mm_maskstore_ps(float *p, __m128 m, __m128 a)
882{
883 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
884}
885
886/* Cacheability support ops */
887static __inline void __attribute__((__always_inline__, __nodebug__))
888_mm256_stream_si256(__m256i *a, __m256i b)
889{
890 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
891}
892
893static __inline void __attribute__((__always_inline__, __nodebug__))
894_mm256_stream_pd(double *a, __m256d b)
895{
896 __builtin_ia32_movntpd256(a, (__v4df)b);
897}
898
899static __inline void __attribute__((__always_inline__, __nodebug__))
900_mm256_stream_ps(float *p, __m256 a)
901{
902 __builtin_ia32_movntps256(p, (__v8sf)a);
903}
904
905/* Create vectors */
906static __inline __m256d __attribute__((__always_inline__, __nodebug__))
907_mm256_set_pd(double a, double b, double c, double d)
908{
909 return (__m256d){ d, c, b, a };
910}
911
912static __inline __m256 __attribute__((__always_inline__, __nodebug__))
913_mm256_set_ps(float a, float b, float c, float d,
914 float e, float f, float g, float h)
915{
916 return (__m256){ h, g, f, e, d, c, b, a };
917}
918
919static __inline __m256i __attribute__((__always_inline__, __nodebug__))
920_mm256_set_epi32(int i0, int i1, int i2, int i3,
921 int i4, int i5, int i6, int i7)
922{
923 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
924}
925
926static __inline __m256i __attribute__((__always_inline__, __nodebug__))
927_mm256_set_epi16(short w15, short w14, short w13, short w12,
928 short w11, short w10, short w09, short w08,
929 short w07, short w06, short w05, short w04,
930 short w03, short w02, short w01, short w00)
931{
932 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
933 w08, w09, w10, w11, w12, w13, w14, w15 };
934}
935
936static __inline __m256i __attribute__((__always_inline__, __nodebug__))
937_mm256_set_epi8(char b31, char b30, char b29, char b28,
938 char b27, char b26, char b25, char b24,
939 char b23, char b22, char b21, char b20,
940 char b19, char b18, char b17, char b16,
941 char b15, char b14, char b13, char b12,
942 char b11, char b10, char b09, char b08,
943 char b07, char b06, char b05, char b04,
944 char b03, char b02, char b01, char b00)
945{
946 return (__m256i)(__v32qi){
947 b00, b01, b02, b03, b04, b05, b06, b07,
948 b08, b09, b10, b11, b12, b13, b14, b15,
949 b16, b17, b18, b19, b20, b21, b22, b23,
950 b24, b25, b26, b27, b28, b29, b30, b31
951 };
952}
953
954static __inline __m256i __attribute__((__always_inline__, __nodebug__))
955_mm256_set_epi64x(long long a, long long b, long long c, long long d)
956{
957 return (__m256i)(__v4di){ d, c, b, a };
958}
959
960/* Create vectors with elements in reverse order */
961static __inline __m256d __attribute__((__always_inline__, __nodebug__))
962_mm256_setr_pd(double a, double b, double c, double d)
963{
964 return (__m256d){ a, b, c, d };
965}
966
967static __inline __m256 __attribute__((__always_inline__, __nodebug__))
968_mm256_setr_ps(float a, float b, float c, float d,
969 float e, float f, float g, float h)
970{
971 return (__m256){ a, b, c, d, e, f, g, h };
972}
973
974static __inline __m256i __attribute__((__always_inline__, __nodebug__))
975_mm256_setr_epi32(int i0, int i1, int i2, int i3,
976 int i4, int i5, int i6, int i7)
977{
978 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
979}
980
981static __inline __m256i __attribute__((__always_inline__, __nodebug__))
982_mm256_setr_epi16(short w15, short w14, short w13, short w12,
983 short w11, short w10, short w09, short w08,
984 short w07, short w06, short w05, short w04,
985 short w03, short w02, short w01, short w00)
986{
987 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
988 w07, w06, w05, w04, w03, w02, w01, w00 };
989}
990
991static __inline __m256i __attribute__((__always_inline__, __nodebug__))
992_mm256_setr_epi8(char b31, char b30, char b29, char b28,
993 char b27, char b26, char b25, char b24,
994 char b23, char b22, char b21, char b20,
995 char b19, char b18, char b17, char b16,
996 char b15, char b14, char b13, char b12,
997 char b11, char b10, char b09, char b08,
998 char b07, char b06, char b05, char b04,
999 char b03, char b02, char b01, char b00)
1000{
1001 return (__m256i)(__v32qi){
1002 b31, b30, b29, b28, b27, b26, b25, b24,
1003 b23, b22, b21, b20, b19, b18, b17, b16,
1004 b15, b14, b13, b12, b11, b10, b09, b08,
1005 b07, b06, b05, b04, b03, b02, b01, b00 };
1006}
1007
1008static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1009_mm256_setr_epi64x(long long a, long long b, long long c, long long d)
1010{
1011 return (__m256i)(__v4di){ a, b, c, d };
1012}
1013
1014/* Create vectors with repeated elements */
1015static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1016_mm256_set1_pd(double w)
1017{
1018 return (__m256d){ w, w, w, w };
1019}
1020
1021static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1022_mm256_set1_ps(float w)
1023{
1024 return (__m256){ w, w, w, w, w, w, w, w };
1025}
1026
1027static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1028_mm256_set1_epi32(int i)
1029{
1030 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
1031}
1032
1033static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1034_mm256_set1_epi16(short w)
1035{
1036 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
1037}
1038
1039static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1040_mm256_set1_epi8(char b)
1041{
1042 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
1043 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1044}
1045
1046static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1047_mm256_set1_epi64x(long long q)
1048{
1049 return (__m256i)(__v4di){ q, q, q, q };
1050}
1051
1052/* Create zeroed vectors */
1053static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1054_mm256_setzero_pd(void)
1055{
1056 return (__m256d){ 0, 0, 0, 0 };
1057}
1058
1059static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1060_mm256_setzero_ps(void)
1061{
1062 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
1063}
1064
1065static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1066_mm256_setzero_si256(void)
1067{
1068 return (__m256i){ 0LL, 0LL, 0LL, 0LL };
1069}
1070
1071/* Cast between vector types */
1072static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1073_mm256_castpd_ps(__m256d in)
1074{
1075 return (__m256)in;
1076}
1077
1078static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1079_mm256_castpd_si256(__m256d in)
1080{
1081 return (__m256i)in;
1082}
1083
1084static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1085_mm256_castps_pd(__m256 in)
1086{
1087 return (__m256d)in;
1088}
1089
1090static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1091_mm256_castps_si256(__m256 in)
1092{
1093 return (__m256i)in;
1094}
1095
1096static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1097_mm256_castsi256_ps(__m256i in)
1098{
1099 return (__m256)in;
1100}
1101
1102static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1103_mm256_castsi256_pd(__m256i in)
1104{
1105 return (__m256d)in;
1106}
1107
1108static __inline __m128d __attribute__((__always_inline__, __nodebug__))
1109_mm256_castpd256_pd128(__m256d in)
1110{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001111 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001112}
1113
1114static __inline __m128 __attribute__((__always_inline__, __nodebug__))
1115_mm256_castps256_ps128(__m256 in)
1116{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001117 return __builtin_shufflevector(in, in, 0, 1, 2, 3);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001118}
1119
1120static __inline __m128i __attribute__((__always_inline__, __nodebug__))
1121_mm256_castsi256_si128(__m256i in)
1122{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001123 return __builtin_shufflevector(in, in, 0, 1);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001124}
1125
1126static __inline __m256d __attribute__((__always_inline__, __nodebug__))
1127_mm256_castpd128_pd256(__m128d in)
1128{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001129 __m128d zero = _mm_setzero_pd();
1130 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001131}
1132
1133static __inline __m256 __attribute__((__always_inline__, __nodebug__))
1134_mm256_castps128_ps256(__m128 in)
1135{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001136 __m128 zero = _mm_setzero_ps();
1137 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001138}
1139
1140static __inline __m256i __attribute__((__always_inline__, __nodebug__))
1141_mm256_castsi128_si256(__m128i in)
1142{
Bruno Cardoso Lopes7fc37022010-08-11 02:14:38 +00001143 __m128i zero = _mm_setzero_si128();
1144 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
Bruno Cardoso Lopes55db5b82010-08-04 22:03:36 +00001145}