blob: b8cd554717a03f90146ddeac965ffe45d2a47a3a [file] [log] [blame]
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001/*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error \
11 "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVX512VLFP16INTRIN_H
15#define __AVX512VLFP16INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, \
20 __target__("avx512fp16, avx512vl"), \
21 __min_vector_width__(256)))
22#define __DEFAULT_FN_ATTRS128 \
23 __attribute__((__always_inline__, __nodebug__, \
24 __target__("avx512fp16, avx512vl"), \
25 __min_vector_width__(128)))
26
27static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) {
28 return __a[0];
29}
30
31static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) {
32 return __a[0];
33}
34
35static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) {
36 return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0};
37}
38
39static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) {
40 return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h};
41}
42
43static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) {
44 return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h,
45 __h, __h, __h, __h, __h, __h, __h, __h};
46}
47
48static __inline __m128h __DEFAULT_FN_ATTRS128
49_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
50 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) {
51 return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1};
52}
53
54static __inline __m256h __DEFAULT_FN_ATTRS256
55_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
56 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
57 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
58 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) {
59 return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11,
60 __h10, __h9, __h8, __h7, __h6, __h5,
61 __h4, __h3, __h2, __h1};
62}
63
64#define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \
65 _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
66
67#define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
68 h14, h15, h16) \
69 _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \
70 (h7), (h6), (h5), (h4), (h3), (h2), (h1))
71
72static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A,
73 __m256h __B) {
74 return (__m256h)((__v16hf)__A + (__v16hf)__B);
75}
76
77static __inline__ __m256h __DEFAULT_FN_ATTRS256
78_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
79 return (__m256h)__builtin_ia32_selectph_256(
80 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W);
81}
82
83static __inline__ __m256h __DEFAULT_FN_ATTRS256
84_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) {
85 return (__m256h)__builtin_ia32_selectph_256(
86 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
87}
88
89static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A,
90 __m128h __B) {
91 return (__m128h)((__v8hf)__A + (__v8hf)__B);
92}
93
94static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W,
95 __mmask8 __U,
96 __m128h __A,
97 __m128h __B) {
98 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
99 (__v8hf)__W);
100}
101
102static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U,
103 __m128h __A,
104 __m128h __B) {
105 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
106 (__v8hf)_mm_setzero_ph());
107}
108
109static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A,
110 __m256h __B) {
111 return (__m256h)((__v16hf)__A - (__v16hf)__B);
112}
113
114static __inline__ __m256h __DEFAULT_FN_ATTRS256
115_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
116 return (__m256h)__builtin_ia32_selectph_256(
117 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W);
118}
119
120static __inline__ __m256h __DEFAULT_FN_ATTRS256
121_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) {
122 return (__m256h)__builtin_ia32_selectph_256(
123 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
124}
125
126static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A,
127 __m128h __B) {
128 return (__m128h)((__v8hf)__A - (__v8hf)__B);
129}
130
131static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W,
132 __mmask8 __U,
133 __m128h __A,
134 __m128h __B) {
135 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
136 (__v8hf)__W);
137}
138
139static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U,
140 __m128h __A,
141 __m128h __B) {
142 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
143 (__v8hf)_mm_setzero_ph());
144}
145
146static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A,
147 __m256h __B) {
148 return (__m256h)((__v16hf)__A * (__v16hf)__B);
149}
150
151static __inline__ __m256h __DEFAULT_FN_ATTRS256
152_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
153 return (__m256h)__builtin_ia32_selectph_256(
154 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W);
155}
156
157static __inline__ __m256h __DEFAULT_FN_ATTRS256
158_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) {
159 return (__m256h)__builtin_ia32_selectph_256(
160 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
161}
162
163static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A,
164 __m128h __B) {
165 return (__m128h)((__v8hf)__A * (__v8hf)__B);
166}
167
168static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W,
169 __mmask8 __U,
170 __m128h __A,
171 __m128h __B) {
172 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
173 (__v8hf)__W);
174}
175
176static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U,
177 __m128h __A,
178 __m128h __B) {
179 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
180 (__v8hf)_mm_setzero_ph());
181}
182
183static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A,
184 __m256h __B) {
185 return (__m256h)((__v16hf)__A / (__v16hf)__B);
186}
187
188static __inline__ __m256h __DEFAULT_FN_ATTRS256
189_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
190 return (__m256h)__builtin_ia32_selectph_256(
191 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W);
192}
193
194static __inline__ __m256h __DEFAULT_FN_ATTRS256
195_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) {
196 return (__m256h)__builtin_ia32_selectph_256(
197 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
198}
199
200static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A,
201 __m128h __B) {
202 return (__m128h)((__v8hf)__A / (__v8hf)__B);
203}
204
205static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W,
206 __mmask8 __U,
207 __m128h __A,
208 __m128h __B) {
209 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
210 (__v8hf)__W);
211}
212
213static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U,
214 __m128h __A,
215 __m128h __B) {
216 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
217 (__v8hf)_mm_setzero_ph());
218}
219
220static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A,
221 __m256h __B) {
222 return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B);
223}
224
225static __inline__ __m256h __DEFAULT_FN_ATTRS256
226_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
227 return (__m256h)__builtin_ia32_selectph_256(
228 (__mmask16)__U,
229 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
230 (__v16hf)__W);
231}
232
233static __inline__ __m256h __DEFAULT_FN_ATTRS256
234_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) {
235 return (__m256h)__builtin_ia32_selectph_256(
236 (__mmask16)__U,
237 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
238 (__v16hf)_mm256_setzero_ph());
239}
240
241static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A,
242 __m128h __B) {
243 return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B);
244}
245
246static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W,
247 __mmask8 __U,
248 __m128h __A,
249 __m128h __B) {
250 return (__m128h)__builtin_ia32_selectph_128(
251 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
252 (__v8hf)__W);
253}
254
255static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U,
256 __m128h __A,
257 __m128h __B) {
258 return (__m128h)__builtin_ia32_selectph_128(
259 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
260 (__v8hf)_mm_setzero_ph());
261}
262
263static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A,
264 __m256h __B) {
265 return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B);
266}
267
268static __inline__ __m256h __DEFAULT_FN_ATTRS256
269_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
270 return (__m256h)__builtin_ia32_selectph_256(
271 (__mmask16)__U,
272 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
273 (__v16hf)__W);
274}
275
276static __inline__ __m256h __DEFAULT_FN_ATTRS256
277_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) {
278 return (__m256h)__builtin_ia32_selectph_256(
279 (__mmask16)__U,
280 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
281 (__v16hf)_mm256_setzero_ph());
282}
283
284static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A,
285 __m128h __B) {
286 return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B);
287}
288
289static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W,
290 __mmask8 __U,
291 __m128h __A,
292 __m128h __B) {
293 return (__m128h)__builtin_ia32_selectph_128(
294 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
295 (__v8hf)__W);
296}
297
298static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U,
299 __m128h __A,
300 __m128h __B) {
301 return (__m128h)__builtin_ia32_selectph_128(
302 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
303 (__v8hf)_mm_setzero_ph());
304}
305
306static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) {
307 return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A);
308}
309
310static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) {
311 return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
312}
313
314static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) {
315 return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f));
316}
317
318static __inline__ __m256h __DEFAULT_FN_ATTRS256
319_mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) {
320 return (__m256h)__builtin_ia32_selectps_256(
321 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W);
322}
323
324static __inline__ __m256h __DEFAULT_FN_ATTRS256
325_mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) {
326 return (__m256h)__builtin_ia32_selectps_256(
327 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps());
328}
329
330static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) {
331 return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f));
332}
333
334static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W,
335 __mmask8 __U,
336 __m128h __A) {
337 return (__m128h)__builtin_ia32_selectps_128(
338 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W);
339}
340
341static __inline__ __m128h __DEFAULT_FN_ATTRS128
342_mm_maskz_conj_pch(__mmask8 __U, __m128h __A) {
343 return (__m128h)__builtin_ia32_selectps_128(
344 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps());
345}
346
347#define _mm256_cmp_ph_mask(a, b, p) \
348 ((__mmask16)__builtin_ia32_cmpph256_mask( \
349 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
350
351#define _mm256_mask_cmp_ph_mask(m, a, b, p) \
352 ((__mmask16)__builtin_ia32_cmpph256_mask( \
353 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m)))
354
355#define _mm_cmp_ph_mask(a, b, p) \
356 ((__mmask8)__builtin_ia32_cmpph128_mask( \
357 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1))
358
359#define _mm_mask_cmp_ph_mask(m, a, b, p) \
360 ((__mmask8)__builtin_ia32_cmpph128_mask( \
361 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
362
363static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) {
364 return (__m256h)__builtin_ia32_rcpph256_mask(
365 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
366}
367
368static __inline__ __m256h __DEFAULT_FN_ATTRS256
369_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
370 return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W,
371 (__mmask16)__U);
372}
373
374static __inline__ __m256h __DEFAULT_FN_ATTRS256
375_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) {
376 return (__m256h)__builtin_ia32_rcpph256_mask(
377 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
378}
379
380static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) {
381 return (__m128h)__builtin_ia32_rcpph128_mask(
382 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
383}
384
385static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W,
386 __mmask8 __U,
387 __m128h __A) {
388 return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W,
389 (__mmask8)__U);
390}
391
392static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U,
393 __m128h __A) {
394 return (__m128h)__builtin_ia32_rcpph128_mask(
395 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
396}
397
398static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) {
399 return (__m256h)__builtin_ia32_rsqrtph256_mask(
400 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
401}
402
403static __inline__ __m256h __DEFAULT_FN_ATTRS256
404_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
405 return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W,
406 (__mmask16)__U);
407}
408
409static __inline__ __m256h __DEFAULT_FN_ATTRS256
410_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) {
411 return (__m256h)__builtin_ia32_rsqrtph256_mask(
412 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
413}
414
415static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) {
416 return (__m128h)__builtin_ia32_rsqrtph128_mask(
417 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
418}
419
420static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W,
421 __mmask8 __U,
422 __m128h __A) {
423 return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W,
424 (__mmask8)__U);
425}
426
427static __inline__ __m128h __DEFAULT_FN_ATTRS128
428_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) {
429 return (__m128h)__builtin_ia32_rsqrtph128_mask(
430 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
431}
432
433static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) {
434 return (__m128h)__builtin_ia32_getexpph128_mask(
435 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
436}
437
438static __inline__ __m128h __DEFAULT_FN_ATTRS128
439_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) {
440 return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W,
441 (__mmask8)__U);
442}
443
444static __inline__ __m128h __DEFAULT_FN_ATTRS128
445_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) {
446 return (__m128h)__builtin_ia32_getexpph128_mask(
447 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
448}
449
450static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) {
451 return (__m256h)__builtin_ia32_getexpph256_mask(
452 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
453}
454
455static __inline__ __m256h __DEFAULT_FN_ATTRS256
456_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
457 return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W,
458 (__mmask16)__U);
459}
460
461static __inline__ __m256h __DEFAULT_FN_ATTRS256
462_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) {
463 return (__m256h)__builtin_ia32_getexpph256_mask(
464 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
465}
466
467#define _mm_getmant_ph(A, B, C) \
468 ((__m128h)__builtin_ia32_getmantph128_mask( \
469 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
470 (__mmask8)-1))
471
472#define _mm_mask_getmant_ph(W, U, A, B, C) \
473 ((__m128h)__builtin_ia32_getmantph128_mask( \
474 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \
475 (__mmask8)(U)))
476
477#define _mm_maskz_getmant_ph(U, A, B, C) \
478 ((__m128h)__builtin_ia32_getmantph128_mask( \
479 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
480 (__mmask8)(U)))
481
482#define _mm256_getmant_ph(A, B, C) \
483 ((__m256h)__builtin_ia32_getmantph256_mask( \
484 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
485 (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
486
487#define _mm256_mask_getmant_ph(W, U, A, B, C) \
488 ((__m256h)__builtin_ia32_getmantph256_mask( \
489 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
490 (__mmask16)(U)))
491
492#define _mm256_maskz_getmant_ph(U, A, B, C) \
493 ((__m256h)__builtin_ia32_getmantph256_mask( \
494 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
495 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
496
497static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A,
498 __m128h __B) {
499 return (__m128h)__builtin_ia32_scalefph128_mask(
500 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
501}
502
503static __inline__ __m128h __DEFAULT_FN_ATTRS128
504_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
505 return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B,
506 (__v8hf)__W, (__mmask8)__U);
507}
508
509static __inline__ __m128h __DEFAULT_FN_ATTRS128
510_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) {
511 return (__m128h)__builtin_ia32_scalefph128_mask(
512 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
513}
514
515static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A,
516 __m256h __B) {
517 return (__m256h)__builtin_ia32_scalefph256_mask(
518 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
519}
520
521static __inline__ __m256h __DEFAULT_FN_ATTRS256
522_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
523 return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B,
524 (__v16hf)__W, (__mmask16)__U);
525}
526
527static __inline__ __m256h __DEFAULT_FN_ATTRS256
528_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
529 return (__m256h)__builtin_ia32_scalefph256_mask(
530 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
531}
532
533#define _mm_roundscale_ph(A, imm) \
534 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
535 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
536 (__mmask8)-1))
537
538#define _mm_mask_roundscale_ph(W, U, A, imm) \
539 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
540 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
541
542#define _mm_maskz_roundscale_ph(U, A, imm) \
543 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
544 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
545 (__mmask8)(U)))
546
547#define _mm256_roundscale_ph(A, imm) \
548 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
549 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
550 (__mmask16)-1))
551
552#define _mm256_mask_roundscale_ph(W, U, A, imm) \
553 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
554 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
555 (__mmask16)(U)))
556
557#define _mm256_maskz_roundscale_ph(U, A, imm) \
558 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
559 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
560 (__mmask16)(U)))
561
562#define _mm_reduce_ph(A, imm) \
563 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
564 (__v8hf)_mm_setzero_ph(), \
565 (__mmask8)-1))
566
567#define _mm_mask_reduce_ph(W, U, A, imm) \
568 ((__m128h)__builtin_ia32_reduceph128_mask( \
569 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
570
571#define _mm_maskz_reduce_ph(U, A, imm) \
572 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
573 (__v8hf)_mm_setzero_ph(), \
574 (__mmask8)(U)))
575
576#define _mm256_reduce_ph(A, imm) \
577 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
578 (__v16hf)_mm256_setzero_ph(), \
579 (__mmask16)-1))
580
581#define _mm256_mask_reduce_ph(W, U, A, imm) \
582 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
583 (__v16hf)(__m256h)(W), \
584 (__mmask16)(U)))
585
586#define _mm256_maskz_reduce_ph(U, A, imm) \
587 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
588 (__v16hf)_mm256_setzero_ph(), \
589 (__mmask16)(U)))
590
591static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
592 return __builtin_ia32_sqrtph((__v8hf)__a);
593}
594
595static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
596 __mmask8 __U,
597 __m128h __A) {
598 return (__m128h)__builtin_ia32_selectph_128(
599 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W);
600}
601
602static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
603 __m128h __A) {
604 return (__m128h)__builtin_ia32_selectph_128(
605 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph());
606}
607
608static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
609 return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
610}
611
612static __inline__ __m256h __DEFAULT_FN_ATTRS256
613_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
614 return (__m256h)__builtin_ia32_selectph_256(
615 (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W);
616}
617
618static __inline__ __m256h __DEFAULT_FN_ATTRS256
619_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) {
620 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
621 (__v16hf)_mm256_sqrt_ph(__A),
622 (__v16hf)_mm256_setzero_ph());
623}
624
625#define _mm_mask_fpclass_ph_mask(U, A, imm) \
626 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
627 (int)(imm), (__mmask8)(U)))
628
629#define _mm_fpclass_ph_mask(A, imm) \
630 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
631 (int)(imm), (__mmask8)-1))
632
633#define _mm256_mask_fpclass_ph_mask(U, A, imm) \
634 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
635 (int)(imm), (__mmask16)(U)))
636
637#define _mm256_fpclass_ph_mask(A, imm) \
638 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
639 (int)(imm), (__mmask16)-1))
640
641static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) {
642 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
643 (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
644}
645
646static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W,
647 __mmask8 __U,
648 __m128d __A) {
649 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W,
650 (__mmask8)__U);
651}
652
653static __inline__ __m128h __DEFAULT_FN_ATTRS128
654_mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) {
655 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
656 (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
657}
658
659static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) {
660 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
661 (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
662}
663
664static __inline__ __m128h __DEFAULT_FN_ATTRS256
665_mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) {
666 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W,
667 (__mmask8)__U);
668}
669
670static __inline__ __m128h __DEFAULT_FN_ATTRS256
671_mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) {
672 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
673 (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
674}
675
676static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) {
677 return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
678 (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1);
679}
680
681static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W,
682 __mmask8 __U,
683 __m128h __A) {
684 return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W,
685 (__mmask8)__U);
686}
687
688static __inline__ __m128d __DEFAULT_FN_ATTRS128
689_mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
690 return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
691 (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U);
692}
693
694static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) {
695 return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
696 (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1);
697}
698
699static __inline__ __m256d __DEFAULT_FN_ATTRS256
700_mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) {
701 return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W,
702 (__mmask8)__U);
703}
704
705static __inline__ __m256d __DEFAULT_FN_ATTRS256
706_mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
707 return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
708 (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U);
709}
710
711static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) {
712 return (__m128i)__builtin_ia32_vcvtph2w128_mask(
713 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
714}
715
716static __inline__ __m128i __DEFAULT_FN_ATTRS128
717_mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
718 return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W,
719 (__mmask8)__U);
720}
721
722static __inline__ __m128i __DEFAULT_FN_ATTRS128
723_mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) {
724 return (__m128i)__builtin_ia32_vcvtph2w128_mask(
725 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
726}
727
728static __inline__ __m256i __DEFAULT_FN_ATTRS256
729_mm256_cvtph_epi16(__m256h __A) {
730 return (__m256i)__builtin_ia32_vcvtph2w256_mask(
731 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
732}
733
734static __inline__ __m256i __DEFAULT_FN_ATTRS256
735_mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
736 return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W,
737 (__mmask16)__U);
738}
739
740static __inline__ __m256i __DEFAULT_FN_ATTRS256
741_mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) {
742 return (__m256i)__builtin_ia32_vcvtph2w256_mask(
743 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
744}
745
746static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) {
747 return (__m128i)__builtin_ia32_vcvttph2w128_mask(
748 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
749}
750
751static __inline__ __m128i __DEFAULT_FN_ATTRS128
752_mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
753 return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W,
754 (__mmask8)__U);
755}
756
757static __inline__ __m128i __DEFAULT_FN_ATTRS128
758_mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) {
759 return (__m128i)__builtin_ia32_vcvttph2w128_mask(
760 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
761}
762
763static __inline__ __m256i __DEFAULT_FN_ATTRS256
764_mm256_cvttph_epi16(__m256h __A) {
765 return (__m256i)__builtin_ia32_vcvttph2w256_mask(
766 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
767}
768
769static __inline__ __m256i __DEFAULT_FN_ATTRS256
770_mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
771 return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W,
772 (__mmask16)__U);
773}
774
775static __inline__ __m256i __DEFAULT_FN_ATTRS256
776_mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) {
777 return (__m256i)__builtin_ia32_vcvttph2w256_mask(
778 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
779}
780
781static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) {
782 return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf);
783}
784
785static __inline__ __m128h __DEFAULT_FN_ATTRS128
786_mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
787 return (__m128h)__builtin_ia32_selectph_128(
788 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W);
789}
790
791static __inline__ __m128h __DEFAULT_FN_ATTRS128
792_mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) {
793 return (__m128h)__builtin_ia32_selectph_128(
794 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph());
795}
796
797static __inline__ __m256h __DEFAULT_FN_ATTRS256
798_mm256_cvtepi16_ph(__m256i __A) {
799 return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf);
800}
801
802static __inline__ __m256h __DEFAULT_FN_ATTRS256
803_mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
804 return (__m256h)__builtin_ia32_selectph_256(
805 (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W);
806}
807
808static __inline__ __m256h __DEFAULT_FN_ATTRS256
809_mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) {
810 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
811 (__v16hf)_mm256_cvtepi16_ph(__A),
812 (__v16hf)_mm256_setzero_ph());
813}
814
815static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) {
816 return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
817 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
818}
819
820static __inline__ __m128i __DEFAULT_FN_ATTRS128
821_mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
822 return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W,
823 (__mmask8)__U);
824}
825
826static __inline__ __m128i __DEFAULT_FN_ATTRS128
827_mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) {
828 return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
829 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
830}
831
832static __inline__ __m256i __DEFAULT_FN_ATTRS256
833_mm256_cvtph_epu16(__m256h __A) {
834 return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
835 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
836}
837
838static __inline__ __m256i __DEFAULT_FN_ATTRS256
839_mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
840 return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W,
841 (__mmask16)__U);
842}
843
844static __inline__ __m256i __DEFAULT_FN_ATTRS256
845_mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) {
846 return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
847 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
848}
849
850static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) {
851 return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
852 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
853}
854
855static __inline__ __m128i __DEFAULT_FN_ATTRS128
856_mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
857 return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W,
858 (__mmask8)__U);
859}
860
861static __inline__ __m128i __DEFAULT_FN_ATTRS128
862_mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) {
863 return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
864 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
865}
866
867static __inline__ __m256i __DEFAULT_FN_ATTRS256
868_mm256_cvttph_epu16(__m256h __A) {
869 return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
870 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
871}
872
873static __inline__ __m256i __DEFAULT_FN_ATTRS256
874_mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
875 return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W,
876 (__mmask16)__U);
877}
878
879static __inline__ __m256i __DEFAULT_FN_ATTRS256
880_mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) {
881 return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
882 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
883}
884
885static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) {
886 return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf);
887}
888
889static __inline__ __m128h __DEFAULT_FN_ATTRS128
890_mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
891 return (__m128h)__builtin_ia32_selectph_128(
892 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W);
893}
894
895static __inline__ __m128h __DEFAULT_FN_ATTRS128
896_mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) {
897 return (__m128h)__builtin_ia32_selectph_128(
898 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph());
899}
900
901static __inline__ __m256h __DEFAULT_FN_ATTRS256
902_mm256_cvtepu16_ph(__m256i __A) {
903 return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf);
904}
905
906static __inline__ __m256h __DEFAULT_FN_ATTRS256
907_mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
908 return (__m256h)__builtin_ia32_selectph_256(
909 (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W);
910}
911
912static __inline__ __m256h __DEFAULT_FN_ATTRS256
913_mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) {
914 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
915 (__v16hf)_mm256_cvtepu16_ph(__A),
916 (__v16hf)_mm256_setzero_ph());
917}
918
919static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) {
920 return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
921 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
922}
923
924static __inline__ __m128i __DEFAULT_FN_ATTRS128
925_mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
926 return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W,
927 (__mmask8)__U);
928}
929
930static __inline__ __m128i __DEFAULT_FN_ATTRS128
931_mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
932 return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
933 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
934}
935
936static __inline__ __m256i __DEFAULT_FN_ATTRS256
937_mm256_cvtph_epi32(__m128h __A) {
938 return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
939 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
940}
941
942static __inline__ __m256i __DEFAULT_FN_ATTRS256
943_mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
944 return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W,
945 (__mmask8)__U);
946}
947
948static __inline__ __m256i __DEFAULT_FN_ATTRS256
949_mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
950 return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
951 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
952}
953
954static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) {
955 return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
956 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
957}
958
959static __inline__ __m128i __DEFAULT_FN_ATTRS128
960_mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
961 return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W,
962 (__mmask8)__U);
963}
964
965static __inline__ __m128i __DEFAULT_FN_ATTRS128
966_mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
967 return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
968 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
969}
970
971static __inline__ __m256i __DEFAULT_FN_ATTRS256
972_mm256_cvtph_epu32(__m128h __A) {
973 return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
974 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
975}
976
977static __inline__ __m256i __DEFAULT_FN_ATTRS256
978_mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
979 return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W,
980 (__mmask8)__U);
981}
982
983static __inline__ __m256i __DEFAULT_FN_ATTRS256
984_mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
985 return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
986 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
987}
988
989static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) {
990 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
991 (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
992}
993
994static __inline__ __m128h __DEFAULT_FN_ATTRS128
995_mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
996 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W,
997 (__mmask8)__U);
998}
999
1000static __inline__ __m128h __DEFAULT_FN_ATTRS128
1001_mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) {
1002 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
1003 (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1004}
1005
1006static __inline__ __m128h __DEFAULT_FN_ATTRS256
1007_mm256_cvtepi32_ph(__m256i __A) {
1008 return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf);
1009}
1010
1011static __inline__ __m128h __DEFAULT_FN_ATTRS256
1012_mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1013 return (__m128h)__builtin_ia32_selectph_128(
1014 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W);
1015}
1016
1017static __inline__ __m128h __DEFAULT_FN_ATTRS256
1018_mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) {
1019 return (__m128h)__builtin_ia32_selectph_128(
1020 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph());
1021}
1022
1023static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) {
1024 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1025 (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1026}
1027
1028static __inline__ __m128h __DEFAULT_FN_ATTRS128
1029_mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1030 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W,
1031 (__mmask8)__U);
1032}
1033
1034static __inline__ __m128h __DEFAULT_FN_ATTRS128
1035_mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) {
1036 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1037 (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1038}
1039
1040static __inline__ __m128h __DEFAULT_FN_ATTRS256
1041_mm256_cvtepu32_ph(__m256i __A) {
1042 return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf);
1043}
1044
1045static __inline__ __m128h __DEFAULT_FN_ATTRS256
1046_mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1047 return (__m128h)__builtin_ia32_selectph_128(
1048 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W);
1049}
1050
1051static __inline__ __m128h __DEFAULT_FN_ATTRS256
1052_mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) {
1053 return (__m128h)__builtin_ia32_selectph_128(
1054 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph());
1055}
1056
1057static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) {
1058 return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1059 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
1060}
1061
1062static __inline__ __m128i __DEFAULT_FN_ATTRS128
1063_mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
1064 return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W,
1065 (__mmask8)__U);
1066}
1067
1068static __inline__ __m128i __DEFAULT_FN_ATTRS128
1069_mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
1070 return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1071 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
1072}
1073
1074static __inline__ __m256i __DEFAULT_FN_ATTRS256
1075_mm256_cvttph_epi32(__m128h __A) {
1076 return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1077 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
1078}
1079
1080static __inline__ __m256i __DEFAULT_FN_ATTRS256
1081_mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
1082 return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W,
1083 (__mmask8)__U);
1084}
1085
1086static __inline__ __m256i __DEFAULT_FN_ATTRS256
1087_mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
1088 return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1089 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
1090}
1091
1092static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) {
1093 return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1094 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
1095}
1096
1097static __inline__ __m128i __DEFAULT_FN_ATTRS128
1098_mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
1099 return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W,
1100 (__mmask8)__U);
1101}
1102
1103static __inline__ __m128i __DEFAULT_FN_ATTRS128
1104_mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
1105 return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1106 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
1107}
1108
1109static __inline__ __m256i __DEFAULT_FN_ATTRS256
1110_mm256_cvttph_epu32(__m128h __A) {
1111 return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1112 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
1113}
1114
1115static __inline__ __m256i __DEFAULT_FN_ATTRS256
1116_mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
1117 return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W,
1118 (__mmask8)__U);
1119}
1120
1121static __inline__ __m256i __DEFAULT_FN_ATTRS256
1122_mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
1123 return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1124 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
1125}
1126
1127static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) {
1128 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1129 (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1130}
1131
1132static __inline__ __m128h __DEFAULT_FN_ATTRS128
1133_mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1134 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W,
1135 (__mmask8)__U);
1136}
1137
1138static __inline__ __m128h __DEFAULT_FN_ATTRS128
1139_mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) {
1140 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1141 (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1142}
1143
1144static __inline__ __m128h __DEFAULT_FN_ATTRS256
1145_mm256_cvtepi64_ph(__m256i __A) {
1146 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1147 (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1148}
1149
1150static __inline__ __m128h __DEFAULT_FN_ATTRS256
1151_mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1152 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W,
1153 (__mmask8)__U);
1154}
1155
1156static __inline__ __m128h __DEFAULT_FN_ATTRS256
1157_mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) {
1158 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1159 (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1160}
1161
1162static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) {
1163 return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1164 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
1165}
1166
1167static __inline__ __m128i __DEFAULT_FN_ATTRS128
1168_mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
1169 return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W,
1170 (__mmask8)__U);
1171}
1172
1173static __inline__ __m128i __DEFAULT_FN_ATTRS128
1174_mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
1175 return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1176 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
1177}
1178
1179static __inline__ __m256i __DEFAULT_FN_ATTRS256
1180_mm256_cvtph_epi64(__m128h __A) {
1181 return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1182 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
1183}
1184
1185static __inline__ __m256i __DEFAULT_FN_ATTRS256
1186_mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
1187 return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W,
1188 (__mmask8)__U);
1189}
1190
1191static __inline__ __m256i __DEFAULT_FN_ATTRS256
1192_mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
1193 return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1194 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
1195}
1196
1197static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) {
1198 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1199 (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1200}
1201
1202static __inline__ __m128h __DEFAULT_FN_ATTRS128
1203_mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1204 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W,
1205 (__mmask8)__U);
1206}
1207
1208static __inline__ __m128h __DEFAULT_FN_ATTRS128
1209_mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) {
1210 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1211 (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1212}
1213
1214static __inline__ __m128h __DEFAULT_FN_ATTRS256
1215_mm256_cvtepu64_ph(__m256i __A) {
1216 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1217 (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1218}
1219
1220static __inline__ __m128h __DEFAULT_FN_ATTRS256
1221_mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1222 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W,
1223 (__mmask8)__U);
1224}
1225
1226static __inline__ __m128h __DEFAULT_FN_ATTRS256
1227_mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) {
1228 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1229 (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1230}
1231
1232static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) {
1233 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1234 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
1235}
1236
1237static __inline__ __m128i __DEFAULT_FN_ATTRS128
1238_mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
1239 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1240 (__mmask8)__U);
1241}
1242
1243static __inline__ __m128i __DEFAULT_FN_ATTRS128
1244_mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
1245 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1246 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
1247}
1248
1249static __inline__ __m256i __DEFAULT_FN_ATTRS256
1250_mm256_cvtph_epu64(__m128h __A) {
1251 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1252 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
1253}
1254
1255static __inline__ __m256i __DEFAULT_FN_ATTRS256
1256_mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
1257 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1258 (__mmask8)__U);
1259}
1260
1261static __inline__ __m256i __DEFAULT_FN_ATTRS256
1262_mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
1263 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1264 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
1265}
1266
1267static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) {
1268 return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1269 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
1270}
1271
1272static __inline__ __m128i __DEFAULT_FN_ATTRS128
1273_mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
1274 return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W,
1275 (__mmask8)__U);
1276}
1277
1278static __inline__ __m128i __DEFAULT_FN_ATTRS128
1279_mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
1280 return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1281 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
1282}
1283
1284static __inline__ __m256i __DEFAULT_FN_ATTRS256
1285_mm256_cvttph_epi64(__m128h __A) {
1286 return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1287 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
1288}
1289
1290static __inline__ __m256i __DEFAULT_FN_ATTRS256
1291_mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
1292 return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W,
1293 (__mmask8)__U);
1294}
1295
1296static __inline__ __m256i __DEFAULT_FN_ATTRS256
1297_mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
1298 return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1299 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
1300}
1301
1302static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) {
1303 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1304 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
1305}
1306
1307static __inline__ __m128i __DEFAULT_FN_ATTRS128
1308_mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
1309 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1310 (__mmask8)__U);
1311}
1312
1313static __inline__ __m128i __DEFAULT_FN_ATTRS128
1314_mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
1315 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1316 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
1317}
1318
1319static __inline__ __m256i __DEFAULT_FN_ATTRS256
1320_mm256_cvttph_epu64(__m128h __A) {
1321 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1322 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
1323}
1324
1325static __inline__ __m256i __DEFAULT_FN_ATTRS256
1326_mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
1327 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1328 (__mmask8)__U);
1329}
1330
1331static __inline__ __m256i __DEFAULT_FN_ATTRS256
1332_mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
1333 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1334 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
1335}
1336
1337static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) {
1338 return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1339 (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1);
1340}
1341
1342static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W,
1343 __mmask8 __U,
1344 __m128h __A) {
1345 return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W,
1346 (__mmask8)__U);
1347}
1348
1349static __inline__ __m128 __DEFAULT_FN_ATTRS128
1350_mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
1351 return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1352 (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U);
1353}
1354
1355static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) {
1356 return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1357 (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1);
1358}
1359
1360static __inline__ __m256 __DEFAULT_FN_ATTRS256
1361_mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) {
1362 return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W,
1363 (__mmask8)__U);
1364}
1365
1366static __inline__ __m256 __DEFAULT_FN_ATTRS256
1367_mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
1368 return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1369 (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U);
1370}
1371
1372static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) {
1373 return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1374 (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1375}
1376
1377static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W,
1378 __mmask8 __U,
1379 __m128 __A) {
1380 return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W,
1381 (__mmask8)__U);
1382}
1383
1384static __inline__ __m128h __DEFAULT_FN_ATTRS128
1385_mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) {
1386 return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1387 (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1388}
1389
1390static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) {
1391 return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1392 (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1393}
1394
1395static __inline__ __m128h __DEFAULT_FN_ATTRS256
1396_mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) {
1397 return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W,
1398 (__mmask8)__U);
1399}
1400
1401static __inline__ __m128h __DEFAULT_FN_ATTRS256
1402_mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) {
1403 return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1404 (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1405}
1406
1407static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A,
1408 __m128h __B,
1409 __m128h __C) {
1410 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1411 (__v8hf)__C);
1412}
1413
1414static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A,
1415 __mmask8 __U,
1416 __m128h __B,
1417 __m128h __C) {
1418 return (__m128h)__builtin_ia32_selectph_128(
1419 (__mmask8)__U,
1420 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1421 (__v8hf)__A);
1422}
1423
1424static __inline__ __m128h __DEFAULT_FN_ATTRS128
1425_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1426 return (__m128h)__builtin_ia32_selectph_128(
1427 (__mmask8)__U,
1428 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1429 (__v8hf)__C);
1430}
1431
1432static __inline__ __m128h __DEFAULT_FN_ATTRS128
1433_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1434 return (__m128h)__builtin_ia32_selectph_128(
1435 (__mmask8)__U,
1436 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1437 (__v8hf)_mm_setzero_ph());
1438}
1439
1440static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A,
1441 __m128h __B,
1442 __m128h __C) {
1443 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1444 -(__v8hf)__C);
1445}
1446
1447static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A,
1448 __mmask8 __U,
1449 __m128h __B,
1450 __m128h __C) {
1451 return (__m128h)__builtin_ia32_selectph_128(
1452 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1453 (__v8hf)__A);
1454}
1455
1456static __inline__ __m128h __DEFAULT_FN_ATTRS128
1457_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1458 return (__m128h)__builtin_ia32_selectph_128(
1459 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1460 (__v8hf)_mm_setzero_ph());
1461}
1462
1463static __inline__ __m128h __DEFAULT_FN_ATTRS128
1464_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1465 return (__m128h)__builtin_ia32_selectph_128(
1466 (__mmask8)__U,
1467 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1468 (__v8hf)__C);
1469}
1470
1471static __inline__ __m128h __DEFAULT_FN_ATTRS128
1472_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1473 return (__m128h)__builtin_ia32_selectph_128(
1474 (__mmask8)__U,
1475 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1476 (__v8hf)_mm_setzero_ph());
1477}
1478
1479static __inline__ __m128h __DEFAULT_FN_ATTRS128
1480_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1481 return (__m128h)__builtin_ia32_selectph_128(
1482 (__mmask8)__U,
1483 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1484 (__v8hf)_mm_setzero_ph());
1485}
1486
1487static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A,
1488 __m256h __B,
1489 __m256h __C) {
1490 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1491 (__v16hf)__C);
1492}
1493
1494static __inline__ __m256h __DEFAULT_FN_ATTRS256
1495_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1496 return (__m256h)__builtin_ia32_selectph_256(
1497 (__mmask16)__U,
1498 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1499 (__v16hf)__A);
1500}
1501
1502static __inline__ __m256h __DEFAULT_FN_ATTRS256
1503_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1504 return (__m256h)__builtin_ia32_selectph_256(
1505 (__mmask16)__U,
1506 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1507 (__v16hf)__C);
1508}
1509
1510static __inline__ __m256h __DEFAULT_FN_ATTRS256
1511_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1512 return (__m256h)__builtin_ia32_selectph_256(
1513 (__mmask16)__U,
1514 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1515 (__v16hf)_mm256_setzero_ph());
1516}
1517
1518static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A,
1519 __m256h __B,
1520 __m256h __C) {
1521 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1522 -(__v16hf)__C);
1523}
1524
1525static __inline__ __m256h __DEFAULT_FN_ATTRS256
1526_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1527 return (__m256h)__builtin_ia32_selectph_256(
1528 (__mmask16)__U,
1529 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1530 (__v16hf)__A);
1531}
1532
1533static __inline__ __m256h __DEFAULT_FN_ATTRS256
1534_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1535 return (__m256h)__builtin_ia32_selectph_256(
1536 (__mmask16)__U,
1537 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1538 (__v16hf)_mm256_setzero_ph());
1539}
1540
1541static __inline__ __m256h __DEFAULT_FN_ATTRS256
1542_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1543 return (__m256h)__builtin_ia32_selectph_256(
1544 (__mmask16)__U,
1545 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1546 (__v16hf)__C);
1547}
1548
1549static __inline__ __m256h __DEFAULT_FN_ATTRS256
1550_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1551 return (__m256h)__builtin_ia32_selectph_256(
1552 (__mmask16)__U,
1553 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1554 (__v16hf)_mm256_setzero_ph());
1555}
1556
1557static __inline__ __m256h __DEFAULT_FN_ATTRS256
1558_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1559 return (__m256h)__builtin_ia32_selectph_256(
1560 (__mmask16)__U,
1561 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1562 (__v16hf)_mm256_setzero_ph());
1563}
1564
1565static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A,
1566 __m128h __B,
1567 __m128h __C) {
1568 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1569 (__v8hf)__C);
1570}
1571
1572static __inline__ __m128h __DEFAULT_FN_ATTRS128
1573_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1574 return (__m128h)__builtin_ia32_selectph_128(
1575 (__mmask8)__U,
1576 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1577 (__v8hf)__A);
1578}
1579
1580static __inline__ __m128h __DEFAULT_FN_ATTRS128
1581_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1582 return (__m128h)__builtin_ia32_selectph_128(
1583 (__mmask8)__U,
1584 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1585 (__v8hf)__C);
1586}
1587
1588static __inline__ __m128h __DEFAULT_FN_ATTRS128
1589_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1590 return (__m128h)__builtin_ia32_selectph_128(
1591 (__mmask8)__U,
1592 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1593 (__v8hf)_mm_setzero_ph());
1594}
1595
1596static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A,
1597 __m128h __B,
1598 __m128h __C) {
1599 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1600 -(__v8hf)__C);
1601}
1602
1603static __inline__ __m128h __DEFAULT_FN_ATTRS128
1604_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1605 return (__m128h)__builtin_ia32_selectph_128(
1606 (__mmask8)__U,
1607 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1608 (__v8hf)__A);
1609}
1610
1611static __inline__ __m128h __DEFAULT_FN_ATTRS128
1612_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1613 return (__m128h)__builtin_ia32_selectph_128(
1614 (__mmask8)__U,
1615 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1616 (__v8hf)_mm_setzero_ph());
1617}
1618
1619static __inline__ __m256h __DEFAULT_FN_ATTRS256
1620_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) {
1621 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1622 (__v16hf)__C);
1623}
1624
1625static __inline__ __m256h __DEFAULT_FN_ATTRS256
1626_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1627 return (__m256h)__builtin_ia32_selectph_256(
1628 (__mmask16)__U,
1629 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1630 (__v16hf)__A);
1631}
1632
1633static __inline__ __m256h __DEFAULT_FN_ATTRS256
1634_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1635 return (__m256h)__builtin_ia32_selectph_256(
1636 (__mmask16)__U,
1637 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1638 (__v16hf)__C);
1639}
1640
1641static __inline__ __m256h __DEFAULT_FN_ATTRS256
1642_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1643 return (__m256h)__builtin_ia32_selectph_256(
1644 (__mmask16)__U,
1645 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1646 (__v16hf)_mm256_setzero_ph());
1647}
1648
1649static __inline__ __m256h __DEFAULT_FN_ATTRS256
1650_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) {
1651 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1652 -(__v16hf)__C);
1653}
1654
1655static __inline__ __m256h __DEFAULT_FN_ATTRS256
1656_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1657 return (__m256h)__builtin_ia32_selectph_256(
1658 (__mmask16)__U,
1659 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1660 (__v16hf)__A);
1661}
1662
1663static __inline__ __m256h __DEFAULT_FN_ATTRS256
1664_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1665 return (__m256h)__builtin_ia32_selectph_256(
1666 (__mmask16)__U,
1667 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1668 (__v16hf)_mm256_setzero_ph());
1669}
1670
1671static __inline__ __m128h __DEFAULT_FN_ATTRS128
1672_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1673 return (__m128h)__builtin_ia32_selectph_128(
1674 (__mmask8)__U,
1675 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1676 (__v8hf)__C);
1677}
1678
1679static __inline__ __m256h __DEFAULT_FN_ATTRS256
1680_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1681 return (__m256h)__builtin_ia32_selectph_256(
1682 (__mmask16)__U,
1683 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1684 (__v16hf)__C);
1685}
1686
1687static __inline__ __m128h __DEFAULT_FN_ATTRS128
1688_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1689 return (__m128h)__builtin_ia32_selectph_128(
1690 (__mmask8)__U,
1691 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1692 (__v8hf)__C);
1693}
1694
1695static __inline__ __m256h __DEFAULT_FN_ATTRS256
1696_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1697 return (__m256h)__builtin_ia32_selectph_256(
1698 (__mmask16)__U,
1699 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1700 (__v16hf)__C);
1701}
1702
1703static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A,
1704 __m128h __B,
1705 __m128h __C) {
1706 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1707 (__v8hf)__C);
1708}
1709
1710static __inline__ __m128h __DEFAULT_FN_ATTRS128
1711_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1712 return (__m128h)__builtin_ia32_selectph_128(
1713 (__mmask8)__U,
1714 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
1715 (__v8hf)__A);
1716}
1717
1718static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A,
1719 __m256h __B,
1720 __m256h __C) {
1721 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1722 (__v16hf)__C);
1723}
1724
1725static __inline__ __m256h __DEFAULT_FN_ATTRS256
1726_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1727 return (__m256h)__builtin_ia32_selectph_256(
1728 (__mmask16)__U,
1729 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
1730 (__v16hf)__A);
1731}
1732
1733static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A,
1734 __m128h __B,
1735 __m128h __C) {
1736 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1737 -(__v8hf)__C);
1738}
1739
1740static __inline__ __m128h __DEFAULT_FN_ATTRS128
1741_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1742 return (__m128h)__builtin_ia32_selectph_128(
1743 (__mmask8)__U,
1744 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1745 (__v8hf)__A);
1746}
1747
1748static __inline__ __m128h __DEFAULT_FN_ATTRS128
1749_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1750 return (__m128h)__builtin_ia32_selectph_128(
1751 (__mmask8)__U,
1752 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1753 (__v8hf)__C);
1754}
1755
1756static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A,
1757 __m256h __B,
1758 __m256h __C) {
1759 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1760 -(__v16hf)__C);
1761}
1762
1763static __inline__ __m256h __DEFAULT_FN_ATTRS256
1764_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1765 return (__m256h)__builtin_ia32_selectph_256(
1766 (__mmask16)__U,
1767 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1768 (__v16hf)__A);
1769}
1770
1771static __inline__ __m256h __DEFAULT_FN_ATTRS256
1772_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1773 return (__m256h)__builtin_ia32_selectph_256(
1774 (__mmask16)__U,
1775 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1776 (__v16hf)__C);
1777}
1778
1779static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A,
1780 __m128h __B) {
1781 return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1782 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
1783}
1784
1785static __inline__ __m128h __DEFAULT_FN_ATTRS128
1786_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1787 return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1788 (__v4sf)__W, (__mmask8)__U);
1789}
1790
1791static __inline__ __m128h __DEFAULT_FN_ATTRS128
1792_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
1793 return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1794 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
1795}
1796
1797static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A,
1798 __m256h __B) {
1799 return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1800 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
1801}
1802
1803static __inline__ __m256h __DEFAULT_FN_ATTRS256
1804_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
1805 return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1806 (__v8sf)__W, (__mmask8)__U);
1807}
1808
1809static __inline__ __m256h __DEFAULT_FN_ATTRS256
1810_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
1811 return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1812 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
1813}
1814
1815static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A,
1816 __m128h __B,
1817 __m128h __C) {
1818 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)__A,
1819 (__v4sf)__B, (__mmask8)-1);
1820}
1821
1822static __inline__ __m128h __DEFAULT_FN_ATTRS128
1823_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1824 return (__m128h)__builtin_ia32_selectps_128(
1825 __U,
1826 __builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)(__m128h)__A,
1827 (__v4sf)__B, (__mmask8)__U),
1828 (__v4sf)__A);
1829}
1830
1831static __inline__ __m128h __DEFAULT_FN_ATTRS128
1832_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1833 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)__A,
1834 (__v4sf)__B, (__mmask8)__U);
1835}
1836
1837static __inline__ __m128h __DEFAULT_FN_ATTRS128
1838_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1839 return (__m128h)__builtin_ia32_vfcmaddcph128_maskz(
1840 (__v4sf)__C, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U);
1841}
1842
1843static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A,
1844 __m256h __B,
1845 __m256h __C) {
1846 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A,
1847 (__v8sf)__B, (__mmask8)-1);
1848}
1849
1850static __inline__ __m256h __DEFAULT_FN_ATTRS256
1851_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
1852 return (__m256h)__builtin_ia32_selectps_256(
1853 __U,
1854 __builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A, (__v8sf)__B,
1855 (__mmask8)__U),
1856 (__v8sf)__A);
1857}
1858
1859static __inline__ __m256h __DEFAULT_FN_ATTRS256
1860_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
1861 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A,
1862 (__v8sf)__B, (__mmask8)__U);
1863}
1864
1865static __inline__ __m256h __DEFAULT_FN_ATTRS256
1866_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
1867 return (__m256h)__builtin_ia32_vfcmaddcph256_maskz(
1868 (__v8sf)__C, (__v8sf)__A, (__v8sf)__B, (__mmask8)__U);
1869}
1870
1871static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A,
1872 __m128h __B) {
1873 return (__m128h)__builtin_ia32_vfmulcph128_mask(
1874 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
1875}
1876
1877static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W,
1878 __mmask8 __U,
1879 __m128h __A,
1880 __m128h __B) {
1881 return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1882 (__v4sf)__W, (__mmask8)__U);
1883}
1884
1885static __inline__ __m128h __DEFAULT_FN_ATTRS128
1886_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
1887 return (__m128h)__builtin_ia32_vfmulcph128_mask(
1888 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
1889}
1890
1891static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A,
1892 __m256h __B) {
1893 return (__m256h)__builtin_ia32_vfmulcph256_mask(
1894 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
1895}
1896
1897static __inline__ __m256h __DEFAULT_FN_ATTRS256
1898_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
1899 return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1900 (__v8sf)__W, (__mmask8)__U);
1901}
1902
1903static __inline__ __m256h __DEFAULT_FN_ATTRS256
1904_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
1905 return (__m256h)__builtin_ia32_vfmulcph256_mask(
1906 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
1907}
1908
1909static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A,
1910 __m128h __B,
1911 __m128h __C) {
1912 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A,
1913 (__v4sf)__B, (__mmask8)-1);
1914}
1915
1916static __inline__ __m128h __DEFAULT_FN_ATTRS128
1917_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1918 return (__m128h)__builtin_ia32_selectps_128(
1919 __U,
1920 __builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B,
1921 (__mmask8)__U),
1922 (__v4sf)__A);
1923}
1924
1925static __inline__ __m128h __DEFAULT_FN_ATTRS128
1926_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1927 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A,
1928 (__v4sf)__B, (__mmask8)__U);
1929}
1930
1931static __inline__ __m128h __DEFAULT_FN_ATTRS128
1932_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1933 return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__C, (__v4sf)__A,
1934 (__v4sf)__B, (__mmask8)__U);
1935}
1936
1937static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A,
1938 __m256h __B,
1939 __m256h __C) {
1940 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A,
1941 (__v8sf)__B, (__mmask8)-1);
1942}
1943
1944static __inline__ __m256h __DEFAULT_FN_ATTRS256
1945_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
1946 return (__m256h)__builtin_ia32_selectps_256(
1947 __U,
1948 __builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A, (__v8sf)__B,
1949 (__mmask8)__U),
1950 (__v8sf)__A);
1951}
1952
1953static __inline__ __m256h __DEFAULT_FN_ATTRS256
1954_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
1955 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A,
1956 (__v8sf)__B, (__mmask8)__U);
1957}
1958
1959static __inline__ __m256h __DEFAULT_FN_ATTRS256
1960_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
1961 return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__C, (__v8sf)__A,
1962 (__v8sf)__B, (__mmask8)__U);
1963}
1964
1965static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
1966 __m128h __A,
1967 __m128h __W) {
1968 return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W,
1969 (__v8hf)__A);
1970}
1971
1972static __inline__ __m256h __DEFAULT_FN_ATTRS256
1973_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
1974 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W,
1975 (__v16hf)__A);
1976}
1977
1978static __inline__ __m128h __DEFAULT_FN_ATTRS128
1979_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
1980 return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
1981 (__v8hi)__B);
1982}
1983
1984static __inline__ __m256h __DEFAULT_FN_ATTRS256
1985_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
1986 return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
1987 (__v16hi)__B);
1988}
1989
1990static __inline__ __m128h __DEFAULT_FN_ATTRS128
1991_mm_permutexvar_ph(__m128i __A, __m128h __B) {
1992 return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
1993}
1994
1995static __inline__ __m256h __DEFAULT_FN_ATTRS256
1996_mm256_permutexvar_ph(__m256i __A, __m256h __B) {
1997 return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
1998}
1999
2000static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2001_mm256_reduce_add_ph(__m256h __W) {
2002 return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W);
2003}
2004
2005static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2006_mm256_reduce_mul_ph(__m256h __W) {
2007 return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W);
2008}
2009
2010static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2011_mm256_reduce_max_ph(__m256h __V) {
2012 return __builtin_ia32_reduce_fmax_ph256(__V);
2013}
2014
2015static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2016_mm256_reduce_min_ph(__m256h __V) {
2017 return __builtin_ia32_reduce_fmin_ph256(__V);
2018}
2019
2020static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2021_mm_reduce_add_ph(__m128h __W) {
2022 return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W);
2023}
2024
2025static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2026_mm_reduce_mul_ph(__m128h __W) {
2027 return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W);
2028}
2029
2030static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2031_mm_reduce_max_ph(__m128h __V) {
2032 return __builtin_ia32_reduce_fmax_ph128(__V);
2033}
2034
2035static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2036_mm_reduce_min_ph(__m128h __V) {
2037 return __builtin_ia32_reduce_fmin_ph128(__V);
2038}
2039
2040#undef __DEFAULT_FN_ATTRS128
2041#undef __DEFAULT_FN_ATTRS256
2042
2043#endif