blob: 8911b4e2ff42f91baf4db0efab53bf6d71f2c62f [file] [log] [blame]
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11#endif
12
13#ifndef __AVX512FP16INTRIN_H
14#define __AVX512FP16INTRIN_H
15
16/* Define the default attributes for the functions in this file. */
17typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
18typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
19typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
20typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
21typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
22typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
23typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
24typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
25typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
26
27/* Define the default attributes for the functions in this file. */
28#define __DEFAULT_FN_ATTRS512 \
29 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
30 __min_vector_width__(512)))
31#define __DEFAULT_FN_ATTRS256 \
32 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
33 __min_vector_width__(256)))
34#define __DEFAULT_FN_ATTRS128 \
35 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
36 __min_vector_width__(128)))
37
38static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
39 return __a[0];
40}
41
42static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
43 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
44}
45
46static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
47 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
48 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
49}
50
51static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
52 return (__m256h)__builtin_ia32_undef256();
53}
54
55static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
56 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
57 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
58 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
59}
60
61static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
62 return (__m128h)__builtin_ia32_undef128();
63}
64
65static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
66 return (__m512h)__builtin_ia32_undef512();
67}
68
69static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
70 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
71 __h, __h, __h, __h, __h, __h, __h, __h,
72 __h, __h, __h, __h, __h, __h, __h, __h,
73 __h, __h, __h, __h, __h, __h, __h, __h};
74}
75
76static __inline __m512h __DEFAULT_FN_ATTRS512
77_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
78 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
79 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
80 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
81 _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
82 _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
83 _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
84 _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
85 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
86 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
87 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
88 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
89 __h4, __h3, __h2, __h1};
90}
91
92#define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
93 h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \
94 h25, h26, h27, h28, h29, h30, h31, h32) \
95 _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
96 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
97 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \
98 (h5), (h4), (h3), (h2), (h1))
99
100static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
101 return (__m128)__a;
102}
103
104static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
105 return (__m256)__a;
106}
107
108static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
109 return (__m512)__a;
110}
111
112static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
113 return (__m128d)__a;
114}
115
116static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
117 return (__m256d)__a;
118}
119
120static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
121 return (__m512d)__a;
122}
123
124static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
125 return (__m128i)__a;
126}
127
128static __inline__ __m256i __DEFAULT_FN_ATTRS256
129_mm256_castph_si256(__m256h __a) {
130 return (__m256i)__a;
131}
132
133static __inline__ __m512i __DEFAULT_FN_ATTRS512
134_mm512_castph_si512(__m512h __a) {
135 return (__m512i)__a;
136}
137
138static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
139 return (__m128h)__a;
140}
141
142static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
143 return (__m256h)__a;
144}
145
146static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
147 return (__m512h)__a;
148}
149
150static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
151 return (__m128h)__a;
152}
153
154static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
155 return (__m256h)__a;
156}
157
158static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
159 return (__m512h)__a;
160}
161
162static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
163 return (__m128h)__a;
164}
165
166static __inline__ __m256h __DEFAULT_FN_ATTRS256
167_mm256_castsi256_ph(__m256i __a) {
168 return (__m256h)__a;
169}
170
171static __inline__ __m512h __DEFAULT_FN_ATTRS512
172_mm512_castsi512_ph(__m512i __a) {
173 return (__m512h)__a;
174}
175
176static __inline__ __m128h __DEFAULT_FN_ATTRS256
177_mm256_castph256_ph128(__m256h __a) {
178 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
179}
180
181static __inline__ __m128h __DEFAULT_FN_ATTRS512
182_mm512_castph512_ph128(__m512h __a) {
183 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
184}
185
186static __inline__ __m256h __DEFAULT_FN_ATTRS512
187_mm512_castph512_ph256(__m512h __a) {
188 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
189 12, 13, 14, 15);
190}
191
192static __inline__ __m256h __DEFAULT_FN_ATTRS256
193_mm256_castph128_ph256(__m128h __a) {
194 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
195 -1, -1, -1, -1, -1);
196}
197
198static __inline__ __m512h __DEFAULT_FN_ATTRS512
199_mm512_castph128_ph512(__m128h __a) {
200 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
201 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
202 -1, -1, -1, -1, -1, -1, -1, -1, -1);
203}
204
205static __inline__ __m512h __DEFAULT_FN_ATTRS512
206_mm512_castph256_ph512(__m256h __a) {
207 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
208 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
209 -1, -1, -1, -1, -1, -1, -1, -1);
210}
211
212/// Constructs a 256-bit floating-point vector of [16 x half] from a
213/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
214/// contain the value of the source vector. The upper 384 bits are set
215/// to zero.
216///
217/// \headerfile <x86intrin.h>
218///
219/// This intrinsic has no corresponding instruction.
220///
221/// \param __a
222/// A 128-bit vector of [8 x half].
223/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
224/// contain the value of the parameter. The upper 384 bits are set to zero.
225static __inline__ __m256h __DEFAULT_FN_ATTRS256
226_mm256_zextph128_ph256(__m128h __a) {
227 return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
228 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
229}
230
231/// Constructs a 512-bit floating-point vector of [32 x half] from a
232/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
233/// contain the value of the source vector. The upper 384 bits are set
234/// to zero.
235///
236/// \headerfile <x86intrin.h>
237///
238/// This intrinsic has no corresponding instruction.
239///
240/// \param __a
241/// A 128-bit vector of [8 x half].
242/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
243/// contain the value of the parameter. The upper 384 bits are set to zero.
244static __inline__ __m512h __DEFAULT_FN_ATTRS512
245_mm512_zextph128_ph512(__m128h __a) {
246 return __builtin_shufflevector(
247 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
248 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
249}
250
251/// Constructs a 512-bit floating-point vector of [32 x half] from a
252/// 256-bit floating-point vector of [16 x half]. The lower 256 bits
253/// contain the value of the source vector. The upper 256 bits are set
254/// to zero.
255///
256/// \headerfile <x86intrin.h>
257///
258/// This intrinsic has no corresponding instruction.
259///
260/// \param __a
261/// A 256-bit vector of [16 x half].
262/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
263/// contain the value of the parameter. The upper 256 bits are set to zero.
264static __inline__ __m512h __DEFAULT_FN_ATTRS512
265_mm512_zextph256_ph512(__m256h __a) {
266 return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
267 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
268 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
269 29, 30, 31);
270}
271
272#define _mm_comi_round_sh(A, B, P, R) \
273 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
274
275#define _mm_comi_sh(A, B, pred) \
276 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
277
278static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
279 __m128h B) {
280 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
281 _MM_FROUND_CUR_DIRECTION);
282}
283
284static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
285 __m128h B) {
286 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
287 _MM_FROUND_CUR_DIRECTION);
288}
289
290static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
291 __m128h B) {
292 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
293 _MM_FROUND_CUR_DIRECTION);
294}
295
296static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
297 __m128h B) {
298 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
299 _MM_FROUND_CUR_DIRECTION);
300}
301
302static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
303 __m128h B) {
304 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
305 _MM_FROUND_CUR_DIRECTION);
306}
307
308static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
309 __m128h B) {
310 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
311 _MM_FROUND_CUR_DIRECTION);
312}
313
314static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
315 __m128h B) {
316 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
317 _MM_FROUND_CUR_DIRECTION);
318}
319
320static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
321 __m128h B) {
322 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
323 _MM_FROUND_CUR_DIRECTION);
324}
325
326static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
327 __m128h B) {
328 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
329 _MM_FROUND_CUR_DIRECTION);
330}
331
332static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
333 __m128h B) {
334 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
335 _MM_FROUND_CUR_DIRECTION);
336}
337
338static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
339 __m128h B) {
340 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
341 _MM_FROUND_CUR_DIRECTION);
342}
343
344static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
345 __m128h B) {
346 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
347 _MM_FROUND_CUR_DIRECTION);
348}
349
350static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
351 __m512h __B) {
352 return (__m512h)((__v32hf)__A + (__v32hf)__B);
353}
354
355static __inline__ __m512h __DEFAULT_FN_ATTRS512
356_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
357 return (__m512h)__builtin_ia32_selectph_512(
358 (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
359}
360
361static __inline__ __m512h __DEFAULT_FN_ATTRS512
362_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
363 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
364 (__v32hf)_mm512_add_ph(__A, __B),
365 (__v32hf)_mm512_setzero_ph());
366}
367
368#define _mm512_add_round_ph(A, B, R) \
369 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
370 (__v32hf)(__m512h)(B), (int)(R)))
371
372#define _mm512_mask_add_round_ph(W, U, A, B, R) \
373 ((__m512h)__builtin_ia32_selectph_512( \
374 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
375 (__v32hf)(__m512h)(W)))
376
377#define _mm512_maskz_add_round_ph(U, A, B, R) \
378 ((__m512h)__builtin_ia32_selectph_512( \
379 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
380 (__v32hf)_mm512_setzero_ph()))
381
382static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
383 __m512h __B) {
384 return (__m512h)((__v32hf)__A - (__v32hf)__B);
385}
386
387static __inline__ __m512h __DEFAULT_FN_ATTRS512
388_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
389 return (__m512h)__builtin_ia32_selectph_512(
390 (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
391}
392
393static __inline__ __m512h __DEFAULT_FN_ATTRS512
394_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
395 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
396 (__v32hf)_mm512_sub_ph(__A, __B),
397 (__v32hf)_mm512_setzero_ph());
398}
399
400#define _mm512_sub_round_ph(A, B, R) \
401 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
402 (__v32hf)(__m512h)(B), (int)(R)))
403
404#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
405 ((__m512h)__builtin_ia32_selectph_512( \
406 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
407 (__v32hf)(__m512h)(W)))
408
409#define _mm512_maskz_sub_round_ph(U, A, B, R) \
410 ((__m512h)__builtin_ia32_selectph_512( \
411 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
412 (__v32hf)_mm512_setzero_ph()))
413
414static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
415 __m512h __B) {
416 return (__m512h)((__v32hf)__A * (__v32hf)__B);
417}
418
419static __inline__ __m512h __DEFAULT_FN_ATTRS512
420_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
421 return (__m512h)__builtin_ia32_selectph_512(
422 (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
423}
424
425static __inline__ __m512h __DEFAULT_FN_ATTRS512
426_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
427 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
428 (__v32hf)_mm512_mul_ph(__A, __B),
429 (__v32hf)_mm512_setzero_ph());
430}
431
432#define _mm512_mul_round_ph(A, B, R) \
433 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
434 (__v32hf)(__m512h)(B), (int)(R)))
435
436#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
437 ((__m512h)__builtin_ia32_selectph_512( \
438 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
439 (__v32hf)(__m512h)(W)))
440
441#define _mm512_maskz_mul_round_ph(U, A, B, R) \
442 ((__m512h)__builtin_ia32_selectph_512( \
443 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
444 (__v32hf)_mm512_setzero_ph()))
445
446static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
447 __m512h __B) {
448 return (__m512h)((__v32hf)__A / (__v32hf)__B);
449}
450
451static __inline__ __m512h __DEFAULT_FN_ATTRS512
452_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
453 return (__m512h)__builtin_ia32_selectph_512(
454 (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
455}
456
457static __inline__ __m512h __DEFAULT_FN_ATTRS512
458_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
459 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
460 (__v32hf)_mm512_div_ph(__A, __B),
461 (__v32hf)_mm512_setzero_ph());
462}
463
464#define _mm512_div_round_ph(A, B, R) \
465 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
466 (__v32hf)(__m512h)(B), (int)(R)))
467
468#define _mm512_mask_div_round_ph(W, U, A, B, R) \
469 ((__m512h)__builtin_ia32_selectph_512( \
470 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
471 (__v32hf)(__m512h)(W)))
472
473#define _mm512_maskz_div_round_ph(U, A, B, R) \
474 ((__m512h)__builtin_ia32_selectph_512( \
475 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
476 (__v32hf)_mm512_setzero_ph()))
477
478static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
479 __m512h __B) {
480 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
481 _MM_FROUND_CUR_DIRECTION);
482}
483
484static __inline__ __m512h __DEFAULT_FN_ATTRS512
485_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
486 return (__m512h)__builtin_ia32_selectph_512(
487 (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
488}
489
490static __inline__ __m512h __DEFAULT_FN_ATTRS512
491_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
492 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
493 (__v32hf)_mm512_min_ph(__A, __B),
494 (__v32hf)_mm512_setzero_ph());
495}
496
497#define _mm512_min_round_ph(A, B, R) \
498 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
499 (__v32hf)(__m512h)(B), (int)(R)))
500
501#define _mm512_mask_min_round_ph(W, U, A, B, R) \
502 ((__m512h)__builtin_ia32_selectph_512( \
503 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
504 (__v32hf)(__m512h)(W)))
505
506#define _mm512_maskz_min_round_ph(U, A, B, R) \
507 ((__m512h)__builtin_ia32_selectph_512( \
508 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
509 (__v32hf)_mm512_setzero_ph()))
510
511static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
512 __m512h __B) {
513 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
514 _MM_FROUND_CUR_DIRECTION);
515}
516
517static __inline__ __m512h __DEFAULT_FN_ATTRS512
518_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
519 return (__m512h)__builtin_ia32_selectph_512(
520 (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
521}
522
523static __inline__ __m512h __DEFAULT_FN_ATTRS512
524_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
525 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
526 (__v32hf)_mm512_max_ph(__A, __B),
527 (__v32hf)_mm512_setzero_ph());
528}
529
530#define _mm512_max_round_ph(A, B, R) \
531 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
532 (__v32hf)(__m512h)(B), (int)(R)))
533
534#define _mm512_mask_max_round_ph(W, U, A, B, R) \
535 ((__m512h)__builtin_ia32_selectph_512( \
536 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
537 (__v32hf)(__m512h)(W)))
538
539#define _mm512_maskz_max_round_ph(U, A, B, R) \
540 ((__m512h)__builtin_ia32_selectph_512( \
541 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
542 (__v32hf)_mm512_setzero_ph()))
543
544static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
545 return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
546}
547
548static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
549 return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
550}
551
552static __inline__ __m512h __DEFAULT_FN_ATTRS512
553_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
554 return (__m512h)__builtin_ia32_selectps_512(
555 (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
556}
557
558static __inline__ __m512h __DEFAULT_FN_ATTRS512
559_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
560 return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
561 (__v16sf)_mm512_conj_pch(__A),
562 (__v16sf)_mm512_setzero_ps());
563}
564
565static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
566 __m128h __B) {
567 __A[0] += __B[0];
568 return __A;
569}
570
571static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
572 __mmask8 __U,
573 __m128h __A,
574 __m128h __B) {
575 __A = _mm_add_sh(__A, __B);
576 return __builtin_ia32_selectsh_128(__U, __A, __W);
577}
578
579static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
580 __m128h __A,
581 __m128h __B) {
582 __A = _mm_add_sh(__A, __B);
583 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
584}
585
586#define _mm_add_round_sh(A, B, R) \
587 ((__m128h)__builtin_ia32_addsh_round_mask( \
588 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
589 (__mmask8)-1, (int)(R)))
590
591#define _mm_mask_add_round_sh(W, U, A, B, R) \
592 ((__m128h)__builtin_ia32_addsh_round_mask( \
593 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
594 (__mmask8)(U), (int)(R)))
595
596#define _mm_maskz_add_round_sh(U, A, B, R) \
597 ((__m128h)__builtin_ia32_addsh_round_mask( \
598 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
599 (__mmask8)(U), (int)(R)))
600
601static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
602 __m128h __B) {
603 __A[0] -= __B[0];
604 return __A;
605}
606
607static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
608 __mmask8 __U,
609 __m128h __A,
610 __m128h __B) {
611 __A = _mm_sub_sh(__A, __B);
612 return __builtin_ia32_selectsh_128(__U, __A, __W);
613}
614
615static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
616 __m128h __A,
617 __m128h __B) {
618 __A = _mm_sub_sh(__A, __B);
619 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
620}
621
622#define _mm_sub_round_sh(A, B, R) \
623 ((__m128h)__builtin_ia32_subsh_round_mask( \
624 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
625 (__mmask8)-1, (int)(R)))
626
627#define _mm_mask_sub_round_sh(W, U, A, B, R) \
628 ((__m128h)__builtin_ia32_subsh_round_mask( \
629 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
630 (__mmask8)(U), (int)(R)))
631
632#define _mm_maskz_sub_round_sh(U, A, B, R) \
633 ((__m128h)__builtin_ia32_subsh_round_mask( \
634 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
635 (__mmask8)(U), (int)(R)))
636
637static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
638 __m128h __B) {
639 __A[0] *= __B[0];
640 return __A;
641}
642
643static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
644 __mmask8 __U,
645 __m128h __A,
646 __m128h __B) {
647 __A = _mm_mul_sh(__A, __B);
648 return __builtin_ia32_selectsh_128(__U, __A, __W);
649}
650
651static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
652 __m128h __A,
653 __m128h __B) {
654 __A = _mm_mul_sh(__A, __B);
655 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
656}
657
658#define _mm_mul_round_sh(A, B, R) \
659 ((__m128h)__builtin_ia32_mulsh_round_mask( \
660 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
661 (__mmask8)-1, (int)(R)))
662
663#define _mm_mask_mul_round_sh(W, U, A, B, R) \
664 ((__m128h)__builtin_ia32_mulsh_round_mask( \
665 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
666 (__mmask8)(U), (int)(R)))
667
668#define _mm_maskz_mul_round_sh(U, A, B, R) \
669 ((__m128h)__builtin_ia32_mulsh_round_mask( \
670 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
671 (__mmask8)(U), (int)(R)))
672
673static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
674 __m128h __B) {
675 __A[0] /= __B[0];
676 return __A;
677}
678
679static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
680 __mmask8 __U,
681 __m128h __A,
682 __m128h __B) {
683 __A = _mm_div_sh(__A, __B);
684 return __builtin_ia32_selectsh_128(__U, __A, __W);
685}
686
687static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
688 __m128h __A,
689 __m128h __B) {
690 __A = _mm_div_sh(__A, __B);
691 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
692}
693
694#define _mm_div_round_sh(A, B, R) \
695 ((__m128h)__builtin_ia32_divsh_round_mask( \
696 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
697 (__mmask8)-1, (int)(R)))
698
699#define _mm_mask_div_round_sh(W, U, A, B, R) \
700 ((__m128h)__builtin_ia32_divsh_round_mask( \
701 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
702 (__mmask8)(U), (int)(R)))
703
704#define _mm_maskz_div_round_sh(U, A, B, R) \
705 ((__m128h)__builtin_ia32_divsh_round_mask( \
706 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
707 (__mmask8)(U), (int)(R)))
708
709static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
710 __m128h __B) {
711 return (__m128h)__builtin_ia32_minsh_round_mask(
712 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
713 _MM_FROUND_CUR_DIRECTION);
714}
715
716static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
717 __mmask8 __U,
718 __m128h __A,
719 __m128h __B) {
720 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
721 (__v8hf)__W, (__mmask8)__U,
722 _MM_FROUND_CUR_DIRECTION);
723}
724
725static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
726 __m128h __A,
727 __m128h __B) {
728 return (__m128h)__builtin_ia32_minsh_round_mask(
729 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
730 _MM_FROUND_CUR_DIRECTION);
731}
732
733#define _mm_min_round_sh(A, B, R) \
734 ((__m128h)__builtin_ia32_minsh_round_mask( \
735 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
736 (__mmask8)-1, (int)(R)))
737
738#define _mm_mask_min_round_sh(W, U, A, B, R) \
739 ((__m128h)__builtin_ia32_minsh_round_mask( \
740 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
741 (__mmask8)(U), (int)(R)))
742
743#define _mm_maskz_min_round_sh(U, A, B, R) \
744 ((__m128h)__builtin_ia32_minsh_round_mask( \
745 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
746 (__mmask8)(U), (int)(R)))
747
748static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
749 __m128h __B) {
750 return (__m128h)__builtin_ia32_maxsh_round_mask(
751 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
752 _MM_FROUND_CUR_DIRECTION);
753}
754
755static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
756 __mmask8 __U,
757 __m128h __A,
758 __m128h __B) {
759 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
760 (__v8hf)__W, (__mmask8)__U,
761 _MM_FROUND_CUR_DIRECTION);
762}
763
764static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
765 __m128h __A,
766 __m128h __B) {
767 return (__m128h)__builtin_ia32_maxsh_round_mask(
768 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
769 _MM_FROUND_CUR_DIRECTION);
770}
771
772#define _mm_max_round_sh(A, B, R) \
773 ((__m128h)__builtin_ia32_maxsh_round_mask( \
774 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
775 (__mmask8)-1, (int)(R)))
776
777#define _mm_mask_max_round_sh(W, U, A, B, R) \
778 ((__m128h)__builtin_ia32_maxsh_round_mask( \
779 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
780 (__mmask8)(U), (int)(R)))
781
782#define _mm_maskz_max_round_sh(U, A, B, R) \
783 ((__m128h)__builtin_ia32_maxsh_round_mask( \
784 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
785 (__mmask8)(U), (int)(R)))
786
787#define _mm512_cmp_round_ph_mask(A, B, P, R) \
788 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
789 (__v32hf)(__m512h)(B), (int)(P), \
790 (__mmask32)-1, (int)(R)))
791
792#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
793 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
794 (__v32hf)(__m512h)(B), (int)(P), \
795 (__mmask32)(U), (int)(R)))
796
797#define _mm512_cmp_ph_mask(A, B, P) \
798 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
799
800#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
801 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
802
803#define _mm_cmp_round_sh_mask(X, Y, P, R) \
804 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
805 (__v8hf)(__m128h)(Y), (int)(P), \
806 (__mmask8)-1, (int)(R)))
807
808#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
809 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
810 (__v8hf)(__m128h)(Y), (int)(P), \
811 (__mmask8)(M), (int)(R)))
812
813#define _mm_cmp_sh_mask(X, Y, P) \
814 ((__mmask8)__builtin_ia32_cmpsh_mask( \
815 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
816 _MM_FROUND_CUR_DIRECTION))
817
818#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
819 ((__mmask8)__builtin_ia32_cmpsh_mask( \
820 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
821 _MM_FROUND_CUR_DIRECTION))
822// loads with vmovsh:
823static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
824 struct __mm_load_sh_struct {
825 _Float16 __u;
826 } __attribute__((__packed__, __may_alias__));
827 _Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u;
828 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
829}
830
831static __inline__ __m128h __DEFAULT_FN_ATTRS128
832_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
833 __m128h src = (__v8hf)__builtin_shufflevector(
834 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
835
836 return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1);
837}
838
839static __inline__ __m128h __DEFAULT_FN_ATTRS128
840_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
841 return (__m128h)__builtin_ia32_loadsh128_mask(
842 (__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
843}
844
845static __inline__ __m512h __DEFAULT_FN_ATTRS512
846_mm512_load_ph(void const *__p) {
847 return *(const __m512h *)__p;
848}
849
850static __inline__ __m256h __DEFAULT_FN_ATTRS256
851_mm256_load_ph(void const *__p) {
852 return *(const __m256h *)__p;
853}
854
855static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
856 return *(const __m128h *)__p;
857}
858
859static __inline__ __m512h __DEFAULT_FN_ATTRS512
860_mm512_loadu_ph(void const *__p) {
861 struct __loadu_ph {
862 __m512h_u __v;
863 } __attribute__((__packed__, __may_alias__));
864 return ((const struct __loadu_ph *)__p)->__v;
865}
866
867static __inline__ __m256h __DEFAULT_FN_ATTRS256
868_mm256_loadu_ph(void const *__p) {
869 struct __loadu_ph {
870 __m256h_u __v;
871 } __attribute__((__packed__, __may_alias__));
872 return ((const struct __loadu_ph *)__p)->__v;
873}
874
875static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
876 struct __loadu_ph {
877 __m128h_u __v;
878 } __attribute__((__packed__, __may_alias__));
879 return ((const struct __loadu_ph *)__p)->__v;
880}
881
882// stores with vmovsh:
883static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
884 __m128h __a) {
885 struct __mm_store_sh_struct {
886 _Float16 __u;
887 } __attribute__((__packed__, __may_alias__));
888 ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
889}
890
891static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
892 __mmask8 __U,
893 __m128h __A) {
894 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
895}
896
897static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
898 __m512h __A) {
899 *(__m512h *)__P = __A;
900}
901
902static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
903 __m256h __A) {
904 *(__m256h *)__P = __A;
905}
906
907static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
908 __m128h __A) {
909 *(__m128h *)__P = __A;
910}
911
912static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
913 __m512h __A) {
914 struct __storeu_ph {
915 __m512h_u __v;
916 } __attribute__((__packed__, __may_alias__));
917 ((struct __storeu_ph *)__P)->__v = __A;
918}
919
920static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
921 __m256h __A) {
922 struct __storeu_ph {
923 __m256h_u __v;
924 } __attribute__((__packed__, __may_alias__));
925 ((struct __storeu_ph *)__P)->__v = __A;
926}
927
928static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
929 __m128h __A) {
930 struct __storeu_ph {
931 __m128h_u __v;
932 } __attribute__((__packed__, __may_alias__));
933 ((struct __storeu_ph *)__P)->__v = __A;
934}
935
936// moves with vmovsh:
937static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
938 __m128h __b) {
939 __a[0] = __b[0];
940 return __a;
941}
942
943static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
944 __mmask8 __U,
945 __m128h __A,
946 __m128h __B) {
947 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
948}
949
950static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
951 __m128h __A,
952 __m128h __B) {
953 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
954 _mm_setzero_ph());
955}
956
957// vmovw:
958static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
959 return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
960}
961
962static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
963 __v8hi __b = (__v8hi)__a;
964 return __b[0];
965}
966
967static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
968 return (__m512h)__builtin_ia32_rcpph512_mask(
969 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
970}
971
972static __inline__ __m512h __DEFAULT_FN_ATTRS512
973_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
974 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
975 (__mmask32)__U);
976}
977
978static __inline__ __m512h __DEFAULT_FN_ATTRS512
979_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
980 return (__m512h)__builtin_ia32_rcpph512_mask(
981 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
982}
983
984static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
985 return (__m512h)__builtin_ia32_rsqrtph512_mask(
986 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
987}
988
989static __inline__ __m512h __DEFAULT_FN_ATTRS512
990_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
991 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
992 (__mmask32)__U);
993}
994
995static __inline__ __m512h __DEFAULT_FN_ATTRS512
996_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
997 return (__m512h)__builtin_ia32_rsqrtph512_mask(
998 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
999}
1000
1001#define _mm512_getmant_ph(A, B, C) \
1002 ((__m512h)__builtin_ia32_getmantph512_mask( \
1003 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1004 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1005 _MM_FROUND_CUR_DIRECTION))
1006
1007#define _mm512_mask_getmant_ph(W, U, A, B, C) \
1008 ((__m512h)__builtin_ia32_getmantph512_mask( \
1009 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1010 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1011
1012#define _mm512_maskz_getmant_ph(U, A, B, C) \
1013 ((__m512h)__builtin_ia32_getmantph512_mask( \
1014 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1015 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1016
1017#define _mm512_getmant_round_ph(A, B, C, R) \
1018 ((__m512h)__builtin_ia32_getmantph512_mask( \
1019 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1020 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1021
1022#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1023 ((__m512h)__builtin_ia32_getmantph512_mask( \
1024 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1025 (__mmask32)(U), (int)(R)))
1026
1027#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1028 ((__m512h)__builtin_ia32_getmantph512_mask( \
1029 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1030 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1031
1032static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1033 return (__m512h)__builtin_ia32_getexpph512_mask(
1034 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1035 _MM_FROUND_CUR_DIRECTION);
1036}
1037
1038static __inline__ __m512h __DEFAULT_FN_ATTRS512
1039_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1040 return (__m512h)__builtin_ia32_getexpph512_mask(
1041 (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1042}
1043
1044static __inline__ __m512h __DEFAULT_FN_ATTRS512
1045_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1046 return (__m512h)__builtin_ia32_getexpph512_mask(
1047 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1048 _MM_FROUND_CUR_DIRECTION);
1049}
1050
1051#define _mm512_getexp_round_ph(A, R) \
1052 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1053 (__v32hf)_mm512_undefined_ph(), \
1054 (__mmask32)-1, (int)(R)))
1055
1056#define _mm512_mask_getexp_round_ph(W, U, A, R) \
1057 ((__m512h)__builtin_ia32_getexpph512_mask( \
1058 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1059
1060#define _mm512_maskz_getexp_round_ph(U, A, R) \
1061 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1062 (__v32hf)_mm512_setzero_ph(), \
1063 (__mmask32)(U), (int)(R)))
1064
1065static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1066 __m512h __B) {
1067 return (__m512h)__builtin_ia32_scalefph512_mask(
1068 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1069 _MM_FROUND_CUR_DIRECTION);
1070}
1071
1072static __inline__ __m512h __DEFAULT_FN_ATTRS512
1073_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1074 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1075 (__v32hf)__W, (__mmask32)__U,
1076 _MM_FROUND_CUR_DIRECTION);
1077}
1078
1079static __inline__ __m512h __DEFAULT_FN_ATTRS512
1080_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1081 return (__m512h)__builtin_ia32_scalefph512_mask(
1082 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1083 _MM_FROUND_CUR_DIRECTION);
1084}
1085
1086#define _mm512_scalef_round_ph(A, B, R) \
1087 ((__m512h)__builtin_ia32_scalefph512_mask( \
1088 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1089 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1090
1091#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1092 ((__m512h)__builtin_ia32_scalefph512_mask( \
1093 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1094 (__mmask32)(U), (int)(R)))
1095
1096#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1097 ((__m512h)__builtin_ia32_scalefph512_mask( \
1098 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1099 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1100
1101#define _mm512_roundscale_ph(A, B) \
1102 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1103 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1104 _MM_FROUND_CUR_DIRECTION))
1105
1106#define _mm512_mask_roundscale_ph(A, B, C, imm) \
1107 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1108 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1109 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1110
1111#define _mm512_maskz_roundscale_ph(A, B, imm) \
1112 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1113 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1114 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1115
1116#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1117 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1118 (__v32hf)(__m512h)(A), \
1119 (__mmask32)(B), (int)(R)))
1120
1121#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1122 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1123 (__v32hf)_mm512_setzero_ph(), \
1124 (__mmask32)(A), (int)(R)))
1125
1126#define _mm512_roundscale_round_ph(A, imm, R) \
1127 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1128 (__v32hf)_mm512_undefined_ph(), \
1129 (__mmask32)-1, (int)(R)))
1130
1131#define _mm512_reduce_ph(A, imm) \
1132 ((__m512h)__builtin_ia32_reduceph512_mask( \
1133 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1134 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1135
1136#define _mm512_mask_reduce_ph(W, U, A, imm) \
1137 ((__m512h)__builtin_ia32_reduceph512_mask( \
1138 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1139 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1140
1141#define _mm512_maskz_reduce_ph(U, A, imm) \
1142 ((__m512h)__builtin_ia32_reduceph512_mask( \
1143 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1144 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1145
1146#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1147 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1148 (__v32hf)(__m512h)(W), \
1149 (__mmask32)(U), (int)(R)))
1150
1151#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1152 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1153 (__v32hf)_mm512_setzero_ph(), \
1154 (__mmask32)(U), (int)(R)))
1155
1156#define _mm512_reduce_round_ph(A, imm, R) \
1157 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1158 (__v32hf)_mm512_undefined_ph(), \
1159 (__mmask32)-1, (int)(R)))
1160
1161static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1162 __m128h __B) {
1163 return (__m128h)__builtin_ia32_rcpsh_mask(
1164 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1165}
1166
1167static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1168 __mmask8 __U,
1169 __m128h __A,
1170 __m128h __B) {
1171 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1172 (__v8hf)__W, (__mmask8)__U);
1173}
1174
1175static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1176 __m128h __A,
1177 __m128h __B) {
1178 return (__m128h)__builtin_ia32_rcpsh_mask(
1179 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1180}
1181
1182static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1183 __m128h __B) {
1184 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1185 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1186}
1187
1188static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1189 __mmask8 __U,
1190 __m128h __A,
1191 __m128h __B) {
1192 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1193 (__v8hf)__W, (__mmask8)__U);
1194}
1195
1196static __inline__ __m128h __DEFAULT_FN_ATTRS128
1197_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1198 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1199 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1200}
1201
1202#define _mm_getmant_round_sh(A, B, C, D, R) \
1203 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1204 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1205 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1206
1207#define _mm_getmant_sh(A, B, C, D) \
1208 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1209 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1210 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1211
1212#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1213 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1214 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1215 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1216
1217#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1218 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1219 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1220 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1221
1222#define _mm_maskz_getmant_sh(U, A, B, C, D) \
1223 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1224 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1225 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1226
1227#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1228 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1229 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1230 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1231
1232#define _mm_getexp_round_sh(A, B, R) \
1233 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1234 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1235 (__mmask8)-1, (int)(R)))
1236
1237static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1238 __m128h __B) {
1239 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1240 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1241 _MM_FROUND_CUR_DIRECTION);
1242}
1243
1244static __inline__ __m128h __DEFAULT_FN_ATTRS128
1245_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1246 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1247 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1248 _MM_FROUND_CUR_DIRECTION);
1249}
1250
1251#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1252 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1253 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1254 (__mmask8)(U), (int)(R)))
1255
1256static __inline__ __m128h __DEFAULT_FN_ATTRS128
1257_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1258 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1259 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1260 _MM_FROUND_CUR_DIRECTION);
1261}
1262
1263#define _mm_maskz_getexp_round_sh(U, A, B, R) \
1264 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1265 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1266 (__mmask8)(U), (int)(R)))
1267
1268#define _mm_scalef_round_sh(A, B, R) \
1269 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1270 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1271 (__mmask8)-1, (int)(R)))
1272
1273static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1274 __m128h __B) {
1275 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1276 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1277 _MM_FROUND_CUR_DIRECTION);
1278}
1279
1280static __inline__ __m128h __DEFAULT_FN_ATTRS128
1281_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1282 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1283 (__v8hf)__W, (__mmask8)__U,
1284 _MM_FROUND_CUR_DIRECTION);
1285}
1286
1287#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1288 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1289 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1290 (__mmask8)(U), (int)(R)))
1291
1292static __inline__ __m128h __DEFAULT_FN_ATTRS128
1293_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1294 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1295 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1296 _MM_FROUND_CUR_DIRECTION);
1297}
1298
1299#define _mm_maskz_scalef_round_sh(U, A, B, R) \
1300 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1301 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1302 (__mmask8)(U), (int)(R)))
1303
1304#define _mm_roundscale_round_sh(A, B, imm, R) \
1305 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1306 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1307 (__mmask8)-1, (int)(imm), (int)(R)))
1308
1309#define _mm_roundscale_sh(A, B, imm) \
1310 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1311 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1312 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1313
1314#define _mm_mask_roundscale_sh(W, U, A, B, I) \
1315 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1316 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1317 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1318
1319#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1320 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1321 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1322 (__mmask8)(U), (int)(I), (int)(R)))
1323
1324#define _mm_maskz_roundscale_sh(U, A, B, I) \
1325 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1326 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1327 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1328
1329#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1330 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1331 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1332 (__mmask8)(U), (int)(I), (int)(R)))
1333
1334#define _mm_reduce_sh(A, B, C) \
1335 ((__m128h)__builtin_ia32_reducesh_mask( \
1336 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1337 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1338
1339#define _mm_mask_reduce_sh(W, U, A, B, C) \
1340 ((__m128h)__builtin_ia32_reducesh_mask( \
1341 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1342 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1343
1344#define _mm_maskz_reduce_sh(U, A, B, C) \
1345 ((__m128h)__builtin_ia32_reducesh_mask( \
1346 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1347 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1348
1349#define _mm_reduce_round_sh(A, B, C, R) \
1350 ((__m128h)__builtin_ia32_reducesh_mask( \
1351 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1352 (__mmask8)-1, (int)(C), (int)(R)))
1353
1354#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1355 ((__m128h)__builtin_ia32_reducesh_mask( \
1356 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1357 (__mmask8)(U), (int)(C), (int)(R)))
1358
1359#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1360 ((__m128h)__builtin_ia32_reducesh_mask( \
1361 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1362 (__mmask8)(U), (int)(C), (int)(R)))
1363
1364#define _mm512_sqrt_round_ph(A, R) \
1365 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1366
1367#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1368 ((__m512h)__builtin_ia32_selectph_512( \
1369 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1370 (__v32hf)(__m512h)(W)))
1371
1372#define _mm512_maskz_sqrt_round_ph(U, A, R) \
1373 ((__m512h)__builtin_ia32_selectph_512( \
1374 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1375 (__v32hf)_mm512_setzero_ph()))
1376
1377static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1378 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1379 _MM_FROUND_CUR_DIRECTION);
1380}
1381
1382static __inline__ __m512h __DEFAULT_FN_ATTRS512
1383_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1384 return (__m512h)__builtin_ia32_selectph_512(
1385 (__mmask32)(__U),
1386 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1387 (__v32hf)(__m512h)(__W));
1388}
1389
1390static __inline__ __m512h __DEFAULT_FN_ATTRS512
1391_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1392 return (__m512h)__builtin_ia32_selectph_512(
1393 (__mmask32)(__U),
1394 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1395 (__v32hf)_mm512_setzero_ph());
1396}
1397
1398#define _mm_sqrt_round_sh(A, B, R) \
1399 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1400 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1401 (__mmask8)-1, (int)(R)))
1402
1403#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1404 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1405 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1406 (__mmask8)(U), (int)(R)))
1407
1408#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1409 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1410 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1411 (__mmask8)(U), (int)(R)))
1412
1413static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1414 __m128h __B) {
1415 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1416 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1417 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
1418}
1419
1420static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1421 __mmask32 __U,
1422 __m128h __A,
1423 __m128h __B) {
1424 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1425 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1426 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1427}
1428
1429static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1430 __m128h __A,
1431 __m128h __B) {
1432 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1433 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1434 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1435}
1436
1437#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1438 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1439 (int)(imm), (__mmask32)(U)))
1440
1441#define _mm512_fpclass_ph_mask(A, imm) \
1442 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1443 (int)(imm), (__mmask32)-1))
1444
1445#define _mm_fpclass_sh_mask(A, imm) \
1446 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1447 (__mmask8)-1))
1448
1449#define _mm_mask_fpclass_sh_mask(U, A, imm) \
1450 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1451 (__mmask8)(U)))
1452
1453#define _mm512_cvt_roundpd_ph(A, R) \
1454 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1455 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1456
1457#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1458 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1459 (__mmask8)(U), (int)(R)))
1460
1461#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1462 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1463 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1464
1465static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1466 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1467 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1468 _MM_FROUND_CUR_DIRECTION);
1469}
1470
1471static __inline__ __m128h __DEFAULT_FN_ATTRS512
1472_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1473 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1474 (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1475}
1476
1477static __inline__ __m128h __DEFAULT_FN_ATTRS512
1478_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1479 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1480 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1481 _MM_FROUND_CUR_DIRECTION);
1482}
1483
1484#define _mm512_cvt_roundph_pd(A, R) \
1485 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1486 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1487
1488#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1489 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1490 (__mmask8)(U), (int)(R)))
1491
1492#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1493 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1494 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1495
1496static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1497 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1498 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1499 _MM_FROUND_CUR_DIRECTION);
1500}
1501
1502static __inline__ __m512d __DEFAULT_FN_ATTRS512
1503_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1504 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1505 (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1506}
1507
1508static __inline__ __m512d __DEFAULT_FN_ATTRS512
1509_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1510 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1511 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1512 _MM_FROUND_CUR_DIRECTION);
1513}
1514
1515#define _mm_cvt_roundsh_ss(A, B, R) \
1516 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1517 (__v4sf)_mm_undefined_ps(), \
1518 (__mmask8)(-1), (int)(R)))
1519
1520#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1521 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1522 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1523
1524#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1525 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1526 (__v4sf)_mm_setzero_ps(), \
1527 (__mmask8)(U), (int)(R)))
1528
1529static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1530 __m128h __B) {
1531 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1532 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1533 _MM_FROUND_CUR_DIRECTION);
1534}
1535
1536static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1537 __mmask8 __U,
1538 __m128 __A,
1539 __m128h __B) {
1540 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1541 (__v4sf)__W, (__mmask8)__U,
1542 _MM_FROUND_CUR_DIRECTION);
1543}
1544
1545static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1546 __m128 __A,
1547 __m128h __B) {
1548 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1549 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1550 _MM_FROUND_CUR_DIRECTION);
1551}
1552
1553#define _mm_cvt_roundss_sh(A, B, R) \
1554 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1555 (__v8hf)_mm_undefined_ph(), \
1556 (__mmask8)(-1), (int)(R)))
1557
1558#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1559 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1560 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1561
1562#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1563 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1564 (__v8hf)_mm_setzero_ph(), \
1565 (__mmask8)(U), (int)(R)))
1566
1567static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1568 __m128 __B) {
1569 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1570 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1571 _MM_FROUND_CUR_DIRECTION);
1572}
1573
1574static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1575 __mmask8 __U,
1576 __m128h __A,
1577 __m128 __B) {
1578 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1579 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1580 _MM_FROUND_CUR_DIRECTION);
1581}
1582
1583static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1584 __m128h __A,
1585 __m128 __B) {
1586 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1587 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1588 _MM_FROUND_CUR_DIRECTION);
1589}
1590
1591#define _mm_cvt_roundsd_sh(A, B, R) \
1592 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1593 (__v8hf)_mm_undefined_ph(), \
1594 (__mmask8)(-1), (int)(R)))
1595
1596#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1597 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1598 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1599
1600#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1601 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1602 (__v8hf)_mm_setzero_ph(), \
1603 (__mmask8)(U), (int)(R)))
1604
1605static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1606 __m128d __B) {
1607 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1608 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1609 _MM_FROUND_CUR_DIRECTION);
1610}
1611
1612static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1613 __mmask8 __U,
1614 __m128h __A,
1615 __m128d __B) {
1616 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1617 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1618 _MM_FROUND_CUR_DIRECTION);
1619}
1620
1621static __inline__ __m128h __DEFAULT_FN_ATTRS128
1622_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1623 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1624 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1625 _MM_FROUND_CUR_DIRECTION);
1626}
1627
1628#define _mm_cvt_roundsh_sd(A, B, R) \
1629 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1630 (__v2df)_mm_undefined_pd(), \
1631 (__mmask8)(-1), (int)(R)))
1632
1633#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1634 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1635 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1636
1637#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1638 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1639 (__v2df)_mm_setzero_pd(), \
1640 (__mmask8)(U), (int)(R)))
1641
1642static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1643 __m128h __B) {
1644 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1645 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1646 _MM_FROUND_CUR_DIRECTION);
1647}
1648
1649static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1650 __mmask8 __U,
1651 __m128d __A,
1652 __m128h __B) {
1653 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1654 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1655 _MM_FROUND_CUR_DIRECTION);
1656}
1657
1658static __inline__ __m128d __DEFAULT_FN_ATTRS128
1659_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1660 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1661 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1662 _MM_FROUND_CUR_DIRECTION);
1663}
1664
1665#define _mm512_cvt_roundph_epi16(A, R) \
1666 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1667 (__v32hi)_mm512_undefined_epi32(), \
1668 (__mmask32)(-1), (int)(R)))
1669
1670#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1671 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1672 (__mmask32)(U), (int)(R)))
1673
1674#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1675 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1676 (__v32hi)_mm512_setzero_epi32(), \
1677 (__mmask32)(U), (int)(R)))
1678
1679static __inline__ __m512i __DEFAULT_FN_ATTRS512
1680_mm512_cvtph_epi16(__m512h __A) {
1681 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1682 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1683 _MM_FROUND_CUR_DIRECTION);
1684}
1685
1686static __inline__ __m512i __DEFAULT_FN_ATTRS512
1687_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1688 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1689 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1690}
1691
1692static __inline__ __m512i __DEFAULT_FN_ATTRS512
1693_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1694 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1695 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1696 _MM_FROUND_CUR_DIRECTION);
1697}
1698
1699#define _mm512_cvtt_roundph_epi16(A, R) \
1700 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1701 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1702 (int)(R)))
1703
1704#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1705 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1706 (__mmask32)(U), (int)(R)))
1707
1708#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1709 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1710 (__v32hi)_mm512_setzero_epi32(), \
1711 (__mmask32)(U), (int)(R)))
1712
1713static __inline__ __m512i __DEFAULT_FN_ATTRS512
1714_mm512_cvttph_epi16(__m512h __A) {
1715 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1716 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1717 _MM_FROUND_CUR_DIRECTION);
1718}
1719
1720static __inline__ __m512i __DEFAULT_FN_ATTRS512
1721_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1722 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1723 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1724}
1725
1726static __inline__ __m512i __DEFAULT_FN_ATTRS512
1727_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1728 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1729 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1730 _MM_FROUND_CUR_DIRECTION);
1731}
1732
1733#define _mm512_cvt_roundepi16_ph(A, R) \
1734 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1735 (__v32hf)_mm512_undefined_ph(), \
1736 (__mmask32)(-1), (int)(R)))
1737
1738#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1739 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1740 (__mmask32)(U), (int)(R)))
1741
1742#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1743 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1744 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1745
1746static __inline__ __m512h __DEFAULT_FN_ATTRS512
1747_mm512_cvtepi16_ph(__m512i __A) {
1748 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1749 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1750 _MM_FROUND_CUR_DIRECTION);
1751}
1752
1753static __inline__ __m512h __DEFAULT_FN_ATTRS512
1754_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1755 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1756 (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1757}
1758
1759static __inline__ __m512h __DEFAULT_FN_ATTRS512
1760_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1761 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1762 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1763 _MM_FROUND_CUR_DIRECTION);
1764}
1765
1766#define _mm512_cvt_roundph_epu16(A, R) \
1767 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1768 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1769 (int)(R)))
1770
1771#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1772 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1773 (__mmask32)(U), (int)(R)))
1774
1775#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1776 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1777 (__v32hu)_mm512_setzero_epi32(), \
1778 (__mmask32)(U), (int)(R)))
1779
1780static __inline__ __m512i __DEFAULT_FN_ATTRS512
1781_mm512_cvtph_epu16(__m512h __A) {
1782 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1783 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1784 _MM_FROUND_CUR_DIRECTION);
1785}
1786
1787static __inline__ __m512i __DEFAULT_FN_ATTRS512
1788_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1789 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1790 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1791}
1792
1793static __inline__ __m512i __DEFAULT_FN_ATTRS512
1794_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1795 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1796 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1797 _MM_FROUND_CUR_DIRECTION);
1798}
1799
1800#define _mm512_cvtt_roundph_epu16(A, R) \
1801 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1802 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1803 (int)(R)))
1804
1805#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1806 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1807 (__mmask32)(U), (int)(R)))
1808
1809#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1810 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1811 (__v32hu)_mm512_setzero_epi32(), \
1812 (__mmask32)(U), (int)(R)))
1813
1814static __inline__ __m512i __DEFAULT_FN_ATTRS512
1815_mm512_cvttph_epu16(__m512h __A) {
1816 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1817 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1818 _MM_FROUND_CUR_DIRECTION);
1819}
1820
1821static __inline__ __m512i __DEFAULT_FN_ATTRS512
1822_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1823 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1824 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1825}
1826
1827static __inline__ __m512i __DEFAULT_FN_ATTRS512
1828_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1829 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1830 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1831 _MM_FROUND_CUR_DIRECTION);
1832}
1833
1834#define _mm512_cvt_roundepu16_ph(A, R) \
1835 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1836 (__v32hf)_mm512_undefined_ph(), \
1837 (__mmask32)(-1), (int)(R)))
1838
1839#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1840 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1841 (__mmask32)(U), (int)(R)))
1842
1843#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1844 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1845 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1846
1847static __inline__ __m512h __DEFAULT_FN_ATTRS512
1848_mm512_cvtepu16_ph(__m512i __A) {
1849 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1850 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1851 _MM_FROUND_CUR_DIRECTION);
1852}
1853
1854static __inline__ __m512h __DEFAULT_FN_ATTRS512
1855_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1856 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1857 (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1858}
1859
1860static __inline__ __m512h __DEFAULT_FN_ATTRS512
1861_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1862 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1863 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1864 _MM_FROUND_CUR_DIRECTION);
1865}
1866
1867#define _mm512_cvt_roundph_epi32(A, R) \
1868 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1869 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1870 (int)(R)))
1871
1872#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1873 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1874 (__mmask16)(U), (int)(R)))
1875
1876#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1877 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1878 (__v16si)_mm512_setzero_epi32(), \
1879 (__mmask16)(U), (int)(R)))
1880
1881static __inline__ __m512i __DEFAULT_FN_ATTRS512
1882_mm512_cvtph_epi32(__m256h __A) {
1883 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1884 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1885 _MM_FROUND_CUR_DIRECTION);
1886}
1887
1888static __inline__ __m512i __DEFAULT_FN_ATTRS512
1889_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1890 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1891 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1892}
1893
1894static __inline__ __m512i __DEFAULT_FN_ATTRS512
1895_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1896 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1897 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1898 _MM_FROUND_CUR_DIRECTION);
1899}
1900
1901#define _mm512_cvt_roundph_epu32(A, R) \
1902 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1903 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1904 (int)(R)))
1905
1906#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1907 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1908 (__mmask16)(U), (int)(R)))
1909
1910#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1911 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1912 (__v16su)_mm512_setzero_epi32(), \
1913 (__mmask16)(U), (int)(R)))
1914
1915static __inline__ __m512i __DEFAULT_FN_ATTRS512
1916_mm512_cvtph_epu32(__m256h __A) {
1917 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1918 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1919 _MM_FROUND_CUR_DIRECTION);
1920}
1921
1922static __inline__ __m512i __DEFAULT_FN_ATTRS512
1923_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1924 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1925 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1926}
1927
1928static __inline__ __m512i __DEFAULT_FN_ATTRS512
1929_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1930 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1931 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1932 _MM_FROUND_CUR_DIRECTION);
1933}
1934
1935#define _mm512_cvt_roundepi32_ph(A, R) \
1936 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1937 (__v16hf)_mm256_undefined_ph(), \
1938 (__mmask16)(-1), (int)(R)))
1939
1940#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1941 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1942 (__mmask16)(U), (int)(R)))
1943
1944#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1945 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1946 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1947
1948static __inline__ __m256h __DEFAULT_FN_ATTRS512
1949_mm512_cvtepi32_ph(__m512i __A) {
1950 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1951 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1952 _MM_FROUND_CUR_DIRECTION);
1953}
1954
1955static __inline__ __m256h __DEFAULT_FN_ATTRS512
1956_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1957 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1958 (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1959}
1960
1961static __inline__ __m256h __DEFAULT_FN_ATTRS512
1962_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1963 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1964 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1965 _MM_FROUND_CUR_DIRECTION);
1966}
1967
1968#define _mm512_cvt_roundepu32_ph(A, R) \
1969 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1970 (__v16hf)_mm256_undefined_ph(), \
1971 (__mmask16)(-1), (int)(R)))
1972
1973#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1974 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1975 (__mmask16)(U), (int)(R)))
1976
1977#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1978 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1979 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1980
1981static __inline__ __m256h __DEFAULT_FN_ATTRS512
1982_mm512_cvtepu32_ph(__m512i __A) {
1983 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1984 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1985 _MM_FROUND_CUR_DIRECTION);
1986}
1987
1988static __inline__ __m256h __DEFAULT_FN_ATTRS512
1989_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1990 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1991 (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1992}
1993
1994static __inline__ __m256h __DEFAULT_FN_ATTRS512
1995_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
1996 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1997 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1998 _MM_FROUND_CUR_DIRECTION);
1999}
2000
2001#define _mm512_cvtt_roundph_epi32(A, R) \
2002 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2003 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2004 (int)(R)))
2005
2006#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2007 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2008 (__mmask16)(U), (int)(R)))
2009
2010#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2011 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2012 (__v16si)_mm512_setzero_epi32(), \
2013 (__mmask16)(U), (int)(R)))
2014
2015static __inline__ __m512i __DEFAULT_FN_ATTRS512
2016_mm512_cvttph_epi32(__m256h __A) {
2017 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2018 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2019 _MM_FROUND_CUR_DIRECTION);
2020}
2021
2022static __inline__ __m512i __DEFAULT_FN_ATTRS512
2023_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2024 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2025 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2026}
2027
2028static __inline__ __m512i __DEFAULT_FN_ATTRS512
2029_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2030 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2031 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2032 _MM_FROUND_CUR_DIRECTION);
2033}
2034
2035#define _mm512_cvtt_roundph_epu32(A, R) \
2036 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2037 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2038 (int)(R)))
2039
2040#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2041 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2042 (__mmask16)(U), (int)(R)))
2043
2044#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2045 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2046 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2047 (int)(R)))
2048
2049static __inline__ __m512i __DEFAULT_FN_ATTRS512
2050_mm512_cvttph_epu32(__m256h __A) {
2051 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2052 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2053 _MM_FROUND_CUR_DIRECTION);
2054}
2055
2056static __inline__ __m512i __DEFAULT_FN_ATTRS512
2057_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2058 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2059 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2060}
2061
2062static __inline__ __m512i __DEFAULT_FN_ATTRS512
2063_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2064 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2065 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2066 _MM_FROUND_CUR_DIRECTION);
2067}
2068
2069#define _mm512_cvt_roundepi64_ph(A, R) \
2070 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2071 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2072
2073#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2074 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2075 (__mmask8)(U), (int)(R)))
2076
2077#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2078 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2079 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2080
2081static __inline__ __m128h __DEFAULT_FN_ATTRS512
2082_mm512_cvtepi64_ph(__m512i __A) {
2083 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2084 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2085 _MM_FROUND_CUR_DIRECTION);
2086}
2087
2088static __inline__ __m128h __DEFAULT_FN_ATTRS512
2089_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2090 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2091 (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2092}
2093
2094static __inline__ __m128h __DEFAULT_FN_ATTRS512
2095_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2096 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2097 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2098 _MM_FROUND_CUR_DIRECTION);
2099}
2100
2101#define _mm512_cvt_roundph_epi64(A, R) \
2102 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2103 (__v8di)_mm512_undefined_epi32(), \
2104 (__mmask8)(-1), (int)(R)))
2105
2106#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2107 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2108 (__mmask8)(U), (int)(R)))
2109
2110#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2111 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2112 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2113
2114static __inline__ __m512i __DEFAULT_FN_ATTRS512
2115_mm512_cvtph_epi64(__m128h __A) {
2116 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2117 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2118 _MM_FROUND_CUR_DIRECTION);
2119}
2120
2121static __inline__ __m512i __DEFAULT_FN_ATTRS512
2122_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2123 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2124 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2125}
2126
2127static __inline__ __m512i __DEFAULT_FN_ATTRS512
2128_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2129 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2130 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2131 _MM_FROUND_CUR_DIRECTION);
2132}
2133
2134#define _mm512_cvt_roundepu64_ph(A, R) \
2135 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2136 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2137
2138#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2139 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2140 (__mmask8)(U), (int)(R)))
2141
2142#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2143 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2144 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2145
2146static __inline__ __m128h __DEFAULT_FN_ATTRS512
2147_mm512_cvtepu64_ph(__m512i __A) {
2148 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2149 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2150 _MM_FROUND_CUR_DIRECTION);
2151}
2152
2153static __inline__ __m128h __DEFAULT_FN_ATTRS512
2154_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2155 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2156 (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2157}
2158
2159static __inline__ __m128h __DEFAULT_FN_ATTRS512
2160_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2161 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2162 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2163 _MM_FROUND_CUR_DIRECTION);
2164}
2165
2166#define _mm512_cvt_roundph_epu64(A, R) \
2167 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2168 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2169 (int)(R)))
2170
2171#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2172 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2173 (__mmask8)(U), (int)(R)))
2174
2175#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2176 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2177 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2178
2179static __inline__ __m512i __DEFAULT_FN_ATTRS512
2180_mm512_cvtph_epu64(__m128h __A) {
2181 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2182 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2183 _MM_FROUND_CUR_DIRECTION);
2184}
2185
2186static __inline__ __m512i __DEFAULT_FN_ATTRS512
2187_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2188 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2189 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2190}
2191
2192static __inline__ __m512i __DEFAULT_FN_ATTRS512
2193_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2194 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2195 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2196 _MM_FROUND_CUR_DIRECTION);
2197}
2198
2199#define _mm512_cvtt_roundph_epi64(A, R) \
2200 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2201 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2202 (int)(R)))
2203
2204#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2205 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2206 (__mmask8)(U), (int)(R)))
2207
2208#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2209 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2210 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2211
2212static __inline__ __m512i __DEFAULT_FN_ATTRS512
2213_mm512_cvttph_epi64(__m128h __A) {
2214 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2215 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2216 _MM_FROUND_CUR_DIRECTION);
2217}
2218
2219static __inline__ __m512i __DEFAULT_FN_ATTRS512
2220_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2221 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2222 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2223}
2224
2225static __inline__ __m512i __DEFAULT_FN_ATTRS512
2226_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2227 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2228 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2229 _MM_FROUND_CUR_DIRECTION);
2230}
2231
2232#define _mm512_cvtt_roundph_epu64(A, R) \
2233 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2234 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2235 (int)(R)))
2236
2237#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2238 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2239 (__mmask8)(U), (int)(R)))
2240
2241#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2242 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2243 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2244
2245static __inline__ __m512i __DEFAULT_FN_ATTRS512
2246_mm512_cvttph_epu64(__m128h __A) {
2247 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2248 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2249 _MM_FROUND_CUR_DIRECTION);
2250}
2251
2252static __inline__ __m512i __DEFAULT_FN_ATTRS512
2253_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2254 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2255 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2256}
2257
2258static __inline__ __m512i __DEFAULT_FN_ATTRS512
2259_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2260 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2261 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2262 _MM_FROUND_CUR_DIRECTION);
2263}
2264
2265#define _mm_cvt_roundsh_i32(A, R) \
2266 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2267
2268static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2269 return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2270}
2271
2272#define _mm_cvt_roundsh_u32(A, R) \
2273 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2274
2275static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2276_mm_cvtsh_u32(__m128h __A) {
2277 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2278 _MM_FROUND_CUR_DIRECTION);
2279}
2280
2281#ifdef __x86_64__
2282#define _mm_cvt_roundsh_i64(A, R) \
2283 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2284
2285static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2286 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2287 _MM_FROUND_CUR_DIRECTION);
2288}
2289
2290#define _mm_cvt_roundsh_u64(A, R) \
2291 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2292
2293static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2294_mm_cvtsh_u64(__m128h __A) {
2295 return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2296 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2297}
2298#endif // __x86_64__
2299
2300#define _mm_cvt_roundu32_sh(A, B, R) \
2301 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2302
2303static __inline__ __m128h __DEFAULT_FN_ATTRS128
2304_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2305 __A[0] = __B;
2306 return __A;
2307}
2308
2309#ifdef __x86_64__
2310#define _mm_cvt_roundu64_sh(A, B, R) \
2311 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2312 (int)(R)))
2313
2314static __inline__ __m128h __DEFAULT_FN_ATTRS128
2315_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2316 __A[0] = __B;
2317 return __A;
2318}
2319#endif
2320
2321#define _mm_cvt_roundi32_sh(A, B, R) \
2322 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2323
2324static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2325 int __B) {
2326 __A[0] = __B;
2327 return __A;
2328}
2329
2330#ifdef __x86_64__
2331#define _mm_cvt_roundi64_sh(A, B, R) \
2332 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2333
2334static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2335 long long __B) {
2336 __A[0] = __B;
2337 return __A;
2338}
2339#endif
2340
2341#define _mm_cvtt_roundsh_i32(A, R) \
2342 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2343
2344static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2345 return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2346 _MM_FROUND_CUR_DIRECTION);
2347}
2348
2349#ifdef __x86_64__
2350#define _mm_cvtt_roundsh_i64(A, R) \
2351 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2352
2353static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2354 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2355 _MM_FROUND_CUR_DIRECTION);
2356}
2357#endif
2358
2359#define _mm_cvtt_roundsh_u32(A, R) \
2360 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2361
2362static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2363_mm_cvttsh_u32(__m128h __A) {
2364 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2365 _MM_FROUND_CUR_DIRECTION);
2366}
2367
2368#ifdef __x86_64__
2369#define _mm_cvtt_roundsh_u64(A, R) \
2370 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2371
2372static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2373_mm_cvttsh_u64(__m128h __A) {
2374 return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2375 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2376}
2377#endif
2378
2379#define _mm512_cvtx_roundph_ps(A, R) \
2380 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2381 (__v16sf)_mm512_undefined_ps(), \
2382 (__mmask16)(-1), (int)(R)))
2383
2384#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2385 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2386 (__mmask16)(U), (int)(R)))
2387
2388#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2389 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2390 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2391
2392static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2393 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2394 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2395 _MM_FROUND_CUR_DIRECTION);
2396}
2397
2398static __inline__ __m512 __DEFAULT_FN_ATTRS512
2399_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2400 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2401 (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2402}
2403
2404static __inline__ __m512 __DEFAULT_FN_ATTRS512
2405_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2406 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2407 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2408 _MM_FROUND_CUR_DIRECTION);
2409}
2410
2411#define _mm512_cvtx_roundps_ph(A, R) \
2412 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2413 (__v16hf)_mm256_undefined_ph(), \
2414 (__mmask16)(-1), (int)(R)))
2415
2416#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2417 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2418 (__mmask16)(U), (int)(R)))
2419
2420#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2421 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2422 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2423
2424static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2425 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2426 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2427 _MM_FROUND_CUR_DIRECTION);
2428}
2429
2430static __inline__ __m256h __DEFAULT_FN_ATTRS512
2431_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2432 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2433 (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2434}
2435
2436static __inline__ __m256h __DEFAULT_FN_ATTRS512
2437_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2438 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2439 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2440 _MM_FROUND_CUR_DIRECTION);
2441}
2442
2443#define _mm512_fmadd_round_ph(A, B, C, R) \
2444 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2445 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2446 (__mmask32)-1, (int)(R)))
2447
2448#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2449 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2450 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2451 (__mmask32)(U), (int)(R)))
2452
2453#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2454 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2455 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2456 (__mmask32)(U), (int)(R)))
2457
2458#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2459 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2460 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2461 (__mmask32)(U), (int)(R)))
2462
2463#define _mm512_fmsub_round_ph(A, B, C, R) \
2464 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2465 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2466 (__mmask32)-1, (int)(R)))
2467
2468#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2469 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2470 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2471 (__mmask32)(U), (int)(R)))
2472
2473#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2474 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2475 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2476 (__mmask32)(U), (int)(R)))
2477
2478#define _mm512_fnmadd_round_ph(A, B, C, R) \
2479 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2480 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2481 (__mmask32)-1, (int)(R)))
2482
2483#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2484 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2485 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2486 (__mmask32)(U), (int)(R)))
2487
2488#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2489 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2490 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2491 (__mmask32)(U), (int)(R)))
2492
2493#define _mm512_fnmsub_round_ph(A, B, C, R) \
2494 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2495 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2496 (__mmask32)-1, (int)(R)))
2497
2498#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2499 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2500 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2501 (__mmask32)(U), (int)(R)))
2502
2503static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2504 __m512h __B,
2505 __m512h __C) {
2506 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2507 (__v32hf)__C, (__mmask32)-1,
2508 _MM_FROUND_CUR_DIRECTION);
2509}
2510
2511static __inline__ __m512h __DEFAULT_FN_ATTRS512
2512_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2513 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2514 (__v32hf)__C, (__mmask32)__U,
2515 _MM_FROUND_CUR_DIRECTION);
2516}
2517
2518static __inline__ __m512h __DEFAULT_FN_ATTRS512
2519_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2520 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2521 (__v32hf)__C, (__mmask32)__U,
2522 _MM_FROUND_CUR_DIRECTION);
2523}
2524
2525static __inline__ __m512h __DEFAULT_FN_ATTRS512
2526_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2527 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2528 (__v32hf)__C, (__mmask32)__U,
2529 _MM_FROUND_CUR_DIRECTION);
2530}
2531
2532static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2533 __m512h __B,
2534 __m512h __C) {
2535 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2536 -(__v32hf)__C, (__mmask32)-1,
2537 _MM_FROUND_CUR_DIRECTION);
2538}
2539
2540static __inline__ __m512h __DEFAULT_FN_ATTRS512
2541_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2542 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2543 -(__v32hf)__C, (__mmask32)__U,
2544 _MM_FROUND_CUR_DIRECTION);
2545}
2546
2547static __inline__ __m512h __DEFAULT_FN_ATTRS512
2548_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2549 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2550 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2551 _MM_FROUND_CUR_DIRECTION);
2552}
2553
2554static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2555 __m512h __B,
2556 __m512h __C) {
2557 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2558 (__v32hf)__C, (__mmask32)-1,
2559 _MM_FROUND_CUR_DIRECTION);
2560}
2561
2562static __inline__ __m512h __DEFAULT_FN_ATTRS512
2563_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2564 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2565 (__v32hf)__C, (__mmask32)__U,
2566 _MM_FROUND_CUR_DIRECTION);
2567}
2568
2569static __inline__ __m512h __DEFAULT_FN_ATTRS512
2570_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2571 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2572 (__v32hf)__C, (__mmask32)__U,
2573 _MM_FROUND_CUR_DIRECTION);
2574}
2575
2576static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2577 __m512h __B,
2578 __m512h __C) {
2579 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2580 -(__v32hf)__C, (__mmask32)-1,
2581 _MM_FROUND_CUR_DIRECTION);
2582}
2583
2584static __inline__ __m512h __DEFAULT_FN_ATTRS512
2585_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2586 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2587 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2588 _MM_FROUND_CUR_DIRECTION);
2589}
2590
2591#define _mm512_fmaddsub_round_ph(A, B, C, R) \
2592 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2593 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2594 (__mmask32)-1, (int)(R)))
2595
2596#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2597 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2598 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2599 (__mmask32)(U), (int)(R)))
2600
2601#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2602 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2603 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2604 (__mmask32)(U), (int)(R)))
2605
2606#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2607 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2608 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2609 (__mmask32)(U), (int)(R)))
2610
2611#define _mm512_fmsubadd_round_ph(A, B, C, R) \
2612 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2613 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2614 (__mmask32)-1, (int)(R)))
2615
2616#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2617 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2618 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2619 (__mmask32)(U), (int)(R)))
2620
2621#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2622 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2623 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2624 (__mmask32)(U), (int)(R)))
2625
2626static __inline__ __m512h __DEFAULT_FN_ATTRS512
2627_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2628 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2629 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2630 _MM_FROUND_CUR_DIRECTION);
2631}
2632
2633static __inline__ __m512h __DEFAULT_FN_ATTRS512
2634_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2635 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2636 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2637 _MM_FROUND_CUR_DIRECTION);
2638}
2639
2640static __inline__ __m512h __DEFAULT_FN_ATTRS512
2641_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2642 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2643 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2644 _MM_FROUND_CUR_DIRECTION);
2645}
2646
2647static __inline__ __m512h __DEFAULT_FN_ATTRS512
2648_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2649 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2650 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2651 _MM_FROUND_CUR_DIRECTION);
2652}
2653
2654static __inline__ __m512h __DEFAULT_FN_ATTRS512
2655_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2656 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2657 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2658 _MM_FROUND_CUR_DIRECTION);
2659}
2660
2661static __inline__ __m512h __DEFAULT_FN_ATTRS512
2662_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2663 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2664 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2665 _MM_FROUND_CUR_DIRECTION);
2666}
2667
2668static __inline__ __m512h __DEFAULT_FN_ATTRS512
2669_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2670 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2671 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2672 _MM_FROUND_CUR_DIRECTION);
2673}
2674
2675#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2676 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2677 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2678 (__mmask32)(U), (int)(R)))
2679
2680static __inline__ __m512h __DEFAULT_FN_ATTRS512
2681_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2682 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2683 (__v32hf)__C, (__mmask32)__U,
2684 _MM_FROUND_CUR_DIRECTION);
2685}
2686
2687#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2688 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2689 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2690 (__mmask32)(U), (int)(R)))
2691
2692static __inline__ __m512h __DEFAULT_FN_ATTRS512
2693_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2694 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2695 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2696 _MM_FROUND_CUR_DIRECTION);
2697}
2698
2699#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2700 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2701 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2702 (__mmask32)(U), (int)(R)))
2703
2704static __inline__ __m512h __DEFAULT_FN_ATTRS512
2705_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2706 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2707 (__v32hf)__C, (__mmask32)__U,
2708 _MM_FROUND_CUR_DIRECTION);
2709}
2710
2711#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2712 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2713 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2714 (__mmask32)(U), (int)(R)))
2715
2716#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2717 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2718 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2719 (__mmask32)(U), (int)(R)))
2720
2721static __inline__ __m512h __DEFAULT_FN_ATTRS512
2722_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2723 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2724 -(__v32hf)__C, (__mmask32)__U,
2725 _MM_FROUND_CUR_DIRECTION);
2726}
2727
2728static __inline__ __m512h __DEFAULT_FN_ATTRS512
2729_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2730 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2731 (__v32hf)__C, (__mmask32)__U,
2732 _MM_FROUND_CUR_DIRECTION);
2733}
2734
2735static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2736 __m128h __A,
2737 __m128h __B) {
2738 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2739 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2740}
2741
2742static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2743 __mmask8 __U,
2744 __m128h __A,
2745 __m128h __B) {
2746 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2747 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2748}
2749
2750#define _mm_fmadd_round_sh(A, B, C, R) \
2751 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2752 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2753 (__mmask8)-1, (int)(R)))
2754
2755#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2756 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2757 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2758 (__mmask8)(U), (int)(R)))
2759
2760static __inline__ __m128h __DEFAULT_FN_ATTRS128
2761_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2762 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2763 (__mmask8)__U,
2764 _MM_FROUND_CUR_DIRECTION);
2765}
2766
2767#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2768 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2769 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2770 (__mmask8)(U), (int)(R)))
2771
2772static __inline__ __m128h __DEFAULT_FN_ATTRS128
2773_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2774 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2775 (__mmask8)__U,
2776 _MM_FROUND_CUR_DIRECTION);
2777}
2778
2779#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2780 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2781 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2782 (__mmask8)(U), (int)(R)))
2783
2784static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2785 __m128h __A,
2786 __m128h __B) {
2787 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2788 -(__v8hf)__B, (__mmask8)-1,
2789 _MM_FROUND_CUR_DIRECTION);
2790}
2791
2792static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2793 __mmask8 __U,
2794 __m128h __A,
2795 __m128h __B) {
2796 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2797 -(__v8hf)__B, (__mmask8)__U,
2798 _MM_FROUND_CUR_DIRECTION);
2799}
2800
2801#define _mm_fmsub_round_sh(A, B, C, R) \
2802 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2803 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2804 (__mmask8)-1, (int)(R)))
2805
2806#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2807 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2808 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2809 (__mmask8)(U), (int)(R)))
2810
2811static __inline__ __m128h __DEFAULT_FN_ATTRS128
2812_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2813 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2814 -(__v8hf)__C, (__mmask8)__U,
2815 _MM_FROUND_CUR_DIRECTION);
2816}
2817
2818#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2819 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2820 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2821 (__mmask8)(U), (int)R))
2822
2823static __inline__ __m128h __DEFAULT_FN_ATTRS128
2824_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2825 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2826 (__mmask8)__U,
2827 _MM_FROUND_CUR_DIRECTION);
2828}
2829
2830#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2831 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2832 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2833 (__mmask8)(U), (int)(R)))
2834
2835static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2836 __m128h __A,
2837 __m128h __B) {
2838 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2839 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2840}
2841
2842static __inline__ __m128h __DEFAULT_FN_ATTRS128
2843_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2844 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2845 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2846}
2847
2848#define _mm_fnmadd_round_sh(A, B, C, R) \
2849 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2850 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2851 (__mmask8)-1, (int)(R)))
2852
2853#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2854 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2855 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2856 (__mmask8)(U), (int)(R)))
2857
2858static __inline__ __m128h __DEFAULT_FN_ATTRS128
2859_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2860 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2861 (__mmask8)__U,
2862 _MM_FROUND_CUR_DIRECTION);
2863}
2864
2865#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2866 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2867 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2868 (__mmask8)(U), (int)(R)))
2869
2870static __inline__ __m128h __DEFAULT_FN_ATTRS128
2871_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2872 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2873 (__mmask8)__U,
2874 _MM_FROUND_CUR_DIRECTION);
2875}
2876
2877#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2878 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2879 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2880 (__mmask8)(U), (int)(R)))
2881
2882static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2883 __m128h __A,
2884 __m128h __B) {
2885 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2886 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2887}
2888
2889static __inline__ __m128h __DEFAULT_FN_ATTRS128
2890_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2891 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2892 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2893}
2894
2895#define _mm_fnmsub_round_sh(A, B, C, R) \
2896 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2897 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2898 (__mmask8)-1, (int)(R)))
2899
2900#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2901 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2902 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2903 (__mmask8)(U), (int)(R)))
2904
2905static __inline__ __m128h __DEFAULT_FN_ATTRS128
2906_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2907 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2908 (__mmask8)__U,
2909 _MM_FROUND_CUR_DIRECTION);
2910}
2911
2912#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2913 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2914 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2915 (__mmask8)(U), (int)(R)))
2916
2917static __inline__ __m128h __DEFAULT_FN_ATTRS128
2918_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2919 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2920 (__mmask8)__U,
2921 _MM_FROUND_CUR_DIRECTION);
2922}
2923
2924#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2925 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2926 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2927 (__mmask8)(U), (int)(R)))
2928
2929static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2930 __m128h __B,
2931 __m128h __C) {
2932 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__C, (__v4sf)__A,
2933 (__v4sf)__B, (__mmask8)-1,
2934 _MM_FROUND_CUR_DIRECTION);
2935}
2936
2937static __inline__ __m128h __DEFAULT_FN_ATTRS128
2938_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2939 return (__m128h)__builtin_ia32_selectps_128(
2940 __U,
2941 __builtin_ia32_vfcmaddcsh_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B,
2942 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION),
2943 (__v4sf)__A);
2944}
2945
2946static __inline__ __m128h __DEFAULT_FN_ATTRS128
2947_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2948 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__C, (__v4sf)__A,
2949 (__v4sf)__B, (__mmask8)__U,
2950 _MM_FROUND_CUR_DIRECTION);
2951}
2952
2953static __inline__ __m128h __DEFAULT_FN_ATTRS128
2954_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2955 return (__m128h)_mm_move_ss((__m128)__C,
2956 (__m128)__builtin_ia32_vfcmaddcsh_mask(
2957 (__v4sf)__C, (__v4sf)__A, (__v4sf)__B, __U,
2958 _MM_FROUND_CUR_DIRECTION));
2959}
2960
2961#define _mm_fcmadd_round_sch(A, B, C, R) \
2962 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2963 (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
2964 (__mmask8)-1, (int)(R)))
2965
2966#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2967 ((__m128h)__builtin_ia32_selectps_128( \
2968 (__mmask8)(U & 1), \
2969 __builtin_ia32_vfcmaddcsh_mask( \
2970 (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
2971 (__mmask8)(U), (int)(R)), \
2972 (__v4sf)(__m128h)(A)))
2973
2974#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2975 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2976 (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
2977 (__mmask8)(U), (int)(R)))
2978
2979#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2980 ((__m128h)_mm_move_ss((__m128)(C), \
2981 (__m128)__builtin_ia32_vfcmaddcsh_mask( \
2982 (__v4sf)(C), (__v4sf)(A), (__v4sf)(B), (U), (R))))
2983
2984static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2985 __m128h __B,
2986 __m128h __C) {
2987 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__C, (__v4sf)__A,
2988 (__v4sf)__B, (__mmask8)-1,
2989 _MM_FROUND_CUR_DIRECTION);
2990}
2991
2992static __inline__ __m128h __DEFAULT_FN_ATTRS128
2993_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2994 return (__m128h)__builtin_ia32_selectps_128(
2995 __U,
2996 __builtin_ia32_vfmaddcsh_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B,
2997 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION),
2998 (__v4sf)__A);
2999}
3000
3001static __inline__ __m128h __DEFAULT_FN_ATTRS128
3002_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
3003 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__C, (__v4sf)__A,
3004 (__v4sf)__B, (__mmask8)__U,
3005 _MM_FROUND_CUR_DIRECTION);
3006}
3007
3008#define _mm_fmadd_round_sch(A, B, C, R) \
3009 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3010 (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3011 (__mmask8)-1, (int)(R)))
3012
3013#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3014 ((__m128h)__builtin_ia32_selectps_128( \
3015 (__mmask8)(U & 1), \
3016 __builtin_ia32_vfmaddcsh_mask( \
3017 (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3018 (__mmask8)(U), (int)(R)), \
3019 (__v4sf)(__m128h)(A)))
3020
3021#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3022 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3023 (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3024 (__mmask8)(U), (int)(R)))
3025
3026static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3027 __m128h __B) {
3028 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3029 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3030 _MM_FROUND_CUR_DIRECTION);
3031}
3032
3033static __inline__ __m128h __DEFAULT_FN_ATTRS128
3034_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3035 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3036 (__v4sf)__W, (__mmask8)__U,
3037 _MM_FROUND_CUR_DIRECTION);
3038}
3039
3040static __inline__ __m128h __DEFAULT_FN_ATTRS128
3041_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3042 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3043 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3044 _MM_FROUND_CUR_DIRECTION);
3045}
3046
3047#define _mm_fcmul_round_sch(A, B, R) \
3048 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3049 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3050 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3051
3052#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3053 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3054 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3055 (__mmask8)(U), (int)(R)))
3056
3057#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3058 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3059 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3060 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3061
3062static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3063 __m128h __B) {
3064 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3065 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3066 _MM_FROUND_CUR_DIRECTION);
3067}
3068
3069static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3070 __mmask8 __U,
3071 __m128h __A,
3072 __m128h __B) {
3073 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3074 (__v4sf)__W, (__mmask8)__U,
3075 _MM_FROUND_CUR_DIRECTION);
3076}
3077
3078static __inline__ __m128h __DEFAULT_FN_ATTRS128
3079_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3080 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3081 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3082 _MM_FROUND_CUR_DIRECTION);
3083}
3084
3085#define _mm_fmul_round_sch(A, B, R) \
3086 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3087 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3088 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3089
3090#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3091 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3092 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3093 (__mmask8)(U), (int)(R)))
3094
3095#define _mm_maskz_fmul_round_sch(U, A, B, R) \
3096 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3097 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3098 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3099
3100static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3101 __m512h __B) {
3102 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3103 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3104 _MM_FROUND_CUR_DIRECTION);
3105}
3106
3107static __inline__ __m512h __DEFAULT_FN_ATTRS512
3108_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3109 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3110 (__v16sf)__W, (__mmask16)__U,
3111 _MM_FROUND_CUR_DIRECTION);
3112}
3113
3114static __inline__ __m512h __DEFAULT_FN_ATTRS512
3115_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3116 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3117 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3118 _MM_FROUND_CUR_DIRECTION);
3119}
3120
3121#define _mm512_fcmul_round_pch(A, B, R) \
3122 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3123 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3124 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3125
3126#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3127 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3128 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3129 (__mmask16)(U), (int)(R)))
3130
3131#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3132 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3133 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3134 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3135
3136static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3137 __m512h __B) {
3138 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3139 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3140 _MM_FROUND_CUR_DIRECTION);
3141}
3142
3143static __inline__ __m512h __DEFAULT_FN_ATTRS512
3144_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3145 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3146 (__v16sf)__W, (__mmask16)__U,
3147 _MM_FROUND_CUR_DIRECTION);
3148}
3149
3150static __inline__ __m512h __DEFAULT_FN_ATTRS512
3151_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3152 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3153 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3154 _MM_FROUND_CUR_DIRECTION);
3155}
3156
3157#define _mm512_fmul_round_pch(A, B, R) \
3158 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3159 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3160 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3161
3162#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3163 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3164 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3165 (__mmask16)(U), (int)(R)))
3166
3167#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3168 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3169 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3170 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3171
3172static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3173 __m512h __B,
3174 __m512h __C) {
3175 return (__m512h)__builtin_ia32_vfcmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
3176 (__v16sf)__B, (__mmask16)-1,
3177 _MM_FROUND_CUR_DIRECTION);
3178}
3179
3180static __inline__ __m512h __DEFAULT_FN_ATTRS512
3181_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3182 return (__m512h)__builtin_ia32_selectps_512(
3183 __U,
3184 __builtin_ia32_vfcmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
3185 (__v16sf)__B, (__mmask16)__U,
3186 _MM_FROUND_CUR_DIRECTION),
3187 (__v16sf)__A);
3188}
3189
3190static __inline__ __m512h __DEFAULT_FN_ATTRS512
3191_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3192 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3193 (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U,
3194 _MM_FROUND_CUR_DIRECTION);
3195}
3196
3197static __inline__ __m512h __DEFAULT_FN_ATTRS512
3198_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3199 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3200 (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U,
3201 _MM_FROUND_CUR_DIRECTION);
3202}
3203
3204#define _mm512_fcmadd_round_pch(A, B, C, R) \
3205 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3206 (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3207 (__mmask16)-1, (int)(R)))
3208
3209#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3210 ((__m512h)__builtin_ia32_selectps_512( \
3211 (__mmask16)(U), \
3212 __builtin_ia32_vfcmaddcph512_mask( \
3213 (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3214 (__mmask16)(U), (int)(R)), \
3215 (__v16sf)(__m512h)(A)))
3216
3217#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3218 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3219 (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3220 (__mmask16)(U), (int)(R)))
3221
3222#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3223 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3224 (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3225 (__mmask16)(U), (int)(R)))
3226
3227static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3228 __m512h __B,
3229 __m512h __C) {
3230 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
3231 (__v16sf)__B, (__mmask16)-1,
3232 _MM_FROUND_CUR_DIRECTION);
3233}
3234
3235static __inline__ __m512h __DEFAULT_FN_ATTRS512
3236_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3237 return (__m512h)__builtin_ia32_selectps_512(
3238 __U,
3239 __builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A, (__v16sf)__B,
3240 (__mmask16)__U,
3241 _MM_FROUND_CUR_DIRECTION),
3242 (__v16sf)__A);
3243}
3244
3245static __inline__ __m512h __DEFAULT_FN_ATTRS512
3246_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3247 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
3248 (__v16sf)__B, (__mmask16)__U,
3249 _MM_FROUND_CUR_DIRECTION);
3250}
3251
3252static __inline__ __m512h __DEFAULT_FN_ATTRS512
3253_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3254 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3255 (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U,
3256 _MM_FROUND_CUR_DIRECTION);
3257}
3258
3259#define _mm512_fmadd_round_pch(A, B, C, R) \
3260 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3261 (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3262 (__mmask16)-1, (int)(R)))
3263
3264#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3265 ((__m512h)__builtin_ia32_selectps_512( \
3266 (__mmask16)(U), \
3267 __builtin_ia32_vfmaddcph512_mask( \
3268 (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3269 (__mmask16)(U), (int)(R)), \
3270 (__v16sf)(__m512h)(A)))
3271
3272#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3273 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3274 (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3275 (__mmask16)(U), (int)(R)))
3276
3277#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3278 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3279 (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3280 (__mmask16)(U), (int)(R)))
3281
3282static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3283_mm512_reduce_add_ph(__m512h __W) {
3284 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3285}
3286
3287static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3288_mm512_reduce_mul_ph(__m512h __W) {
3289 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3290}
3291
3292static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3293_mm512_reduce_max_ph(__m512h __V) {
3294 return __builtin_ia32_reduce_fmax_ph512(__V);
3295}
3296
3297static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3298_mm512_reduce_min_ph(__m512h __V) {
3299 return __builtin_ia32_reduce_fmin_ph512(__V);
3300}
3301
3302static __inline__ __m512h __DEFAULT_FN_ATTRS512
3303_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3304 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3305 (__v32hf)__A);
3306}
3307
3308static __inline__ __m512h __DEFAULT_FN_ATTRS512
3309_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3310 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3311 (__v32hi)__B);
3312}
3313
3314static __inline__ __m512h __DEFAULT_FN_ATTRS512
3315_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3316 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3317}
3318
3319#undef __DEFAULT_FN_ATTRS128
3320#undef __DEFAULT_FN_ATTRS256
3321#undef __DEFAULT_FN_ATTRS512
3322
3323#endif