blob: 99409a31b32bd914efb0e61d7e0acfe96f89f1ce [file] [log] [blame]
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11#endif
12
13#ifndef __AVX512FP16INTRIN_H
14#define __AVX512FP16INTRIN_H
15
16/* Define the default attributes for the functions in this file. */
17typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
18typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
19typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
20typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
21typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
22typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
23typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
24typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
25typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
26
27/* Define the default attributes for the functions in this file. */
28#define __DEFAULT_FN_ATTRS512 \
29 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
30 __min_vector_width__(512)))
31#define __DEFAULT_FN_ATTRS256 \
32 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
33 __min_vector_width__(256)))
34#define __DEFAULT_FN_ATTRS128 \
35 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
36 __min_vector_width__(128)))
37
38static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
39 return __a[0];
40}
41
42static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
43 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
44}
45
46static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
47 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
48 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
49}
50
51static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
52 return (__m256h)__builtin_ia32_undef256();
53}
54
55static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
56 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
57 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
58 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
59}
60
61static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
62 return (__m128h)__builtin_ia32_undef128();
63}
64
65static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
66 return (__m512h)__builtin_ia32_undef512();
67}
68
69static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
70 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
71 __h, __h, __h, __h, __h, __h, __h, __h,
72 __h, __h, __h, __h, __h, __h, __h, __h,
73 __h, __h, __h, __h, __h, __h, __h, __h};
74}
75
76static __inline __m512h __DEFAULT_FN_ATTRS512
77_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
78 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
79 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
80 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
81 _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
82 _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
83 _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
84 _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
85 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
86 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
87 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
88 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
89 __h4, __h3, __h2, __h1};
90}
91
92#define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
93 h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \
94 h25, h26, h27, h28, h29, h30, h31, h32) \
95 _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
96 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
97 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \
98 (h5), (h4), (h3), (h2), (h1))
99
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -0800100static __inline __m512h __DEFAULT_FN_ATTRS512
101_mm512_set1_pch(_Float16 _Complex h) {
102 return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h));
103}
104
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800105static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
106 return (__m128)__a;
107}
108
109static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
110 return (__m256)__a;
111}
112
113static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
114 return (__m512)__a;
115}
116
117static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
118 return (__m128d)__a;
119}
120
121static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
122 return (__m256d)__a;
123}
124
125static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
126 return (__m512d)__a;
127}
128
129static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
130 return (__m128i)__a;
131}
132
133static __inline__ __m256i __DEFAULT_FN_ATTRS256
134_mm256_castph_si256(__m256h __a) {
135 return (__m256i)__a;
136}
137
138static __inline__ __m512i __DEFAULT_FN_ATTRS512
139_mm512_castph_si512(__m512h __a) {
140 return (__m512i)__a;
141}
142
143static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
144 return (__m128h)__a;
145}
146
147static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
148 return (__m256h)__a;
149}
150
151static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
152 return (__m512h)__a;
153}
154
155static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
156 return (__m128h)__a;
157}
158
159static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
160 return (__m256h)__a;
161}
162
163static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
164 return (__m512h)__a;
165}
166
167static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
168 return (__m128h)__a;
169}
170
171static __inline__ __m256h __DEFAULT_FN_ATTRS256
172_mm256_castsi256_ph(__m256i __a) {
173 return (__m256h)__a;
174}
175
176static __inline__ __m512h __DEFAULT_FN_ATTRS512
177_mm512_castsi512_ph(__m512i __a) {
178 return (__m512h)__a;
179}
180
181static __inline__ __m128h __DEFAULT_FN_ATTRS256
182_mm256_castph256_ph128(__m256h __a) {
183 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
184}
185
186static __inline__ __m128h __DEFAULT_FN_ATTRS512
187_mm512_castph512_ph128(__m512h __a) {
188 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
189}
190
191static __inline__ __m256h __DEFAULT_FN_ATTRS512
192_mm512_castph512_ph256(__m512h __a) {
193 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
194 12, 13, 14, 15);
195}
196
197static __inline__ __m256h __DEFAULT_FN_ATTRS256
198_mm256_castph128_ph256(__m128h __a) {
199 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
200 -1, -1, -1, -1, -1);
201}
202
203static __inline__ __m512h __DEFAULT_FN_ATTRS512
204_mm512_castph128_ph512(__m128h __a) {
205 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
206 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
207 -1, -1, -1, -1, -1, -1, -1, -1, -1);
208}
209
210static __inline__ __m512h __DEFAULT_FN_ATTRS512
211_mm512_castph256_ph512(__m256h __a) {
212 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
213 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
214 -1, -1, -1, -1, -1, -1, -1, -1);
215}
216
217/// Constructs a 256-bit floating-point vector of [16 x half] from a
218/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
219/// contain the value of the source vector. The upper 384 bits are set
220/// to zero.
221///
222/// \headerfile <x86intrin.h>
223///
224/// This intrinsic has no corresponding instruction.
225///
226/// \param __a
227/// A 128-bit vector of [8 x half].
228/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
229/// contain the value of the parameter. The upper 384 bits are set to zero.
230static __inline__ __m256h __DEFAULT_FN_ATTRS256
231_mm256_zextph128_ph256(__m128h __a) {
232 return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
233 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
234}
235
236/// Constructs a 512-bit floating-point vector of [32 x half] from a
237/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
238/// contain the value of the source vector. The upper 384 bits are set
239/// to zero.
240///
241/// \headerfile <x86intrin.h>
242///
243/// This intrinsic has no corresponding instruction.
244///
245/// \param __a
246/// A 128-bit vector of [8 x half].
247/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
248/// contain the value of the parameter. The upper 384 bits are set to zero.
249static __inline__ __m512h __DEFAULT_FN_ATTRS512
250_mm512_zextph128_ph512(__m128h __a) {
251 return __builtin_shufflevector(
252 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
253 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
254}
255
256/// Constructs a 512-bit floating-point vector of [32 x half] from a
257/// 256-bit floating-point vector of [16 x half]. The lower 256 bits
258/// contain the value of the source vector. The upper 256 bits are set
259/// to zero.
260///
261/// \headerfile <x86intrin.h>
262///
263/// This intrinsic has no corresponding instruction.
264///
265/// \param __a
266/// A 256-bit vector of [16 x half].
267/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
268/// contain the value of the parameter. The upper 256 bits are set to zero.
269static __inline__ __m512h __DEFAULT_FN_ATTRS512
270_mm512_zextph256_ph512(__m256h __a) {
271 return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
272 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
273 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
274 29, 30, 31);
275}
276
277#define _mm_comi_round_sh(A, B, P, R) \
278 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
279
280#define _mm_comi_sh(A, B, pred) \
281 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
282
283static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
284 __m128h B) {
285 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
286 _MM_FROUND_CUR_DIRECTION);
287}
288
289static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
290 __m128h B) {
291 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
292 _MM_FROUND_CUR_DIRECTION);
293}
294
295static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
296 __m128h B) {
297 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
298 _MM_FROUND_CUR_DIRECTION);
299}
300
301static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
302 __m128h B) {
303 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
304 _MM_FROUND_CUR_DIRECTION);
305}
306
307static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
308 __m128h B) {
309 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
310 _MM_FROUND_CUR_DIRECTION);
311}
312
313static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
314 __m128h B) {
315 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
316 _MM_FROUND_CUR_DIRECTION);
317}
318
319static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
320 __m128h B) {
321 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
322 _MM_FROUND_CUR_DIRECTION);
323}
324
325static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
326 __m128h B) {
327 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
328 _MM_FROUND_CUR_DIRECTION);
329}
330
331static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
332 __m128h B) {
333 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
334 _MM_FROUND_CUR_DIRECTION);
335}
336
337static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
338 __m128h B) {
339 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
340 _MM_FROUND_CUR_DIRECTION);
341}
342
343static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
344 __m128h B) {
345 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
346 _MM_FROUND_CUR_DIRECTION);
347}
348
349static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
350 __m128h B) {
351 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
352 _MM_FROUND_CUR_DIRECTION);
353}
354
355static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
356 __m512h __B) {
357 return (__m512h)((__v32hf)__A + (__v32hf)__B);
358}
359
360static __inline__ __m512h __DEFAULT_FN_ATTRS512
361_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
362 return (__m512h)__builtin_ia32_selectph_512(
363 (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
364}
365
366static __inline__ __m512h __DEFAULT_FN_ATTRS512
367_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
368 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
369 (__v32hf)_mm512_add_ph(__A, __B),
370 (__v32hf)_mm512_setzero_ph());
371}
372
373#define _mm512_add_round_ph(A, B, R) \
374 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
375 (__v32hf)(__m512h)(B), (int)(R)))
376
377#define _mm512_mask_add_round_ph(W, U, A, B, R) \
378 ((__m512h)__builtin_ia32_selectph_512( \
379 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
380 (__v32hf)(__m512h)(W)))
381
382#define _mm512_maskz_add_round_ph(U, A, B, R) \
383 ((__m512h)__builtin_ia32_selectph_512( \
384 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
385 (__v32hf)_mm512_setzero_ph()))
386
387static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
388 __m512h __B) {
389 return (__m512h)((__v32hf)__A - (__v32hf)__B);
390}
391
392static __inline__ __m512h __DEFAULT_FN_ATTRS512
393_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
394 return (__m512h)__builtin_ia32_selectph_512(
395 (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
396}
397
398static __inline__ __m512h __DEFAULT_FN_ATTRS512
399_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
400 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
401 (__v32hf)_mm512_sub_ph(__A, __B),
402 (__v32hf)_mm512_setzero_ph());
403}
404
405#define _mm512_sub_round_ph(A, B, R) \
406 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
407 (__v32hf)(__m512h)(B), (int)(R)))
408
409#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
410 ((__m512h)__builtin_ia32_selectph_512( \
411 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
412 (__v32hf)(__m512h)(W)))
413
414#define _mm512_maskz_sub_round_ph(U, A, B, R) \
415 ((__m512h)__builtin_ia32_selectph_512( \
416 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
417 (__v32hf)_mm512_setzero_ph()))
418
419static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
420 __m512h __B) {
421 return (__m512h)((__v32hf)__A * (__v32hf)__B);
422}
423
424static __inline__ __m512h __DEFAULT_FN_ATTRS512
425_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
426 return (__m512h)__builtin_ia32_selectph_512(
427 (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
428}
429
430static __inline__ __m512h __DEFAULT_FN_ATTRS512
431_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
432 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
433 (__v32hf)_mm512_mul_ph(__A, __B),
434 (__v32hf)_mm512_setzero_ph());
435}
436
437#define _mm512_mul_round_ph(A, B, R) \
438 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
439 (__v32hf)(__m512h)(B), (int)(R)))
440
441#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
442 ((__m512h)__builtin_ia32_selectph_512( \
443 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
444 (__v32hf)(__m512h)(W)))
445
446#define _mm512_maskz_mul_round_ph(U, A, B, R) \
447 ((__m512h)__builtin_ia32_selectph_512( \
448 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
449 (__v32hf)_mm512_setzero_ph()))
450
451static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
452 __m512h __B) {
453 return (__m512h)((__v32hf)__A / (__v32hf)__B);
454}
455
456static __inline__ __m512h __DEFAULT_FN_ATTRS512
457_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
458 return (__m512h)__builtin_ia32_selectph_512(
459 (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
460}
461
462static __inline__ __m512h __DEFAULT_FN_ATTRS512
463_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
464 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
465 (__v32hf)_mm512_div_ph(__A, __B),
466 (__v32hf)_mm512_setzero_ph());
467}
468
469#define _mm512_div_round_ph(A, B, R) \
470 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
471 (__v32hf)(__m512h)(B), (int)(R)))
472
473#define _mm512_mask_div_round_ph(W, U, A, B, R) \
474 ((__m512h)__builtin_ia32_selectph_512( \
475 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
476 (__v32hf)(__m512h)(W)))
477
478#define _mm512_maskz_div_round_ph(U, A, B, R) \
479 ((__m512h)__builtin_ia32_selectph_512( \
480 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
481 (__v32hf)_mm512_setzero_ph()))
482
483static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
484 __m512h __B) {
485 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
486 _MM_FROUND_CUR_DIRECTION);
487}
488
489static __inline__ __m512h __DEFAULT_FN_ATTRS512
490_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
491 return (__m512h)__builtin_ia32_selectph_512(
492 (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
493}
494
495static __inline__ __m512h __DEFAULT_FN_ATTRS512
496_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
497 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
498 (__v32hf)_mm512_min_ph(__A, __B),
499 (__v32hf)_mm512_setzero_ph());
500}
501
502#define _mm512_min_round_ph(A, B, R) \
503 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
504 (__v32hf)(__m512h)(B), (int)(R)))
505
506#define _mm512_mask_min_round_ph(W, U, A, B, R) \
507 ((__m512h)__builtin_ia32_selectph_512( \
508 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
509 (__v32hf)(__m512h)(W)))
510
511#define _mm512_maskz_min_round_ph(U, A, B, R) \
512 ((__m512h)__builtin_ia32_selectph_512( \
513 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
514 (__v32hf)_mm512_setzero_ph()))
515
516static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
517 __m512h __B) {
518 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
519 _MM_FROUND_CUR_DIRECTION);
520}
521
522static __inline__ __m512h __DEFAULT_FN_ATTRS512
523_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
524 return (__m512h)__builtin_ia32_selectph_512(
525 (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
526}
527
528static __inline__ __m512h __DEFAULT_FN_ATTRS512
529_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
530 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
531 (__v32hf)_mm512_max_ph(__A, __B),
532 (__v32hf)_mm512_setzero_ph());
533}
534
535#define _mm512_max_round_ph(A, B, R) \
536 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
537 (__v32hf)(__m512h)(B), (int)(R)))
538
539#define _mm512_mask_max_round_ph(W, U, A, B, R) \
540 ((__m512h)__builtin_ia32_selectph_512( \
541 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
542 (__v32hf)(__m512h)(W)))
543
544#define _mm512_maskz_max_round_ph(U, A, B, R) \
545 ((__m512h)__builtin_ia32_selectph_512( \
546 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
547 (__v32hf)_mm512_setzero_ph()))
548
549static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
550 return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
551}
552
553static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
554 return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
555}
556
557static __inline__ __m512h __DEFAULT_FN_ATTRS512
558_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
559 return (__m512h)__builtin_ia32_selectps_512(
560 (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
561}
562
563static __inline__ __m512h __DEFAULT_FN_ATTRS512
564_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
565 return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
566 (__v16sf)_mm512_conj_pch(__A),
567 (__v16sf)_mm512_setzero_ps());
568}
569
570static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
571 __m128h __B) {
572 __A[0] += __B[0];
573 return __A;
574}
575
576static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
577 __mmask8 __U,
578 __m128h __A,
579 __m128h __B) {
580 __A = _mm_add_sh(__A, __B);
581 return __builtin_ia32_selectsh_128(__U, __A, __W);
582}
583
584static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
585 __m128h __A,
586 __m128h __B) {
587 __A = _mm_add_sh(__A, __B);
588 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
589}
590
591#define _mm_add_round_sh(A, B, R) \
592 ((__m128h)__builtin_ia32_addsh_round_mask( \
593 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
594 (__mmask8)-1, (int)(R)))
595
596#define _mm_mask_add_round_sh(W, U, A, B, R) \
597 ((__m128h)__builtin_ia32_addsh_round_mask( \
598 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
599 (__mmask8)(U), (int)(R)))
600
601#define _mm_maskz_add_round_sh(U, A, B, R) \
602 ((__m128h)__builtin_ia32_addsh_round_mask( \
603 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
604 (__mmask8)(U), (int)(R)))
605
606static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
607 __m128h __B) {
608 __A[0] -= __B[0];
609 return __A;
610}
611
612static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
613 __mmask8 __U,
614 __m128h __A,
615 __m128h __B) {
616 __A = _mm_sub_sh(__A, __B);
617 return __builtin_ia32_selectsh_128(__U, __A, __W);
618}
619
620static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
621 __m128h __A,
622 __m128h __B) {
623 __A = _mm_sub_sh(__A, __B);
624 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
625}
626
627#define _mm_sub_round_sh(A, B, R) \
628 ((__m128h)__builtin_ia32_subsh_round_mask( \
629 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
630 (__mmask8)-1, (int)(R)))
631
632#define _mm_mask_sub_round_sh(W, U, A, B, R) \
633 ((__m128h)__builtin_ia32_subsh_round_mask( \
634 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
635 (__mmask8)(U), (int)(R)))
636
637#define _mm_maskz_sub_round_sh(U, A, B, R) \
638 ((__m128h)__builtin_ia32_subsh_round_mask( \
639 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
640 (__mmask8)(U), (int)(R)))
641
642static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
643 __m128h __B) {
644 __A[0] *= __B[0];
645 return __A;
646}
647
648static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
649 __mmask8 __U,
650 __m128h __A,
651 __m128h __B) {
652 __A = _mm_mul_sh(__A, __B);
653 return __builtin_ia32_selectsh_128(__U, __A, __W);
654}
655
656static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
657 __m128h __A,
658 __m128h __B) {
659 __A = _mm_mul_sh(__A, __B);
660 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
661}
662
663#define _mm_mul_round_sh(A, B, R) \
664 ((__m128h)__builtin_ia32_mulsh_round_mask( \
665 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
666 (__mmask8)-1, (int)(R)))
667
668#define _mm_mask_mul_round_sh(W, U, A, B, R) \
669 ((__m128h)__builtin_ia32_mulsh_round_mask( \
670 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
671 (__mmask8)(U), (int)(R)))
672
673#define _mm_maskz_mul_round_sh(U, A, B, R) \
674 ((__m128h)__builtin_ia32_mulsh_round_mask( \
675 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
676 (__mmask8)(U), (int)(R)))
677
678static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
679 __m128h __B) {
680 __A[0] /= __B[0];
681 return __A;
682}
683
684static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
685 __mmask8 __U,
686 __m128h __A,
687 __m128h __B) {
688 __A = _mm_div_sh(__A, __B);
689 return __builtin_ia32_selectsh_128(__U, __A, __W);
690}
691
692static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
693 __m128h __A,
694 __m128h __B) {
695 __A = _mm_div_sh(__A, __B);
696 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
697}
698
699#define _mm_div_round_sh(A, B, R) \
700 ((__m128h)__builtin_ia32_divsh_round_mask( \
701 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
702 (__mmask8)-1, (int)(R)))
703
704#define _mm_mask_div_round_sh(W, U, A, B, R) \
705 ((__m128h)__builtin_ia32_divsh_round_mask( \
706 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
707 (__mmask8)(U), (int)(R)))
708
709#define _mm_maskz_div_round_sh(U, A, B, R) \
710 ((__m128h)__builtin_ia32_divsh_round_mask( \
711 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
712 (__mmask8)(U), (int)(R)))
713
714static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
715 __m128h __B) {
716 return (__m128h)__builtin_ia32_minsh_round_mask(
717 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
718 _MM_FROUND_CUR_DIRECTION);
719}
720
721static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
722 __mmask8 __U,
723 __m128h __A,
724 __m128h __B) {
725 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
726 (__v8hf)__W, (__mmask8)__U,
727 _MM_FROUND_CUR_DIRECTION);
728}
729
730static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
731 __m128h __A,
732 __m128h __B) {
733 return (__m128h)__builtin_ia32_minsh_round_mask(
734 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
735 _MM_FROUND_CUR_DIRECTION);
736}
737
738#define _mm_min_round_sh(A, B, R) \
739 ((__m128h)__builtin_ia32_minsh_round_mask( \
740 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
741 (__mmask8)-1, (int)(R)))
742
743#define _mm_mask_min_round_sh(W, U, A, B, R) \
744 ((__m128h)__builtin_ia32_minsh_round_mask( \
745 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
746 (__mmask8)(U), (int)(R)))
747
748#define _mm_maskz_min_round_sh(U, A, B, R) \
749 ((__m128h)__builtin_ia32_minsh_round_mask( \
750 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
751 (__mmask8)(U), (int)(R)))
752
753static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
754 __m128h __B) {
755 return (__m128h)__builtin_ia32_maxsh_round_mask(
756 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
757 _MM_FROUND_CUR_DIRECTION);
758}
759
760static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
761 __mmask8 __U,
762 __m128h __A,
763 __m128h __B) {
764 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
765 (__v8hf)__W, (__mmask8)__U,
766 _MM_FROUND_CUR_DIRECTION);
767}
768
769static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
770 __m128h __A,
771 __m128h __B) {
772 return (__m128h)__builtin_ia32_maxsh_round_mask(
773 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
774 _MM_FROUND_CUR_DIRECTION);
775}
776
777#define _mm_max_round_sh(A, B, R) \
778 ((__m128h)__builtin_ia32_maxsh_round_mask( \
779 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
780 (__mmask8)-1, (int)(R)))
781
782#define _mm_mask_max_round_sh(W, U, A, B, R) \
783 ((__m128h)__builtin_ia32_maxsh_round_mask( \
784 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
785 (__mmask8)(U), (int)(R)))
786
787#define _mm_maskz_max_round_sh(U, A, B, R) \
788 ((__m128h)__builtin_ia32_maxsh_round_mask( \
789 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
790 (__mmask8)(U), (int)(R)))
791
792#define _mm512_cmp_round_ph_mask(A, B, P, R) \
793 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
794 (__v32hf)(__m512h)(B), (int)(P), \
795 (__mmask32)-1, (int)(R)))
796
797#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
798 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
799 (__v32hf)(__m512h)(B), (int)(P), \
800 (__mmask32)(U), (int)(R)))
801
802#define _mm512_cmp_ph_mask(A, B, P) \
803 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
804
805#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
806 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
807
808#define _mm_cmp_round_sh_mask(X, Y, P, R) \
809 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
810 (__v8hf)(__m128h)(Y), (int)(P), \
811 (__mmask8)-1, (int)(R)))
812
813#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
814 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
815 (__v8hf)(__m128h)(Y), (int)(P), \
816 (__mmask8)(M), (int)(R)))
817
818#define _mm_cmp_sh_mask(X, Y, P) \
819 ((__mmask8)__builtin_ia32_cmpsh_mask( \
820 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
821 _MM_FROUND_CUR_DIRECTION))
822
823#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
824 ((__mmask8)__builtin_ia32_cmpsh_mask( \
825 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
826 _MM_FROUND_CUR_DIRECTION))
827// loads with vmovsh:
828static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
829 struct __mm_load_sh_struct {
830 _Float16 __u;
831 } __attribute__((__packed__, __may_alias__));
832 _Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u;
833 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
834}
835
836static __inline__ __m128h __DEFAULT_FN_ATTRS128
837_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
838 __m128h src = (__v8hf)__builtin_shufflevector(
839 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
840
841 return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1);
842}
843
844static __inline__ __m128h __DEFAULT_FN_ATTRS128
845_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
846 return (__m128h)__builtin_ia32_loadsh128_mask(
847 (__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
848}
849
850static __inline__ __m512h __DEFAULT_FN_ATTRS512
851_mm512_load_ph(void const *__p) {
852 return *(const __m512h *)__p;
853}
854
855static __inline__ __m256h __DEFAULT_FN_ATTRS256
856_mm256_load_ph(void const *__p) {
857 return *(const __m256h *)__p;
858}
859
860static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
861 return *(const __m128h *)__p;
862}
863
864static __inline__ __m512h __DEFAULT_FN_ATTRS512
865_mm512_loadu_ph(void const *__p) {
866 struct __loadu_ph {
867 __m512h_u __v;
868 } __attribute__((__packed__, __may_alias__));
869 return ((const struct __loadu_ph *)__p)->__v;
870}
871
872static __inline__ __m256h __DEFAULT_FN_ATTRS256
873_mm256_loadu_ph(void const *__p) {
874 struct __loadu_ph {
875 __m256h_u __v;
876 } __attribute__((__packed__, __may_alias__));
877 return ((const struct __loadu_ph *)__p)->__v;
878}
879
880static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
881 struct __loadu_ph {
882 __m128h_u __v;
883 } __attribute__((__packed__, __may_alias__));
884 return ((const struct __loadu_ph *)__p)->__v;
885}
886
887// stores with vmovsh:
888static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
889 __m128h __a) {
890 struct __mm_store_sh_struct {
891 _Float16 __u;
892 } __attribute__((__packed__, __may_alias__));
893 ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
894}
895
896static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
897 __mmask8 __U,
898 __m128h __A) {
899 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
900}
901
902static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
903 __m512h __A) {
904 *(__m512h *)__P = __A;
905}
906
907static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
908 __m256h __A) {
909 *(__m256h *)__P = __A;
910}
911
912static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
913 __m128h __A) {
914 *(__m128h *)__P = __A;
915}
916
917static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
918 __m512h __A) {
919 struct __storeu_ph {
920 __m512h_u __v;
921 } __attribute__((__packed__, __may_alias__));
922 ((struct __storeu_ph *)__P)->__v = __A;
923}
924
925static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
926 __m256h __A) {
927 struct __storeu_ph {
928 __m256h_u __v;
929 } __attribute__((__packed__, __may_alias__));
930 ((struct __storeu_ph *)__P)->__v = __A;
931}
932
933static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
934 __m128h __A) {
935 struct __storeu_ph {
936 __m128h_u __v;
937 } __attribute__((__packed__, __may_alias__));
938 ((struct __storeu_ph *)__P)->__v = __A;
939}
940
941// moves with vmovsh:
942static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
943 __m128h __b) {
944 __a[0] = __b[0];
945 return __a;
946}
947
948static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
949 __mmask8 __U,
950 __m128h __A,
951 __m128h __B) {
952 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
953}
954
955static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
956 __m128h __A,
957 __m128h __B) {
958 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
959 _mm_setzero_ph());
960}
961
962// vmovw:
963static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
964 return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
965}
966
967static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
968 __v8hi __b = (__v8hi)__a;
969 return __b[0];
970}
971
972static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
973 return (__m512h)__builtin_ia32_rcpph512_mask(
974 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
975}
976
977static __inline__ __m512h __DEFAULT_FN_ATTRS512
978_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
979 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
980 (__mmask32)__U);
981}
982
983static __inline__ __m512h __DEFAULT_FN_ATTRS512
984_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
985 return (__m512h)__builtin_ia32_rcpph512_mask(
986 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
987}
988
989static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
990 return (__m512h)__builtin_ia32_rsqrtph512_mask(
991 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
992}
993
994static __inline__ __m512h __DEFAULT_FN_ATTRS512
995_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
996 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
997 (__mmask32)__U);
998}
999
1000static __inline__ __m512h __DEFAULT_FN_ATTRS512
1001_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
1002 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1003 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1004}
1005
1006#define _mm512_getmant_ph(A, B, C) \
1007 ((__m512h)__builtin_ia32_getmantph512_mask( \
1008 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1009 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1010 _MM_FROUND_CUR_DIRECTION))
1011
1012#define _mm512_mask_getmant_ph(W, U, A, B, C) \
1013 ((__m512h)__builtin_ia32_getmantph512_mask( \
1014 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1015 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1016
1017#define _mm512_maskz_getmant_ph(U, A, B, C) \
1018 ((__m512h)__builtin_ia32_getmantph512_mask( \
1019 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1020 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1021
1022#define _mm512_getmant_round_ph(A, B, C, R) \
1023 ((__m512h)__builtin_ia32_getmantph512_mask( \
1024 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1025 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1026
1027#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1028 ((__m512h)__builtin_ia32_getmantph512_mask( \
1029 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1030 (__mmask32)(U), (int)(R)))
1031
1032#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1033 ((__m512h)__builtin_ia32_getmantph512_mask( \
1034 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1035 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1036
1037static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1038 return (__m512h)__builtin_ia32_getexpph512_mask(
1039 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1040 _MM_FROUND_CUR_DIRECTION);
1041}
1042
1043static __inline__ __m512h __DEFAULT_FN_ATTRS512
1044_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1045 return (__m512h)__builtin_ia32_getexpph512_mask(
1046 (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1047}
1048
1049static __inline__ __m512h __DEFAULT_FN_ATTRS512
1050_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1051 return (__m512h)__builtin_ia32_getexpph512_mask(
1052 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1053 _MM_FROUND_CUR_DIRECTION);
1054}
1055
1056#define _mm512_getexp_round_ph(A, R) \
1057 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1058 (__v32hf)_mm512_undefined_ph(), \
1059 (__mmask32)-1, (int)(R)))
1060
1061#define _mm512_mask_getexp_round_ph(W, U, A, R) \
1062 ((__m512h)__builtin_ia32_getexpph512_mask( \
1063 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1064
1065#define _mm512_maskz_getexp_round_ph(U, A, R) \
1066 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1067 (__v32hf)_mm512_setzero_ph(), \
1068 (__mmask32)(U), (int)(R)))
1069
1070static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1071 __m512h __B) {
1072 return (__m512h)__builtin_ia32_scalefph512_mask(
1073 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1074 _MM_FROUND_CUR_DIRECTION);
1075}
1076
1077static __inline__ __m512h __DEFAULT_FN_ATTRS512
1078_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1079 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1080 (__v32hf)__W, (__mmask32)__U,
1081 _MM_FROUND_CUR_DIRECTION);
1082}
1083
1084static __inline__ __m512h __DEFAULT_FN_ATTRS512
1085_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1086 return (__m512h)__builtin_ia32_scalefph512_mask(
1087 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1088 _MM_FROUND_CUR_DIRECTION);
1089}
1090
1091#define _mm512_scalef_round_ph(A, B, R) \
1092 ((__m512h)__builtin_ia32_scalefph512_mask( \
1093 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1094 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1095
1096#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1097 ((__m512h)__builtin_ia32_scalefph512_mask( \
1098 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1099 (__mmask32)(U), (int)(R)))
1100
1101#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1102 ((__m512h)__builtin_ia32_scalefph512_mask( \
1103 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1104 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1105
1106#define _mm512_roundscale_ph(A, B) \
1107 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1108 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1109 _MM_FROUND_CUR_DIRECTION))
1110
1111#define _mm512_mask_roundscale_ph(A, B, C, imm) \
1112 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1113 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1114 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1115
1116#define _mm512_maskz_roundscale_ph(A, B, imm) \
1117 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1118 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1119 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1120
1121#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1122 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1123 (__v32hf)(__m512h)(A), \
1124 (__mmask32)(B), (int)(R)))
1125
1126#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1127 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1128 (__v32hf)_mm512_setzero_ph(), \
1129 (__mmask32)(A), (int)(R)))
1130
1131#define _mm512_roundscale_round_ph(A, imm, R) \
1132 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1133 (__v32hf)_mm512_undefined_ph(), \
1134 (__mmask32)-1, (int)(R)))
1135
1136#define _mm512_reduce_ph(A, imm) \
1137 ((__m512h)__builtin_ia32_reduceph512_mask( \
1138 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1139 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1140
1141#define _mm512_mask_reduce_ph(W, U, A, imm) \
1142 ((__m512h)__builtin_ia32_reduceph512_mask( \
1143 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1144 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1145
1146#define _mm512_maskz_reduce_ph(U, A, imm) \
1147 ((__m512h)__builtin_ia32_reduceph512_mask( \
1148 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1149 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1150
1151#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1152 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1153 (__v32hf)(__m512h)(W), \
1154 (__mmask32)(U), (int)(R)))
1155
1156#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1157 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1158 (__v32hf)_mm512_setzero_ph(), \
1159 (__mmask32)(U), (int)(R)))
1160
1161#define _mm512_reduce_round_ph(A, imm, R) \
1162 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1163 (__v32hf)_mm512_undefined_ph(), \
1164 (__mmask32)-1, (int)(R)))
1165
1166static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1167 __m128h __B) {
1168 return (__m128h)__builtin_ia32_rcpsh_mask(
1169 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1170}
1171
1172static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1173 __mmask8 __U,
1174 __m128h __A,
1175 __m128h __B) {
1176 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1177 (__v8hf)__W, (__mmask8)__U);
1178}
1179
1180static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1181 __m128h __A,
1182 __m128h __B) {
1183 return (__m128h)__builtin_ia32_rcpsh_mask(
1184 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1185}
1186
1187static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1188 __m128h __B) {
1189 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1190 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1191}
1192
1193static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1194 __mmask8 __U,
1195 __m128h __A,
1196 __m128h __B) {
1197 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1198 (__v8hf)__W, (__mmask8)__U);
1199}
1200
1201static __inline__ __m128h __DEFAULT_FN_ATTRS128
1202_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1203 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1204 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1205}
1206
1207#define _mm_getmant_round_sh(A, B, C, D, R) \
1208 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1209 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1210 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1211
1212#define _mm_getmant_sh(A, B, C, D) \
1213 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1214 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1215 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1216
1217#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1218 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1219 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1220 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1221
1222#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1223 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1224 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1225 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1226
1227#define _mm_maskz_getmant_sh(U, A, B, C, D) \
1228 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1229 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1230 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1231
1232#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1233 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1234 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1235 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1236
1237#define _mm_getexp_round_sh(A, B, R) \
1238 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1239 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1240 (__mmask8)-1, (int)(R)))
1241
1242static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1243 __m128h __B) {
1244 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1245 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1246 _MM_FROUND_CUR_DIRECTION);
1247}
1248
1249static __inline__ __m128h __DEFAULT_FN_ATTRS128
1250_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1251 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1252 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1253 _MM_FROUND_CUR_DIRECTION);
1254}
1255
1256#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1257 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1258 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1259 (__mmask8)(U), (int)(R)))
1260
1261static __inline__ __m128h __DEFAULT_FN_ATTRS128
1262_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1263 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1264 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1265 _MM_FROUND_CUR_DIRECTION);
1266}
1267
1268#define _mm_maskz_getexp_round_sh(U, A, B, R) \
1269 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1270 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1271 (__mmask8)(U), (int)(R)))
1272
1273#define _mm_scalef_round_sh(A, B, R) \
1274 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1275 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1276 (__mmask8)-1, (int)(R)))
1277
1278static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1279 __m128h __B) {
1280 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1281 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1282 _MM_FROUND_CUR_DIRECTION);
1283}
1284
1285static __inline__ __m128h __DEFAULT_FN_ATTRS128
1286_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1287 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1288 (__v8hf)__W, (__mmask8)__U,
1289 _MM_FROUND_CUR_DIRECTION);
1290}
1291
1292#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1293 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1294 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1295 (__mmask8)(U), (int)(R)))
1296
1297static __inline__ __m128h __DEFAULT_FN_ATTRS128
1298_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1299 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1300 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1301 _MM_FROUND_CUR_DIRECTION);
1302}
1303
1304#define _mm_maskz_scalef_round_sh(U, A, B, R) \
1305 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1306 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1307 (__mmask8)(U), (int)(R)))
1308
1309#define _mm_roundscale_round_sh(A, B, imm, R) \
1310 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1311 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1312 (__mmask8)-1, (int)(imm), (int)(R)))
1313
1314#define _mm_roundscale_sh(A, B, imm) \
1315 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1316 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1317 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1318
1319#define _mm_mask_roundscale_sh(W, U, A, B, I) \
1320 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1321 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1322 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1323
1324#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1325 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1326 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1327 (__mmask8)(U), (int)(I), (int)(R)))
1328
1329#define _mm_maskz_roundscale_sh(U, A, B, I) \
1330 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1331 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1332 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1333
1334#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1335 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1336 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1337 (__mmask8)(U), (int)(I), (int)(R)))
1338
1339#define _mm_reduce_sh(A, B, C) \
1340 ((__m128h)__builtin_ia32_reducesh_mask( \
1341 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1342 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1343
1344#define _mm_mask_reduce_sh(W, U, A, B, C) \
1345 ((__m128h)__builtin_ia32_reducesh_mask( \
1346 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1347 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1348
1349#define _mm_maskz_reduce_sh(U, A, B, C) \
1350 ((__m128h)__builtin_ia32_reducesh_mask( \
1351 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1352 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1353
1354#define _mm_reduce_round_sh(A, B, C, R) \
1355 ((__m128h)__builtin_ia32_reducesh_mask( \
1356 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1357 (__mmask8)-1, (int)(C), (int)(R)))
1358
1359#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1360 ((__m128h)__builtin_ia32_reducesh_mask( \
1361 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1362 (__mmask8)(U), (int)(C), (int)(R)))
1363
1364#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1365 ((__m128h)__builtin_ia32_reducesh_mask( \
1366 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1367 (__mmask8)(U), (int)(C), (int)(R)))
1368
1369#define _mm512_sqrt_round_ph(A, R) \
1370 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1371
1372#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1373 ((__m512h)__builtin_ia32_selectph_512( \
1374 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1375 (__v32hf)(__m512h)(W)))
1376
1377#define _mm512_maskz_sqrt_round_ph(U, A, R) \
1378 ((__m512h)__builtin_ia32_selectph_512( \
1379 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1380 (__v32hf)_mm512_setzero_ph()))
1381
1382static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1383 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1384 _MM_FROUND_CUR_DIRECTION);
1385}
1386
1387static __inline__ __m512h __DEFAULT_FN_ATTRS512
1388_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1389 return (__m512h)__builtin_ia32_selectph_512(
1390 (__mmask32)(__U),
1391 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1392 (__v32hf)(__m512h)(__W));
1393}
1394
1395static __inline__ __m512h __DEFAULT_FN_ATTRS512
1396_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1397 return (__m512h)__builtin_ia32_selectph_512(
1398 (__mmask32)(__U),
1399 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1400 (__v32hf)_mm512_setzero_ph());
1401}
1402
1403#define _mm_sqrt_round_sh(A, B, R) \
1404 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1405 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1406 (__mmask8)-1, (int)(R)))
1407
1408#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1409 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1410 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1411 (__mmask8)(U), (int)(R)))
1412
1413#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1414 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1415 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1416 (__mmask8)(U), (int)(R)))
1417
1418static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1419 __m128h __B) {
1420 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1421 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1422 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
1423}
1424
1425static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1426 __mmask32 __U,
1427 __m128h __A,
1428 __m128h __B) {
1429 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1430 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1431 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1432}
1433
1434static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1435 __m128h __A,
1436 __m128h __B) {
1437 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1438 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1439 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1440}
1441
1442#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1443 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1444 (int)(imm), (__mmask32)(U)))
1445
1446#define _mm512_fpclass_ph_mask(A, imm) \
1447 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1448 (int)(imm), (__mmask32)-1))
1449
1450#define _mm_fpclass_sh_mask(A, imm) \
1451 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1452 (__mmask8)-1))
1453
1454#define _mm_mask_fpclass_sh_mask(U, A, imm) \
1455 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1456 (__mmask8)(U)))
1457
1458#define _mm512_cvt_roundpd_ph(A, R) \
1459 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1460 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1461
1462#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1463 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1464 (__mmask8)(U), (int)(R)))
1465
1466#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1467 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1468 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1469
1470static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1471 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1472 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1473 _MM_FROUND_CUR_DIRECTION);
1474}
1475
1476static __inline__ __m128h __DEFAULT_FN_ATTRS512
1477_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1478 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1479 (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1480}
1481
1482static __inline__ __m128h __DEFAULT_FN_ATTRS512
1483_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1484 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1485 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1486 _MM_FROUND_CUR_DIRECTION);
1487}
1488
1489#define _mm512_cvt_roundph_pd(A, R) \
1490 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1491 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1492
1493#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1494 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1495 (__mmask8)(U), (int)(R)))
1496
1497#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1498 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1499 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1500
1501static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1502 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1503 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1504 _MM_FROUND_CUR_DIRECTION);
1505}
1506
1507static __inline__ __m512d __DEFAULT_FN_ATTRS512
1508_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1509 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1510 (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1511}
1512
1513static __inline__ __m512d __DEFAULT_FN_ATTRS512
1514_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1515 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1516 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1517 _MM_FROUND_CUR_DIRECTION);
1518}
1519
1520#define _mm_cvt_roundsh_ss(A, B, R) \
1521 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1522 (__v4sf)_mm_undefined_ps(), \
1523 (__mmask8)(-1), (int)(R)))
1524
1525#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1526 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1527 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1528
1529#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1530 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1531 (__v4sf)_mm_setzero_ps(), \
1532 (__mmask8)(U), (int)(R)))
1533
1534static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1535 __m128h __B) {
1536 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1537 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1538 _MM_FROUND_CUR_DIRECTION);
1539}
1540
1541static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1542 __mmask8 __U,
1543 __m128 __A,
1544 __m128h __B) {
1545 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1546 (__v4sf)__W, (__mmask8)__U,
1547 _MM_FROUND_CUR_DIRECTION);
1548}
1549
1550static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1551 __m128 __A,
1552 __m128h __B) {
1553 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1554 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1555 _MM_FROUND_CUR_DIRECTION);
1556}
1557
1558#define _mm_cvt_roundss_sh(A, B, R) \
1559 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1560 (__v8hf)_mm_undefined_ph(), \
1561 (__mmask8)(-1), (int)(R)))
1562
1563#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1564 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1565 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1566
1567#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1568 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1569 (__v8hf)_mm_setzero_ph(), \
1570 (__mmask8)(U), (int)(R)))
1571
1572static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1573 __m128 __B) {
1574 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1575 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1576 _MM_FROUND_CUR_DIRECTION);
1577}
1578
1579static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1580 __mmask8 __U,
1581 __m128h __A,
1582 __m128 __B) {
1583 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1584 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1585 _MM_FROUND_CUR_DIRECTION);
1586}
1587
1588static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1589 __m128h __A,
1590 __m128 __B) {
1591 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1592 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1593 _MM_FROUND_CUR_DIRECTION);
1594}
1595
1596#define _mm_cvt_roundsd_sh(A, B, R) \
1597 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1598 (__v8hf)_mm_undefined_ph(), \
1599 (__mmask8)(-1), (int)(R)))
1600
1601#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1602 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1603 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1604
1605#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1606 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1607 (__v8hf)_mm_setzero_ph(), \
1608 (__mmask8)(U), (int)(R)))
1609
1610static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1611 __m128d __B) {
1612 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1613 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1614 _MM_FROUND_CUR_DIRECTION);
1615}
1616
1617static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1618 __mmask8 __U,
1619 __m128h __A,
1620 __m128d __B) {
1621 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1622 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1623 _MM_FROUND_CUR_DIRECTION);
1624}
1625
1626static __inline__ __m128h __DEFAULT_FN_ATTRS128
1627_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1628 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1629 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1630 _MM_FROUND_CUR_DIRECTION);
1631}
1632
1633#define _mm_cvt_roundsh_sd(A, B, R) \
1634 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1635 (__v2df)_mm_undefined_pd(), \
1636 (__mmask8)(-1), (int)(R)))
1637
1638#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1639 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1640 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1641
1642#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1643 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1644 (__v2df)_mm_setzero_pd(), \
1645 (__mmask8)(U), (int)(R)))
1646
1647static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1648 __m128h __B) {
1649 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1650 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1651 _MM_FROUND_CUR_DIRECTION);
1652}
1653
1654static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1655 __mmask8 __U,
1656 __m128d __A,
1657 __m128h __B) {
1658 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1659 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1660 _MM_FROUND_CUR_DIRECTION);
1661}
1662
1663static __inline__ __m128d __DEFAULT_FN_ATTRS128
1664_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1665 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1666 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1667 _MM_FROUND_CUR_DIRECTION);
1668}
1669
1670#define _mm512_cvt_roundph_epi16(A, R) \
1671 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1672 (__v32hi)_mm512_undefined_epi32(), \
1673 (__mmask32)(-1), (int)(R)))
1674
1675#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1676 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1677 (__mmask32)(U), (int)(R)))
1678
1679#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1680 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1681 (__v32hi)_mm512_setzero_epi32(), \
1682 (__mmask32)(U), (int)(R)))
1683
1684static __inline__ __m512i __DEFAULT_FN_ATTRS512
1685_mm512_cvtph_epi16(__m512h __A) {
1686 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1687 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1688 _MM_FROUND_CUR_DIRECTION);
1689}
1690
1691static __inline__ __m512i __DEFAULT_FN_ATTRS512
1692_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1693 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1694 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1695}
1696
1697static __inline__ __m512i __DEFAULT_FN_ATTRS512
1698_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1699 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1700 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1701 _MM_FROUND_CUR_DIRECTION);
1702}
1703
1704#define _mm512_cvtt_roundph_epi16(A, R) \
1705 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1706 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1707 (int)(R)))
1708
1709#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1710 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1711 (__mmask32)(U), (int)(R)))
1712
1713#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1714 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1715 (__v32hi)_mm512_setzero_epi32(), \
1716 (__mmask32)(U), (int)(R)))
1717
1718static __inline__ __m512i __DEFAULT_FN_ATTRS512
1719_mm512_cvttph_epi16(__m512h __A) {
1720 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1721 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1722 _MM_FROUND_CUR_DIRECTION);
1723}
1724
1725static __inline__ __m512i __DEFAULT_FN_ATTRS512
1726_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1727 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1728 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1729}
1730
1731static __inline__ __m512i __DEFAULT_FN_ATTRS512
1732_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1733 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1734 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1735 _MM_FROUND_CUR_DIRECTION);
1736}
1737
1738#define _mm512_cvt_roundepi16_ph(A, R) \
1739 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1740 (__v32hf)_mm512_undefined_ph(), \
1741 (__mmask32)(-1), (int)(R)))
1742
1743#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1744 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1745 (__mmask32)(U), (int)(R)))
1746
1747#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1748 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1749 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1750
1751static __inline__ __m512h __DEFAULT_FN_ATTRS512
1752_mm512_cvtepi16_ph(__m512i __A) {
1753 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1754 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1755 _MM_FROUND_CUR_DIRECTION);
1756}
1757
1758static __inline__ __m512h __DEFAULT_FN_ATTRS512
1759_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1760 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1761 (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1762}
1763
1764static __inline__ __m512h __DEFAULT_FN_ATTRS512
1765_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1766 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1767 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1768 _MM_FROUND_CUR_DIRECTION);
1769}
1770
1771#define _mm512_cvt_roundph_epu16(A, R) \
1772 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1773 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1774 (int)(R)))
1775
1776#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1777 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1778 (__mmask32)(U), (int)(R)))
1779
1780#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1781 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1782 (__v32hu)_mm512_setzero_epi32(), \
1783 (__mmask32)(U), (int)(R)))
1784
1785static __inline__ __m512i __DEFAULT_FN_ATTRS512
1786_mm512_cvtph_epu16(__m512h __A) {
1787 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1788 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1789 _MM_FROUND_CUR_DIRECTION);
1790}
1791
1792static __inline__ __m512i __DEFAULT_FN_ATTRS512
1793_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1794 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1795 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1796}
1797
1798static __inline__ __m512i __DEFAULT_FN_ATTRS512
1799_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1800 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1801 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1802 _MM_FROUND_CUR_DIRECTION);
1803}
1804
1805#define _mm512_cvtt_roundph_epu16(A, R) \
1806 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1807 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1808 (int)(R)))
1809
1810#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1811 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1812 (__mmask32)(U), (int)(R)))
1813
1814#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1815 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1816 (__v32hu)_mm512_setzero_epi32(), \
1817 (__mmask32)(U), (int)(R)))
1818
1819static __inline__ __m512i __DEFAULT_FN_ATTRS512
1820_mm512_cvttph_epu16(__m512h __A) {
1821 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1822 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1823 _MM_FROUND_CUR_DIRECTION);
1824}
1825
1826static __inline__ __m512i __DEFAULT_FN_ATTRS512
1827_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1828 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1829 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1830}
1831
1832static __inline__ __m512i __DEFAULT_FN_ATTRS512
1833_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1834 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1835 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1836 _MM_FROUND_CUR_DIRECTION);
1837}
1838
1839#define _mm512_cvt_roundepu16_ph(A, R) \
1840 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1841 (__v32hf)_mm512_undefined_ph(), \
1842 (__mmask32)(-1), (int)(R)))
1843
1844#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1845 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1846 (__mmask32)(U), (int)(R)))
1847
1848#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1849 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1850 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1851
1852static __inline__ __m512h __DEFAULT_FN_ATTRS512
1853_mm512_cvtepu16_ph(__m512i __A) {
1854 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1855 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1856 _MM_FROUND_CUR_DIRECTION);
1857}
1858
1859static __inline__ __m512h __DEFAULT_FN_ATTRS512
1860_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1861 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1862 (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1863}
1864
1865static __inline__ __m512h __DEFAULT_FN_ATTRS512
1866_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1867 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1868 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1869 _MM_FROUND_CUR_DIRECTION);
1870}
1871
1872#define _mm512_cvt_roundph_epi32(A, R) \
1873 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1874 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1875 (int)(R)))
1876
1877#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1878 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1879 (__mmask16)(U), (int)(R)))
1880
1881#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1882 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1883 (__v16si)_mm512_setzero_epi32(), \
1884 (__mmask16)(U), (int)(R)))
1885
1886static __inline__ __m512i __DEFAULT_FN_ATTRS512
1887_mm512_cvtph_epi32(__m256h __A) {
1888 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1889 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1890 _MM_FROUND_CUR_DIRECTION);
1891}
1892
1893static __inline__ __m512i __DEFAULT_FN_ATTRS512
1894_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1895 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1896 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1897}
1898
1899static __inline__ __m512i __DEFAULT_FN_ATTRS512
1900_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1901 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1902 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1903 _MM_FROUND_CUR_DIRECTION);
1904}
1905
1906#define _mm512_cvt_roundph_epu32(A, R) \
1907 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1908 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1909 (int)(R)))
1910
1911#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1912 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1913 (__mmask16)(U), (int)(R)))
1914
1915#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1916 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1917 (__v16su)_mm512_setzero_epi32(), \
1918 (__mmask16)(U), (int)(R)))
1919
1920static __inline__ __m512i __DEFAULT_FN_ATTRS512
1921_mm512_cvtph_epu32(__m256h __A) {
1922 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1923 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1924 _MM_FROUND_CUR_DIRECTION);
1925}
1926
1927static __inline__ __m512i __DEFAULT_FN_ATTRS512
1928_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1929 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1930 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1931}
1932
1933static __inline__ __m512i __DEFAULT_FN_ATTRS512
1934_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1935 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1936 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1937 _MM_FROUND_CUR_DIRECTION);
1938}
1939
1940#define _mm512_cvt_roundepi32_ph(A, R) \
1941 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1942 (__v16hf)_mm256_undefined_ph(), \
1943 (__mmask16)(-1), (int)(R)))
1944
1945#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1946 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1947 (__mmask16)(U), (int)(R)))
1948
1949#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1950 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1951 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1952
1953static __inline__ __m256h __DEFAULT_FN_ATTRS512
1954_mm512_cvtepi32_ph(__m512i __A) {
1955 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1956 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1957 _MM_FROUND_CUR_DIRECTION);
1958}
1959
1960static __inline__ __m256h __DEFAULT_FN_ATTRS512
1961_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1962 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1963 (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1964}
1965
1966static __inline__ __m256h __DEFAULT_FN_ATTRS512
1967_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1968 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1969 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1970 _MM_FROUND_CUR_DIRECTION);
1971}
1972
1973#define _mm512_cvt_roundepu32_ph(A, R) \
1974 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1975 (__v16hf)_mm256_undefined_ph(), \
1976 (__mmask16)(-1), (int)(R)))
1977
1978#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1979 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1980 (__mmask16)(U), (int)(R)))
1981
1982#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1983 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1984 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1985
1986static __inline__ __m256h __DEFAULT_FN_ATTRS512
1987_mm512_cvtepu32_ph(__m512i __A) {
1988 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1989 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1990 _MM_FROUND_CUR_DIRECTION);
1991}
1992
1993static __inline__ __m256h __DEFAULT_FN_ATTRS512
1994_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1995 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1996 (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1997}
1998
1999static __inline__ __m256h __DEFAULT_FN_ATTRS512
2000_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
2001 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2002 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2003 _MM_FROUND_CUR_DIRECTION);
2004}
2005
2006#define _mm512_cvtt_roundph_epi32(A, R) \
2007 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2008 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2009 (int)(R)))
2010
2011#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2012 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2013 (__mmask16)(U), (int)(R)))
2014
2015#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2016 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2017 (__v16si)_mm512_setzero_epi32(), \
2018 (__mmask16)(U), (int)(R)))
2019
2020static __inline__ __m512i __DEFAULT_FN_ATTRS512
2021_mm512_cvttph_epi32(__m256h __A) {
2022 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2023 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2024 _MM_FROUND_CUR_DIRECTION);
2025}
2026
2027static __inline__ __m512i __DEFAULT_FN_ATTRS512
2028_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2029 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2030 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2031}
2032
2033static __inline__ __m512i __DEFAULT_FN_ATTRS512
2034_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2035 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2036 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2037 _MM_FROUND_CUR_DIRECTION);
2038}
2039
2040#define _mm512_cvtt_roundph_epu32(A, R) \
2041 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2042 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2043 (int)(R)))
2044
2045#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2046 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2047 (__mmask16)(U), (int)(R)))
2048
2049#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2050 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2051 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2052 (int)(R)))
2053
2054static __inline__ __m512i __DEFAULT_FN_ATTRS512
2055_mm512_cvttph_epu32(__m256h __A) {
2056 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2057 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2058 _MM_FROUND_CUR_DIRECTION);
2059}
2060
2061static __inline__ __m512i __DEFAULT_FN_ATTRS512
2062_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2063 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2064 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2065}
2066
2067static __inline__ __m512i __DEFAULT_FN_ATTRS512
2068_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2069 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2070 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2071 _MM_FROUND_CUR_DIRECTION);
2072}
2073
2074#define _mm512_cvt_roundepi64_ph(A, R) \
2075 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2076 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2077
2078#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2079 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2080 (__mmask8)(U), (int)(R)))
2081
2082#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2083 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2084 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2085
2086static __inline__ __m128h __DEFAULT_FN_ATTRS512
2087_mm512_cvtepi64_ph(__m512i __A) {
2088 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2089 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2090 _MM_FROUND_CUR_DIRECTION);
2091}
2092
2093static __inline__ __m128h __DEFAULT_FN_ATTRS512
2094_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2095 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2096 (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2097}
2098
2099static __inline__ __m128h __DEFAULT_FN_ATTRS512
2100_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2101 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2102 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2103 _MM_FROUND_CUR_DIRECTION);
2104}
2105
2106#define _mm512_cvt_roundph_epi64(A, R) \
2107 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2108 (__v8di)_mm512_undefined_epi32(), \
2109 (__mmask8)(-1), (int)(R)))
2110
2111#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2112 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2113 (__mmask8)(U), (int)(R)))
2114
2115#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2116 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2117 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2118
2119static __inline__ __m512i __DEFAULT_FN_ATTRS512
2120_mm512_cvtph_epi64(__m128h __A) {
2121 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2122 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2123 _MM_FROUND_CUR_DIRECTION);
2124}
2125
2126static __inline__ __m512i __DEFAULT_FN_ATTRS512
2127_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2128 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2129 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2130}
2131
2132static __inline__ __m512i __DEFAULT_FN_ATTRS512
2133_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2134 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2135 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2136 _MM_FROUND_CUR_DIRECTION);
2137}
2138
2139#define _mm512_cvt_roundepu64_ph(A, R) \
2140 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2141 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2142
2143#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2144 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2145 (__mmask8)(U), (int)(R)))
2146
2147#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2148 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2149 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2150
2151static __inline__ __m128h __DEFAULT_FN_ATTRS512
2152_mm512_cvtepu64_ph(__m512i __A) {
2153 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2154 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2155 _MM_FROUND_CUR_DIRECTION);
2156}
2157
2158static __inline__ __m128h __DEFAULT_FN_ATTRS512
2159_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2160 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2161 (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2162}
2163
2164static __inline__ __m128h __DEFAULT_FN_ATTRS512
2165_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2166 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2167 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2168 _MM_FROUND_CUR_DIRECTION);
2169}
2170
2171#define _mm512_cvt_roundph_epu64(A, R) \
2172 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2173 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2174 (int)(R)))
2175
2176#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2177 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2178 (__mmask8)(U), (int)(R)))
2179
2180#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2181 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2182 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2183
2184static __inline__ __m512i __DEFAULT_FN_ATTRS512
2185_mm512_cvtph_epu64(__m128h __A) {
2186 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2187 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2188 _MM_FROUND_CUR_DIRECTION);
2189}
2190
2191static __inline__ __m512i __DEFAULT_FN_ATTRS512
2192_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2193 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2194 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2195}
2196
2197static __inline__ __m512i __DEFAULT_FN_ATTRS512
2198_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2199 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2200 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2201 _MM_FROUND_CUR_DIRECTION);
2202}
2203
2204#define _mm512_cvtt_roundph_epi64(A, R) \
2205 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2206 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2207 (int)(R)))
2208
2209#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2210 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2211 (__mmask8)(U), (int)(R)))
2212
2213#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2214 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2215 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2216
2217static __inline__ __m512i __DEFAULT_FN_ATTRS512
2218_mm512_cvttph_epi64(__m128h __A) {
2219 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2220 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2221 _MM_FROUND_CUR_DIRECTION);
2222}
2223
2224static __inline__ __m512i __DEFAULT_FN_ATTRS512
2225_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2226 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2227 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2228}
2229
2230static __inline__ __m512i __DEFAULT_FN_ATTRS512
2231_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2232 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2233 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2234 _MM_FROUND_CUR_DIRECTION);
2235}
2236
2237#define _mm512_cvtt_roundph_epu64(A, R) \
2238 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2239 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2240 (int)(R)))
2241
2242#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2243 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2244 (__mmask8)(U), (int)(R)))
2245
2246#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2247 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2248 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2249
2250static __inline__ __m512i __DEFAULT_FN_ATTRS512
2251_mm512_cvttph_epu64(__m128h __A) {
2252 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2253 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2254 _MM_FROUND_CUR_DIRECTION);
2255}
2256
2257static __inline__ __m512i __DEFAULT_FN_ATTRS512
2258_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2259 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2260 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2261}
2262
2263static __inline__ __m512i __DEFAULT_FN_ATTRS512
2264_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2265 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2266 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2267 _MM_FROUND_CUR_DIRECTION);
2268}
2269
2270#define _mm_cvt_roundsh_i32(A, R) \
2271 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2272
2273static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2274 return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2275}
2276
2277#define _mm_cvt_roundsh_u32(A, R) \
2278 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2279
2280static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2281_mm_cvtsh_u32(__m128h __A) {
2282 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2283 _MM_FROUND_CUR_DIRECTION);
2284}
2285
2286#ifdef __x86_64__
2287#define _mm_cvt_roundsh_i64(A, R) \
2288 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2289
2290static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2291 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2292 _MM_FROUND_CUR_DIRECTION);
2293}
2294
2295#define _mm_cvt_roundsh_u64(A, R) \
2296 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2297
2298static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2299_mm_cvtsh_u64(__m128h __A) {
2300 return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2301 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2302}
2303#endif // __x86_64__
2304
2305#define _mm_cvt_roundu32_sh(A, B, R) \
2306 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2307
2308static __inline__ __m128h __DEFAULT_FN_ATTRS128
2309_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2310 __A[0] = __B;
2311 return __A;
2312}
2313
2314#ifdef __x86_64__
2315#define _mm_cvt_roundu64_sh(A, B, R) \
2316 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2317 (int)(R)))
2318
2319static __inline__ __m128h __DEFAULT_FN_ATTRS128
2320_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2321 __A[0] = __B;
2322 return __A;
2323}
2324#endif
2325
2326#define _mm_cvt_roundi32_sh(A, B, R) \
2327 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2328
2329static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2330 int __B) {
2331 __A[0] = __B;
2332 return __A;
2333}
2334
2335#ifdef __x86_64__
2336#define _mm_cvt_roundi64_sh(A, B, R) \
2337 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2338
2339static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2340 long long __B) {
2341 __A[0] = __B;
2342 return __A;
2343}
2344#endif
2345
2346#define _mm_cvtt_roundsh_i32(A, R) \
2347 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2348
2349static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2350 return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2351 _MM_FROUND_CUR_DIRECTION);
2352}
2353
2354#ifdef __x86_64__
2355#define _mm_cvtt_roundsh_i64(A, R) \
2356 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2357
2358static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2359 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2360 _MM_FROUND_CUR_DIRECTION);
2361}
2362#endif
2363
2364#define _mm_cvtt_roundsh_u32(A, R) \
2365 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2366
2367static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2368_mm_cvttsh_u32(__m128h __A) {
2369 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2370 _MM_FROUND_CUR_DIRECTION);
2371}
2372
2373#ifdef __x86_64__
2374#define _mm_cvtt_roundsh_u64(A, R) \
2375 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2376
2377static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2378_mm_cvttsh_u64(__m128h __A) {
2379 return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2380 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2381}
2382#endif
2383
2384#define _mm512_cvtx_roundph_ps(A, R) \
2385 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2386 (__v16sf)_mm512_undefined_ps(), \
2387 (__mmask16)(-1), (int)(R)))
2388
2389#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2390 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2391 (__mmask16)(U), (int)(R)))
2392
2393#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2394 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2395 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2396
2397static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2398 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2399 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2400 _MM_FROUND_CUR_DIRECTION);
2401}
2402
2403static __inline__ __m512 __DEFAULT_FN_ATTRS512
2404_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2405 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2406 (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2407}
2408
2409static __inline__ __m512 __DEFAULT_FN_ATTRS512
2410_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2411 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2412 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2413 _MM_FROUND_CUR_DIRECTION);
2414}
2415
2416#define _mm512_cvtx_roundps_ph(A, R) \
2417 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2418 (__v16hf)_mm256_undefined_ph(), \
2419 (__mmask16)(-1), (int)(R)))
2420
2421#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2422 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2423 (__mmask16)(U), (int)(R)))
2424
2425#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2426 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2427 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2428
2429static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2430 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2431 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2432 _MM_FROUND_CUR_DIRECTION);
2433}
2434
2435static __inline__ __m256h __DEFAULT_FN_ATTRS512
2436_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2437 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2438 (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2439}
2440
2441static __inline__ __m256h __DEFAULT_FN_ATTRS512
2442_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2443 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2444 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2445 _MM_FROUND_CUR_DIRECTION);
2446}
2447
2448#define _mm512_fmadd_round_ph(A, B, C, R) \
2449 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2450 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2451 (__mmask32)-1, (int)(R)))
2452
2453#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2454 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2455 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2456 (__mmask32)(U), (int)(R)))
2457
2458#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2459 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2460 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2461 (__mmask32)(U), (int)(R)))
2462
2463#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2464 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2465 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2466 (__mmask32)(U), (int)(R)))
2467
2468#define _mm512_fmsub_round_ph(A, B, C, R) \
2469 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2470 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2471 (__mmask32)-1, (int)(R)))
2472
2473#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2474 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2475 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2476 (__mmask32)(U), (int)(R)))
2477
2478#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2479 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2480 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2481 (__mmask32)(U), (int)(R)))
2482
2483#define _mm512_fnmadd_round_ph(A, B, C, R) \
2484 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2485 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2486 (__mmask32)-1, (int)(R)))
2487
2488#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2489 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2490 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2491 (__mmask32)(U), (int)(R)))
2492
2493#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2494 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2495 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2496 (__mmask32)(U), (int)(R)))
2497
2498#define _mm512_fnmsub_round_ph(A, B, C, R) \
2499 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2500 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2501 (__mmask32)-1, (int)(R)))
2502
2503#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2504 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2505 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2506 (__mmask32)(U), (int)(R)))
2507
2508static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2509 __m512h __B,
2510 __m512h __C) {
2511 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2512 (__v32hf)__C, (__mmask32)-1,
2513 _MM_FROUND_CUR_DIRECTION);
2514}
2515
2516static __inline__ __m512h __DEFAULT_FN_ATTRS512
2517_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2518 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2519 (__v32hf)__C, (__mmask32)__U,
2520 _MM_FROUND_CUR_DIRECTION);
2521}
2522
2523static __inline__ __m512h __DEFAULT_FN_ATTRS512
2524_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2525 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2526 (__v32hf)__C, (__mmask32)__U,
2527 _MM_FROUND_CUR_DIRECTION);
2528}
2529
2530static __inline__ __m512h __DEFAULT_FN_ATTRS512
2531_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2532 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2533 (__v32hf)__C, (__mmask32)__U,
2534 _MM_FROUND_CUR_DIRECTION);
2535}
2536
2537static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2538 __m512h __B,
2539 __m512h __C) {
2540 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2541 -(__v32hf)__C, (__mmask32)-1,
2542 _MM_FROUND_CUR_DIRECTION);
2543}
2544
2545static __inline__ __m512h __DEFAULT_FN_ATTRS512
2546_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2547 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2548 -(__v32hf)__C, (__mmask32)__U,
2549 _MM_FROUND_CUR_DIRECTION);
2550}
2551
2552static __inline__ __m512h __DEFAULT_FN_ATTRS512
2553_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2554 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2555 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2556 _MM_FROUND_CUR_DIRECTION);
2557}
2558
2559static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2560 __m512h __B,
2561 __m512h __C) {
2562 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2563 (__v32hf)__C, (__mmask32)-1,
2564 _MM_FROUND_CUR_DIRECTION);
2565}
2566
2567static __inline__ __m512h __DEFAULT_FN_ATTRS512
2568_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2569 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2570 (__v32hf)__C, (__mmask32)__U,
2571 _MM_FROUND_CUR_DIRECTION);
2572}
2573
2574static __inline__ __m512h __DEFAULT_FN_ATTRS512
2575_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2576 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2577 (__v32hf)__C, (__mmask32)__U,
2578 _MM_FROUND_CUR_DIRECTION);
2579}
2580
2581static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2582 __m512h __B,
2583 __m512h __C) {
2584 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2585 -(__v32hf)__C, (__mmask32)-1,
2586 _MM_FROUND_CUR_DIRECTION);
2587}
2588
2589static __inline__ __m512h __DEFAULT_FN_ATTRS512
2590_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2591 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2592 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2593 _MM_FROUND_CUR_DIRECTION);
2594}
2595
2596#define _mm512_fmaddsub_round_ph(A, B, C, R) \
2597 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2598 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2599 (__mmask32)-1, (int)(R)))
2600
2601#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2602 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2603 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2604 (__mmask32)(U), (int)(R)))
2605
2606#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2607 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2608 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2609 (__mmask32)(U), (int)(R)))
2610
2611#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2612 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2613 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2614 (__mmask32)(U), (int)(R)))
2615
2616#define _mm512_fmsubadd_round_ph(A, B, C, R) \
2617 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2618 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2619 (__mmask32)-1, (int)(R)))
2620
2621#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2622 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2623 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2624 (__mmask32)(U), (int)(R)))
2625
2626#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2627 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2628 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2629 (__mmask32)(U), (int)(R)))
2630
2631static __inline__ __m512h __DEFAULT_FN_ATTRS512
2632_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2633 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2634 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2635 _MM_FROUND_CUR_DIRECTION);
2636}
2637
2638static __inline__ __m512h __DEFAULT_FN_ATTRS512
2639_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2640 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2641 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2642 _MM_FROUND_CUR_DIRECTION);
2643}
2644
2645static __inline__ __m512h __DEFAULT_FN_ATTRS512
2646_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2647 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2648 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2649 _MM_FROUND_CUR_DIRECTION);
2650}
2651
2652static __inline__ __m512h __DEFAULT_FN_ATTRS512
2653_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2654 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2655 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2656 _MM_FROUND_CUR_DIRECTION);
2657}
2658
2659static __inline__ __m512h __DEFAULT_FN_ATTRS512
2660_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2661 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2662 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2663 _MM_FROUND_CUR_DIRECTION);
2664}
2665
2666static __inline__ __m512h __DEFAULT_FN_ATTRS512
2667_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2668 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2669 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2670 _MM_FROUND_CUR_DIRECTION);
2671}
2672
2673static __inline__ __m512h __DEFAULT_FN_ATTRS512
2674_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2675 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2676 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2677 _MM_FROUND_CUR_DIRECTION);
2678}
2679
2680#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2681 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2682 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2683 (__mmask32)(U), (int)(R)))
2684
2685static __inline__ __m512h __DEFAULT_FN_ATTRS512
2686_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2687 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2688 (__v32hf)__C, (__mmask32)__U,
2689 _MM_FROUND_CUR_DIRECTION);
2690}
2691
2692#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2693 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2694 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2695 (__mmask32)(U), (int)(R)))
2696
2697static __inline__ __m512h __DEFAULT_FN_ATTRS512
2698_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2699 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2700 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2701 _MM_FROUND_CUR_DIRECTION);
2702}
2703
2704#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2705 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2706 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2707 (__mmask32)(U), (int)(R)))
2708
2709static __inline__ __m512h __DEFAULT_FN_ATTRS512
2710_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2711 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2712 (__v32hf)__C, (__mmask32)__U,
2713 _MM_FROUND_CUR_DIRECTION);
2714}
2715
2716#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2717 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2718 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2719 (__mmask32)(U), (int)(R)))
2720
2721#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2722 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2723 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2724 (__mmask32)(U), (int)(R)))
2725
2726static __inline__ __m512h __DEFAULT_FN_ATTRS512
2727_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2728 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2729 -(__v32hf)__C, (__mmask32)__U,
2730 _MM_FROUND_CUR_DIRECTION);
2731}
2732
2733static __inline__ __m512h __DEFAULT_FN_ATTRS512
2734_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2735 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2736 (__v32hf)__C, (__mmask32)__U,
2737 _MM_FROUND_CUR_DIRECTION);
2738}
2739
2740static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2741 __m128h __A,
2742 __m128h __B) {
2743 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2744 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2745}
2746
2747static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2748 __mmask8 __U,
2749 __m128h __A,
2750 __m128h __B) {
2751 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2752 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2753}
2754
2755#define _mm_fmadd_round_sh(A, B, C, R) \
2756 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2757 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2758 (__mmask8)-1, (int)(R)))
2759
2760#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2761 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2762 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2763 (__mmask8)(U), (int)(R)))
2764
2765static __inline__ __m128h __DEFAULT_FN_ATTRS128
2766_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2767 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2768 (__mmask8)__U,
2769 _MM_FROUND_CUR_DIRECTION);
2770}
2771
2772#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2773 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2774 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2775 (__mmask8)(U), (int)(R)))
2776
2777static __inline__ __m128h __DEFAULT_FN_ATTRS128
2778_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2779 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2780 (__mmask8)__U,
2781 _MM_FROUND_CUR_DIRECTION);
2782}
2783
2784#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2785 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2786 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2787 (__mmask8)(U), (int)(R)))
2788
2789static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2790 __m128h __A,
2791 __m128h __B) {
2792 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2793 -(__v8hf)__B, (__mmask8)-1,
2794 _MM_FROUND_CUR_DIRECTION);
2795}
2796
2797static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2798 __mmask8 __U,
2799 __m128h __A,
2800 __m128h __B) {
2801 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2802 -(__v8hf)__B, (__mmask8)__U,
2803 _MM_FROUND_CUR_DIRECTION);
2804}
2805
2806#define _mm_fmsub_round_sh(A, B, C, R) \
2807 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2808 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2809 (__mmask8)-1, (int)(R)))
2810
2811#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2812 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2813 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2814 (__mmask8)(U), (int)(R)))
2815
2816static __inline__ __m128h __DEFAULT_FN_ATTRS128
2817_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2818 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2819 -(__v8hf)__C, (__mmask8)__U,
2820 _MM_FROUND_CUR_DIRECTION);
2821}
2822
2823#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2824 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2825 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2826 (__mmask8)(U), (int)R))
2827
2828static __inline__ __m128h __DEFAULT_FN_ATTRS128
2829_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2830 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2831 (__mmask8)__U,
2832 _MM_FROUND_CUR_DIRECTION);
2833}
2834
2835#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2836 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2837 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2838 (__mmask8)(U), (int)(R)))
2839
2840static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2841 __m128h __A,
2842 __m128h __B) {
2843 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2844 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2845}
2846
2847static __inline__ __m128h __DEFAULT_FN_ATTRS128
2848_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2849 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2850 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2851}
2852
2853#define _mm_fnmadd_round_sh(A, B, C, R) \
2854 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2855 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2856 (__mmask8)-1, (int)(R)))
2857
2858#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2859 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2860 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2861 (__mmask8)(U), (int)(R)))
2862
2863static __inline__ __m128h __DEFAULT_FN_ATTRS128
2864_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2865 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2866 (__mmask8)__U,
2867 _MM_FROUND_CUR_DIRECTION);
2868}
2869
2870#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2871 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2872 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2873 (__mmask8)(U), (int)(R)))
2874
2875static __inline__ __m128h __DEFAULT_FN_ATTRS128
2876_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2877 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2878 (__mmask8)__U,
2879 _MM_FROUND_CUR_DIRECTION);
2880}
2881
2882#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2883 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2884 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2885 (__mmask8)(U), (int)(R)))
2886
2887static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2888 __m128h __A,
2889 __m128h __B) {
2890 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2891 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2892}
2893
2894static __inline__ __m128h __DEFAULT_FN_ATTRS128
2895_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2896 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2897 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2898}
2899
2900#define _mm_fnmsub_round_sh(A, B, C, R) \
2901 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2902 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2903 (__mmask8)-1, (int)(R)))
2904
2905#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2906 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2907 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2908 (__mmask8)(U), (int)(R)))
2909
2910static __inline__ __m128h __DEFAULT_FN_ATTRS128
2911_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2912 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2913 (__mmask8)__U,
2914 _MM_FROUND_CUR_DIRECTION);
2915}
2916
2917#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2918 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2919 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2920 (__mmask8)(U), (int)(R)))
2921
2922static __inline__ __m128h __DEFAULT_FN_ATTRS128
2923_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2924 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2925 (__mmask8)__U,
2926 _MM_FROUND_CUR_DIRECTION);
2927}
2928
2929#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2930 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2931 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2932 (__mmask8)(U), (int)(R)))
2933
2934static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2935 __m128h __B,
2936 __m128h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002937 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2938 (__v4sf)__C, (__mmask8)-1,
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002939 _MM_FROUND_CUR_DIRECTION);
2940}
2941
2942static __inline__ __m128h __DEFAULT_FN_ATTRS128
2943_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002944 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2945 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002946}
2947
2948static __inline__ __m128h __DEFAULT_FN_ATTRS128
2949_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002950 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2951 (__v4sf)__C, (__mmask8)__U,
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002952 _MM_FROUND_CUR_DIRECTION);
2953}
2954
2955static __inline__ __m128h __DEFAULT_FN_ATTRS128
2956_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002957 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2958 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002959}
2960
2961#define _mm_fcmadd_round_sch(A, B, C, R) \
2962 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002963 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002964 (__mmask8)-1, (int)(R)))
2965
2966#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002967 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2968 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2969 (__mmask8)(U), (int)(R)))
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002970
2971#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2972 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002973 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002974 (__mmask8)(U), (int)(R)))
2975
2976#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002977 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2978 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2979 (__mmask8)(U), (int)(R)))
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002980
2981static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2982 __m128h __B,
2983 __m128h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002984 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2985 (__v4sf)__C, (__mmask8)-1,
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002986 _MM_FROUND_CUR_DIRECTION);
2987}
2988
2989static __inline__ __m128h __DEFAULT_FN_ATTRS128
2990_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002991 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2992 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002993}
2994
2995static __inline__ __m128h __DEFAULT_FN_ATTRS128
2996_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002997 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2998 (__v4sf)__C, (__mmask8)__U,
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002999 _MM_FROUND_CUR_DIRECTION);
3000}
3001
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003002static __inline__ __m128h __DEFAULT_FN_ATTRS128
3003_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3004 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3005 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3006}
3007
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003008#define _mm_fmadd_round_sch(A, B, C, R) \
3009 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003010 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003011 (__mmask8)-1, (int)(R)))
3012
3013#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003014 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3015 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3016 (__mmask8)(U), (int)(R)))
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003017
3018#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3019 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003020 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3021 (__mmask8)(U), (int)(R)))
3022
3023#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3024 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3025 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003026 (__mmask8)(U), (int)(R)))
3027
3028static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3029 __m128h __B) {
3030 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3031 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3032 _MM_FROUND_CUR_DIRECTION);
3033}
3034
3035static __inline__ __m128h __DEFAULT_FN_ATTRS128
3036_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3037 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3038 (__v4sf)__W, (__mmask8)__U,
3039 _MM_FROUND_CUR_DIRECTION);
3040}
3041
3042static __inline__ __m128h __DEFAULT_FN_ATTRS128
3043_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3044 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3045 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3046 _MM_FROUND_CUR_DIRECTION);
3047}
3048
3049#define _mm_fcmul_round_sch(A, B, R) \
3050 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3051 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3052 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3053
3054#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3055 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3056 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3057 (__mmask8)(U), (int)(R)))
3058
3059#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3060 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3061 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3062 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3063
3064static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3065 __m128h __B) {
3066 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3067 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3068 _MM_FROUND_CUR_DIRECTION);
3069}
3070
3071static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3072 __mmask8 __U,
3073 __m128h __A,
3074 __m128h __B) {
3075 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3076 (__v4sf)__W, (__mmask8)__U,
3077 _MM_FROUND_CUR_DIRECTION);
3078}
3079
3080static __inline__ __m128h __DEFAULT_FN_ATTRS128
3081_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3082 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3083 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3084 _MM_FROUND_CUR_DIRECTION);
3085}
3086
3087#define _mm_fmul_round_sch(A, B, R) \
3088 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3089 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3090 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3091
3092#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3093 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3094 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3095 (__mmask8)(U), (int)(R)))
3096
3097#define _mm_maskz_fmul_round_sch(U, A, B, R) \
3098 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3099 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3100 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3101
3102static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3103 __m512h __B) {
3104 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3105 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3106 _MM_FROUND_CUR_DIRECTION);
3107}
3108
3109static __inline__ __m512h __DEFAULT_FN_ATTRS512
3110_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3111 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3112 (__v16sf)__W, (__mmask16)__U,
3113 _MM_FROUND_CUR_DIRECTION);
3114}
3115
3116static __inline__ __m512h __DEFAULT_FN_ATTRS512
3117_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3118 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3119 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3120 _MM_FROUND_CUR_DIRECTION);
3121}
3122
3123#define _mm512_fcmul_round_pch(A, B, R) \
3124 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3125 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3126 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3127
3128#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3129 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3130 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3131 (__mmask16)(U), (int)(R)))
3132
3133#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3134 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3135 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3136 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3137
3138static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3139 __m512h __B) {
3140 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3141 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3142 _MM_FROUND_CUR_DIRECTION);
3143}
3144
3145static __inline__ __m512h __DEFAULT_FN_ATTRS512
3146_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3147 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3148 (__v16sf)__W, (__mmask16)__U,
3149 _MM_FROUND_CUR_DIRECTION);
3150}
3151
3152static __inline__ __m512h __DEFAULT_FN_ATTRS512
3153_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3154 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3155 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3156 _MM_FROUND_CUR_DIRECTION);
3157}
3158
3159#define _mm512_fmul_round_pch(A, B, R) \
3160 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3161 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3162 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3163
3164#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3165 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3166 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3167 (__mmask16)(U), (int)(R)))
3168
3169#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3170 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3171 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3172 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3173
3174static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3175 __m512h __B,
3176 __m512h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003177 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3178 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3179 _MM_FROUND_CUR_DIRECTION);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003180}
3181
3182static __inline__ __m512h __DEFAULT_FN_ATTRS512
3183_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003184 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3185 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3186 _MM_FROUND_CUR_DIRECTION);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003187}
3188
3189static __inline__ __m512h __DEFAULT_FN_ATTRS512
3190_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003191 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3192 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003193 _MM_FROUND_CUR_DIRECTION);
3194}
3195
3196static __inline__ __m512h __DEFAULT_FN_ATTRS512
3197_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3198 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003199 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003200 _MM_FROUND_CUR_DIRECTION);
3201}
3202
3203#define _mm512_fcmadd_round_pch(A, B, C, R) \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003204 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3205 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003206 (__mmask16)-1, (int)(R)))
3207
3208#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003209 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3210 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3211 (__mmask16)(U), (int)(R)))
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003212
3213#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003214 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3215 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003216 (__mmask16)(U), (int)(R)))
3217
3218#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3219 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003220 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003221 (__mmask16)(U), (int)(R)))
3222
3223static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3224 __m512h __B,
3225 __m512h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003226 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3227 (__v16sf)__C, (__mmask16)-1,
3228 _MM_FROUND_CUR_DIRECTION);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003229}
3230
3231static __inline__ __m512h __DEFAULT_FN_ATTRS512
3232_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003233 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3234 (__v16sf)__C, (__mmask16)__U,
3235 _MM_FROUND_CUR_DIRECTION);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003236}
3237
3238static __inline__ __m512h __DEFAULT_FN_ATTRS512
3239_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003240 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3241 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3242 _MM_FROUND_CUR_DIRECTION);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003243}
3244
3245static __inline__ __m512h __DEFAULT_FN_ATTRS512
3246_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3247 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003248 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003249 _MM_FROUND_CUR_DIRECTION);
3250}
3251
3252#define _mm512_fmadd_round_pch(A, B, C, R) \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003253 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3254 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003255 (__mmask16)-1, (int)(R)))
3256
3257#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003258 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3259 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3260 (__mmask16)(U), (int)(R)))
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003261
3262#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003263 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3264 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003265 (__mmask16)(U), (int)(R)))
3266
3267#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3268 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003269 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003270 (__mmask16)(U), (int)(R)))
3271
3272static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3273_mm512_reduce_add_ph(__m512h __W) {
3274 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3275}
3276
3277static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3278_mm512_reduce_mul_ph(__m512h __W) {
3279 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3280}
3281
3282static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3283_mm512_reduce_max_ph(__m512h __V) {
3284 return __builtin_ia32_reduce_fmax_ph512(__V);
3285}
3286
3287static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3288_mm512_reduce_min_ph(__m512h __V) {
3289 return __builtin_ia32_reduce_fmin_ph512(__V);
3290}
3291
3292static __inline__ __m512h __DEFAULT_FN_ATTRS512
3293_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3294 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3295 (__v32hf)__A);
3296}
3297
3298static __inline__ __m512h __DEFAULT_FN_ATTRS512
3299_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3300 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3301 (__v32hi)__B);
3302}
3303
3304static __inline__ __m512h __DEFAULT_FN_ATTRS512
3305_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3306 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3307}
3308
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08003309// intrinsics below are alias for f*mul_*ch
3310#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3311#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3312#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3313#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3314#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3315 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3316#define _mm512_maskz_mul_round_pch(U, A, B, R) \
3317 _mm512_maskz_fmul_round_pch(U, A, B, R)
3318
3319#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3320#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3321#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3322#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3323#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3324 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3325#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3326 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3327
3328#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3329#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3330#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3331#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3332#define _mm_mask_mul_round_sch(W, U, A, B, R) \
3333 _mm_mask_fmul_round_sch(W, U, A, B, R)
3334#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3335
3336#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3337#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3338#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3339#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3340#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3341 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3342#define _mm_maskz_cmul_round_sch(U, A, B, R) \
3343 _mm_maskz_fcmul_round_sch(U, A, B, R)
3344
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08003345#undef __DEFAULT_FN_ATTRS128
3346#undef __DEFAULT_FN_ATTRS256
3347#undef __DEFAULT_FN_ATTRS512
3348
3349#endif