blob: 3d27853ad9648e6340bf026632326e5b8238aada [file] [log] [blame]
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001/*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error \
11 "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVX512VLFP16INTRIN_H
15#define __AVX512VLFP16INTRIN_H
16
17/* Define the default attributes for the functions in this file. */
18#define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, \
20 __target__("avx512fp16, avx512vl"), \
21 __min_vector_width__(256)))
22#define __DEFAULT_FN_ATTRS128 \
23 __attribute__((__always_inline__, __nodebug__, \
24 __target__("avx512fp16, avx512vl"), \
25 __min_vector_width__(128)))
26
27static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) {
28 return __a[0];
29}
30
31static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) {
32 return __a[0];
33}
34
35static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) {
36 return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0};
37}
38
39static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) {
40 return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h};
41}
42
43static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) {
44 return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h,
45 __h, __h, __h, __h, __h, __h, __h, __h};
46}
47
48static __inline __m128h __DEFAULT_FN_ATTRS128
49_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
50 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) {
51 return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1};
52}
53
54static __inline __m256h __DEFAULT_FN_ATTRS256
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -080055_mm256_set1_pch(_Float16 _Complex h) {
56 return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h));
57}
58
59static __inline __m128h __DEFAULT_FN_ATTRS128
60_mm_set1_pch(_Float16 _Complex h) {
61 return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h));
62}
63
64static __inline __m256h __DEFAULT_FN_ATTRS256
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -080065_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
66 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
67 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
68 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) {
69 return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11,
70 __h10, __h9, __h8, __h7, __h6, __h5,
71 __h4, __h3, __h2, __h1};
72}
73
74#define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \
75 _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
76
77#define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
78 h14, h15, h16) \
79 _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \
80 (h7), (h6), (h5), (h4), (h3), (h2), (h1))
81
82static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A,
83 __m256h __B) {
84 return (__m256h)((__v16hf)__A + (__v16hf)__B);
85}
86
87static __inline__ __m256h __DEFAULT_FN_ATTRS256
88_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
89 return (__m256h)__builtin_ia32_selectph_256(
90 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W);
91}
92
93static __inline__ __m256h __DEFAULT_FN_ATTRS256
94_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) {
95 return (__m256h)__builtin_ia32_selectph_256(
96 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
97}
98
99static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A,
100 __m128h __B) {
101 return (__m128h)((__v8hf)__A + (__v8hf)__B);
102}
103
104static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W,
105 __mmask8 __U,
106 __m128h __A,
107 __m128h __B) {
108 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
109 (__v8hf)__W);
110}
111
112static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U,
113 __m128h __A,
114 __m128h __B) {
115 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
116 (__v8hf)_mm_setzero_ph());
117}
118
119static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A,
120 __m256h __B) {
121 return (__m256h)((__v16hf)__A - (__v16hf)__B);
122}
123
124static __inline__ __m256h __DEFAULT_FN_ATTRS256
125_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
126 return (__m256h)__builtin_ia32_selectph_256(
127 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W);
128}
129
130static __inline__ __m256h __DEFAULT_FN_ATTRS256
131_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) {
132 return (__m256h)__builtin_ia32_selectph_256(
133 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
134}
135
136static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A,
137 __m128h __B) {
138 return (__m128h)((__v8hf)__A - (__v8hf)__B);
139}
140
141static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W,
142 __mmask8 __U,
143 __m128h __A,
144 __m128h __B) {
145 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
146 (__v8hf)__W);
147}
148
149static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U,
150 __m128h __A,
151 __m128h __B) {
152 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
153 (__v8hf)_mm_setzero_ph());
154}
155
156static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A,
157 __m256h __B) {
158 return (__m256h)((__v16hf)__A * (__v16hf)__B);
159}
160
161static __inline__ __m256h __DEFAULT_FN_ATTRS256
162_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
163 return (__m256h)__builtin_ia32_selectph_256(
164 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W);
165}
166
167static __inline__ __m256h __DEFAULT_FN_ATTRS256
168_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) {
169 return (__m256h)__builtin_ia32_selectph_256(
170 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
171}
172
173static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A,
174 __m128h __B) {
175 return (__m128h)((__v8hf)__A * (__v8hf)__B);
176}
177
178static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W,
179 __mmask8 __U,
180 __m128h __A,
181 __m128h __B) {
182 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
183 (__v8hf)__W);
184}
185
186static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U,
187 __m128h __A,
188 __m128h __B) {
189 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
190 (__v8hf)_mm_setzero_ph());
191}
192
193static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A,
194 __m256h __B) {
195 return (__m256h)((__v16hf)__A / (__v16hf)__B);
196}
197
198static __inline__ __m256h __DEFAULT_FN_ATTRS256
199_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
200 return (__m256h)__builtin_ia32_selectph_256(
201 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W);
202}
203
204static __inline__ __m256h __DEFAULT_FN_ATTRS256
205_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) {
206 return (__m256h)__builtin_ia32_selectph_256(
207 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
208}
209
210static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A,
211 __m128h __B) {
212 return (__m128h)((__v8hf)__A / (__v8hf)__B);
213}
214
215static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W,
216 __mmask8 __U,
217 __m128h __A,
218 __m128h __B) {
219 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
220 (__v8hf)__W);
221}
222
223static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U,
224 __m128h __A,
225 __m128h __B) {
226 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
227 (__v8hf)_mm_setzero_ph());
228}
229
230static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A,
231 __m256h __B) {
232 return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B);
233}
234
235static __inline__ __m256h __DEFAULT_FN_ATTRS256
236_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
237 return (__m256h)__builtin_ia32_selectph_256(
238 (__mmask16)__U,
239 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
240 (__v16hf)__W);
241}
242
243static __inline__ __m256h __DEFAULT_FN_ATTRS256
244_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) {
245 return (__m256h)__builtin_ia32_selectph_256(
246 (__mmask16)__U,
247 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
248 (__v16hf)_mm256_setzero_ph());
249}
250
251static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A,
252 __m128h __B) {
253 return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B);
254}
255
256static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W,
257 __mmask8 __U,
258 __m128h __A,
259 __m128h __B) {
260 return (__m128h)__builtin_ia32_selectph_128(
261 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
262 (__v8hf)__W);
263}
264
265static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U,
266 __m128h __A,
267 __m128h __B) {
268 return (__m128h)__builtin_ia32_selectph_128(
269 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
270 (__v8hf)_mm_setzero_ph());
271}
272
273static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A,
274 __m256h __B) {
275 return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B);
276}
277
278static __inline__ __m256h __DEFAULT_FN_ATTRS256
279_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
280 return (__m256h)__builtin_ia32_selectph_256(
281 (__mmask16)__U,
282 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
283 (__v16hf)__W);
284}
285
286static __inline__ __m256h __DEFAULT_FN_ATTRS256
287_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) {
288 return (__m256h)__builtin_ia32_selectph_256(
289 (__mmask16)__U,
290 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
291 (__v16hf)_mm256_setzero_ph());
292}
293
294static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A,
295 __m128h __B) {
296 return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B);
297}
298
299static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W,
300 __mmask8 __U,
301 __m128h __A,
302 __m128h __B) {
303 return (__m128h)__builtin_ia32_selectph_128(
304 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
305 (__v8hf)__W);
306}
307
308static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U,
309 __m128h __A,
310 __m128h __B) {
311 return (__m128h)__builtin_ia32_selectph_128(
312 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
313 (__v8hf)_mm_setzero_ph());
314}
315
316static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) {
317 return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A);
318}
319
320static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) {
321 return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
322}
323
324static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) {
325 return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f));
326}
327
328static __inline__ __m256h __DEFAULT_FN_ATTRS256
329_mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) {
330 return (__m256h)__builtin_ia32_selectps_256(
331 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W);
332}
333
334static __inline__ __m256h __DEFAULT_FN_ATTRS256
335_mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) {
336 return (__m256h)__builtin_ia32_selectps_256(
337 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps());
338}
339
340static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) {
341 return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f));
342}
343
344static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W,
345 __mmask8 __U,
346 __m128h __A) {
347 return (__m128h)__builtin_ia32_selectps_128(
348 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W);
349}
350
351static __inline__ __m128h __DEFAULT_FN_ATTRS128
352_mm_maskz_conj_pch(__mmask8 __U, __m128h __A) {
353 return (__m128h)__builtin_ia32_selectps_128(
354 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps());
355}
356
357#define _mm256_cmp_ph_mask(a, b, p) \
358 ((__mmask16)__builtin_ia32_cmpph256_mask( \
359 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
360
361#define _mm256_mask_cmp_ph_mask(m, a, b, p) \
362 ((__mmask16)__builtin_ia32_cmpph256_mask( \
363 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m)))
364
365#define _mm_cmp_ph_mask(a, b, p) \
366 ((__mmask8)__builtin_ia32_cmpph128_mask( \
367 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1))
368
369#define _mm_mask_cmp_ph_mask(m, a, b, p) \
370 ((__mmask8)__builtin_ia32_cmpph128_mask( \
371 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
372
373static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) {
374 return (__m256h)__builtin_ia32_rcpph256_mask(
375 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
376}
377
378static __inline__ __m256h __DEFAULT_FN_ATTRS256
379_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
380 return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W,
381 (__mmask16)__U);
382}
383
384static __inline__ __m256h __DEFAULT_FN_ATTRS256
385_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) {
386 return (__m256h)__builtin_ia32_rcpph256_mask(
387 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
388}
389
390static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) {
391 return (__m128h)__builtin_ia32_rcpph128_mask(
392 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
393}
394
395static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W,
396 __mmask8 __U,
397 __m128h __A) {
398 return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W,
399 (__mmask8)__U);
400}
401
402static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U,
403 __m128h __A) {
404 return (__m128h)__builtin_ia32_rcpph128_mask(
405 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
406}
407
408static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) {
409 return (__m256h)__builtin_ia32_rsqrtph256_mask(
410 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
411}
412
413static __inline__ __m256h __DEFAULT_FN_ATTRS256
414_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
415 return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W,
416 (__mmask16)__U);
417}
418
419static __inline__ __m256h __DEFAULT_FN_ATTRS256
420_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) {
421 return (__m256h)__builtin_ia32_rsqrtph256_mask(
422 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
423}
424
425static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) {
426 return (__m128h)__builtin_ia32_rsqrtph128_mask(
427 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
428}
429
430static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W,
431 __mmask8 __U,
432 __m128h __A) {
433 return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W,
434 (__mmask8)__U);
435}
436
437static __inline__ __m128h __DEFAULT_FN_ATTRS128
438_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) {
439 return (__m128h)__builtin_ia32_rsqrtph128_mask(
440 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
441}
442
443static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) {
444 return (__m128h)__builtin_ia32_getexpph128_mask(
445 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
446}
447
448static __inline__ __m128h __DEFAULT_FN_ATTRS128
449_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) {
450 return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W,
451 (__mmask8)__U);
452}
453
454static __inline__ __m128h __DEFAULT_FN_ATTRS128
455_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) {
456 return (__m128h)__builtin_ia32_getexpph128_mask(
457 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
458}
459
460static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) {
461 return (__m256h)__builtin_ia32_getexpph256_mask(
462 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
463}
464
465static __inline__ __m256h __DEFAULT_FN_ATTRS256
466_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
467 return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W,
468 (__mmask16)__U);
469}
470
471static __inline__ __m256h __DEFAULT_FN_ATTRS256
472_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) {
473 return (__m256h)__builtin_ia32_getexpph256_mask(
474 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
475}
476
477#define _mm_getmant_ph(A, B, C) \
478 ((__m128h)__builtin_ia32_getmantph128_mask( \
479 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
480 (__mmask8)-1))
481
482#define _mm_mask_getmant_ph(W, U, A, B, C) \
483 ((__m128h)__builtin_ia32_getmantph128_mask( \
484 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \
485 (__mmask8)(U)))
486
487#define _mm_maskz_getmant_ph(U, A, B, C) \
488 ((__m128h)__builtin_ia32_getmantph128_mask( \
489 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
490 (__mmask8)(U)))
491
492#define _mm256_getmant_ph(A, B, C) \
493 ((__m256h)__builtin_ia32_getmantph256_mask( \
494 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
495 (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
496
497#define _mm256_mask_getmant_ph(W, U, A, B, C) \
498 ((__m256h)__builtin_ia32_getmantph256_mask( \
499 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
500 (__mmask16)(U)))
501
502#define _mm256_maskz_getmant_ph(U, A, B, C) \
503 ((__m256h)__builtin_ia32_getmantph256_mask( \
504 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
505 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
506
507static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A,
508 __m128h __B) {
509 return (__m128h)__builtin_ia32_scalefph128_mask(
510 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
511}
512
513static __inline__ __m128h __DEFAULT_FN_ATTRS128
514_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
515 return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B,
516 (__v8hf)__W, (__mmask8)__U);
517}
518
519static __inline__ __m128h __DEFAULT_FN_ATTRS128
520_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) {
521 return (__m128h)__builtin_ia32_scalefph128_mask(
522 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
523}
524
525static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A,
526 __m256h __B) {
527 return (__m256h)__builtin_ia32_scalefph256_mask(
528 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
529}
530
531static __inline__ __m256h __DEFAULT_FN_ATTRS256
532_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
533 return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B,
534 (__v16hf)__W, (__mmask16)__U);
535}
536
537static __inline__ __m256h __DEFAULT_FN_ATTRS256
538_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
539 return (__m256h)__builtin_ia32_scalefph256_mask(
540 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
541}
542
543#define _mm_roundscale_ph(A, imm) \
544 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
545 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
546 (__mmask8)-1))
547
548#define _mm_mask_roundscale_ph(W, U, A, imm) \
549 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
550 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
551
552#define _mm_maskz_roundscale_ph(U, A, imm) \
553 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
554 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
555 (__mmask8)(U)))
556
557#define _mm256_roundscale_ph(A, imm) \
558 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
559 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
560 (__mmask16)-1))
561
562#define _mm256_mask_roundscale_ph(W, U, A, imm) \
563 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
564 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
565 (__mmask16)(U)))
566
567#define _mm256_maskz_roundscale_ph(U, A, imm) \
568 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
569 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
570 (__mmask16)(U)))
571
572#define _mm_reduce_ph(A, imm) \
573 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
574 (__v8hf)_mm_setzero_ph(), \
575 (__mmask8)-1))
576
577#define _mm_mask_reduce_ph(W, U, A, imm) \
578 ((__m128h)__builtin_ia32_reduceph128_mask( \
579 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
580
581#define _mm_maskz_reduce_ph(U, A, imm) \
582 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
583 (__v8hf)_mm_setzero_ph(), \
584 (__mmask8)(U)))
585
586#define _mm256_reduce_ph(A, imm) \
587 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
588 (__v16hf)_mm256_setzero_ph(), \
589 (__mmask16)-1))
590
591#define _mm256_mask_reduce_ph(W, U, A, imm) \
592 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
593 (__v16hf)(__m256h)(W), \
594 (__mmask16)(U)))
595
596#define _mm256_maskz_reduce_ph(U, A, imm) \
597 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
598 (__v16hf)_mm256_setzero_ph(), \
599 (__mmask16)(U)))
600
601static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
602 return __builtin_ia32_sqrtph((__v8hf)__a);
603}
604
605static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
606 __mmask8 __U,
607 __m128h __A) {
608 return (__m128h)__builtin_ia32_selectph_128(
609 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W);
610}
611
612static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
613 __m128h __A) {
614 return (__m128h)__builtin_ia32_selectph_128(
615 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph());
616}
617
618static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
619 return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
620}
621
622static __inline__ __m256h __DEFAULT_FN_ATTRS256
623_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
624 return (__m256h)__builtin_ia32_selectph_256(
625 (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W);
626}
627
628static __inline__ __m256h __DEFAULT_FN_ATTRS256
629_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) {
630 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
631 (__v16hf)_mm256_sqrt_ph(__A),
632 (__v16hf)_mm256_setzero_ph());
633}
634
635#define _mm_mask_fpclass_ph_mask(U, A, imm) \
636 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
637 (int)(imm), (__mmask8)(U)))
638
639#define _mm_fpclass_ph_mask(A, imm) \
640 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
641 (int)(imm), (__mmask8)-1))
642
643#define _mm256_mask_fpclass_ph_mask(U, A, imm) \
644 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
645 (int)(imm), (__mmask16)(U)))
646
647#define _mm256_fpclass_ph_mask(A, imm) \
648 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
649 (int)(imm), (__mmask16)-1))
650
651static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) {
652 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
653 (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
654}
655
656static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W,
657 __mmask8 __U,
658 __m128d __A) {
659 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W,
660 (__mmask8)__U);
661}
662
663static __inline__ __m128h __DEFAULT_FN_ATTRS128
664_mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) {
665 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
666 (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
667}
668
669static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) {
670 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
671 (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
672}
673
674static __inline__ __m128h __DEFAULT_FN_ATTRS256
675_mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) {
676 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W,
677 (__mmask8)__U);
678}
679
680static __inline__ __m128h __DEFAULT_FN_ATTRS256
681_mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) {
682 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
683 (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
684}
685
686static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) {
687 return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
688 (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1);
689}
690
691static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W,
692 __mmask8 __U,
693 __m128h __A) {
694 return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W,
695 (__mmask8)__U);
696}
697
698static __inline__ __m128d __DEFAULT_FN_ATTRS128
699_mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
700 return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
701 (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U);
702}
703
704static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) {
705 return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
706 (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1);
707}
708
709static __inline__ __m256d __DEFAULT_FN_ATTRS256
710_mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) {
711 return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W,
712 (__mmask8)__U);
713}
714
715static __inline__ __m256d __DEFAULT_FN_ATTRS256
716_mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
717 return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
718 (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U);
719}
720
721static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) {
722 return (__m128i)__builtin_ia32_vcvtph2w128_mask(
723 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
724}
725
726static __inline__ __m128i __DEFAULT_FN_ATTRS128
727_mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
728 return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W,
729 (__mmask8)__U);
730}
731
732static __inline__ __m128i __DEFAULT_FN_ATTRS128
733_mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) {
734 return (__m128i)__builtin_ia32_vcvtph2w128_mask(
735 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
736}
737
738static __inline__ __m256i __DEFAULT_FN_ATTRS256
739_mm256_cvtph_epi16(__m256h __A) {
740 return (__m256i)__builtin_ia32_vcvtph2w256_mask(
741 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
742}
743
744static __inline__ __m256i __DEFAULT_FN_ATTRS256
745_mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
746 return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W,
747 (__mmask16)__U);
748}
749
750static __inline__ __m256i __DEFAULT_FN_ATTRS256
751_mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) {
752 return (__m256i)__builtin_ia32_vcvtph2w256_mask(
753 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
754}
755
756static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) {
757 return (__m128i)__builtin_ia32_vcvttph2w128_mask(
758 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
759}
760
761static __inline__ __m128i __DEFAULT_FN_ATTRS128
762_mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
763 return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W,
764 (__mmask8)__U);
765}
766
767static __inline__ __m128i __DEFAULT_FN_ATTRS128
768_mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) {
769 return (__m128i)__builtin_ia32_vcvttph2w128_mask(
770 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
771}
772
773static __inline__ __m256i __DEFAULT_FN_ATTRS256
774_mm256_cvttph_epi16(__m256h __A) {
775 return (__m256i)__builtin_ia32_vcvttph2w256_mask(
776 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
777}
778
779static __inline__ __m256i __DEFAULT_FN_ATTRS256
780_mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
781 return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W,
782 (__mmask16)__U);
783}
784
785static __inline__ __m256i __DEFAULT_FN_ATTRS256
786_mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) {
787 return (__m256i)__builtin_ia32_vcvttph2w256_mask(
788 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
789}
790
791static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) {
792 return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf);
793}
794
795static __inline__ __m128h __DEFAULT_FN_ATTRS128
796_mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
797 return (__m128h)__builtin_ia32_selectph_128(
798 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W);
799}
800
801static __inline__ __m128h __DEFAULT_FN_ATTRS128
802_mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) {
803 return (__m128h)__builtin_ia32_selectph_128(
804 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph());
805}
806
807static __inline__ __m256h __DEFAULT_FN_ATTRS256
808_mm256_cvtepi16_ph(__m256i __A) {
809 return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf);
810}
811
812static __inline__ __m256h __DEFAULT_FN_ATTRS256
813_mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
814 return (__m256h)__builtin_ia32_selectph_256(
815 (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W);
816}
817
818static __inline__ __m256h __DEFAULT_FN_ATTRS256
819_mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) {
820 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
821 (__v16hf)_mm256_cvtepi16_ph(__A),
822 (__v16hf)_mm256_setzero_ph());
823}
824
825static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) {
826 return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
827 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
828}
829
830static __inline__ __m128i __DEFAULT_FN_ATTRS128
831_mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
832 return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W,
833 (__mmask8)__U);
834}
835
836static __inline__ __m128i __DEFAULT_FN_ATTRS128
837_mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) {
838 return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
839 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
840}
841
842static __inline__ __m256i __DEFAULT_FN_ATTRS256
843_mm256_cvtph_epu16(__m256h __A) {
844 return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
845 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
846}
847
848static __inline__ __m256i __DEFAULT_FN_ATTRS256
849_mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
850 return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W,
851 (__mmask16)__U);
852}
853
854static __inline__ __m256i __DEFAULT_FN_ATTRS256
855_mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) {
856 return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
857 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
858}
859
860static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) {
861 return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
862 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
863}
864
865static __inline__ __m128i __DEFAULT_FN_ATTRS128
866_mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
867 return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W,
868 (__mmask8)__U);
869}
870
871static __inline__ __m128i __DEFAULT_FN_ATTRS128
872_mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) {
873 return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
874 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
875}
876
877static __inline__ __m256i __DEFAULT_FN_ATTRS256
878_mm256_cvttph_epu16(__m256h __A) {
879 return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
880 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
881}
882
883static __inline__ __m256i __DEFAULT_FN_ATTRS256
884_mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
885 return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W,
886 (__mmask16)__U);
887}
888
889static __inline__ __m256i __DEFAULT_FN_ATTRS256
890_mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) {
891 return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
892 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
893}
894
895static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) {
896 return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf);
897}
898
899static __inline__ __m128h __DEFAULT_FN_ATTRS128
900_mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
901 return (__m128h)__builtin_ia32_selectph_128(
902 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W);
903}
904
905static __inline__ __m128h __DEFAULT_FN_ATTRS128
906_mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) {
907 return (__m128h)__builtin_ia32_selectph_128(
908 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph());
909}
910
911static __inline__ __m256h __DEFAULT_FN_ATTRS256
912_mm256_cvtepu16_ph(__m256i __A) {
913 return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf);
914}
915
916static __inline__ __m256h __DEFAULT_FN_ATTRS256
917_mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
918 return (__m256h)__builtin_ia32_selectph_256(
919 (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W);
920}
921
922static __inline__ __m256h __DEFAULT_FN_ATTRS256
923_mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) {
924 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
925 (__v16hf)_mm256_cvtepu16_ph(__A),
926 (__v16hf)_mm256_setzero_ph());
927}
928
929static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) {
930 return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
931 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
932}
933
934static __inline__ __m128i __DEFAULT_FN_ATTRS128
935_mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
936 return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W,
937 (__mmask8)__U);
938}
939
940static __inline__ __m128i __DEFAULT_FN_ATTRS128
941_mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
942 return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
943 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
944}
945
946static __inline__ __m256i __DEFAULT_FN_ATTRS256
947_mm256_cvtph_epi32(__m128h __A) {
948 return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
949 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
950}
951
952static __inline__ __m256i __DEFAULT_FN_ATTRS256
953_mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
954 return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W,
955 (__mmask8)__U);
956}
957
958static __inline__ __m256i __DEFAULT_FN_ATTRS256
959_mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
960 return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
961 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
962}
963
964static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) {
965 return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
966 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
967}
968
969static __inline__ __m128i __DEFAULT_FN_ATTRS128
970_mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
971 return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W,
972 (__mmask8)__U);
973}
974
975static __inline__ __m128i __DEFAULT_FN_ATTRS128
976_mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
977 return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
978 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
979}
980
981static __inline__ __m256i __DEFAULT_FN_ATTRS256
982_mm256_cvtph_epu32(__m128h __A) {
983 return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
984 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
985}
986
987static __inline__ __m256i __DEFAULT_FN_ATTRS256
988_mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
989 return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W,
990 (__mmask8)__U);
991}
992
993static __inline__ __m256i __DEFAULT_FN_ATTRS256
994_mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
995 return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
996 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
997}
998
999static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) {
1000 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
1001 (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1002}
1003
1004static __inline__ __m128h __DEFAULT_FN_ATTRS128
1005_mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1006 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W,
1007 (__mmask8)__U);
1008}
1009
1010static __inline__ __m128h __DEFAULT_FN_ATTRS128
1011_mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) {
1012 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
1013 (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1014}
1015
1016static __inline__ __m128h __DEFAULT_FN_ATTRS256
1017_mm256_cvtepi32_ph(__m256i __A) {
1018 return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf);
1019}
1020
1021static __inline__ __m128h __DEFAULT_FN_ATTRS256
1022_mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1023 return (__m128h)__builtin_ia32_selectph_128(
1024 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W);
1025}
1026
1027static __inline__ __m128h __DEFAULT_FN_ATTRS256
1028_mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) {
1029 return (__m128h)__builtin_ia32_selectph_128(
1030 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph());
1031}
1032
1033static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) {
1034 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1035 (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1036}
1037
1038static __inline__ __m128h __DEFAULT_FN_ATTRS128
1039_mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1040 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W,
1041 (__mmask8)__U);
1042}
1043
1044static __inline__ __m128h __DEFAULT_FN_ATTRS128
1045_mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) {
1046 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1047 (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1048}
1049
1050static __inline__ __m128h __DEFAULT_FN_ATTRS256
1051_mm256_cvtepu32_ph(__m256i __A) {
1052 return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf);
1053}
1054
1055static __inline__ __m128h __DEFAULT_FN_ATTRS256
1056_mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1057 return (__m128h)__builtin_ia32_selectph_128(
1058 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W);
1059}
1060
1061static __inline__ __m128h __DEFAULT_FN_ATTRS256
1062_mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) {
1063 return (__m128h)__builtin_ia32_selectph_128(
1064 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph());
1065}
1066
1067static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) {
1068 return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1069 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
1070}
1071
1072static __inline__ __m128i __DEFAULT_FN_ATTRS128
1073_mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
1074 return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W,
1075 (__mmask8)__U);
1076}
1077
1078static __inline__ __m128i __DEFAULT_FN_ATTRS128
1079_mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
1080 return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1081 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
1082}
1083
1084static __inline__ __m256i __DEFAULT_FN_ATTRS256
1085_mm256_cvttph_epi32(__m128h __A) {
1086 return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1087 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
1088}
1089
1090static __inline__ __m256i __DEFAULT_FN_ATTRS256
1091_mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
1092 return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W,
1093 (__mmask8)__U);
1094}
1095
1096static __inline__ __m256i __DEFAULT_FN_ATTRS256
1097_mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
1098 return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1099 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
1100}
1101
1102static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) {
1103 return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1104 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
1105}
1106
1107static __inline__ __m128i __DEFAULT_FN_ATTRS128
1108_mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
1109 return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W,
1110 (__mmask8)__U);
1111}
1112
1113static __inline__ __m128i __DEFAULT_FN_ATTRS128
1114_mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
1115 return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1116 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
1117}
1118
1119static __inline__ __m256i __DEFAULT_FN_ATTRS256
1120_mm256_cvttph_epu32(__m128h __A) {
1121 return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1122 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
1123}
1124
1125static __inline__ __m256i __DEFAULT_FN_ATTRS256
1126_mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
1127 return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W,
1128 (__mmask8)__U);
1129}
1130
1131static __inline__ __m256i __DEFAULT_FN_ATTRS256
1132_mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
1133 return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1134 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
1135}
1136
1137static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) {
1138 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1139 (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1140}
1141
1142static __inline__ __m128h __DEFAULT_FN_ATTRS128
1143_mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1144 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W,
1145 (__mmask8)__U);
1146}
1147
1148static __inline__ __m128h __DEFAULT_FN_ATTRS128
1149_mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) {
1150 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1151 (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1152}
1153
1154static __inline__ __m128h __DEFAULT_FN_ATTRS256
1155_mm256_cvtepi64_ph(__m256i __A) {
1156 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1157 (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1158}
1159
1160static __inline__ __m128h __DEFAULT_FN_ATTRS256
1161_mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1162 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W,
1163 (__mmask8)__U);
1164}
1165
1166static __inline__ __m128h __DEFAULT_FN_ATTRS256
1167_mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) {
1168 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1169 (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1170}
1171
1172static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) {
1173 return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1174 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
1175}
1176
1177static __inline__ __m128i __DEFAULT_FN_ATTRS128
1178_mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
1179 return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W,
1180 (__mmask8)__U);
1181}
1182
1183static __inline__ __m128i __DEFAULT_FN_ATTRS128
1184_mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
1185 return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1186 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
1187}
1188
1189static __inline__ __m256i __DEFAULT_FN_ATTRS256
1190_mm256_cvtph_epi64(__m128h __A) {
1191 return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1192 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
1193}
1194
1195static __inline__ __m256i __DEFAULT_FN_ATTRS256
1196_mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
1197 return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W,
1198 (__mmask8)__U);
1199}
1200
1201static __inline__ __m256i __DEFAULT_FN_ATTRS256
1202_mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
1203 return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1204 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
1205}
1206
1207static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) {
1208 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1209 (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1210}
1211
1212static __inline__ __m128h __DEFAULT_FN_ATTRS128
1213_mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1214 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W,
1215 (__mmask8)__U);
1216}
1217
1218static __inline__ __m128h __DEFAULT_FN_ATTRS128
1219_mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) {
1220 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1221 (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1222}
1223
1224static __inline__ __m128h __DEFAULT_FN_ATTRS256
1225_mm256_cvtepu64_ph(__m256i __A) {
1226 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1227 (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1228}
1229
1230static __inline__ __m128h __DEFAULT_FN_ATTRS256
1231_mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1232 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W,
1233 (__mmask8)__U);
1234}
1235
1236static __inline__ __m128h __DEFAULT_FN_ATTRS256
1237_mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) {
1238 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1239 (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1240}
1241
1242static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) {
1243 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1244 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
1245}
1246
1247static __inline__ __m128i __DEFAULT_FN_ATTRS128
1248_mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
1249 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1250 (__mmask8)__U);
1251}
1252
1253static __inline__ __m128i __DEFAULT_FN_ATTRS128
1254_mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
1255 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1256 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
1257}
1258
1259static __inline__ __m256i __DEFAULT_FN_ATTRS256
1260_mm256_cvtph_epu64(__m128h __A) {
1261 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1262 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
1263}
1264
1265static __inline__ __m256i __DEFAULT_FN_ATTRS256
1266_mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
1267 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1268 (__mmask8)__U);
1269}
1270
1271static __inline__ __m256i __DEFAULT_FN_ATTRS256
1272_mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
1273 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1274 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
1275}
1276
1277static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) {
1278 return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1279 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
1280}
1281
1282static __inline__ __m128i __DEFAULT_FN_ATTRS128
1283_mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
1284 return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W,
1285 (__mmask8)__U);
1286}
1287
1288static __inline__ __m128i __DEFAULT_FN_ATTRS128
1289_mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
1290 return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1291 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
1292}
1293
1294static __inline__ __m256i __DEFAULT_FN_ATTRS256
1295_mm256_cvttph_epi64(__m128h __A) {
1296 return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1297 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
1298}
1299
1300static __inline__ __m256i __DEFAULT_FN_ATTRS256
1301_mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
1302 return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W,
1303 (__mmask8)__U);
1304}
1305
1306static __inline__ __m256i __DEFAULT_FN_ATTRS256
1307_mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
1308 return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1309 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
1310}
1311
1312static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) {
1313 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1314 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
1315}
1316
1317static __inline__ __m128i __DEFAULT_FN_ATTRS128
1318_mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
1319 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1320 (__mmask8)__U);
1321}
1322
1323static __inline__ __m128i __DEFAULT_FN_ATTRS128
1324_mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
1325 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1326 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
1327}
1328
1329static __inline__ __m256i __DEFAULT_FN_ATTRS256
1330_mm256_cvttph_epu64(__m128h __A) {
1331 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1332 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
1333}
1334
1335static __inline__ __m256i __DEFAULT_FN_ATTRS256
1336_mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
1337 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1338 (__mmask8)__U);
1339}
1340
1341static __inline__ __m256i __DEFAULT_FN_ATTRS256
1342_mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
1343 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1344 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
1345}
1346
1347static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) {
1348 return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1349 (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1);
1350}
1351
1352static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W,
1353 __mmask8 __U,
1354 __m128h __A) {
1355 return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W,
1356 (__mmask8)__U);
1357}
1358
1359static __inline__ __m128 __DEFAULT_FN_ATTRS128
1360_mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
1361 return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1362 (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U);
1363}
1364
1365static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) {
1366 return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1367 (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1);
1368}
1369
1370static __inline__ __m256 __DEFAULT_FN_ATTRS256
1371_mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) {
1372 return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W,
1373 (__mmask8)__U);
1374}
1375
1376static __inline__ __m256 __DEFAULT_FN_ATTRS256
1377_mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
1378 return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1379 (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U);
1380}
1381
1382static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) {
1383 return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1384 (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1385}
1386
1387static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W,
1388 __mmask8 __U,
1389 __m128 __A) {
1390 return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W,
1391 (__mmask8)__U);
1392}
1393
1394static __inline__ __m128h __DEFAULT_FN_ATTRS128
1395_mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) {
1396 return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1397 (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1398}
1399
1400static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) {
1401 return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1402 (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1403}
1404
1405static __inline__ __m128h __DEFAULT_FN_ATTRS256
1406_mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) {
1407 return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W,
1408 (__mmask8)__U);
1409}
1410
1411static __inline__ __m128h __DEFAULT_FN_ATTRS256
1412_mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) {
1413 return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1414 (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1415}
1416
1417static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A,
1418 __m128h __B,
1419 __m128h __C) {
1420 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1421 (__v8hf)__C);
1422}
1423
1424static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A,
1425 __mmask8 __U,
1426 __m128h __B,
1427 __m128h __C) {
1428 return (__m128h)__builtin_ia32_selectph_128(
1429 (__mmask8)__U,
1430 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1431 (__v8hf)__A);
1432}
1433
1434static __inline__ __m128h __DEFAULT_FN_ATTRS128
1435_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1436 return (__m128h)__builtin_ia32_selectph_128(
1437 (__mmask8)__U,
1438 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1439 (__v8hf)__C);
1440}
1441
1442static __inline__ __m128h __DEFAULT_FN_ATTRS128
1443_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1444 return (__m128h)__builtin_ia32_selectph_128(
1445 (__mmask8)__U,
1446 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1447 (__v8hf)_mm_setzero_ph());
1448}
1449
1450static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A,
1451 __m128h __B,
1452 __m128h __C) {
1453 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1454 -(__v8hf)__C);
1455}
1456
1457static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A,
1458 __mmask8 __U,
1459 __m128h __B,
1460 __m128h __C) {
1461 return (__m128h)__builtin_ia32_selectph_128(
1462 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1463 (__v8hf)__A);
1464}
1465
1466static __inline__ __m128h __DEFAULT_FN_ATTRS128
1467_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1468 return (__m128h)__builtin_ia32_selectph_128(
1469 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1470 (__v8hf)_mm_setzero_ph());
1471}
1472
1473static __inline__ __m128h __DEFAULT_FN_ATTRS128
1474_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1475 return (__m128h)__builtin_ia32_selectph_128(
1476 (__mmask8)__U,
1477 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1478 (__v8hf)__C);
1479}
1480
1481static __inline__ __m128h __DEFAULT_FN_ATTRS128
1482_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1483 return (__m128h)__builtin_ia32_selectph_128(
1484 (__mmask8)__U,
1485 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1486 (__v8hf)_mm_setzero_ph());
1487}
1488
1489static __inline__ __m128h __DEFAULT_FN_ATTRS128
1490_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1491 return (__m128h)__builtin_ia32_selectph_128(
1492 (__mmask8)__U,
1493 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1494 (__v8hf)_mm_setzero_ph());
1495}
1496
1497static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A,
1498 __m256h __B,
1499 __m256h __C) {
1500 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1501 (__v16hf)__C);
1502}
1503
1504static __inline__ __m256h __DEFAULT_FN_ATTRS256
1505_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1506 return (__m256h)__builtin_ia32_selectph_256(
1507 (__mmask16)__U,
1508 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1509 (__v16hf)__A);
1510}
1511
1512static __inline__ __m256h __DEFAULT_FN_ATTRS256
1513_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1514 return (__m256h)__builtin_ia32_selectph_256(
1515 (__mmask16)__U,
1516 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1517 (__v16hf)__C);
1518}
1519
1520static __inline__ __m256h __DEFAULT_FN_ATTRS256
1521_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1522 return (__m256h)__builtin_ia32_selectph_256(
1523 (__mmask16)__U,
1524 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1525 (__v16hf)_mm256_setzero_ph());
1526}
1527
1528static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A,
1529 __m256h __B,
1530 __m256h __C) {
1531 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1532 -(__v16hf)__C);
1533}
1534
1535static __inline__ __m256h __DEFAULT_FN_ATTRS256
1536_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1537 return (__m256h)__builtin_ia32_selectph_256(
1538 (__mmask16)__U,
1539 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1540 (__v16hf)__A);
1541}
1542
1543static __inline__ __m256h __DEFAULT_FN_ATTRS256
1544_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1545 return (__m256h)__builtin_ia32_selectph_256(
1546 (__mmask16)__U,
1547 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1548 (__v16hf)_mm256_setzero_ph());
1549}
1550
1551static __inline__ __m256h __DEFAULT_FN_ATTRS256
1552_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1553 return (__m256h)__builtin_ia32_selectph_256(
1554 (__mmask16)__U,
1555 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1556 (__v16hf)__C);
1557}
1558
1559static __inline__ __m256h __DEFAULT_FN_ATTRS256
1560_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1561 return (__m256h)__builtin_ia32_selectph_256(
1562 (__mmask16)__U,
1563 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1564 (__v16hf)_mm256_setzero_ph());
1565}
1566
1567static __inline__ __m256h __DEFAULT_FN_ATTRS256
1568_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1569 return (__m256h)__builtin_ia32_selectph_256(
1570 (__mmask16)__U,
1571 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1572 (__v16hf)_mm256_setzero_ph());
1573}
1574
1575static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A,
1576 __m128h __B,
1577 __m128h __C) {
1578 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1579 (__v8hf)__C);
1580}
1581
1582static __inline__ __m128h __DEFAULT_FN_ATTRS128
1583_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1584 return (__m128h)__builtin_ia32_selectph_128(
1585 (__mmask8)__U,
1586 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1587 (__v8hf)__A);
1588}
1589
1590static __inline__ __m128h __DEFAULT_FN_ATTRS128
1591_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1592 return (__m128h)__builtin_ia32_selectph_128(
1593 (__mmask8)__U,
1594 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1595 (__v8hf)__C);
1596}
1597
1598static __inline__ __m128h __DEFAULT_FN_ATTRS128
1599_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1600 return (__m128h)__builtin_ia32_selectph_128(
1601 (__mmask8)__U,
1602 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1603 (__v8hf)_mm_setzero_ph());
1604}
1605
1606static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A,
1607 __m128h __B,
1608 __m128h __C) {
1609 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1610 -(__v8hf)__C);
1611}
1612
1613static __inline__ __m128h __DEFAULT_FN_ATTRS128
1614_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1615 return (__m128h)__builtin_ia32_selectph_128(
1616 (__mmask8)__U,
1617 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1618 (__v8hf)__A);
1619}
1620
1621static __inline__ __m128h __DEFAULT_FN_ATTRS128
1622_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1623 return (__m128h)__builtin_ia32_selectph_128(
1624 (__mmask8)__U,
1625 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1626 (__v8hf)_mm_setzero_ph());
1627}
1628
1629static __inline__ __m256h __DEFAULT_FN_ATTRS256
1630_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) {
1631 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1632 (__v16hf)__C);
1633}
1634
1635static __inline__ __m256h __DEFAULT_FN_ATTRS256
1636_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1637 return (__m256h)__builtin_ia32_selectph_256(
1638 (__mmask16)__U,
1639 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1640 (__v16hf)__A);
1641}
1642
1643static __inline__ __m256h __DEFAULT_FN_ATTRS256
1644_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1645 return (__m256h)__builtin_ia32_selectph_256(
1646 (__mmask16)__U,
1647 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1648 (__v16hf)__C);
1649}
1650
1651static __inline__ __m256h __DEFAULT_FN_ATTRS256
1652_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1653 return (__m256h)__builtin_ia32_selectph_256(
1654 (__mmask16)__U,
1655 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1656 (__v16hf)_mm256_setzero_ph());
1657}
1658
1659static __inline__ __m256h __DEFAULT_FN_ATTRS256
1660_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) {
1661 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1662 -(__v16hf)__C);
1663}
1664
1665static __inline__ __m256h __DEFAULT_FN_ATTRS256
1666_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1667 return (__m256h)__builtin_ia32_selectph_256(
1668 (__mmask16)__U,
1669 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1670 (__v16hf)__A);
1671}
1672
1673static __inline__ __m256h __DEFAULT_FN_ATTRS256
1674_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1675 return (__m256h)__builtin_ia32_selectph_256(
1676 (__mmask16)__U,
1677 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1678 (__v16hf)_mm256_setzero_ph());
1679}
1680
1681static __inline__ __m128h __DEFAULT_FN_ATTRS128
1682_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1683 return (__m128h)__builtin_ia32_selectph_128(
1684 (__mmask8)__U,
1685 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1686 (__v8hf)__C);
1687}
1688
1689static __inline__ __m256h __DEFAULT_FN_ATTRS256
1690_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1691 return (__m256h)__builtin_ia32_selectph_256(
1692 (__mmask16)__U,
1693 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1694 (__v16hf)__C);
1695}
1696
1697static __inline__ __m128h __DEFAULT_FN_ATTRS128
1698_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1699 return (__m128h)__builtin_ia32_selectph_128(
1700 (__mmask8)__U,
1701 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1702 (__v8hf)__C);
1703}
1704
1705static __inline__ __m256h __DEFAULT_FN_ATTRS256
1706_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1707 return (__m256h)__builtin_ia32_selectph_256(
1708 (__mmask16)__U,
1709 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1710 (__v16hf)__C);
1711}
1712
1713static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A,
1714 __m128h __B,
1715 __m128h __C) {
1716 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1717 (__v8hf)__C);
1718}
1719
1720static __inline__ __m128h __DEFAULT_FN_ATTRS128
1721_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1722 return (__m128h)__builtin_ia32_selectph_128(
1723 (__mmask8)__U,
1724 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
1725 (__v8hf)__A);
1726}
1727
1728static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A,
1729 __m256h __B,
1730 __m256h __C) {
1731 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1732 (__v16hf)__C);
1733}
1734
1735static __inline__ __m256h __DEFAULT_FN_ATTRS256
1736_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1737 return (__m256h)__builtin_ia32_selectph_256(
1738 (__mmask16)__U,
1739 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
1740 (__v16hf)__A);
1741}
1742
1743static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A,
1744 __m128h __B,
1745 __m128h __C) {
1746 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1747 -(__v8hf)__C);
1748}
1749
1750static __inline__ __m128h __DEFAULT_FN_ATTRS128
1751_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1752 return (__m128h)__builtin_ia32_selectph_128(
1753 (__mmask8)__U,
1754 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1755 (__v8hf)__A);
1756}
1757
1758static __inline__ __m128h __DEFAULT_FN_ATTRS128
1759_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1760 return (__m128h)__builtin_ia32_selectph_128(
1761 (__mmask8)__U,
1762 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1763 (__v8hf)__C);
1764}
1765
1766static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A,
1767 __m256h __B,
1768 __m256h __C) {
1769 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1770 -(__v16hf)__C);
1771}
1772
1773static __inline__ __m256h __DEFAULT_FN_ATTRS256
1774_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1775 return (__m256h)__builtin_ia32_selectph_256(
1776 (__mmask16)__U,
1777 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1778 (__v16hf)__A);
1779}
1780
1781static __inline__ __m256h __DEFAULT_FN_ATTRS256
1782_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1783 return (__m256h)__builtin_ia32_selectph_256(
1784 (__mmask16)__U,
1785 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1786 (__v16hf)__C);
1787}
1788
1789static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A,
1790 __m128h __B) {
1791 return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1792 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
1793}
1794
1795static __inline__ __m128h __DEFAULT_FN_ATTRS128
1796_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1797 return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1798 (__v4sf)__W, (__mmask8)__U);
1799}
1800
1801static __inline__ __m128h __DEFAULT_FN_ATTRS128
1802_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
1803 return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1804 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
1805}
1806
1807static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A,
1808 __m256h __B) {
1809 return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1810 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
1811}
1812
1813static __inline__ __m256h __DEFAULT_FN_ATTRS256
1814_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
1815 return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1816 (__v8sf)__W, (__mmask8)__U);
1817}
1818
1819static __inline__ __m256h __DEFAULT_FN_ATTRS256
1820_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
1821 return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1822 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
1823}
1824
1825static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A,
1826 __m128h __B,
1827 __m128h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001828 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1829 (__v4sf)__C, (__mmask8)-1);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001830}
1831
1832static __inline__ __m128h __DEFAULT_FN_ATTRS128
1833_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1834 return (__m128h)__builtin_ia32_selectps_128(
1835 __U,
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001836 __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B,
1837 (__v4sf)__C, (__mmask8)__U),
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001838 (__v4sf)__A);
1839}
1840
1841static __inline__ __m128h __DEFAULT_FN_ATTRS128
1842_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001843 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1844 (__v4sf)__C, (__mmask8)__U);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001845}
1846
1847static __inline__ __m128h __DEFAULT_FN_ATTRS128
1848_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1849 return (__m128h)__builtin_ia32_vfcmaddcph128_maskz(
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001850 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001851}
1852
1853static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A,
1854 __m256h __B,
1855 __m256h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001856 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1857 (__v8sf)__C, (__mmask8)-1);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001858}
1859
1860static __inline__ __m256h __DEFAULT_FN_ATTRS256
1861_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
1862 return (__m256h)__builtin_ia32_selectps_256(
1863 __U,
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001864 __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001865 (__mmask8)__U),
1866 (__v8sf)__A);
1867}
1868
1869static __inline__ __m256h __DEFAULT_FN_ATTRS256
1870_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001871 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1872 (__v8sf)__C, (__mmask8)__U);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001873}
1874
1875static __inline__ __m256h __DEFAULT_FN_ATTRS256
1876_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
1877 return (__m256h)__builtin_ia32_vfcmaddcph256_maskz(
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001878 (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001879}
1880
1881static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A,
1882 __m128h __B) {
1883 return (__m128h)__builtin_ia32_vfmulcph128_mask(
1884 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
1885}
1886
1887static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W,
1888 __mmask8 __U,
1889 __m128h __A,
1890 __m128h __B) {
1891 return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1892 (__v4sf)__W, (__mmask8)__U);
1893}
1894
1895static __inline__ __m128h __DEFAULT_FN_ATTRS128
1896_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
1897 return (__m128h)__builtin_ia32_vfmulcph128_mask(
1898 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
1899}
1900
1901static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A,
1902 __m256h __B) {
1903 return (__m256h)__builtin_ia32_vfmulcph256_mask(
1904 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
1905}
1906
1907static __inline__ __m256h __DEFAULT_FN_ATTRS256
1908_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
1909 return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1910 (__v8sf)__W, (__mmask8)__U);
1911}
1912
1913static __inline__ __m256h __DEFAULT_FN_ATTRS256
1914_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
1915 return (__m256h)__builtin_ia32_vfmulcph256_mask(
1916 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
1917}
1918
1919static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A,
1920 __m128h __B,
1921 __m128h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001922 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1923 (__v4sf)__C, (__mmask8)-1);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001924}
1925
1926static __inline__ __m128h __DEFAULT_FN_ATTRS128
1927_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1928 return (__m128h)__builtin_ia32_selectps_128(
1929 __U,
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001930 __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001931 (__mmask8)__U),
1932 (__v4sf)__A);
1933}
1934
1935static __inline__ __m128h __DEFAULT_FN_ATTRS128
1936_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001937 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1938 (__v4sf)__C, (__mmask8)__U);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001939}
1940
1941static __inline__ __m128h __DEFAULT_FN_ATTRS128
1942_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001943 return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B,
1944 (__v4sf)__C, (__mmask8)__U);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001945}
1946
1947static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A,
1948 __m256h __B,
1949 __m256h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001950 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1951 (__v8sf)__C, (__mmask8)-1);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001952}
1953
1954static __inline__ __m256h __DEFAULT_FN_ATTRS256
1955_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
1956 return (__m256h)__builtin_ia32_selectps_256(
1957 __U,
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001958 __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001959 (__mmask8)__U),
1960 (__v8sf)__A);
1961}
1962
1963static __inline__ __m256h __DEFAULT_FN_ATTRS256
1964_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001965 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1966 (__v8sf)__C, (__mmask8)__U);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001967}
1968
1969static __inline__ __m256h __DEFAULT_FN_ATTRS256
1970_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001971 return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B,
1972 (__v8sf)__C, (__mmask8)__U);
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08001973}
1974
1975static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
1976 __m128h __A,
1977 __m128h __W) {
1978 return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W,
1979 (__v8hf)__A);
1980}
1981
1982static __inline__ __m256h __DEFAULT_FN_ATTRS256
1983_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
1984 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W,
1985 (__v16hf)__A);
1986}
1987
1988static __inline__ __m128h __DEFAULT_FN_ATTRS128
1989_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
1990 return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
1991 (__v8hi)__B);
1992}
1993
1994static __inline__ __m256h __DEFAULT_FN_ATTRS256
1995_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
1996 return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
1997 (__v16hi)__B);
1998}
1999
2000static __inline__ __m128h __DEFAULT_FN_ATTRS128
2001_mm_permutexvar_ph(__m128i __A, __m128h __B) {
2002 return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
2003}
2004
2005static __inline__ __m256h __DEFAULT_FN_ATTRS256
2006_mm256_permutexvar_ph(__m256i __A, __m256h __B) {
2007 return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
2008}
2009
2010static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2011_mm256_reduce_add_ph(__m256h __W) {
2012 return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W);
2013}
2014
2015static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2016_mm256_reduce_mul_ph(__m256h __W) {
2017 return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W);
2018}
2019
2020static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2021_mm256_reduce_max_ph(__m256h __V) {
2022 return __builtin_ia32_reduce_fmax_ph256(__V);
2023}
2024
2025static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2026_mm256_reduce_min_ph(__m256h __V) {
2027 return __builtin_ia32_reduce_fmin_ph256(__V);
2028}
2029
2030static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2031_mm_reduce_add_ph(__m128h __W) {
2032 return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W);
2033}
2034
2035static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2036_mm_reduce_mul_ph(__m128h __W) {
2037 return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W);
2038}
2039
2040static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2041_mm_reduce_max_ph(__m128h __V) {
2042 return __builtin_ia32_reduce_fmax_ph128(__V);
2043}
2044
2045static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2046_mm_reduce_min_ph(__m128h __V) {
2047 return __builtin_ia32_reduce_fmin_ph128(__V);
2048}
2049
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08002050// intrinsics below are alias for f*mul_*ch
2051#define _mm_mul_pch(A, B) _mm_fmul_pch(A, B)
2052#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B)
2053#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B)
2054#define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B)
2055#define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B)
2056#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B)
2057
2058#define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B)
2059#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B)
2060#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B)
2061#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B)
2062#define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B)
2063#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B)
2064
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -08002065#undef __DEFAULT_FN_ATTRS128
2066#undef __DEFAULT_FN_ATTRS256
2067
2068#endif