Blame - linux-x86/lib64/clang/14.0.1/include/avx512vlbf16intrin.h - platform/prebuilts/clang-tools

blob: 1b1a744bcdbfd11576743332353c11eb2d8aedc9 [file] [log] [blame]

Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	1	/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
				2	*
				3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	* See https://llvm.org/LICENSE.txt for license information.
				5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				6	*
				7	*===-----------------------------------------------------------------------===
				8	*/
				9	#ifndef __IMMINTRIN_H
				10	#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
				11	#endif
				12
				13	#ifndef __AVX512VLBF16INTRIN_H
				14	#define __AVX512VLBF16INTRIN_H
				15
				16	typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
				17
				18	#define __DEFAULT_FN_ATTRS128 \
				19	__attribute__((__always_inline__, __nodebug__, \
				20	__target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
				21	#define __DEFAULT_FN_ATTRS256 \
				22	__attribute__((__always_inline__, __nodebug__, \
				23	__target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
				24
				25	/// Convert Two Packed Single Data to One Packed BF16 Data.
				26	///
				27	/// \headerfile <x86intrin.h>
				28	///
				29	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
				30	///
				31	/// \param __A
				32	/// A 128-bit vector of [4 x float].
				33	/// \param __B
				34	/// A 128-bit vector of [4 x float].
				35	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
				36	/// conversion of __B, and higher 64 bits come from conversion of __A.
				37	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
				38	_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
				39	return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
				40	(__v4sf) __B);
				41	}
				42
				43	/// Convert Two Packed Single Data to One Packed BF16 Data.
				44	///
				45	/// \headerfile <x86intrin.h>
				46	///
				47	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
				48	///
				49	/// \param __A
				50	/// A 128-bit vector of [4 x float].
				51	/// \param __B
				52	/// A 128-bit vector of [4 x float].
				53	/// \param __W
				54	/// A 128-bit vector of [8 x bfloat].
				55	/// \param __U
				56	/// A 8-bit mask value specifying what is chosen for each element.
				57	/// A 1 means conversion of __A or __B. A 0 means element from __W.
				58	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
				59	/// conversion of __B, and higher 64 bits come from conversion of __A.
				60	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
				61	_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
				62	return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
				63	(__v8hi)_mm_cvtne2ps_pbh(__A, __B),
				64	(__v8hi)__W);
				65	}
				66
				67	/// Convert Two Packed Single Data to One Packed BF16 Data.
				68	///
				69	/// \headerfile <x86intrin.h>
				70	///
				71	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
				72	///
				73	/// \param __A
				74	/// A 128-bit vector of [4 x float].
				75	/// \param __B
				76	/// A 128-bit vector of [4 x float].
				77	/// \param __U
				78	/// A 8-bit mask value specifying what is chosen for each element.
				79	/// A 1 means conversion of __A or __B. A 0 means element is zero.
				80	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
				81	/// conversion of __B, and higher 64 bits come from conversion of __A.
				82	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
				83	_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
				84	return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
				85	(__v8hi)_mm_cvtne2ps_pbh(__A, __B),
				86	(__v8hi)_mm_setzero_si128());
				87	}
				88
				89	/// Convert Two Packed Single Data to One Packed BF16 Data.
				90	///
				91	/// \headerfile <x86intrin.h>
				92	///
				93	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
				94	///
				95	/// \param __A
				96	/// A 256-bit vector of [8 x float].
				97	/// \param __B
				98	/// A 256-bit vector of [8 x float].
				99	/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
				100	/// conversion of __B, and higher 128 bits come from conversion of __A.
				101	static __inline__ __m256bh __DEFAULT_FN_ATTRS256
				102	_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
				103	return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
				104	(__v8sf) __B);
				105	}
				106
				107	/// Convert Two Packed Single Data to One Packed BF16 Data.
				108	///
				109	/// \headerfile <x86intrin.h>
				110	///
				111	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
				112	///
				113	/// \param __A
				114	/// A 256-bit vector of [8 x float].
				115	/// \param __B
				116	/// A 256-bit vector of [8 x float].
				117	/// \param __W
				118	/// A 256-bit vector of [16 x bfloat].
				119	/// \param __U
				120	/// A 16-bit mask value specifying what is chosen for each element.
				121	/// A 1 means conversion of __A or __B. A 0 means element from __W.
				122	/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
				123	/// conversion of __B, and higher 128 bits come from conversion of __A.
				124	static __inline__ __m256bh __DEFAULT_FN_ATTRS256
				125	_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
				126	return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
				127	(__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
				128	(__v16hi)__W);
				129	}
				130
				131	/// Convert Two Packed Single Data to One Packed BF16 Data.
				132	///
				133	/// \headerfile <x86intrin.h>
				134	///
				135	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
				136	///
				137	/// \param __A
				138	/// A 256-bit vector of [8 x float].
				139	/// \param __B
				140	/// A 256-bit vector of [8 x float].
				141	/// \param __U
				142	/// A 16-bit mask value specifying what is chosen for each element.
				143	/// A 1 means conversion of __A or __B. A 0 means element is zero.
				144	/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
				145	/// conversion of __B, and higher 128 bits come from conversion of __A.
				146	static __inline__ __m256bh __DEFAULT_FN_ATTRS256
				147	_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
				148	return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
				149	(__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
				150	(__v16hi)_mm256_setzero_si256());
				151	}
				152
				153	/// Convert Packed Single Data to Packed BF16 Data.
				154	///
				155	/// \headerfile <x86intrin.h>
				156	///
				157	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
				158	///
				159	/// \param __A
				160	/// A 128-bit vector of [4 x float].
				161	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
				162	/// conversion of __A, and higher 64 bits are 0.
				163	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
				164	_mm_cvtneps_pbh(__m128 __A) {
				165	return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
				166	(__v8hi)_mm_undefined_si128(),
				167	(__mmask8)-1);
				168	}
				169
				170	/// Convert Packed Single Data to Packed BF16 Data.
				171	///
				172	/// \headerfile <x86intrin.h>
				173	///
				174	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
				175	///
				176	/// \param __A
				177	/// A 128-bit vector of [4 x float].
				178	/// \param __W
				179	/// A 128-bit vector of [8 x bfloat].
				180	/// \param __U
				181	/// A 4-bit mask value specifying what is chosen for each element.
				182	/// A 1 means conversion of __A. A 0 means element from __W.
				183	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
				184	/// conversion of __A, and higher 64 bits are 0.
				185	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
				186	_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
				187	return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
				188	(__v8hi)__W,
				189	(__mmask8)__U);
				190	}
				191
				192	/// Convert Packed Single Data to Packed BF16 Data.
				193	///
				194	/// \headerfile <x86intrin.h>
				195	///
				196	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
				197	///
				198	/// \param __A
				199	/// A 128-bit vector of [4 x float].
				200	/// \param __U
				201	/// A 4-bit mask value specifying what is chosen for each element.
				202	/// A 1 means conversion of __A. A 0 means element is zero.
				203	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
				204	/// conversion of __A, and higher 64 bits are 0.
				205	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
				206	_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
				207	return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
				208	(__v8hi)_mm_setzero_si128(),
				209	(__mmask8)__U);
				210	}
				211
				212	/// Convert Packed Single Data to Packed BF16 Data.
				213	///
				214	/// \headerfile <x86intrin.h>
				215	///
				216	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
				217	///
				218	/// \param __A
				219	/// A 256-bit vector of [8 x float].
				220	/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
				221	static __inline__ __m128bh __DEFAULT_FN_ATTRS256
				222	_mm256_cvtneps_pbh(__m256 __A) {
				223	return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
				224	(__v8hi)_mm_undefined_si128(),
				225	(__mmask8)-1);
				226	}
				227
				228	/// Convert Packed Single Data to Packed BF16 Data.
				229	///
				230	/// \headerfile <x86intrin.h>
				231	///
				232	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
				233	///
				234	/// \param __A
				235	/// A 256-bit vector of [8 x float].
				236	/// \param __W
				237	/// A 256-bit vector of [8 x bfloat].
				238	/// \param __U
				239	/// A 8-bit mask value specifying what is chosen for each element.
				240	/// A 1 means conversion of __A. A 0 means element from __W.
				241	/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
				242	static __inline__ __m128bh __DEFAULT_FN_ATTRS256
				243	_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
				244	return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
				245	(__v8hi)__W,
				246	(__mmask8)__U);
				247	}
				248
				249	/// Convert Packed Single Data to Packed BF16 Data.
				250	///
				251	/// \headerfile <x86intrin.h>
				252	///
				253	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
				254	///
				255	/// \param __A
				256	/// A 256-bit vector of [8 x float].
				257	/// \param __U
				258	/// A 8-bit mask value specifying what is chosen for each element.
				259	/// A 1 means conversion of __A. A 0 means element is zero.
				260	/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
				261	static __inline__ __m128bh __DEFAULT_FN_ATTRS256
				262	_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
				263	return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
				264	(__v8hi)_mm_setzero_si128(),
				265	(__mmask8)__U);
				266	}
				267
				268	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
				269	///
				270	/// \headerfile <x86intrin.h>
				271	///
				272	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
				273	///
				274	/// \param __A
				275	/// A 128-bit vector of [8 x bfloat].
				276	/// \param __B
				277	/// A 128-bit vector of [8 x bfloat].
				278	/// \param __D
				279	/// A 128-bit vector of [4 x float].
				280	/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
				281	/// __A, __B and __D
				282	static __inline__ __m128 __DEFAULT_FN_ATTRS128
				283	_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
				284	return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
				285	(__v4si)__A,
				286	(__v4si)__B);
				287	}
				288
				289	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
				290	///
				291	/// \headerfile <x86intrin.h>
				292	///
				293	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
				294	///
				295	/// \param __A
				296	/// A 128-bit vector of [8 x bfloat].
				297	/// \param __B
				298	/// A 128-bit vector of [8 x bfloat].
				299	/// \param __D
				300	/// A 128-bit vector of [4 x float].
				301	/// \param __U
				302	/// A 8-bit mask value specifying what is chosen for each element.
				303	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
				304	/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
				305	/// __A, __B and __D
				306	static __inline__ __m128 __DEFAULT_FN_ATTRS128
				307	_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
				308	return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
				309	(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
				310	(__v4sf)__D);
				311	}
				312
				313	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
				314	///
				315	/// \headerfile <x86intrin.h>
				316	///
				317	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
				318	///
				319	/// \param __A
				320	/// A 128-bit vector of [8 x bfloat].
				321	/// \param __B
				322	/// A 128-bit vector of [8 x bfloat].
				323	/// \param __D
				324	/// A 128-bit vector of [4 x float].
				325	/// \param __U
				326	/// A 8-bit mask value specifying what is chosen for each element.
				327	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
				328	/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
				329	/// __A, __B and __D
				330	static __inline__ __m128 __DEFAULT_FN_ATTRS128
				331	_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
				332	return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
				333	(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
				334	(__v4sf)_mm_setzero_si128());
				335	}
				336
				337	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
				338	///
				339	/// \headerfile <x86intrin.h>
				340	///
				341	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
				342	///
				343	/// \param __A
				344	/// A 256-bit vector of [16 x bfloat].
				345	/// \param __B
				346	/// A 256-bit vector of [16 x bfloat].
				347	/// \param __D
				348	/// A 256-bit vector of [8 x float].
				349	/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
				350	/// __A, __B and __D
				351	static __inline__ __m256 __DEFAULT_FN_ATTRS256
				352	_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
				353	return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
				354	(__v8si)__A,
				355	(__v8si)__B);
				356	}
				357
				358	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
				359	///
				360	/// \headerfile <x86intrin.h>
				361	///
				362	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
				363	///
				364	/// \param __A
				365	/// A 256-bit vector of [16 x bfloat].
				366	/// \param __B
				367	/// A 256-bit vector of [16 x bfloat].
				368	/// \param __D
				369	/// A 256-bit vector of [8 x float].
				370	/// \param __U
				371	/// A 16-bit mask value specifying what is chosen for each element.
				372	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
				373	/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
				374	/// __A, __B and __D
				375	static __inline__ __m256 __DEFAULT_FN_ATTRS256
				376	_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
				377	return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
				378	(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
				379	(__v8sf)__D);
				380	}
				381
				382	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
				383	///
				384	/// \headerfile <x86intrin.h>
				385	///
				386	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
				387	///
				388	/// \param __A
				389	/// A 256-bit vector of [16 x bfloat].
				390	/// \param __B
				391	/// A 256-bit vector of [16 x bfloat].
				392	/// \param __D
				393	/// A 256-bit vector of [8 x float].
				394	/// \param __U
				395	/// A 8-bit mask value specifying what is chosen for each element.
				396	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
				397	/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
				398	/// __A, __B and __D
				399	static __inline__ __m256 __DEFAULT_FN_ATTRS256
				400	_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
				401	return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
				402	(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
				403	(__v8sf)_mm256_setzero_si256());
				404	}
				405
				406	/// Convert One Single float Data to One BF16 Data.
				407	///
				408	/// \headerfile <x86intrin.h>
				409	///
				410	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
				411	///
				412	/// \param __A
				413	/// A float data.
				414	/// \returns A bf16 data whose sign field and exponent field keep unchanged,
				415	/// and fraction field is truncated to 7 bits.
				416	static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
				417	__v4sf __V = {__A, 0, 0, 0};
				418	__v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
				419	(__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
				420	return __R[0];
				421	}
				422
				423	/// Convert Packed BF16 Data to Packed float Data.
				424	///
				425	/// \headerfile <x86intrin.h>
				426	///
				427	/// \param __A
				428	/// A 128-bit vector of [8 x bfloat].
				429	/// \returns A 256-bit vector of [8 x float] come from convertion of __A
				430	static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
				431	return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
				432	(__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
				433	}
				434
				435	/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
				436	///
				437	/// \headerfile <x86intrin.h>
				438	///
				439	/// \param __U
				440	/// A 8-bit mask. Elements are zeroed out when the corresponding mask
				441	/// bit is not set.
				442	/// \param __A
				443	/// A 128-bit vector of [8 x bfloat].
				444	/// \returns A 256-bit vector of [8 x float] come from convertion of __A
				445	static __inline__ __m256 __DEFAULT_FN_ATTRS256
				446	_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
				447	return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
				448	(__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
				449	}
				450
				451	/// Convert Packed BF16 Data to Packed float Data using merging mask.
				452	///
				453	/// \headerfile <x86intrin.h>
				454	///
				455	/// \param __S
				456	/// A 256-bit vector of [8 x float]. Elements are copied from __S when
				457	/// the corresponding mask bit is not set.
				458	/// \param __U
				459	/// A 8-bit mask. Elements are zeroed out when the corresponding mask
				460	/// bit is not set.
				461	/// \param __A
				462	/// A 128-bit vector of [8 x bfloat].
				463	/// \returns A 256-bit vector of [8 x float] come from convertion of __A
				464	static __inline__ __m256 __DEFAULT_FN_ATTRS256
				465	_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
				466	return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
				467	(__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
				468	16));
				469	}
				470
				471	#undef __DEFAULT_FN_ATTRS128
				472	#undef __DEFAULT_FN_ATTRS256
				473
				474	#endif