Blame - darwin-x86/lib64/clang/14.0.0/include/avx512vlvnniintrin.h - platform/prebuilts/clang-tools

blob: 0fb29af262f710f107d733eaea3d63a85a20f379 [file] [log] [blame]

Pirama Arumuga Nainar	986b880	2021-06-03 16:00:34 -0700	[diff] [blame]	1	/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
				2	*
				3	*
				4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				5	* See https://llvm.org/LICENSE.txt for license information.
				6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				7	*
				8	*===-----------------------------------------------------------------------===
				9	*/
				10	#ifndef __IMMINTRIN_H
				11	#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
				12	#endif
				13
				14	#ifndef __AVX512VLVNNIINTRIN_H
				15	#define __AVX512VLVNNIINTRIN_H
				16
				17	/* Define the default attributes for the functions in this file. */
				18	#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
				19	#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
				20
				21	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
				22	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
				23	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
				24	/// in \a S, and store the packed 32-bit results in DST.
				25	///
				26	/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
				27	///
				28	/// \operation
				29	/// FOR j := 0 to 7
				30	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
				31	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
				32	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
				33	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
				34	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
				35	/// ENDFOR
				36	/// DST[MAX:256] := 0
				37	/// \endoperation
				38	#define _mm256_dpbusd_epi32(S, A, B) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	39	((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
Pirama Arumuga Nainar	986b880	2021-06-03 16:00:34 -0700	[diff] [blame]	40
				41	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
				42	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
				43	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
				44	/// in \a S using signed saturation, and store the packed 32-bit results in DST.
				45	///
				46	/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
				47	///
				48	/// \operation
				49	/// FOR j := 0 to 7
				50	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
				51	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
				52	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
				53	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
				54	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
				55	/// ENDFOR
				56	/// DST[MAX:256] := 0
				57	/// \endoperation
				58	#define _mm256_dpbusds_epi32(S, A, B) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	59	((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
Pirama Arumuga Nainar	986b880	2021-06-03 16:00:34 -0700	[diff] [blame]	60
				61	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
				62	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
				63	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
				64	/// and store the packed 32-bit results in DST.
				65	///
				66	/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
				67	///
				68	/// \operation
				69	/// FOR j := 0 to 7
				70	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
				71	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
				72	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
				73	/// ENDFOR
				74	/// DST[MAX:256] := 0
				75	/// \endoperation
				76	#define _mm256_dpwssd_epi32(S, A, B) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	77	((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
Pirama Arumuga Nainar	986b880	2021-06-03 16:00:34 -0700	[diff] [blame]	78
				79	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
				80	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
				81	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
				82	/// using signed saturation, and store the packed 32-bit results in DST.
				83	///
				84	/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
				85	///
				86	/// \operation
				87	/// FOR j := 0 to 7
				88	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
				89	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
				90	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
				91	/// ENDFOR
				92	/// DST[MAX:256] := 0
				93	/// \endoperation
				94	#define _mm256_dpwssds_epi32(S, A, B) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	95	((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
Pirama Arumuga Nainar	986b880	2021-06-03 16:00:34 -0700	[diff] [blame]	96
				97	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
				98	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
				99	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
				100	/// in \a S, and store the packed 32-bit results in DST.
				101	///
				102	/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
				103	///
				104	/// \operation
				105	/// FOR j := 0 to 3
				106	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
				107	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
				108	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
				109	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
				110	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
				111	/// ENDFOR
				112	/// DST[MAX:128] := 0
				113	/// \endoperation
				114	#define _mm_dpbusd_epi32(S, A, B) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	115	((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
Pirama Arumuga Nainar	986b880	2021-06-03 16:00:34 -0700	[diff] [blame]	116
				117	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
				118	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
				119	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
				120	/// in \a S using signed saturation, and store the packed 32-bit results in DST.
				121	///
				122	/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
				123	///
				124	/// \operation
				125	/// FOR j := 0 to 3
				126	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
				127	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
				128	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
				129	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
				130	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
				131	/// ENDFOR
				132	/// DST[MAX:128] := 0
				133	/// \endoperation
				134	#define _mm_dpbusds_epi32(S, A, B) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	135	((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
Pirama Arumuga Nainar	986b880	2021-06-03 16:00:34 -0700	[diff] [blame]	136
				137	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
				138	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
				139	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
				140	/// and store the packed 32-bit results in DST.
				141	///
				142	/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
				143	///
				144	/// \operation
				145	/// FOR j := 0 to 3
				146	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
				147	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
				148	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
				149	/// ENDFOR
				150	/// DST[MAX:128] := 0
				151	/// \endoperation
				152	#define _mm_dpwssd_epi32(S, A, B) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	153	((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
Pirama Arumuga Nainar	986b880	2021-06-03 16:00:34 -0700	[diff] [blame]	154
				155	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
				156	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
				157	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
				158	/// using signed saturation, and store the packed 32-bit results in DST.
				159	///
				160	/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
				161	///
				162	/// \operation
				163	/// FOR j := 0 to 3
				164	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
				165	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
				166	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
				167	/// ENDFOR
				168	/// DST[MAX:128] := 0
				169	/// \endoperation
				170	#define _mm_dpwssds_epi32(S, A, B) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	171	((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
Pirama Arumuga Nainar	986b880	2021-06-03 16:00:34 -0700	[diff] [blame]	172
				173	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				174	_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
				175	{
				176	return (__m256i)__builtin_ia32_selectd_256(__U,
				177	(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
				178	(__v8si)__S);
				179	}
				180
				181	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				182	_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
				183	{
				184	return (__m256i)__builtin_ia32_selectd_256(__U,
				185	(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
				186	(__v8si)_mm256_setzero_si256());
				187	}
				188
				189	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				190	_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
				191	{
				192	return (__m256i)__builtin_ia32_selectd_256(__U,
				193	(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
				194	(__v8si)__S);
				195	}
				196
				197	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				198	_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
				199	{
				200	return (__m256i)__builtin_ia32_selectd_256(__U,
				201	(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
				202	(__v8si)_mm256_setzero_si256());
				203	}
				204
				205	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				206	_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
				207	{
				208	return (__m256i)__builtin_ia32_selectd_256(__U,
				209	(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
				210	(__v8si)__S);
				211	}
				212
				213	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				214	_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
				215	{
				216	return (__m256i)__builtin_ia32_selectd_256(__U,
				217	(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
				218	(__v8si)_mm256_setzero_si256());
				219	}
				220
				221	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				222	_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
				223	{
				224	return (__m256i)__builtin_ia32_selectd_256(__U,
				225	(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
				226	(__v8si)__S);
				227	}
				228
				229	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				230	_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
				231	{
				232	return (__m256i)__builtin_ia32_selectd_256(__U,
				233	(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
				234	(__v8si)_mm256_setzero_si256());
				235	}
				236
				237	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				238	_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
				239	{
				240	return (__m128i)__builtin_ia32_selectd_128(__U,
				241	(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
				242	(__v4si)__S);
				243	}
				244
				245	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				246	_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
				247	{
				248	return (__m128i)__builtin_ia32_selectd_128(__U,
				249	(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
				250	(__v4si)_mm_setzero_si128());
				251	}
				252
				253	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				254	_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
				255	{
				256	return (__m128i)__builtin_ia32_selectd_128(__U,
				257	(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
				258	(__v4si)__S);
				259	}
				260
				261	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				262	_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
				263	{
				264	return (__m128i)__builtin_ia32_selectd_128(__U,
				265	(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
				266	(__v4si)_mm_setzero_si128());
				267	}
				268
				269	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				270	_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
				271	{
				272	return (__m128i)__builtin_ia32_selectd_128(__U,
				273	(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
				274	(__v4si)__S);
				275	}
				276
				277	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				278	_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
				279	{
				280	return (__m128i)__builtin_ia32_selectd_128(__U,
				281	(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
				282	(__v4si)_mm_setzero_si128());
				283	}
				284
				285	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				286	_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
				287	{
				288	return (__m128i)__builtin_ia32_selectd_128(__U,
				289	(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
				290	(__v4si)__S);
				291	}
				292
				293	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				294	_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
				295	{
				296	return (__m128i)__builtin_ia32_selectd_128(__U,
				297	(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
				298	(__v4si)_mm_setzero_si128());
				299	}
				300
				301	#undef __DEFAULT_FN_ATTRS128
				302	#undef __DEFAULT_FN_ATTRS256
				303
				304	#endif