Blame - darwin-x86/lib64/clang/14.0.1/include/avxvnniintrin.h - platform/prebuilts/clang-tools

blob: ad45cb7962e5451626f193830bd003b1451eaeab [file] [log] [blame]

Pirama Arumuga Nainar	986b880	2021-06-03 16:00:34 -0700	[diff] [blame]	1	/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
				2	*
				3	*
				4	* Permission is hereby granted, free of charge, to any person obtaining a copy
				5	* of this software and associated documentation files (the "Software"), to deal
				6	* in the Software without restriction, including without limitation the rights
				7	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				8	* copies of the Software, and to permit persons to whom the Software is
				9	* furnished to do so, subject to the following conditions:
				10	*
				11	* The above copyright notice and this permission notice shall be included in
				12	* all copies or substantial portions of the Software.
				13	*
				14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				17	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				18	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				19	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				20	* THE SOFTWARE.
				21	*
				22	*===-----------------------------------------------------------------------===
				23	*/
				24	#ifndef __IMMINTRIN_H
				25	#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
				26	#endif
				27
				28	#ifndef __AVXVNNIINTRIN_H
				29	#define __AVXVNNIINTRIN_H
				30
				31	/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
				32	/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
				33	/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
				34	/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
				35	/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
				36	/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
				37	/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
				38	/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
				39	/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
				40
				41	/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
				42	/* Define the default attributes for the functions in this file. */
				43	#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
				44	#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
				45
				46	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
				47	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
				48	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
				49	/// in \a __S, and store the packed 32-bit results in DST.
				50	///
				51	/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
				52	///
				53	/// \operation
				54	/// FOR j := 0 to 7
				55	/// tmp1.word := Signed(ZeroExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j]))
				56	/// tmp2.word := Signed(ZeroExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1]))
				57	/// tmp3.word := Signed(ZeroExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2]))
				58	/// tmp4.word := Signed(ZeroExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3]))
				59	/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
				60	/// ENDFOR
				61	/// DST[MAX:256] := 0
				62	/// \endoperation
				63	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				64	_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
				65	{
				66	return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
				67	}
				68
				69	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
				70	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
				71	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
				72	/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
				73	///
				74	/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
				75	///
				76	/// \operation
				77	/// FOR j := 0 to 7
				78	/// tmp1.word := Signed(ZeroExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j]))
				79	/// tmp2.word := Signed(ZeroExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1]))
				80	/// tmp3.word := Signed(ZeroExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2]))
				81	/// tmp4.word := Signed(ZeroExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3]))
				82	/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
				83	/// ENDFOR
				84	/// DST[MAX:256] := 0
				85	/// \endoperation
				86	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				87	_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
				88	{
				89	return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
				90	}
				91
				92	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
				93	/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
				94	/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
				95	/// and store the packed 32-bit results in DST.
				96	///
				97	/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
				98	///
				99	/// \operation
				100	/// FOR j := 0 to 7
				101	/// tmp1.dword := SignExtend32(__A.word[2j]) SignExtend32(__B.word[2*j])
				102	/// tmp2.dword := SignExtend32(__A.word[2j+1]) SignExtend32(__B.word[2*j+1])
				103	/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
				104	/// ENDFOR
				105	/// DST[MAX:256] := 0
				106	/// \endoperation
				107	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				108	_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
				109	{
				110	return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
				111	}
				112
				113	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
				114	/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
				115	/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
				116	/// using signed saturation, and store the packed 32-bit results in DST.
				117	///
				118	/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
				119	///
				120	/// \operation
				121	/// FOR j := 0 to 7
				122	/// tmp1.dword := SignExtend32(__A.word[2j]) SignExtend32(__B.word[2*j])
				123	/// tmp2.dword := SignExtend32(__A.word[2j+1]) SignExtend32(__B.word[2*j+1])
				124	/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
				125	/// ENDFOR
				126	/// DST[MAX:256] := 0
				127	/// \endoperation
				128	static __inline__ __m256i __DEFAULT_FN_ATTRS256
				129	_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
				130	{
				131	return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
				132	}
				133
				134	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
				135	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
				136	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
				137	/// in \a __S, and store the packed 32-bit results in DST.
				138	///
				139	/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
				140	///
				141	/// \operation
				142	/// FOR j := 0 to 3
				143	/// tmp1.word := Signed(ZeroExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j]))
				144	/// tmp2.word := Signed(ZeroExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1]))
				145	/// tmp3.word := Signed(ZeroExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2]))
				146	/// tmp4.word := Signed(ZeroExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3]))
				147	/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
				148	/// ENDFOR
				149	/// DST[MAX:128] := 0
				150	/// \endoperation
				151	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				152	_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
				153	{
				154	return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
				155	}
				156
				157	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
				158	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
				159	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
				160	/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
				161	///
				162	/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
				163	///
				164	/// \operation
				165	/// FOR j := 0 to 3
				166	/// tmp1.word := Signed(ZeroExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j]))
				167	/// tmp2.word := Signed(ZeroExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1]))
				168	/// tmp3.word := Signed(ZeroExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2]))
				169	/// tmp4.word := Signed(ZeroExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3]))
				170	/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
				171	/// ENDFOR
				172	/// DST[MAX:128] := 0
				173	/// \endoperation
				174	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				175	_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
				176	{
				177	return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
				178	}
				179
				180	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
				181	/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
				182	/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
				183	/// and store the packed 32-bit results in DST.
				184	///
				185	/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
				186	///
				187	/// \operation
				188	/// FOR j := 0 to 3
				189	/// tmp1.dword := SignExtend32(__A.word[2j]) SignExtend32(__B.word[2*j])
				190	/// tmp2.dword := SignExtend32(__A.word[2j+1]) SignExtend32(__B.word[2*j+1])
				191	/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
				192	/// ENDFOR
				193	/// DST[MAX:128] := 0
				194	/// \endoperation
				195	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				196	_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
				197	{
				198	return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
				199	}
				200
				201	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
				202	/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
				203	/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
				204	/// using signed saturation, and store the packed 32-bit results in DST.
				205	///
				206	/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
				207	///
				208	/// \operation
				209	/// FOR j := 0 to 3
				210	/// tmp1.dword := SignExtend32(__A.word[2j]) SignExtend32(__B.word[2*j])
				211	/// tmp2.dword := SignExtend32(__A.word[2j+1]) SignExtend32(__B.word[2*j+1])
				212	/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
				213	/// ENDFOR
				214	/// DST[MAX:128] := 0
				215	/// \endoperation
				216	static __inline__ __m128i __DEFAULT_FN_ATTRS128
				217	_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
				218	{
				219	return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
				220	}
				221
				222	#undef __DEFAULT_FN_ATTRS128
				223	#undef __DEFAULT_FN_ATTRS256
				224
				225	#endif // __AVXVNNIINTRIN_H