Blame - darwin-x86/lib64/clang/14.0.2/include/avxintrin.h - platform/prebuilts/clang-tools

blob: 17fe636911772b0df10fa0922c4052303cf4210e [file] [log] [blame]

Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
				2	*
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	* See https://llvm.org/LICENSE.txt for license information.
				5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	6	*
				7	*===-----------------------------------------------------------------------===
				8	*/
				9
				10	#ifndef __IMMINTRIN_H
				11	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
				12	#endif
				13
				14	#ifndef __AVXINTRIN_H
				15	#define __AVXINTRIN_H
				16
				17	typedef double __v4df __attribute__ ((__vector_size__ (32)));
				18	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
				19	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
				20	typedef int __v8si __attribute__ ((__vector_size__ (32)));
				21	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
				22	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
				23
				24	/* Unsigned types */
				25	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
				26	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
				27	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
				28	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
				29
				30	/* We need an explicitly signed variant for char. Note that this shouldn't
				31	* appear in the interface though. */
				32	typedef signed char __v32qs __attribute__((__vector_size__(32)));
				33
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	34	typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
				35	typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
				36	typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
				37
				38	typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
				39	typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
				40	typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	41
				42	/* Define the default attributes for the functions in this file. */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	43	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
				44	#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	45
				46	/* Arithmetic */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	47	/// Adds two 256-bit vectors of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	48	///
				49	/// \headerfile <x86intrin.h>
				50	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	51	/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	52	///
				53	/// \param __a
				54	/// A 256-bit vector of [4 x double] containing one of the source operands.
				55	/// \param __b
				56	/// A 256-bit vector of [4 x double] containing one of the source operands.
				57	/// \returns A 256-bit vector of [4 x double] containing the sums of both
				58	/// operands.
				59	static __inline __m256d __DEFAULT_FN_ATTRS
				60	_mm256_add_pd(__m256d __a, __m256d __b)
				61	{
				62	return (__m256d)((__v4df)__a+(__v4df)__b);
				63	}
				64
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	65	/// Adds two 256-bit vectors of [8 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	66	///
				67	/// \headerfile <x86intrin.h>
				68	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	69	/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	70	///
				71	/// \param __a
				72	/// A 256-bit vector of [8 x float] containing one of the source operands.
				73	/// \param __b
				74	/// A 256-bit vector of [8 x float] containing one of the source operands.
				75	/// \returns A 256-bit vector of [8 x float] containing the sums of both
				76	/// operands.
				77	static __inline __m256 __DEFAULT_FN_ATTRS
				78	_mm256_add_ps(__m256 __a, __m256 __b)
				79	{
				80	return (__m256)((__v8sf)__a+(__v8sf)__b);
				81	}
				82
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	83	/// Subtracts two 256-bit vectors of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	84	///
				85	/// \headerfile <x86intrin.h>
				86	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	87	/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	88	///
				89	/// \param __a
				90	/// A 256-bit vector of [4 x double] containing the minuend.
				91	/// \param __b
				92	/// A 256-bit vector of [4 x double] containing the subtrahend.
				93	/// \returns A 256-bit vector of [4 x double] containing the differences between
				94	/// both operands.
				95	static __inline __m256d __DEFAULT_FN_ATTRS
				96	_mm256_sub_pd(__m256d __a, __m256d __b)
				97	{
				98	return (__m256d)((__v4df)__a-(__v4df)__b);
				99	}
				100
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	101	/// Subtracts two 256-bit vectors of [8 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	102	///
				103	/// \headerfile <x86intrin.h>
				104	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	105	/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	106	///
				107	/// \param __a
				108	/// A 256-bit vector of [8 x float] containing the minuend.
				109	/// \param __b
				110	/// A 256-bit vector of [8 x float] containing the subtrahend.
				111	/// \returns A 256-bit vector of [8 x float] containing the differences between
				112	/// both operands.
				113	static __inline __m256 __DEFAULT_FN_ATTRS
				114	_mm256_sub_ps(__m256 __a, __m256 __b)
				115	{
				116	return (__m256)((__v8sf)__a-(__v8sf)__b);
				117	}
				118
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	119	/// Adds the even-indexed values and subtracts the odd-indexed values of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	120	/// two 256-bit vectors of [4 x double].
				121	///
				122	/// \headerfile <x86intrin.h>
				123	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	124	/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	125	///
				126	/// \param __a
				127	/// A 256-bit vector of [4 x double] containing the left source operand.
				128	/// \param __b
				129	/// A 256-bit vector of [4 x double] containing the right source operand.
				130	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
				131	/// and differences between both operands.
				132	static __inline __m256d __DEFAULT_FN_ATTRS
				133	_mm256_addsub_pd(__m256d __a, __m256d __b)
				134	{
				135	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
				136	}
				137
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	138	/// Adds the even-indexed values and subtracts the odd-indexed values of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	139	/// two 256-bit vectors of [8 x float].
				140	///
				141	/// \headerfile <x86intrin.h>
				142	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	143	/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	144	///
				145	/// \param __a
				146	/// A 256-bit vector of [8 x float] containing the left source operand.
				147	/// \param __b
				148	/// A 256-bit vector of [8 x float] containing the right source operand.
				149	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
				150	/// differences between both operands.
				151	static __inline __m256 __DEFAULT_FN_ATTRS
				152	_mm256_addsub_ps(__m256 __a, __m256 __b)
				153	{
				154	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
				155	}
				156
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	157	/// Divides two 256-bit vectors of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	158	///
				159	/// \headerfile <x86intrin.h>
				160	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	161	/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	162	///
				163	/// \param __a
				164	/// A 256-bit vector of [4 x double] containing the dividend.
				165	/// \param __b
				166	/// A 256-bit vector of [4 x double] containing the divisor.
				167	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
				168	/// operands.
				169	static __inline __m256d __DEFAULT_FN_ATTRS
				170	_mm256_div_pd(__m256d __a, __m256d __b)
				171	{
				172	return (__m256d)((__v4df)__a/(__v4df)__b);
				173	}
				174
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	175	/// Divides two 256-bit vectors of [8 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	176	///
				177	/// \headerfile <x86intrin.h>
				178	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	179	/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	180	///
				181	/// \param __a
				182	/// A 256-bit vector of [8 x float] containing the dividend.
				183	/// \param __b
				184	/// A 256-bit vector of [8 x float] containing the divisor.
				185	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
				186	/// operands.
				187	static __inline __m256 __DEFAULT_FN_ATTRS
				188	_mm256_div_ps(__m256 __a, __m256 __b)
				189	{
				190	return (__m256)((__v8sf)__a/(__v8sf)__b);
				191	}
				192
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	193	/// Compares two 256-bit vectors of [4 x double] and returns the greater
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	194	/// of each pair of values.
				195	///
				196	/// \headerfile <x86intrin.h>
				197	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	198	/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	199	///
				200	/// \param __a
				201	/// A 256-bit vector of [4 x double] containing one of the operands.
				202	/// \param __b
				203	/// A 256-bit vector of [4 x double] containing one of the operands.
				204	/// \returns A 256-bit vector of [4 x double] containing the maximum values
				205	/// between both operands.
				206	static __inline __m256d __DEFAULT_FN_ATTRS
				207	_mm256_max_pd(__m256d __a, __m256d __b)
				208	{
				209	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
				210	}
				211
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	212	/// Compares two 256-bit vectors of [8 x float] and returns the greater
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	213	/// of each pair of values.
				214	///
				215	/// \headerfile <x86intrin.h>
				216	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	217	/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	218	///
				219	/// \param __a
				220	/// A 256-bit vector of [8 x float] containing one of the operands.
				221	/// \param __b
				222	/// A 256-bit vector of [8 x float] containing one of the operands.
				223	/// \returns A 256-bit vector of [8 x float] containing the maximum values
				224	/// between both operands.
				225	static __inline __m256 __DEFAULT_FN_ATTRS
				226	_mm256_max_ps(__m256 __a, __m256 __b)
				227	{
				228	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
				229	}
				230
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	231	/// Compares two 256-bit vectors of [4 x double] and returns the lesser
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	232	/// of each pair of values.
				233	///
				234	/// \headerfile <x86intrin.h>
				235	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	236	/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	237	///
				238	/// \param __a
				239	/// A 256-bit vector of [4 x double] containing one of the operands.
				240	/// \param __b
				241	/// A 256-bit vector of [4 x double] containing one of the operands.
				242	/// \returns A 256-bit vector of [4 x double] containing the minimum values
				243	/// between both operands.
				244	static __inline __m256d __DEFAULT_FN_ATTRS
				245	_mm256_min_pd(__m256d __a, __m256d __b)
				246	{
				247	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
				248	}
				249
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	250	/// Compares two 256-bit vectors of [8 x float] and returns the lesser
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	251	/// of each pair of values.
				252	///
				253	/// \headerfile <x86intrin.h>
				254	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	255	/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	256	///
				257	/// \param __a
				258	/// A 256-bit vector of [8 x float] containing one of the operands.
				259	/// \param __b
				260	/// A 256-bit vector of [8 x float] containing one of the operands.
				261	/// \returns A 256-bit vector of [8 x float] containing the minimum values
				262	/// between both operands.
				263	static __inline __m256 __DEFAULT_FN_ATTRS
				264	_mm256_min_ps(__m256 __a, __m256 __b)
				265	{
				266	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
				267	}
				268
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	269	/// Multiplies two 256-bit vectors of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	270	///
				271	/// \headerfile <x86intrin.h>
				272	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	273	/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	274	///
				275	/// \param __a
				276	/// A 256-bit vector of [4 x double] containing one of the operands.
				277	/// \param __b
				278	/// A 256-bit vector of [4 x double] containing one of the operands.
				279	/// \returns A 256-bit vector of [4 x double] containing the products of both
				280	/// operands.
				281	static __inline __m256d __DEFAULT_FN_ATTRS
				282	_mm256_mul_pd(__m256d __a, __m256d __b)
				283	{
				284	return (__m256d)((__v4df)__a * (__v4df)__b);
				285	}
				286
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	287	/// Multiplies two 256-bit vectors of [8 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	288	///
				289	/// \headerfile <x86intrin.h>
				290	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	291	/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	292	///
				293	/// \param __a
				294	/// A 256-bit vector of [8 x float] containing one of the operands.
				295	/// \param __b
				296	/// A 256-bit vector of [8 x float] containing one of the operands.
				297	/// \returns A 256-bit vector of [8 x float] containing the products of both
				298	/// operands.
				299	static __inline __m256 __DEFAULT_FN_ATTRS
				300	_mm256_mul_ps(__m256 __a, __m256 __b)
				301	{
				302	return (__m256)((__v8sf)__a * (__v8sf)__b);
				303	}
				304
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	305	/// Calculates the square roots of the values in a 256-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	306	/// [4 x double].
				307	///
				308	/// \headerfile <x86intrin.h>
				309	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	310	/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	311	///
				312	/// \param __a
				313	/// A 256-bit vector of [4 x double].
				314	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
				315	/// values in the operand.
				316	static __inline __m256d __DEFAULT_FN_ATTRS
				317	_mm256_sqrt_pd(__m256d __a)
				318	{
				319	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
				320	}
				321
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	322	/// Calculates the square roots of the values in a 256-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	323	/// [8 x float].
				324	///
				325	/// \headerfile <x86intrin.h>
				326	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	327	/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	328	///
				329	/// \param __a
				330	/// A 256-bit vector of [8 x float].
				331	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
				332	/// values in the operand.
				333	static __inline __m256 __DEFAULT_FN_ATTRS
				334	_mm256_sqrt_ps(__m256 __a)
				335	{
				336	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
				337	}
				338
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	339	/// Calculates the reciprocal square roots of the values in a 256-bit
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	340	/// vector of [8 x float].
				341	///
				342	/// \headerfile <x86intrin.h>
				343	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	344	/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	345	///
				346	/// \param __a
				347	/// A 256-bit vector of [8 x float].
				348	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
				349	/// roots of the values in the operand.
				350	static __inline __m256 __DEFAULT_FN_ATTRS
				351	_mm256_rsqrt_ps(__m256 __a)
				352	{
				353	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
				354	}
				355
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	356	/// Calculates the reciprocals of the values in a 256-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	357	/// [8 x float].
				358	///
				359	/// \headerfile <x86intrin.h>
				360	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	361	/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	362	///
				363	/// \param __a
				364	/// A 256-bit vector of [8 x float].
				365	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
				366	/// values in the operand.
				367	static __inline __m256 __DEFAULT_FN_ATTRS
				368	_mm256_rcp_ps(__m256 __a)
				369	{
				370	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
				371	}
				372
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	373	/// Rounds the values in a 256-bit vector of [4 x double] as specified
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	374	/// by the byte operand. The source values are rounded to integer values and
				375	/// returned as 64-bit double-precision floating-point values.
				376	///
				377	/// \headerfile <x86intrin.h>
				378	///
				379	/// \code
				380	/// __m256d _mm256_round_pd(__m256d V, const int M);
				381	/// \endcode
				382	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	383	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	384	///
				385	/// \param V
				386	/// A 256-bit vector of [4 x double].
				387	/// \param M
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	388	/// An integer value that specifies the rounding operation. \n
				389	/// Bits [7:4] are reserved. \n
				390	/// Bit [3] is a precision exception value: \n
				391	/// 0: A normal PE exception is used. \n
				392	/// 1: The PE field is not updated. \n
				393	/// Bit [2] is the rounding control source: \n
				394	/// 0: Use bits [1:0] of \a M. \n
				395	/// 1: Use the current MXCSR setting. \n
				396	/// Bits [1:0] contain the rounding control definition: \n
				397	/// 00: Nearest. \n
				398	/// 01: Downward (toward negative infinity). \n
				399	/// 10: Upward (toward positive infinity). \n
				400	/// 11: Truncated.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	401	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	402	#define _mm256_round_pd(V, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	403	((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	404
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	405	/// Rounds the values stored in a 256-bit vector of [8 x float] as
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	406	/// specified by the byte operand. The source values are rounded to integer
				407	/// values and returned as floating-point values.
				408	///
				409	/// \headerfile <x86intrin.h>
				410	///
				411	/// \code
				412	/// __m256 _mm256_round_ps(__m256 V, const int M);
				413	/// \endcode
				414	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	415	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	416	///
				417	/// \param V
				418	/// A 256-bit vector of [8 x float].
				419	/// \param M
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	420	/// An integer value that specifies the rounding operation. \n
				421	/// Bits [7:4] are reserved. \n
				422	/// Bit [3] is a precision exception value: \n
				423	/// 0: A normal PE exception is used. \n
				424	/// 1: The PE field is not updated. \n
				425	/// Bit [2] is the rounding control source: \n
				426	/// 0: Use bits [1:0] of \a M. \n
				427	/// 1: Use the current MXCSR setting. \n
				428	/// Bits [1:0] contain the rounding control definition: \n
				429	/// 00: Nearest. \n
				430	/// 01: Downward (toward negative infinity). \n
				431	/// 10: Upward (toward positive infinity). \n
				432	/// 11: Truncated.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	433	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	434	#define _mm256_round_ps(V, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	435	((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	436
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	437	/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	438	/// source values are rounded up to integer values and returned as 64-bit
				439	/// double-precision floating-point values.
				440	///
				441	/// \headerfile <x86intrin.h>
				442	///
				443	/// \code
				444	/// __m256d _mm256_ceil_pd(__m256d V);
				445	/// \endcode
				446	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	447	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	448	///
				449	/// \param V
				450	/// A 256-bit vector of [4 x double].
				451	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
				452	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
				453
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	454	/// Rounds down the values stored in a 256-bit vector of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	455	/// The source values are rounded down to integer values and returned as
				456	/// 64-bit double-precision floating-point values.
				457	///
				458	/// \headerfile <x86intrin.h>
				459	///
				460	/// \code
				461	/// __m256d _mm256_floor_pd(__m256d V);
				462	/// \endcode
				463	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	464	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	465	///
				466	/// \param V
				467	/// A 256-bit vector of [4 x double].
				468	/// \returns A 256-bit vector of [4 x double] containing the rounded down
				469	/// values.
				470	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
				471
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	472	/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	473	/// source values are rounded up to integer values and returned as
				474	/// floating-point values.
				475	///
				476	/// \headerfile <x86intrin.h>
				477	///
				478	/// \code
				479	/// __m256 _mm256_ceil_ps(__m256 V);
				480	/// \endcode
				481	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	482	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	483	///
				484	/// \param V
				485	/// A 256-bit vector of [8 x float].
				486	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
				487	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
				488
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	489	/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	490	/// source values are rounded down to integer values and returned as
				491	/// floating-point values.
				492	///
				493	/// \headerfile <x86intrin.h>
				494	///
				495	/// \code
				496	/// __m256 _mm256_floor_ps(__m256 V);
				497	/// \endcode
				498	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	499	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	500	///
				501	/// \param V
				502	/// A 256-bit vector of [8 x float].
				503	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
				504	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
				505
				506	/* Logical */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	507	/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	508	///
				509	/// \headerfile <x86intrin.h>
				510	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	511	/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	512	///
				513	/// \param __a
				514	/// A 256-bit vector of [4 x double] containing one of the source operands.
				515	/// \param __b
				516	/// A 256-bit vector of [4 x double] containing one of the source operands.
				517	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				518	/// values between both operands.
				519	static __inline __m256d __DEFAULT_FN_ATTRS
				520	_mm256_and_pd(__m256d __a, __m256d __b)
				521	{
				522	return (__m256d)((__v4du)__a & (__v4du)__b);
				523	}
				524
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	525	/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	526	///
				527	/// \headerfile <x86intrin.h>
				528	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	529	/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	530	///
				531	/// \param __a
				532	/// A 256-bit vector of [8 x float] containing one of the source operands.
				533	/// \param __b
				534	/// A 256-bit vector of [8 x float] containing one of the source operands.
				535	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				536	/// values between both operands.
				537	static __inline __m256 __DEFAULT_FN_ATTRS
				538	_mm256_and_ps(__m256 __a, __m256 __b)
				539	{
				540	return (__m256)((__v8su)__a & (__v8su)__b);
				541	}
				542
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	543	/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	544	/// the one's complement of the values contained in the first source operand.
				545	///
				546	/// \headerfile <x86intrin.h>
				547	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	548	/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	549	///
				550	/// \param __a
				551	/// A 256-bit vector of [4 x double] containing the left source operand. The
				552	/// one's complement of this value is used in the bitwise AND.
				553	/// \param __b
				554	/// A 256-bit vector of [4 x double] containing the right source operand.
				555	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
				556	/// values of the second operand and the one's complement of the first
				557	/// operand.
				558	static __inline __m256d __DEFAULT_FN_ATTRS
				559	_mm256_andnot_pd(__m256d __a, __m256d __b)
				560	{
				561	return (__m256d)(~(__v4du)__a & (__v4du)__b);
				562	}
				563
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	564	/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	565	/// the one's complement of the values contained in the first source operand.
				566	///
				567	/// \headerfile <x86intrin.h>
				568	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	569	/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	570	///
				571	/// \param __a
				572	/// A 256-bit vector of [8 x float] containing the left source operand. The
				573	/// one's complement of this value is used in the bitwise AND.
				574	/// \param __b
				575	/// A 256-bit vector of [8 x float] containing the right source operand.
				576	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
				577	/// values of the second operand and the one's complement of the first
				578	/// operand.
				579	static __inline __m256 __DEFAULT_FN_ATTRS
				580	_mm256_andnot_ps(__m256 __a, __m256 __b)
				581	{
				582	return (__m256)(~(__v8su)__a & (__v8su)__b);
				583	}
				584
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	585	/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	586	///
				587	/// \headerfile <x86intrin.h>
				588	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	589	/// This intrinsic corresponds to the <c> VORPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	590	///
				591	/// \param __a
				592	/// A 256-bit vector of [4 x double] containing one of the source operands.
				593	/// \param __b
				594	/// A 256-bit vector of [4 x double] containing one of the source operands.
				595	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
				596	/// values between both operands.
				597	static __inline __m256d __DEFAULT_FN_ATTRS
				598	_mm256_or_pd(__m256d __a, __m256d __b)
				599	{
				600	return (__m256d)((__v4du)__a \| (__v4du)__b);
				601	}
				602
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	603	/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	604	///
				605	/// \headerfile <x86intrin.h>
				606	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	607	/// This intrinsic corresponds to the <c> VORPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	608	///
				609	/// \param __a
				610	/// A 256-bit vector of [8 x float] containing one of the source operands.
				611	/// \param __b
				612	/// A 256-bit vector of [8 x float] containing one of the source operands.
				613	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
				614	/// values between both operands.
				615	static __inline __m256 __DEFAULT_FN_ATTRS
				616	_mm256_or_ps(__m256 __a, __m256 __b)
				617	{
				618	return (__m256)((__v8su)__a \| (__v8su)__b);
				619	}
				620
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	621	/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	622	///
				623	/// \headerfile <x86intrin.h>
				624	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	625	/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	626	///
				627	/// \param __a
				628	/// A 256-bit vector of [4 x double] containing one of the source operands.
				629	/// \param __b
				630	/// A 256-bit vector of [4 x double] containing one of the source operands.
				631	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
				632	/// values between both operands.
				633	static __inline __m256d __DEFAULT_FN_ATTRS
				634	_mm256_xor_pd(__m256d __a, __m256d __b)
				635	{
				636	return (__m256d)((__v4du)__a ^ (__v4du)__b);
				637	}
				638
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	639	/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	640	///
				641	/// \headerfile <x86intrin.h>
				642	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	643	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	644	///
				645	/// \param __a
				646	/// A 256-bit vector of [8 x float] containing one of the source operands.
				647	/// \param __b
				648	/// A 256-bit vector of [8 x float] containing one of the source operands.
				649	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
				650	/// values between both operands.
				651	static __inline __m256 __DEFAULT_FN_ATTRS
				652	_mm256_xor_ps(__m256 __a, __m256 __b)
				653	{
				654	return (__m256)((__v8su)__a ^ (__v8su)__b);
				655	}
				656
				657	/* Horizontal arithmetic */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	658	/// Horizontally adds the adjacent pairs of values contained in two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	659	/// 256-bit vectors of [4 x double].
				660	///
				661	/// \headerfile <x86intrin.h>
				662	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	663	/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	664	///
				665	/// \param __a
				666	/// A 256-bit vector of [4 x double] containing one of the source operands.
				667	/// The horizontal sums of the values are returned in the even-indexed
				668	/// elements of a vector of [4 x double].
				669	/// \param __b
				670	/// A 256-bit vector of [4 x double] containing one of the source operands.
				671	/// The horizontal sums of the values are returned in the odd-indexed
				672	/// elements of a vector of [4 x double].
				673	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
				674	/// both operands.
				675	static __inline __m256d __DEFAULT_FN_ATTRS
				676	_mm256_hadd_pd(__m256d __a, __m256d __b)
				677	{
				678	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
				679	}
				680
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	681	/// Horizontally adds the adjacent pairs of values contained in two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	682	/// 256-bit vectors of [8 x float].
				683	///
				684	/// \headerfile <x86intrin.h>
				685	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	686	/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	687	///
				688	/// \param __a
				689	/// A 256-bit vector of [8 x float] containing one of the source operands.
				690	/// The horizontal sums of the values are returned in the elements with
				691	/// index 0, 1, 4, 5 of a vector of [8 x float].
				692	/// \param __b
				693	/// A 256-bit vector of [8 x float] containing one of the source operands.
				694	/// The horizontal sums of the values are returned in the elements with
				695	/// index 2, 3, 6, 7 of a vector of [8 x float].
				696	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
				697	/// both operands.
				698	static __inline __m256 __DEFAULT_FN_ATTRS
				699	_mm256_hadd_ps(__m256 __a, __m256 __b)
				700	{
				701	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
				702	}
				703
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	704	/// Horizontally subtracts the adjacent pairs of values contained in two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	705	/// 256-bit vectors of [4 x double].
				706	///
				707	/// \headerfile <x86intrin.h>
				708	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	709	/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	710	///
				711	/// \param __a
				712	/// A 256-bit vector of [4 x double] containing one of the source operands.
				713	/// The horizontal differences between the values are returned in the
				714	/// even-indexed elements of a vector of [4 x double].
				715	/// \param __b
				716	/// A 256-bit vector of [4 x double] containing one of the source operands.
				717	/// The horizontal differences between the values are returned in the
				718	/// odd-indexed elements of a vector of [4 x double].
				719	/// \returns A 256-bit vector of [4 x double] containing the horizontal
				720	/// differences of both operands.
				721	static __inline __m256d __DEFAULT_FN_ATTRS
				722	_mm256_hsub_pd(__m256d __a, __m256d __b)
				723	{
				724	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
				725	}
				726
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	727	/// Horizontally subtracts the adjacent pairs of values contained in two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	728	/// 256-bit vectors of [8 x float].
				729	///
				730	/// \headerfile <x86intrin.h>
				731	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	732	/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	733	///
				734	/// \param __a
				735	/// A 256-bit vector of [8 x float] containing one of the source operands.
				736	/// The horizontal differences between the values are returned in the
				737	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
				738	/// \param __b
				739	/// A 256-bit vector of [8 x float] containing one of the source operands.
				740	/// The horizontal differences between the values are returned in the
				741	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
				742	/// \returns A 256-bit vector of [8 x float] containing the horizontal
				743	/// differences of both operands.
				744	static __inline __m256 __DEFAULT_FN_ATTRS
				745	_mm256_hsub_ps(__m256 __a, __m256 __b)
				746	{
				747	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
				748	}
				749
				750	/* Vector permutations */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	751	/// Copies the values in a 128-bit vector of [2 x double] as specified
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	752	/// by the 128-bit integer vector operand.
				753	///
				754	/// \headerfile <x86intrin.h>
				755	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	756	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	757	///
				758	/// \param __a
				759	/// A 128-bit vector of [2 x double].
				760	/// \param __c
				761	/// A 128-bit integer vector operand specifying how the values are to be
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	762	/// copied. \n
				763	/// Bit [1]: \n
				764	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				765	/// vector. \n
				766	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				767	/// returned vector. \n
				768	/// Bit [65]: \n
				769	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				770	/// returned vector. \n
				771	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				772	/// returned vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	773	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	774	static __inline __m128d __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	775	_mm_permutevar_pd(__m128d __a, __m128i __c)
				776	{
				777	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
				778	}
				779
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	780	/// Copies the values in a 256-bit vector of [4 x double] as specified
				781	/// by the 256-bit integer vector operand.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	782	///
				783	/// \headerfile <x86intrin.h>
				784	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	785	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	786	///
				787	/// \param __a
				788	/// A 256-bit vector of [4 x double].
				789	/// \param __c
				790	/// A 256-bit integer vector operand specifying how the values are to be
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	791	/// copied. \n
				792	/// Bit [1]: \n
				793	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				794	/// vector. \n
				795	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				796	/// returned vector. \n
				797	/// Bit [65]: \n
				798	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				799	/// returned vector. \n
				800	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				801	/// returned vector. \n
				802	/// Bit [129]: \n
				803	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				804	/// returned vector. \n
				805	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				806	/// returned vector. \n
				807	/// Bit [193]: \n
				808	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				809	/// returned vector. \n
				810	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	811	/// returned vector.
				812	/// \returns A 256-bit vector of [4 x double] containing the copied values.
				813	static __inline __m256d __DEFAULT_FN_ATTRS
				814	_mm256_permutevar_pd(__m256d __a, __m256i __c)
				815	{
				816	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
				817	}
				818
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	819	/// Copies the values stored in a 128-bit vector of [4 x float] as
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	820	/// specified by the 128-bit integer vector operand.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	821	/// \headerfile <x86intrin.h>
				822	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	823	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	824	///
				825	/// \param __a
				826	/// A 128-bit vector of [4 x float].
				827	/// \param __c
				828	/// A 128-bit integer vector operand specifying how the values are to be
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	829	/// copied. \n
				830	/// Bits [1:0]: \n
				831	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				832	/// returned vector. \n
				833	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				834	/// returned vector. \n
				835	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				836	/// returned vector. \n
				837	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				838	/// returned vector. \n
				839	/// Bits [33:32]: \n
				840	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				841	/// returned vector. \n
				842	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				843	/// returned vector. \n
				844	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				845	/// returned vector. \n
				846	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				847	/// returned vector. \n
				848	/// Bits [65:64]: \n
				849	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				850	/// returned vector. \n
				851	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				852	/// returned vector. \n
				853	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				854	/// returned vector. \n
				855	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				856	/// returned vector. \n
				857	/// Bits [97:96]: \n
				858	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				859	/// returned vector. \n
				860	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				861	/// returned vector. \n
				862	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				863	/// returned vector. \n
				864	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				865	/// returned vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	866	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	867	static __inline __m128 __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	868	_mm_permutevar_ps(__m128 __a, __m128i __c)
				869	{
				870	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
				871	}
				872
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	873	/// Copies the values stored in a 256-bit vector of [8 x float] as
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	874	/// specified by the 256-bit integer vector operand.
				875	///
				876	/// \headerfile <x86intrin.h>
				877	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	878	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	879	///
				880	/// \param __a
				881	/// A 256-bit vector of [8 x float].
				882	/// \param __c
				883	/// A 256-bit integer vector operand specifying how the values are to be
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	884	/// copied. \n
				885	/// Bits [1:0]: \n
				886	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				887	/// returned vector. \n
				888	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				889	/// returned vector. \n
				890	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				891	/// returned vector. \n
				892	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				893	/// returned vector. \n
				894	/// Bits [33:32]: \n
				895	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				896	/// returned vector. \n
				897	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				898	/// returned vector. \n
				899	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				900	/// returned vector. \n
				901	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				902	/// returned vector. \n
				903	/// Bits [65:64]: \n
				904	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				905	/// returned vector. \n
				906	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				907	/// returned vector. \n
				908	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				909	/// returned vector. \n
				910	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				911	/// returned vector. \n
				912	/// Bits [97:96]: \n
				913	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				914	/// returned vector. \n
				915	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				916	/// returned vector. \n
				917	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				918	/// returned vector. \n
				919	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				920	/// returned vector. \n
				921	/// Bits [129:128]: \n
				922	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				923	/// returned vector. \n
				924	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				925	/// returned vector. \n
				926	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				927	/// returned vector. \n
				928	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				929	/// returned vector. \n
				930	/// Bits [161:160]: \n
				931	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				932	/// returned vector. \n
				933	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				934	/// returned vector. \n
				935	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				936	/// returned vector. \n
				937	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				938	/// returned vector. \n
				939	/// Bits [193:192]: \n
				940	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				941	/// returned vector. \n
				942	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				943	/// returned vector. \n
				944	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				945	/// returned vector. \n
				946	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				947	/// returned vector. \n
				948	/// Bits [225:224]: \n
				949	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				950	/// returned vector. \n
				951	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				952	/// returned vector. \n
				953	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				954	/// returned vector. \n
				955	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				956	/// returned vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	957	/// \returns A 256-bit vector of [8 x float] containing the copied values.
				958	static __inline __m256 __DEFAULT_FN_ATTRS
				959	_mm256_permutevar_ps(__m256 __a, __m256i __c)
				960	{
				961	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
				962	}
				963
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	964	/// Copies the values in a 128-bit vector of [2 x double] as specified
				965	/// by the immediate integer operand.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	966	///
				967	/// \headerfile <x86intrin.h>
				968	///
				969	/// \code
				970	/// __m128d _mm_permute_pd(__m128d A, const int C);
				971	/// \endcode
				972	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	973	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	974	///
				975	/// \param A
				976	/// A 128-bit vector of [2 x double].
				977	/// \param C
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	978	/// An immediate integer operand specifying how the values are to be
				979	/// copied. \n
				980	/// Bit [0]: \n
				981	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				982	/// vector. \n
				983	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				984	/// returned vector. \n
				985	/// Bit [1]: \n
				986	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				987	/// returned vector. \n
				988	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				989	/// returned vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	990	/// \returns A 128-bit vector of [2 x double] containing the copied values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	991	#define _mm_permute_pd(A, C) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	992	((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	993
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	994	/// Copies the values in a 256-bit vector of [4 x double] as specified by
				995	/// the immediate integer operand.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	996	///
				997	/// \headerfile <x86intrin.h>
				998	///
				999	/// \code
				1000	/// __m256d _mm256_permute_pd(__m256d A, const int C);
				1001	/// \endcode
				1002	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1003	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1004	///
				1005	/// \param A
				1006	/// A 256-bit vector of [4 x double].
				1007	/// \param C
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1008	/// An immediate integer operand specifying how the values are to be
				1009	/// copied. \n
				1010	/// Bit [0]: \n
				1011	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
				1012	/// vector. \n
				1013	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
				1014	/// returned vector. \n
				1015	/// Bit [1]: \n
				1016	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
				1017	/// returned vector. \n
				1018	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
				1019	/// returned vector. \n
				1020	/// Bit [2]: \n
				1021	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
				1022	/// returned vector. \n
				1023	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
				1024	/// returned vector. \n
				1025	/// Bit [3]: \n
				1026	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
				1027	/// returned vector. \n
				1028	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
				1029	/// returned vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1030	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1031	#define _mm256_permute_pd(A, C) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1032	((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1033
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1034	/// Copies the values in a 128-bit vector of [4 x float] as specified by
				1035	/// the immediate integer operand.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1036	///
				1037	/// \headerfile <x86intrin.h>
				1038	///
				1039	/// \code
				1040	/// __m128 _mm_permute_ps(__m128 A, const int C);
				1041	/// \endcode
				1042	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1043	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1044	///
				1045	/// \param A
				1046	/// A 128-bit vector of [4 x float].
				1047	/// \param C
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1048	/// An immediate integer operand specifying how the values are to be
				1049	/// copied. \n
				1050	/// Bits [1:0]: \n
				1051	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1052	/// returned vector. \n
				1053	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1054	/// returned vector. \n
				1055	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1056	/// returned vector. \n
				1057	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1058	/// returned vector. \n
				1059	/// Bits [3:2]: \n
				1060	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1061	/// returned vector. \n
				1062	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1063	/// returned vector. \n
				1064	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1065	/// returned vector. \n
				1066	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1067	/// returned vector. \n
				1068	/// Bits [5:4]: \n
				1069	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1070	/// returned vector. \n
				1071	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1072	/// returned vector. \n
				1073	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1074	/// returned vector. \n
				1075	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1076	/// returned vector. \n
				1077	/// Bits [7:6]: \n
				1078	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				1079	/// returned vector. \n
				1080	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1081	/// returned vector. \n
				1082	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1083	/// returned vector. \n
				1084	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1085	/// returned vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1086	/// \returns A 128-bit vector of [4 x float] containing the copied values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1087	#define _mm_permute_ps(A, C) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1088	((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1089
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1090	/// Copies the values in a 256-bit vector of [8 x float] as specified by
				1091	/// the immediate integer operand.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1092	///
				1093	/// \headerfile <x86intrin.h>
				1094	///
				1095	/// \code
				1096	/// __m256 _mm256_permute_ps(__m256 A, const int C);
				1097	/// \endcode
				1098	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1099	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1100	///
				1101	/// \param A
				1102	/// A 256-bit vector of [8 x float].
				1103	/// \param C
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1104	/// An immediate integer operand specifying how the values are to be
				1105	/// copied. \n
				1106	/// Bits [1:0]: \n
				1107	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
				1108	/// returned vector. \n
				1109	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
				1110	/// returned vector. \n
				1111	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
				1112	/// returned vector. \n
				1113	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
				1114	/// returned vector. \n
				1115	/// Bits [3:2]: \n
				1116	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
				1117	/// returned vector. \n
				1118	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
				1119	/// returned vector. \n
				1120	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
				1121	/// returned vector. \n
				1122	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
				1123	/// returned vector. \n
				1124	/// Bits [5:4]: \n
				1125	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
				1126	/// returned vector. \n
				1127	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
				1128	/// returned vector. \n
				1129	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
				1130	/// returned vector. \n
				1131	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
				1132	/// returned vector. \n
				1133	/// Bits [7:6]: \n
				1134	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
				1135	/// returned vector. \n
				1136	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
				1137	/// returned vector. \n
				1138	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
				1139	/// returned vector. \n
				1140	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
				1141	/// returned vector. \n
				1142	/// Bits [1:0]: \n
				1143	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
				1144	/// returned vector. \n
				1145	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
				1146	/// returned vector. \n
				1147	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
				1148	/// returned vector. \n
				1149	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
				1150	/// returned vector. \n
				1151	/// Bits [3:2]: \n
				1152	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
				1153	/// returned vector. \n
				1154	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
				1155	/// returned vector. \n
				1156	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
				1157	/// returned vector. \n
				1158	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
				1159	/// returned vector. \n
				1160	/// Bits [5:4]: \n
				1161	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
				1162	/// returned vector. \n
				1163	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
				1164	/// returned vector. \n
				1165	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
				1166	/// returned vector. \n
				1167	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
				1168	/// returned vector. \n
				1169	/// Bits [7:6]: \n
				1170	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
				1171	/// returned vector. \n
				1172	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
				1173	/// returned vector. \n
				1174	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
				1175	/// returned vector. \n
				1176	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
				1177	/// returned vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1178	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1179	#define _mm256_permute_ps(A, C) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1180	((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1181
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1182	/// Permutes 128-bit data values stored in two 256-bit vectors of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1183	/// [4 x double], as specified by the immediate integer operand.
				1184	///
				1185	/// \headerfile <x86intrin.h>
				1186	///
				1187	/// \code
				1188	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
				1189	/// \endcode
				1190	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1191	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1192	///
				1193	/// \param V1
				1194	/// A 256-bit vector of [4 x double].
				1195	/// \param V2
				1196	/// A 256-bit vector of [4 x double.
				1197	/// \param M
				1198	/// An immediate integer operand specifying how the values are to be
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1199	/// permuted. \n
				1200	/// Bits [1:0]: \n
				1201	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
				1202	/// destination. \n
				1203	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
				1204	/// destination. \n
				1205	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
				1206	/// destination. \n
				1207	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
				1208	/// destination. \n
				1209	/// Bits [5:4]: \n
				1210	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
				1211	/// destination. \n
				1212	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
				1213	/// destination. \n
				1214	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
				1215	/// destination. \n
				1216	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
				1217	/// destination.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1218	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1219	#define _mm256_permute2f128_pd(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1220	((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
				1221	(__v4df)(__m256d)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1222
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1223	/// Permutes 128-bit data values stored in two 256-bit vectors of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1224	/// [8 x float], as specified by the immediate integer operand.
				1225	///
				1226	/// \headerfile <x86intrin.h>
				1227	///
				1228	/// \code
				1229	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
				1230	/// \endcode
				1231	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1232	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1233	///
				1234	/// \param V1
				1235	/// A 256-bit vector of [8 x float].
				1236	/// \param V2
				1237	/// A 256-bit vector of [8 x float].
				1238	/// \param M
				1239	/// An immediate integer operand specifying how the values are to be
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1240	/// permuted. \n
				1241	/// Bits [1:0]: \n
				1242	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
				1243	/// destination. \n
				1244	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
				1245	/// destination. \n
				1246	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
				1247	/// destination. \n
				1248	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
				1249	/// destination. \n
				1250	/// Bits [5:4]: \n
				1251	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
				1252	/// destination. \n
				1253	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
				1254	/// destination. \n
				1255	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
				1256	/// destination. \n
				1257	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1258	/// destination.
				1259	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1260	#define _mm256_permute2f128_ps(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1261	((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
				1262	(__v8sf)(__m256)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1263
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1264	/// Permutes 128-bit data values stored in two 256-bit integer vectors,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1265	/// as specified by the immediate integer operand.
				1266	///
				1267	/// \headerfile <x86intrin.h>
				1268	///
				1269	/// \code
				1270	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
				1271	/// \endcode
				1272	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1273	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1274	///
				1275	/// \param V1
				1276	/// A 256-bit integer vector.
				1277	/// \param V2
				1278	/// A 256-bit integer vector.
				1279	/// \param M
				1280	/// An immediate integer operand specifying how the values are to be copied.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1281	/// Bits [1:0]: \n
				1282	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
				1283	/// destination. \n
				1284	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
				1285	/// destination. \n
				1286	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
				1287	/// destination. \n
				1288	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
				1289	/// destination. \n
				1290	/// Bits [5:4]: \n
				1291	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
				1292	/// destination. \n
				1293	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
				1294	/// destination. \n
				1295	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
				1296	/// destination. \n
				1297	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1298	/// destination.
				1299	/// \returns A 256-bit integer vector containing the copied values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1300	#define _mm256_permute2f128_si256(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1301	((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
				1302	(__v8si)(__m256i)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1303
				1304	/* Vector Blend */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1305	/// Merges 64-bit double-precision data values stored in either of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1306	/// two 256-bit vectors of [4 x double], as specified by the immediate
				1307	/// integer operand.
				1308	///
				1309	/// \headerfile <x86intrin.h>
				1310	///
				1311	/// \code
				1312	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
				1313	/// \endcode
				1314	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1315	/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1316	///
				1317	/// \param V1
				1318	/// A 256-bit vector of [4 x double].
				1319	/// \param V2
				1320	/// A 256-bit vector of [4 x double].
				1321	/// \param M
				1322	/// An immediate integer operand, with mask bits [3:0] specifying how the
				1323	/// values are to be copied. The position of the mask bit corresponds to the
				1324	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1325	/// element in operand \a V1 is copied to the same position in the
				1326	/// destination. When a mask bit is 1, the corresponding 64-bit element in
				1327	/// operand \a V2 is copied to the same position in the destination.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1328	/// \returns A 256-bit vector of [4 x double] containing the copied values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1329	#define _mm256_blend_pd(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1330	((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
				1331	(__v4df)(__m256d)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1332
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1333	/// Merges 32-bit single-precision data values stored in either of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1334	/// two 256-bit vectors of [8 x float], as specified by the immediate
				1335	/// integer operand.
				1336	///
				1337	/// \headerfile <x86intrin.h>
				1338	///
				1339	/// \code
				1340	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
				1341	/// \endcode
				1342	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1343	/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1344	///
				1345	/// \param V1
				1346	/// A 256-bit vector of [8 x float].
				1347	/// \param V2
				1348	/// A 256-bit vector of [8 x float].
				1349	/// \param M
				1350	/// An immediate integer operand, with mask bits [7:0] specifying how the
				1351	/// values are to be copied. The position of the mask bit corresponds to the
				1352	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1353	/// element in operand \a V1 is copied to the same position in the
				1354	/// destination. When a mask bit is 1, the corresponding 32-bit element in
				1355	/// operand \a V2 is copied to the same position in the destination.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1356	/// \returns A 256-bit vector of [8 x float] containing the copied values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1357	#define _mm256_blend_ps(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1358	((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
				1359	(__v8sf)(__m256)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1360
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1361	/// Merges 64-bit double-precision data values stored in either of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1362	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
				1363	/// operand.
				1364	///
				1365	/// \headerfile <x86intrin.h>
				1366	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1367	/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1368	///
				1369	/// \param __a
				1370	/// A 256-bit vector of [4 x double].
				1371	/// \param __b
				1372	/// A 256-bit vector of [4 x double].
				1373	/// \param __c
				1374	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
				1375	/// how the values are to be copied. The position of the mask bit corresponds
				1376	/// to the most significant bit of a copied value. When a mask bit is 0, the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1377	/// corresponding 64-bit element in operand \a __a is copied to the same
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1378	/// position in the destination. When a mask bit is 1, the corresponding
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1379	/// 64-bit element in operand \a __b is copied to the same position in the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1380	/// destination.
				1381	/// \returns A 256-bit vector of [4 x double] containing the copied values.
				1382	static __inline __m256d __DEFAULT_FN_ATTRS
				1383	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
				1384	{
				1385	return (__m256d)__builtin_ia32_blendvpd256(
				1386	(__v4df)__a, (__v4df)__b, (__v4df)__c);
				1387	}
				1388
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1389	/// Merges 32-bit single-precision data values stored in either of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1390	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
				1391	/// operand.
				1392	///
				1393	/// \headerfile <x86intrin.h>
				1394	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1395	/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1396	///
				1397	/// \param __a
				1398	/// A 256-bit vector of [8 x float].
				1399	/// \param __b
				1400	/// A 256-bit vector of [8 x float].
				1401	/// \param __c
				1402	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
				1403	/// and 31 specifying how the values are to be copied. The position of the
				1404	/// mask bit corresponds to the most significant bit of a copied value. When
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1405	/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1406	/// copied to the same position in the destination. When a mask bit is 1, the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1407	/// corresponding 32-bit element in operand \a __b is copied to the same
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1408	/// position in the destination.
				1409	/// \returns A 256-bit vector of [8 x float] containing the copied values.
				1410	static __inline __m256 __DEFAULT_FN_ATTRS
				1411	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
				1412	{
				1413	return (__m256)__builtin_ia32_blendvps256(
				1414	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
				1415	}
				1416
				1417	/* Vector Dot Product */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1418	/// Computes two dot products in parallel, using the lower and upper
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1419	/// halves of two [8 x float] vectors as input to the two computations, and
				1420	/// returning the two dot products in the lower and upper halves of the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1421	/// [8 x float] result.
				1422	///
				1423	/// The immediate integer operand controls which input elements will
				1424	/// contribute to the dot product, and where the final results are returned.
				1425	/// In general, for each dot product, the four corresponding elements of the
				1426	/// input vectors are multiplied; the first two and second two products are
				1427	/// summed, then the two sums are added to form the final result.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1428	///
				1429	/// \headerfile <x86intrin.h>
				1430	///
				1431	/// \code
				1432	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
				1433	/// \endcode
				1434	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1435	/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1436	///
				1437	/// \param V1
				1438	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1439	/// \param V2
				1440	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
				1441	/// \param M
				1442	/// An immediate integer argument. Bits [7:4] determine which elements of
				1443	/// the input vectors are used, with bit [4] corresponding to the lowest
				1444	/// element and bit [7] corresponding to the highest element of each [4 x
				1445	/// float] subvector. If a bit is set, the corresponding elements from the
				1446	/// two input vectors are used as an input for dot product; otherwise that
				1447	/// input is treated as zero. Bits [3:0] determine which elements of the
				1448	/// result will receive a copy of the final dot product, with bit [0]
				1449	/// corresponding to the lowest element and bit [3] corresponding to the
				1450	/// highest element of each [4 x float] subvector. If a bit is set, the dot
				1451	/// product is returned in the corresponding element; otherwise that element
				1452	/// is set to zero. The bitmask is applied in the same way to each of the
				1453	/// two parallel dot product computations.
				1454	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1455	#define _mm256_dp_ps(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1456	((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
				1457	(__v8sf)(__m256)(V2), (M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1458
				1459	/* Vector shuffle */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1460	/// Selects 8 float values from the 256-bit operands of [8 x float], as
				1461	/// specified by the immediate value operand.
				1462	///
				1463	/// The four selected elements in each operand are copied to the destination
				1464	/// according to the bits specified in the immediate operand. The selected
				1465	/// elements from the first 256-bit operand are copied to bits [63:0] and
				1466	/// bits [191:128] of the destination, and the selected elements from the
				1467	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
				1468	/// the destination. For example, if bits [7:0] of the immediate operand
				1469	/// contain a value of 0xFF, the 256-bit destination vector would contain the
				1470	/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1471	///
				1472	/// \headerfile <x86intrin.h>
				1473	///
				1474	/// \code
				1475	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
				1476	/// \endcode
				1477	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1478	/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1479	///
				1480	/// \param a
				1481	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1482	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
				1483	/// according to the bits specified in the immediate operand.
				1484	/// \param b
				1485	/// A 256-bit vector of [8 x float]. The four selected elements in this
				1486	/// operand are copied to bits [127:64] and bits [255:192] in the
				1487	/// destination, according to the bits specified in the immediate operand.
				1488	/// \param mask
				1489	/// An immediate value containing an 8-bit value specifying which elements to
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1490	/// copy from \a a and \a b \n.
				1491	/// Bits [3:0] specify the values copied from operand \a a. \n
				1492	/// Bits [7:4] specify the values copied from operand \a b. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1493	/// The destinations within the 256-bit destination are assigned values as
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1494	/// follows, according to the bit value assignments described below: \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1495	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1496	/// destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1497	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1498	/// destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1499	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1500	/// destination. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1501	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1502	/// the destination. \n
				1503	/// Bit value assignments: \n
				1504	/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
				1505	/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
				1506	/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1507	/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
				1508	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1509	#define _mm256_shuffle_ps(a, b, mask) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1510	((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
				1511	(__v8sf)(__m256)(b), (int)(mask)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1512
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1513	/// Selects four double-precision values from the 256-bit operands of
				1514	/// [4 x double], as specified by the immediate value operand.
				1515	///
				1516	/// The selected elements from the first 256-bit operand are copied to bits
				1517	/// [63:0] and bits [191:128] in the destination, and the selected elements
				1518	/// from the second 256-bit operand are copied to bits [127:64] and bits
				1519	/// [255:192] in the destination. For example, if bits [3:0] of the immediate
				1520	/// operand contain a value of 0xF, the 256-bit destination vector would
				1521	/// contain the following values: b[3], a[3], b[1], a[1].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1522	///
				1523	/// \headerfile <x86intrin.h>
				1524	///
				1525	/// \code
				1526	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
				1527	/// \endcode
				1528	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1529	/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1530	///
				1531	/// \param a
				1532	/// A 256-bit vector of [4 x double].
				1533	/// \param b
				1534	/// A 256-bit vector of [4 x double].
				1535	/// \param mask
				1536	/// An immediate value containing 8-bit values specifying which elements to
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1537	/// copy from \a a and \a b: \n
				1538	/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
				1539	/// destination. \n
				1540	/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
				1541	/// destination. \n
				1542	/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
				1543	/// destination. \n
				1544	/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
				1545	/// destination. \n
				1546	/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
				1547	/// destination. \n
				1548	/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
				1549	/// destination. \n
				1550	/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
				1551	/// destination. \n
				1552	/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1553	/// destination.
				1554	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1555	#define _mm256_shuffle_pd(a, b, mask) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1556	((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
				1557	(__v4df)(__m256d)(b), (int)(mask)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1558
				1559	/* Compare */
				1560	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
				1561	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
				1562	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
				1563	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
				1564	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
				1565	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
				1566	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1567	#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1568	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1569	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1570	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
				1571	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
				1572	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
				1573	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
				1574	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
				1575	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
				1576	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
				1577	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
				1578	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
				1579	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
				1580	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
				1581	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1582	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1583	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
				1584	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1585	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1586	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
				1587	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
				1588	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
				1589	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
				1590	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
				1591	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
				1592
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1593	/// Compares each of the corresponding double-precision values of two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1594	/// 128-bit vectors of [2 x double], using the operation specified by the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1595	/// immediate integer operand.
				1596	///
				1597	/// Returns a [2 x double] vector consisting of two doubles corresponding to
				1598	/// the two comparison results: zero if the comparison is false, and all 1's
				1599	/// if the comparison is true.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1600	///
				1601	/// \headerfile <x86intrin.h>
				1602	///
				1603	/// \code
				1604	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
				1605	/// \endcode
				1606	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1607	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1608	///
				1609	/// \param a
				1610	/// A 128-bit vector of [2 x double].
				1611	/// \param b
				1612	/// A 128-bit vector of [2 x double].
				1613	/// \param c
				1614	/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1615	/// operation to use: \n
				1616	/// 0x00: Equal (ordered, non-signaling) \n
				1617	/// 0x01: Less-than (ordered, signaling) \n
				1618	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1619	/// 0x03: Unordered (non-signaling) \n
				1620	/// 0x04: Not-equal (unordered, non-signaling) \n
				1621	/// 0x05: Not-less-than (unordered, signaling) \n
				1622	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1623	/// 0x07: Ordered (non-signaling) \n
				1624	/// 0x08: Equal (unordered, non-signaling) \n
				1625	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1626	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1627	/// 0x0B: False (ordered, non-signaling) \n
				1628	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1629	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1630	/// 0x0E: Greater-than (ordered, signaling) \n
				1631	/// 0x0F: True (unordered, non-signaling) \n
				1632	/// 0x10: Equal (ordered, signaling) \n
				1633	/// 0x11: Less-than (ordered, non-signaling) \n
				1634	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1635	/// 0x13: Unordered (signaling) \n
				1636	/// 0x14: Not-equal (unordered, signaling) \n
				1637	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1638	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1639	/// 0x17: Ordered (signaling) \n
				1640	/// 0x18: Equal (unordered, signaling) \n
				1641	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1642	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1643	/// 0x1B: False (ordered, signaling) \n
				1644	/// 0x1C: Not-equal (ordered, signaling) \n
				1645	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1646	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1647	/// 0x1F: True (unordered, signaling)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1648	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1649	#define _mm_cmp_pd(a, b, c) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1650	((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
				1651	(__v2df)(__m128d)(b), (c)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1652
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1653	/// Compares each of the corresponding values of two 128-bit vectors of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1654	/// [4 x float], using the operation specified by the immediate integer
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1655	/// operand.
				1656	///
				1657	/// Returns a [4 x float] vector consisting of four floats corresponding to
				1658	/// the four comparison results: zero if the comparison is false, and all 1's
				1659	/// if the comparison is true.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1660	///
				1661	/// \headerfile <x86intrin.h>
				1662	///
				1663	/// \code
				1664	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
				1665	/// \endcode
				1666	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1667	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1668	///
				1669	/// \param a
				1670	/// A 128-bit vector of [4 x float].
				1671	/// \param b
				1672	/// A 128-bit vector of [4 x float].
				1673	/// \param c
				1674	/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1675	/// operation to use: \n
				1676	/// 0x00: Equal (ordered, non-signaling) \n
				1677	/// 0x01: Less-than (ordered, signaling) \n
				1678	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1679	/// 0x03: Unordered (non-signaling) \n
				1680	/// 0x04: Not-equal (unordered, non-signaling) \n
				1681	/// 0x05: Not-less-than (unordered, signaling) \n
				1682	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1683	/// 0x07: Ordered (non-signaling) \n
				1684	/// 0x08: Equal (unordered, non-signaling) \n
				1685	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1686	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1687	/// 0x0B: False (ordered, non-signaling) \n
				1688	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1689	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1690	/// 0x0E: Greater-than (ordered, signaling) \n
				1691	/// 0x0F: True (unordered, non-signaling) \n
				1692	/// 0x10: Equal (ordered, signaling) \n
				1693	/// 0x11: Less-than (ordered, non-signaling) \n
				1694	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1695	/// 0x13: Unordered (signaling) \n
				1696	/// 0x14: Not-equal (unordered, signaling) \n
				1697	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1698	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1699	/// 0x17: Ordered (signaling) \n
				1700	/// 0x18: Equal (unordered, signaling) \n
				1701	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1702	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1703	/// 0x1B: False (ordered, signaling) \n
				1704	/// 0x1C: Not-equal (ordered, signaling) \n
				1705	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1706	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1707	/// 0x1F: True (unordered, signaling)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1708	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1709	#define _mm_cmp_ps(a, b, c) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1710	((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
				1711	(__v4sf)(__m128)(b), (c)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1712
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1713	/// Compares each of the corresponding double-precision values of two
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1714	/// 256-bit vectors of [4 x double], using the operation specified by the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1715	/// immediate integer operand.
				1716	///
				1717	/// Returns a [4 x double] vector consisting of four doubles corresponding to
				1718	/// the four comparison results: zero if the comparison is false, and all 1's
				1719	/// if the comparison is true.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1720	///
				1721	/// \headerfile <x86intrin.h>
				1722	///
				1723	/// \code
				1724	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
				1725	/// \endcode
				1726	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1727	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1728	///
				1729	/// \param a
				1730	/// A 256-bit vector of [4 x double].
				1731	/// \param b
				1732	/// A 256-bit vector of [4 x double].
				1733	/// \param c
				1734	/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1735	/// operation to use: \n
				1736	/// 0x00: Equal (ordered, non-signaling) \n
				1737	/// 0x01: Less-than (ordered, signaling) \n
				1738	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1739	/// 0x03: Unordered (non-signaling) \n
				1740	/// 0x04: Not-equal (unordered, non-signaling) \n
				1741	/// 0x05: Not-less-than (unordered, signaling) \n
				1742	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1743	/// 0x07: Ordered (non-signaling) \n
				1744	/// 0x08: Equal (unordered, non-signaling) \n
				1745	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1746	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1747	/// 0x0B: False (ordered, non-signaling) \n
				1748	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1749	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1750	/// 0x0E: Greater-than (ordered, signaling) \n
				1751	/// 0x0F: True (unordered, non-signaling) \n
				1752	/// 0x10: Equal (ordered, signaling) \n
				1753	/// 0x11: Less-than (ordered, non-signaling) \n
				1754	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1755	/// 0x13: Unordered (signaling) \n
				1756	/// 0x14: Not-equal (unordered, signaling) \n
				1757	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1758	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1759	/// 0x17: Ordered (signaling) \n
				1760	/// 0x18: Equal (unordered, signaling) \n
				1761	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1762	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1763	/// 0x1B: False (ordered, signaling) \n
				1764	/// 0x1C: Not-equal (ordered, signaling) \n
				1765	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1766	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1767	/// 0x1F: True (unordered, signaling)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1768	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1769	#define _mm256_cmp_pd(a, b, c) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1770	((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
				1771	(__v4df)(__m256d)(b), (c)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1772
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1773	/// Compares each of the corresponding values of two 256-bit vectors of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1774	/// [8 x float], using the operation specified by the immediate integer
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1775	/// operand.
				1776	///
				1777	/// Returns a [8 x float] vector consisting of eight floats corresponding to
				1778	/// the eight comparison results: zero if the comparison is false, and all
				1779	/// 1's if the comparison is true.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1780	///
				1781	/// \headerfile <x86intrin.h>
				1782	///
				1783	/// \code
				1784	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
				1785	/// \endcode
				1786	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1787	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1788	///
				1789	/// \param a
				1790	/// A 256-bit vector of [8 x float].
				1791	/// \param b
				1792	/// A 256-bit vector of [8 x float].
				1793	/// \param c
				1794	/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1795	/// operation to use: \n
				1796	/// 0x00: Equal (ordered, non-signaling) \n
				1797	/// 0x01: Less-than (ordered, signaling) \n
				1798	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1799	/// 0x03: Unordered (non-signaling) \n
				1800	/// 0x04: Not-equal (unordered, non-signaling) \n
				1801	/// 0x05: Not-less-than (unordered, signaling) \n
				1802	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1803	/// 0x07: Ordered (non-signaling) \n
				1804	/// 0x08: Equal (unordered, non-signaling) \n
				1805	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1806	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1807	/// 0x0B: False (ordered, non-signaling) \n
				1808	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1809	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1810	/// 0x0E: Greater-than (ordered, signaling) \n
				1811	/// 0x0F: True (unordered, non-signaling) \n
				1812	/// 0x10: Equal (ordered, signaling) \n
				1813	/// 0x11: Less-than (ordered, non-signaling) \n
				1814	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1815	/// 0x13: Unordered (signaling) \n
				1816	/// 0x14: Not-equal (unordered, signaling) \n
				1817	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1818	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1819	/// 0x17: Ordered (signaling) \n
				1820	/// 0x18: Equal (unordered, signaling) \n
				1821	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1822	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1823	/// 0x1B: False (ordered, signaling) \n
				1824	/// 0x1C: Not-equal (ordered, signaling) \n
				1825	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1826	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1827	/// 0x1F: True (unordered, signaling)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1828	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1829	#define _mm256_cmp_ps(a, b, c) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1830	((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
				1831	(__v8sf)(__m256)(b), (c)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1832
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1833	/// Compares each of the corresponding scalar double-precision values of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1834	/// two 128-bit vectors of [2 x double], using the operation specified by the
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1835	/// immediate integer operand.
				1836	///
				1837	/// If the result is true, all 64 bits of the destination vector are set;
				1838	/// otherwise they are cleared.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1839	///
				1840	/// \headerfile <x86intrin.h>
				1841	///
				1842	/// \code
				1843	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
				1844	/// \endcode
				1845	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1846	/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1847	///
				1848	/// \param a
				1849	/// A 128-bit vector of [2 x double].
				1850	/// \param b
				1851	/// A 128-bit vector of [2 x double].
				1852	/// \param c
				1853	/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1854	/// operation to use: \n
				1855	/// 0x00: Equal (ordered, non-signaling) \n
				1856	/// 0x01: Less-than (ordered, signaling) \n
				1857	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1858	/// 0x03: Unordered (non-signaling) \n
				1859	/// 0x04: Not-equal (unordered, non-signaling) \n
				1860	/// 0x05: Not-less-than (unordered, signaling) \n
				1861	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1862	/// 0x07: Ordered (non-signaling) \n
				1863	/// 0x08: Equal (unordered, non-signaling) \n
				1864	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1865	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1866	/// 0x0B: False (ordered, non-signaling) \n
				1867	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1868	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1869	/// 0x0E: Greater-than (ordered, signaling) \n
				1870	/// 0x0F: True (unordered, non-signaling) \n
				1871	/// 0x10: Equal (ordered, signaling) \n
				1872	/// 0x11: Less-than (ordered, non-signaling) \n
				1873	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1874	/// 0x13: Unordered (signaling) \n
				1875	/// 0x14: Not-equal (unordered, signaling) \n
				1876	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1877	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1878	/// 0x17: Ordered (signaling) \n
				1879	/// 0x18: Equal (unordered, signaling) \n
				1880	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1881	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1882	/// 0x1B: False (ordered, signaling) \n
				1883	/// 0x1C: Not-equal (ordered, signaling) \n
				1884	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1885	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1886	/// 0x1F: True (unordered, signaling)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1887	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1888	#define _mm_cmp_sd(a, b, c) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1889	((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
				1890	(__v2df)(__m128d)(b), (c)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1891
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1892	/// Compares each of the corresponding scalar values of two 128-bit
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1893	/// vectors of [4 x float], using the operation specified by the immediate
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1894	/// integer operand.
				1895	///
				1896	/// If the result is true, all 32 bits of the destination vector are set;
				1897	/// otherwise they are cleared.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1898	///
				1899	/// \headerfile <x86intrin.h>
				1900	///
				1901	/// \code
				1902	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
				1903	/// \endcode
				1904	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1905	/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1906	///
				1907	/// \param a
				1908	/// A 128-bit vector of [4 x float].
				1909	/// \param b
				1910	/// A 128-bit vector of [4 x float].
				1911	/// \param c
				1912	/// An immediate integer operand, with bits [4:0] specifying which comparison
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1913	/// operation to use: \n
				1914	/// 0x00: Equal (ordered, non-signaling) \n
				1915	/// 0x01: Less-than (ordered, signaling) \n
				1916	/// 0x02: Less-than-or-equal (ordered, signaling) \n
				1917	/// 0x03: Unordered (non-signaling) \n
				1918	/// 0x04: Not-equal (unordered, non-signaling) \n
				1919	/// 0x05: Not-less-than (unordered, signaling) \n
				1920	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
				1921	/// 0x07: Ordered (non-signaling) \n
				1922	/// 0x08: Equal (unordered, non-signaling) \n
				1923	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
				1924	/// 0x0A: Not-greater-than (unordered, signaling) \n
				1925	/// 0x0B: False (ordered, non-signaling) \n
				1926	/// 0x0C: Not-equal (ordered, non-signaling) \n
				1927	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
				1928	/// 0x0E: Greater-than (ordered, signaling) \n
				1929	/// 0x0F: True (unordered, non-signaling) \n
				1930	/// 0x10: Equal (ordered, signaling) \n
				1931	/// 0x11: Less-than (ordered, non-signaling) \n
				1932	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
				1933	/// 0x13: Unordered (signaling) \n
				1934	/// 0x14: Not-equal (unordered, signaling) \n
				1935	/// 0x15: Not-less-than (unordered, non-signaling) \n
				1936	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
				1937	/// 0x17: Ordered (signaling) \n
				1938	/// 0x18: Equal (unordered, signaling) \n
				1939	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
				1940	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
				1941	/// 0x1B: False (ordered, signaling) \n
				1942	/// 0x1C: Not-equal (ordered, signaling) \n
				1943	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
				1944	/// 0x1E: Greater-than (ordered, non-signaling) \n
				1945	/// 0x1F: True (unordered, signaling)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1946	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1947	#define _mm_cmp_ss(a, b, c) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1948	((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
				1949	(__v4sf)(__m128)(b), (c)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1950
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1951	/// Takes a [8 x i32] vector and returns the vector element value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1952	/// indexed by the immediate constant operand.
				1953	///
				1954	/// \headerfile <x86intrin.h>
				1955	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1956	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				1957	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1958	///
				1959	/// \param __a
				1960	/// A 256-bit vector of [8 x i32].
				1961	/// \param __imm
				1962	/// An immediate integer operand with bits [2:0] determining which vector
				1963	/// element is extracted and returned.
				1964	/// \returns A 32-bit integer containing the extracted 32 bits of extended
				1965	/// packed data.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1966	#define _mm256_extract_epi32(X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1967	((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1968
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1969	/// Takes a [16 x i16] vector and returns the vector element value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1970	/// indexed by the immediate constant operand.
				1971	///
				1972	/// \headerfile <x86intrin.h>
				1973	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1974	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				1975	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1976	///
				1977	/// \param __a
				1978	/// A 256-bit integer vector of [16 x i16].
				1979	/// \param __imm
				1980	/// An immediate integer operand with bits [3:0] determining which vector
				1981	/// element is extracted and returned.
				1982	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
				1983	/// packed data.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1984	#define _mm256_extract_epi16(X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	1985	((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
				1986	(int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1987
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1988	/// Takes a [32 x i8] vector and returns the vector element value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1989	/// indexed by the immediate constant operand.
				1990	///
				1991	/// \headerfile <x86intrin.h>
				1992	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	1993	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				1994	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	1995	///
				1996	/// \param __a
				1997	/// A 256-bit integer vector of [32 x i8].
				1998	/// \param __imm
				1999	/// An immediate integer operand with bits [4:0] determining which vector
				2000	/// element is extracted and returned.
				2001	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
				2002	/// packed data.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2003	#define _mm256_extract_epi8(X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2004	((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
				2005	(int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2006
				2007	#ifdef __x86_64__
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2008	/// Takes a [4 x i64] vector and returns the vector element value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2009	/// indexed by the immediate constant operand.
				2010	///
				2011	/// \headerfile <x86intrin.h>
				2012	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2013	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
				2014	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2015	///
				2016	/// \param __a
				2017	/// A 256-bit integer vector of [4 x i64].
				2018	/// \param __imm
				2019	/// An immediate integer operand with bits [1:0] determining which vector
				2020	/// element is extracted and returned.
				2021	/// \returns A 64-bit integer containing the extracted 64 bits of extended
				2022	/// packed data.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2023	#define _mm256_extract_epi64(X, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2024	((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2025	#endif
				2026
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2027	/// Takes a [8 x i32] vector and replaces the vector element value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2028	/// indexed by the immediate constant operand by a new value. Returns the
				2029	/// modified vector.
				2030	///
				2031	/// \headerfile <x86intrin.h>
				2032	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2033	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2034	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2035	///
				2036	/// \param __a
				2037	/// A vector of [8 x i32] to be used by the insert operation.
				2038	/// \param __b
				2039	/// An integer value. The replacement value for the insert operation.
				2040	/// \param __imm
				2041	/// An immediate integer specifying the index of the vector element to be
				2042	/// replaced.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2043	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2044	/// \a __imm with \a __b.
				2045	#define _mm256_insert_epi32(X, I, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2046	((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
				2047	(int)(I), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2048
				2049
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2050	/// Takes a [16 x i16] vector and replaces the vector element value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2051	/// indexed by the immediate constant operand with a new value. Returns the
				2052	/// modified vector.
				2053	///
				2054	/// \headerfile <x86intrin.h>
				2055	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2056	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2057	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2058	///
				2059	/// \param __a
				2060	/// A vector of [16 x i16] to be used by the insert operation.
				2061	/// \param __b
				2062	/// An i16 integer value. The replacement value for the insert operation.
				2063	/// \param __imm
				2064	/// An immediate integer specifying the index of the vector element to be
				2065	/// replaced.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2066	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2067	/// \a __imm with \a __b.
				2068	#define _mm256_insert_epi16(X, I, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2069	((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
				2070	(int)(I), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2071
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2072	/// Takes a [32 x i8] vector and replaces the vector element value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2073	/// indexed by the immediate constant operand with a new value. Returns the
				2074	/// modified vector.
				2075	///
				2076	/// \headerfile <x86intrin.h>
				2077	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2078	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2079	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2080	///
				2081	/// \param __a
				2082	/// A vector of [32 x i8] to be used by the insert operation.
				2083	/// \param __b
				2084	/// An i8 integer value. The replacement value for the insert operation.
				2085	/// \param __imm
				2086	/// An immediate integer specifying the index of the vector element to be
				2087	/// replaced.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2088	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2089	/// \a __imm with \a __b.
				2090	#define _mm256_insert_epi8(X, I, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2091	((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
				2092	(int)(I), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2093
				2094	#ifdef __x86_64__
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2095	/// Takes a [4 x i64] vector and replaces the vector element value
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2096	/// indexed by the immediate constant operand with a new value. Returns the
				2097	/// modified vector.
				2098	///
				2099	/// \headerfile <x86intrin.h>
				2100	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2101	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
				2102	/// instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2103	///
				2104	/// \param __a
				2105	/// A vector of [4 x i64] to be used by the insert operation.
				2106	/// \param __b
				2107	/// A 64-bit integer value. The replacement value for the insert operation.
				2108	/// \param __imm
				2109	/// An immediate integer specifying the index of the vector element to be
				2110	/// replaced.
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2111	/// \returns A copy of vector \a __a, after replacing its element indexed by
				2112	/// \a __imm with \a __b.
				2113	#define _mm256_insert_epi64(X, I, N) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	2114	((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
				2115	(long long)(I), (int)(N)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2116	#endif
				2117
				2118	/* Conversion */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2119	/// Converts a vector of [4 x i32] into a vector of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2120	///
				2121	/// \headerfile <x86intrin.h>
				2122	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2123	/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2124	///
				2125	/// \param __a
				2126	/// A 128-bit integer vector of [4 x i32].
				2127	/// \returns A 256-bit vector of [4 x double] containing the converted values.
				2128	static __inline __m256d __DEFAULT_FN_ATTRS
				2129	_mm256_cvtepi32_pd(__m128i __a)
				2130	{
				2131	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
				2132	}
				2133
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2134	/// Converts a vector of [8 x i32] into a vector of [8 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2135	///
				2136	/// \headerfile <x86intrin.h>
				2137	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2138	/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2139	///
				2140	/// \param __a
				2141	/// A 256-bit integer vector.
				2142	/// \returns A 256-bit vector of [8 x float] containing the converted values.
				2143	static __inline __m256 __DEFAULT_FN_ATTRS
				2144	_mm256_cvtepi32_ps(__m256i __a)
				2145	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2146	return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2147	}
				2148
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2149	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2150	/// [4 x float].
				2151	///
				2152	/// \headerfile <x86intrin.h>
				2153	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2154	/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2155	///
				2156	/// \param __a
				2157	/// A 256-bit vector of [4 x double].
				2158	/// \returns A 128-bit vector of [4 x float] containing the converted values.
				2159	static __inline __m128 __DEFAULT_FN_ATTRS
				2160	_mm256_cvtpd_ps(__m256d __a)
				2161	{
				2162	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
				2163	}
				2164
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2165	/// Converts a vector of [8 x float] into a vector of [8 x i32].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2166	///
				2167	/// \headerfile <x86intrin.h>
				2168	///
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2169	/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2170	///
				2171	/// \param __a
				2172	/// A 256-bit vector of [8 x float].
				2173	/// \returns A 256-bit integer vector containing the converted values.
				2174	static __inline __m256i __DEFAULT_FN_ATTRS
				2175	_mm256_cvtps_epi32(__m256 __a)
				2176	{
				2177	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
				2178	}
				2179
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2180	/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
				2181	/// x double].
				2182	///
				2183	/// \headerfile <x86intrin.h>
				2184	///
				2185	/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
				2186	///
				2187	/// \param __a
				2188	/// A 128-bit vector of [4 x float].
				2189	/// \returns A 256-bit vector of [4 x double] containing the converted values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2190	static __inline __m256d __DEFAULT_FN_ATTRS
				2191	_mm256_cvtps_pd(__m128 __a)
				2192	{
				2193	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
				2194	}
				2195
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2196	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
				2197	/// x i32], truncating the result by rounding towards zero when it is
				2198	/// inexact.
				2199	///
				2200	/// \headerfile <x86intrin.h>
				2201	///
				2202	/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
				2203	///
				2204	/// \param __a
				2205	/// A 256-bit vector of [4 x double].
				2206	/// \returns A 128-bit integer vector containing the converted values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2207	static __inline __m128i __DEFAULT_FN_ATTRS
				2208	_mm256_cvttpd_epi32(__m256d __a)
				2209	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2210	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2211	}
				2212
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2213	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
				2214	/// x i32]. When a conversion is inexact, the value returned is rounded
				2215	/// according to the rounding control bits in the MXCSR register.
				2216	///
				2217	/// \headerfile <x86intrin.h>
				2218	///
				2219	/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
				2220	///
				2221	/// \param __a
				2222	/// A 256-bit vector of [4 x double].
				2223	/// \returns A 128-bit integer vector containing the converted values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2224	static __inline __m128i __DEFAULT_FN_ATTRS
				2225	_mm256_cvtpd_epi32(__m256d __a)
				2226	{
				2227	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
				2228	}
				2229
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2230	/// Converts a vector of [8 x float] into a vector of [8 x i32],
				2231	/// truncating the result by rounding towards zero when it is inexact.
				2232	///
				2233	/// \headerfile <x86intrin.h>
				2234	///
				2235	/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
				2236	///
				2237	/// \param __a
				2238	/// A 256-bit vector of [8 x float].
				2239	/// \returns A 256-bit integer vector containing the converted values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2240	static __inline __m256i __DEFAULT_FN_ATTRS
				2241	_mm256_cvttps_epi32(__m256 __a)
				2242	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2243	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2244	}
				2245
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2246	/// Returns the first element of the input vector of [4 x double].
				2247	///
Pirama Arumuga Nainar	7e1f839	2021-08-16 17:30:48 -0700	[diff] [blame]	2248	/// \headerfile <x86intrin.h>
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2249	///
				2250	/// This intrinsic is a utility function and does not correspond to a specific
				2251	/// instruction.
				2252	///
				2253	/// \param __a
				2254	/// A 256-bit vector of [4 x double].
				2255	/// \returns A 64 bit double containing the first element of the input vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2256	static __inline double __DEFAULT_FN_ATTRS
				2257	_mm256_cvtsd_f64(__m256d __a)
				2258	{
				2259	return __a[0];
				2260	}
				2261
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2262	/// Returns the first element of the input vector of [8 x i32].
				2263	///
Pirama Arumuga Nainar	7e1f839	2021-08-16 17:30:48 -0700	[diff] [blame]	2264	/// \headerfile <x86intrin.h>
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2265	///
				2266	/// This intrinsic is a utility function and does not correspond to a specific
				2267	/// instruction.
				2268	///
				2269	/// \param __a
				2270	/// A 256-bit vector of [8 x i32].
				2271	/// \returns A 32 bit integer containing the first element of the input vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2272	static __inline int __DEFAULT_FN_ATTRS
				2273	_mm256_cvtsi256_si32(__m256i __a)
				2274	{
				2275	__v8si __b = (__v8si)__a;
				2276	return __b[0];
				2277	}
				2278
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2279	/// Returns the first element of the input vector of [8 x float].
				2280	///
Pirama Arumuga Nainar	7e1f839	2021-08-16 17:30:48 -0700	[diff] [blame]	2281	/// \headerfile <x86intrin.h>
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2282	///
				2283	/// This intrinsic is a utility function and does not correspond to a specific
				2284	/// instruction.
				2285	///
				2286	/// \param __a
				2287	/// A 256-bit vector of [8 x float].
				2288	/// \returns A 32 bit float containing the first element of the input vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2289	static __inline float __DEFAULT_FN_ATTRS
				2290	_mm256_cvtss_f32(__m256 __a)
				2291	{
				2292	return __a[0];
				2293	}
				2294
				2295	/* Vector replicate */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2296	/// Moves and duplicates odd-indexed values from a 256-bit vector of
				2297	/// [8 x float] to float values in a 256-bit vector of [8 x float].
				2298	///
				2299	/// \headerfile <x86intrin.h>
				2300	///
				2301	/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
				2302	///
				2303	/// \param __a
				2304	/// A 256-bit vector of [8 x float]. \n
				2305	/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
				2306	/// the return value. \n
				2307	/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
				2308	/// the return value. \n
				2309	/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
				2310	/// return value. \n
				2311	/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
				2312	/// return value.
				2313	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2314	/// values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2315	static __inline __m256 __DEFAULT_FN_ATTRS
				2316	_mm256_movehdup_ps(__m256 __a)
				2317	{
				2318	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
				2319	}
				2320
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2321	/// Moves and duplicates even-indexed values from a 256-bit vector of
				2322	/// [8 x float] to float values in a 256-bit vector of [8 x float].
				2323	///
				2324	/// \headerfile <x86intrin.h>
				2325	///
				2326	/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
				2327	///
				2328	/// \param __a
				2329	/// A 256-bit vector of [8 x float]. \n
				2330	/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
				2331	/// the return value. \n
				2332	/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
				2333	/// the return value. \n
				2334	/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
				2335	/// return value. \n
				2336	/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
				2337	/// return value.
				2338	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
				2339	/// values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2340	static __inline __m256 __DEFAULT_FN_ATTRS
				2341	_mm256_moveldup_ps(__m256 __a)
				2342	{
				2343	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
				2344	}
				2345
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2346	/// Moves and duplicates double-precision floating point values from a
				2347	/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
				2348	/// vector of [4 x double].
				2349	///
				2350	/// \headerfile <x86intrin.h>
				2351	///
				2352	/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
				2353	///
				2354	/// \param __a
				2355	/// A 256-bit vector of [4 x double]. \n
				2356	/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
				2357	/// return value. \n
				2358	/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
				2359	/// the return value.
				2360	/// \returns A 256-bit vector of [4 x double] containing the moved and
				2361	/// duplicated values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2362	static __inline __m256d __DEFAULT_FN_ATTRS
				2363	_mm256_movedup_pd(__m256d __a)
				2364	{
				2365	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
				2366	}
				2367
				2368	/* Unpack and Interleave */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2369	/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
				2370	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2371	///
				2372	/// \headerfile <x86intrin.h>
				2373	///
				2374	/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
				2375	///
				2376	/// \param __a
				2377	/// A 256-bit floating-point vector of [4 x double]. \n
				2378	/// Bits [127:64] are written to bits [63:0] of the return value. \n
				2379	/// Bits [255:192] are written to bits [191:128] of the return value. \n
				2380	/// \param __b
				2381	/// A 256-bit floating-point vector of [4 x double]. \n
				2382	/// Bits [127:64] are written to bits [127:64] of the return value. \n
				2383	/// Bits [255:192] are written to bits [255:192] of the return value. \n
				2384	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2385	static __inline __m256d __DEFAULT_FN_ATTRS
				2386	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
				2387	{
				2388	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
				2389	}
				2390
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2391	/// Unpacks the even-indexed vector elements from two 256-bit vectors of
				2392	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
				2393	///
				2394	/// \headerfile <x86intrin.h>
				2395	///
				2396	/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
				2397	///
				2398	/// \param __a
				2399	/// A 256-bit floating-point vector of [4 x double]. \n
				2400	/// Bits [63:0] are written to bits [63:0] of the return value. \n
				2401	/// Bits [191:128] are written to bits [191:128] of the return value.
				2402	/// \param __b
				2403	/// A 256-bit floating-point vector of [4 x double]. \n
				2404	/// Bits [63:0] are written to bits [127:64] of the return value. \n
				2405	/// Bits [191:128] are written to bits [255:192] of the return value. \n
				2406	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2407	static __inline __m256d __DEFAULT_FN_ATTRS
				2408	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
				2409	{
				2410	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
				2411	}
				2412
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2413	/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
				2414	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2415	/// vector of [8 x float].
				2416	///
				2417	/// \headerfile <x86intrin.h>
				2418	///
				2419	/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
				2420	///
				2421	/// \param __a
				2422	/// A 256-bit vector of [8 x float]. \n
				2423	/// Bits [95:64] are written to bits [31:0] of the return value. \n
				2424	/// Bits [127:96] are written to bits [95:64] of the return value. \n
				2425	/// Bits [223:192] are written to bits [159:128] of the return value. \n
				2426	/// Bits [255:224] are written to bits [223:192] of the return value.
				2427	/// \param __b
				2428	/// A 256-bit vector of [8 x float]. \n
				2429	/// Bits [95:64] are written to bits [63:32] of the return value. \n
				2430	/// Bits [127:96] are written to bits [127:96] of the return value. \n
				2431	/// Bits [223:192] are written to bits [191:160] of the return value. \n
				2432	/// Bits [255:224] are written to bits [255:224] of the return value.
				2433	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2434	static __inline __m256 __DEFAULT_FN_ATTRS
				2435	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
				2436	{
				2437	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
				2438	}
				2439
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2440	/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
				2441	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
				2442	/// vector of [8 x float].
				2443	///
				2444	/// \headerfile <x86intrin.h>
				2445	///
				2446	/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
				2447	///
				2448	/// \param __a
				2449	/// A 256-bit vector of [8 x float]. \n
				2450	/// Bits [31:0] are written to bits [31:0] of the return value. \n
				2451	/// Bits [63:32] are written to bits [95:64] of the return value. \n
				2452	/// Bits [159:128] are written to bits [159:128] of the return value. \n
				2453	/// Bits [191:160] are written to bits [223:192] of the return value.
				2454	/// \param __b
				2455	/// A 256-bit vector of [8 x float]. \n
				2456	/// Bits [31:0] are written to bits [63:32] of the return value. \n
				2457	/// Bits [63:32] are written to bits [127:96] of the return value. \n
				2458	/// Bits [159:128] are written to bits [191:160] of the return value. \n
				2459	/// Bits [191:160] are written to bits [255:224] of the return value.
				2460	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2461	static __inline __m256 __DEFAULT_FN_ATTRS
				2462	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
				2463	{
				2464	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
				2465	}
				2466
				2467	/* Bit Test */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2468	/// Given two 128-bit floating-point vectors of [2 x double], perform an
				2469	/// element-by-element comparison of the double-precision element in the
				2470	/// first source vector and the corresponding element in the second source
				2471	/// vector.
				2472	///
				2473	/// The EFLAGS register is updated as follows: \n
				2474	/// If there is at least one pair of double-precision elements where the
				2475	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2476	/// ZF flag is set to 1. \n
				2477	/// If there is at least one pair of double-precision elements where the
				2478	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2479	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2480	/// This intrinsic returns the value of the ZF flag.
				2481	///
				2482	/// \headerfile <x86intrin.h>
				2483	///
				2484	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
				2485	///
				2486	/// \param __a
				2487	/// A 128-bit vector of [2 x double].
				2488	/// \param __b
				2489	/// A 128-bit vector of [2 x double].
				2490	/// \returns the ZF flag in the EFLAGS register.
				2491	static __inline int __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2492	_mm_testz_pd(__m128d __a, __m128d __b)
				2493	{
				2494	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
				2495	}
				2496
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2497	/// Given two 128-bit floating-point vectors of [2 x double], perform an
				2498	/// element-by-element comparison of the double-precision element in the
				2499	/// first source vector and the corresponding element in the second source
				2500	/// vector.
				2501	///
				2502	/// The EFLAGS register is updated as follows: \n
				2503	/// If there is at least one pair of double-precision elements where the
				2504	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2505	/// ZF flag is set to 1. \n
				2506	/// If there is at least one pair of double-precision elements where the
				2507	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2508	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2509	/// This intrinsic returns the value of the CF flag.
				2510	///
				2511	/// \headerfile <x86intrin.h>
				2512	///
				2513	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
				2514	///
				2515	/// \param __a
				2516	/// A 128-bit vector of [2 x double].
				2517	/// \param __b
				2518	/// A 128-bit vector of [2 x double].
				2519	/// \returns the CF flag in the EFLAGS register.
				2520	static __inline int __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2521	_mm_testc_pd(__m128d __a, __m128d __b)
				2522	{
				2523	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
				2524	}
				2525
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2526	/// Given two 128-bit floating-point vectors of [2 x double], perform an
				2527	/// element-by-element comparison of the double-precision element in the
				2528	/// first source vector and the corresponding element in the second source
				2529	/// vector.
				2530	///
				2531	/// The EFLAGS register is updated as follows: \n
				2532	/// If there is at least one pair of double-precision elements where the
				2533	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2534	/// ZF flag is set to 1. \n
				2535	/// If there is at least one pair of double-precision elements where the
				2536	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2537	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2538	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2539	/// otherwise it returns 0.
				2540	///
				2541	/// \headerfile <x86intrin.h>
				2542	///
				2543	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
				2544	///
				2545	/// \param __a
				2546	/// A 128-bit vector of [2 x double].
				2547	/// \param __b
				2548	/// A 128-bit vector of [2 x double].
				2549	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
				2550	static __inline int __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2551	_mm_testnzc_pd(__m128d __a, __m128d __b)
				2552	{
				2553	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
				2554	}
				2555
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2556	/// Given two 128-bit floating-point vectors of [4 x float], perform an
				2557	/// element-by-element comparison of the single-precision element in the
				2558	/// first source vector and the corresponding element in the second source
				2559	/// vector.
				2560	///
				2561	/// The EFLAGS register is updated as follows: \n
				2562	/// If there is at least one pair of single-precision elements where the
				2563	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2564	/// ZF flag is set to 1. \n
				2565	/// If there is at least one pair of single-precision elements where the
				2566	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2567	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2568	/// This intrinsic returns the value of the ZF flag.
				2569	///
				2570	/// \headerfile <x86intrin.h>
				2571	///
				2572	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
				2573	///
				2574	/// \param __a
				2575	/// A 128-bit vector of [4 x float].
				2576	/// \param __b
				2577	/// A 128-bit vector of [4 x float].
				2578	/// \returns the ZF flag.
				2579	static __inline int __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2580	_mm_testz_ps(__m128 __a, __m128 __b)
				2581	{
				2582	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
				2583	}
				2584
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2585	/// Given two 128-bit floating-point vectors of [4 x float], perform an
				2586	/// element-by-element comparison of the single-precision element in the
				2587	/// first source vector and the corresponding element in the second source
				2588	/// vector.
				2589	///
				2590	/// The EFLAGS register is updated as follows: \n
				2591	/// If there is at least one pair of single-precision elements where the
				2592	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2593	/// ZF flag is set to 1. \n
				2594	/// If there is at least one pair of single-precision elements where the
				2595	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2596	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2597	/// This intrinsic returns the value of the CF flag.
				2598	///
				2599	/// \headerfile <x86intrin.h>
				2600	///
				2601	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
				2602	///
				2603	/// \param __a
				2604	/// A 128-bit vector of [4 x float].
				2605	/// \param __b
				2606	/// A 128-bit vector of [4 x float].
				2607	/// \returns the CF flag.
				2608	static __inline int __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2609	_mm_testc_ps(__m128 __a, __m128 __b)
				2610	{
				2611	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
				2612	}
				2613
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2614	/// Given two 128-bit floating-point vectors of [4 x float], perform an
				2615	/// element-by-element comparison of the single-precision element in the
				2616	/// first source vector and the corresponding element in the second source
				2617	/// vector.
				2618	///
				2619	/// The EFLAGS register is updated as follows: \n
				2620	/// If there is at least one pair of single-precision elements where the
				2621	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2622	/// ZF flag is set to 1. \n
				2623	/// If there is at least one pair of single-precision elements where the
				2624	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2625	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2626	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2627	/// otherwise it returns 0.
				2628	///
				2629	/// \headerfile <x86intrin.h>
				2630	///
				2631	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
				2632	///
				2633	/// \param __a
				2634	/// A 128-bit vector of [4 x float].
				2635	/// \param __b
				2636	/// A 128-bit vector of [4 x float].
				2637	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
				2638	static __inline int __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2639	_mm_testnzc_ps(__m128 __a, __m128 __b)
				2640	{
				2641	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
				2642	}
				2643
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2644	/// Given two 256-bit floating-point vectors of [4 x double], perform an
				2645	/// element-by-element comparison of the double-precision elements in the
				2646	/// first source vector and the corresponding elements in the second source
				2647	/// vector.
				2648	///
				2649	/// The EFLAGS register is updated as follows: \n
				2650	/// If there is at least one pair of double-precision elements where the
				2651	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2652	/// ZF flag is set to 1. \n
				2653	/// If there is at least one pair of double-precision elements where the
				2654	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2655	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2656	/// This intrinsic returns the value of the ZF flag.
				2657	///
				2658	/// \headerfile <x86intrin.h>
				2659	///
				2660	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
				2661	///
				2662	/// \param __a
				2663	/// A 256-bit vector of [4 x double].
				2664	/// \param __b
				2665	/// A 256-bit vector of [4 x double].
				2666	/// \returns the ZF flag.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2667	static __inline int __DEFAULT_FN_ATTRS
				2668	_mm256_testz_pd(__m256d __a, __m256d __b)
				2669	{
				2670	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
				2671	}
				2672
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2673	/// Given two 256-bit floating-point vectors of [4 x double], perform an
				2674	/// element-by-element comparison of the double-precision elements in the
				2675	/// first source vector and the corresponding elements in the second source
				2676	/// vector.
				2677	///
				2678	/// The EFLAGS register is updated as follows: \n
				2679	/// If there is at least one pair of double-precision elements where the
				2680	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2681	/// ZF flag is set to 1. \n
				2682	/// If there is at least one pair of double-precision elements where the
				2683	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2684	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2685	/// This intrinsic returns the value of the CF flag.
				2686	///
				2687	/// \headerfile <x86intrin.h>
				2688	///
				2689	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
				2690	///
				2691	/// \param __a
				2692	/// A 256-bit vector of [4 x double].
				2693	/// \param __b
				2694	/// A 256-bit vector of [4 x double].
				2695	/// \returns the CF flag.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2696	static __inline int __DEFAULT_FN_ATTRS
				2697	_mm256_testc_pd(__m256d __a, __m256d __b)
				2698	{
				2699	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
				2700	}
				2701
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2702	/// Given two 256-bit floating-point vectors of [4 x double], perform an
				2703	/// element-by-element comparison of the double-precision elements in the
				2704	/// first source vector and the corresponding elements in the second source
				2705	/// vector.
				2706	///
				2707	/// The EFLAGS register is updated as follows: \n
				2708	/// If there is at least one pair of double-precision elements where the
				2709	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2710	/// ZF flag is set to 1. \n
				2711	/// If there is at least one pair of double-precision elements where the
				2712	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2713	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2714	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2715	/// otherwise it returns 0.
				2716	///
				2717	/// \headerfile <x86intrin.h>
				2718	///
				2719	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
				2720	///
				2721	/// \param __a
				2722	/// A 256-bit vector of [4 x double].
				2723	/// \param __b
				2724	/// A 256-bit vector of [4 x double].
				2725	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2726	static __inline int __DEFAULT_FN_ATTRS
				2727	_mm256_testnzc_pd(__m256d __a, __m256d __b)
				2728	{
				2729	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
				2730	}
				2731
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2732	/// Given two 256-bit floating-point vectors of [8 x float], perform an
				2733	/// element-by-element comparison of the single-precision element in the
				2734	/// first source vector and the corresponding element in the second source
				2735	/// vector.
				2736	///
				2737	/// The EFLAGS register is updated as follows: \n
				2738	/// If there is at least one pair of single-precision elements where the
				2739	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2740	/// ZF flag is set to 1. \n
				2741	/// If there is at least one pair of single-precision elements where the
				2742	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2743	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2744	/// This intrinsic returns the value of the ZF flag.
				2745	///
				2746	/// \headerfile <x86intrin.h>
				2747	///
				2748	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
				2749	///
				2750	/// \param __a
				2751	/// A 256-bit vector of [8 x float].
				2752	/// \param __b
				2753	/// A 256-bit vector of [8 x float].
				2754	/// \returns the ZF flag.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2755	static __inline int __DEFAULT_FN_ATTRS
				2756	_mm256_testz_ps(__m256 __a, __m256 __b)
				2757	{
				2758	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
				2759	}
				2760
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2761	/// Given two 256-bit floating-point vectors of [8 x float], perform an
				2762	/// element-by-element comparison of the single-precision element in the
				2763	/// first source vector and the corresponding element in the second source
				2764	/// vector.
				2765	///
				2766	/// The EFLAGS register is updated as follows: \n
				2767	/// If there is at least one pair of single-precision elements where the
				2768	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2769	/// ZF flag is set to 1. \n
				2770	/// If there is at least one pair of single-precision elements where the
				2771	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2772	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2773	/// This intrinsic returns the value of the CF flag.
				2774	///
				2775	/// \headerfile <x86intrin.h>
				2776	///
				2777	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
				2778	///
				2779	/// \param __a
				2780	/// A 256-bit vector of [8 x float].
				2781	/// \param __b
				2782	/// A 256-bit vector of [8 x float].
				2783	/// \returns the CF flag.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2784	static __inline int __DEFAULT_FN_ATTRS
				2785	_mm256_testc_ps(__m256 __a, __m256 __b)
				2786	{
				2787	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
				2788	}
				2789
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2790	/// Given two 256-bit floating-point vectors of [8 x float], perform an
				2791	/// element-by-element comparison of the single-precision elements in the
				2792	/// first source vector and the corresponding elements in the second source
				2793	/// vector.
				2794	///
				2795	/// The EFLAGS register is updated as follows: \n
				2796	/// If there is at least one pair of single-precision elements where the
				2797	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
				2798	/// ZF flag is set to 1. \n
				2799	/// If there is at least one pair of single-precision elements where the
				2800	/// sign-bit of the first element is 0 and the sign-bit of the second element
				2801	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
				2802	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2803	/// otherwise it returns 0.
				2804	///
				2805	/// \headerfile <x86intrin.h>
				2806	///
				2807	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
				2808	///
				2809	/// \param __a
				2810	/// A 256-bit vector of [8 x float].
				2811	/// \param __b
				2812	/// A 256-bit vector of [8 x float].
				2813	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2814	static __inline int __DEFAULT_FN_ATTRS
				2815	_mm256_testnzc_ps(__m256 __a, __m256 __b)
				2816	{
				2817	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
				2818	}
				2819
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2820	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
				2821	/// of the two source vectors.
				2822	///
				2823	/// The EFLAGS register is updated as follows: \n
				2824	/// If there is at least one pair of bits where both bits are 1, the ZF flag
				2825	/// is set to 0. Otherwise the ZF flag is set to 1. \n
				2826	/// If there is at least one pair of bits where the bit from the first source
				2827	/// vector is 0 and the bit from the second source vector is 1, the CF flag
				2828	/// is set to 0. Otherwise the CF flag is set to 1. \n
				2829	/// This intrinsic returns the value of the ZF flag.
				2830	///
				2831	/// \headerfile <x86intrin.h>
				2832	///
				2833	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
				2834	///
				2835	/// \param __a
				2836	/// A 256-bit integer vector.
				2837	/// \param __b
				2838	/// A 256-bit integer vector.
				2839	/// \returns the ZF flag.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2840	static __inline int __DEFAULT_FN_ATTRS
				2841	_mm256_testz_si256(__m256i __a, __m256i __b)
				2842	{
				2843	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
				2844	}
				2845
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2846	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
				2847	/// of the two source vectors.
				2848	///
				2849	/// The EFLAGS register is updated as follows: \n
				2850	/// If there is at least one pair of bits where both bits are 1, the ZF flag
				2851	/// is set to 0. Otherwise the ZF flag is set to 1. \n
				2852	/// If there is at least one pair of bits where the bit from the first source
				2853	/// vector is 0 and the bit from the second source vector is 1, the CF flag
				2854	/// is set to 0. Otherwise the CF flag is set to 1. \n
				2855	/// This intrinsic returns the value of the CF flag.
				2856	///
				2857	/// \headerfile <x86intrin.h>
				2858	///
				2859	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
				2860	///
				2861	/// \param __a
				2862	/// A 256-bit integer vector.
				2863	/// \param __b
				2864	/// A 256-bit integer vector.
				2865	/// \returns the CF flag.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2866	static __inline int __DEFAULT_FN_ATTRS
				2867	_mm256_testc_si256(__m256i __a, __m256i __b)
				2868	{
				2869	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
				2870	}
				2871
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2872	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
				2873	/// of the two source vectors.
				2874	///
				2875	/// The EFLAGS register is updated as follows: \n
				2876	/// If there is at least one pair of bits where both bits are 1, the ZF flag
				2877	/// is set to 0. Otherwise the ZF flag is set to 1. \n
				2878	/// If there is at least one pair of bits where the bit from the first source
				2879	/// vector is 0 and the bit from the second source vector is 1, the CF flag
				2880	/// is set to 0. Otherwise the CF flag is set to 1. \n
				2881	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
				2882	/// otherwise it returns 0.
				2883	///
				2884	/// \headerfile <x86intrin.h>
				2885	///
				2886	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
				2887	///
				2888	/// \param __a
				2889	/// A 256-bit integer vector.
				2890	/// \param __b
				2891	/// A 256-bit integer vector.
				2892	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2893	static __inline int __DEFAULT_FN_ATTRS
				2894	_mm256_testnzc_si256(__m256i __a, __m256i __b)
				2895	{
				2896	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
				2897	}
				2898
				2899	/* Vector extract sign mask */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2900	/// Extracts the sign bits of double-precision floating point elements
				2901	/// in a 256-bit vector of [4 x double] and writes them to the lower order
				2902	/// bits of the return value.
				2903	///
				2904	/// \headerfile <x86intrin.h>
				2905	///
				2906	/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
				2907	///
				2908	/// \param __a
				2909	/// A 256-bit vector of [4 x double] containing the double-precision
				2910	/// floating point values with sign bits to be extracted.
				2911	/// \returns The sign bits from the operand, written to bits [3:0].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2912	static __inline int __DEFAULT_FN_ATTRS
				2913	_mm256_movemask_pd(__m256d __a)
				2914	{
				2915	return __builtin_ia32_movmskpd256((__v4df)__a);
				2916	}
				2917
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2918	/// Extracts the sign bits of single-precision floating point elements
				2919	/// in a 256-bit vector of [8 x float] and writes them to the lower order
				2920	/// bits of the return value.
				2921	///
				2922	/// \headerfile <x86intrin.h>
				2923	///
				2924	/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
				2925	///
				2926	/// \param __a
				2927	/// A 256-bit vector of [8 x float] containing the single-precision floating
				2928	/// point values with sign bits to be extracted.
				2929	/// \returns The sign bits from the operand, written to bits [7:0].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2930	static __inline int __DEFAULT_FN_ATTRS
				2931	_mm256_movemask_ps(__m256 __a)
				2932	{
				2933	return __builtin_ia32_movmskps256((__v8sf)__a);
				2934	}
				2935
				2936	/* Vector __zero */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2937	/// Zeroes the contents of all XMM or YMM registers.
				2938	///
				2939	/// \headerfile <x86intrin.h>
				2940	///
				2941	/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
				2942	static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2943	_mm256_zeroall(void)
				2944	{
				2945	__builtin_ia32_vzeroall();
				2946	}
				2947
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2948	/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
				2949	///
				2950	/// \headerfile <x86intrin.h>
				2951	///
				2952	/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
				2953	static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2954	_mm256_zeroupper(void)
				2955	{
				2956	__builtin_ia32_vzeroupper();
				2957	}
				2958
				2959	/* Vector load with broadcast */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2960	/// Loads a scalar single-precision floating point value from the
				2961	/// specified address pointed to by \a __a and broadcasts it to the elements
				2962	/// of a [4 x float] vector.
				2963	///
				2964	/// \headerfile <x86intrin.h>
				2965	///
				2966	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
				2967	///
				2968	/// \param __a
				2969	/// The single-precision floating point value to be broadcast.
				2970	/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
				2971	/// equal to the broadcast value.
				2972	static __inline __m128 __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2973	_mm_broadcast_ss(float const *__a)
				2974	{
				2975	float __f = *__a;
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2976	return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2977	}
				2978
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2979	/// Loads a scalar double-precision floating point value from the
				2980	/// specified address pointed to by \a __a and broadcasts it to the elements
				2981	/// of a [4 x double] vector.
				2982	///
				2983	/// \headerfile <x86intrin.h>
				2984	///
				2985	/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
				2986	///
				2987	/// \param __a
				2988	/// The double-precision floating point value to be broadcast.
				2989	/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
				2990	/// equal to the broadcast value.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2991	static __inline __m256d __DEFAULT_FN_ATTRS
				2992	_mm256_broadcast_sd(double const *__a)
				2993	{
				2994	double __d = *__a;
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2995	return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	2996	}
				2997
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	2998	/// Loads a scalar single-precision floating point value from the
				2999	/// specified address pointed to by \a __a and broadcasts it to the elements
				3000	/// of a [8 x float] vector.
				3001	///
				3002	/// \headerfile <x86intrin.h>
				3003	///
				3004	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
				3005	///
				3006	/// \param __a
				3007	/// The single-precision floating point value to be broadcast.
				3008	/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
				3009	/// equal to the broadcast value.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3010	static __inline __m256 __DEFAULT_FN_ATTRS
				3011	_mm256_broadcast_ss(float const *__a)
				3012	{
				3013	float __f = *__a;
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3014	return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3015	}
				3016
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3017	/// Loads the data from a 128-bit vector of [2 x double] from the
				3018	/// specified address pointed to by \a __a and broadcasts it to 128-bit
				3019	/// elements in a 256-bit vector of [4 x double].
				3020	///
				3021	/// \headerfile <x86intrin.h>
				3022	///
				3023	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
				3024	///
				3025	/// \param __a
				3026	/// The 128-bit vector of [2 x double] to be broadcast.
				3027	/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
				3028	/// equal to the broadcast value.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3029	static __inline __m256d __DEFAULT_FN_ATTRS
				3030	_mm256_broadcast_pd(__m128d const *__a)
				3031	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3032	__m128d __b = _mm_loadu_pd((const double *)__a);
				3033	return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
				3034	0, 1, 0, 1);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3035	}
				3036
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3037	/// Loads the data from a 128-bit vector of [4 x float] from the
				3038	/// specified address pointed to by \a __a and broadcasts it to 128-bit
				3039	/// elements in a 256-bit vector of [8 x float].
				3040	///
				3041	/// \headerfile <x86intrin.h>
				3042	///
				3043	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
				3044	///
				3045	/// \param __a
				3046	/// The 128-bit vector of [4 x float] to be broadcast.
				3047	/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
				3048	/// equal to the broadcast value.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3049	static __inline __m256 __DEFAULT_FN_ATTRS
				3050	_mm256_broadcast_ps(__m128 const *__a)
				3051	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3052	__m128 __b = _mm_loadu_ps((const float *)__a);
				3053	return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
				3054	0, 1, 2, 3, 0, 1, 2, 3);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3055	}
				3056
				3057	/* SIMD load ops */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3058	/// Loads 4 double-precision floating point values from a 32-byte aligned
				3059	/// memory location pointed to by \a __p into a vector of [4 x double].
				3060	///
				3061	/// \headerfile <x86intrin.h>
				3062	///
				3063	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
				3064	///
				3065	/// \param __p
				3066	/// A 32-byte aligned pointer to a memory location containing
				3067	/// double-precision floating point values.
				3068	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3069	static __inline __m256d __DEFAULT_FN_ATTRS
				3070	_mm256_load_pd(double const *__p)
				3071	{
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	3072	return (const __m256d )__p;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3073	}
				3074
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3075	/// Loads 8 single-precision floating point values from a 32-byte aligned
				3076	/// memory location pointed to by \a __p into a vector of [8 x float].
				3077	///
				3078	/// \headerfile <x86intrin.h>
				3079	///
				3080	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
				3081	///
				3082	/// \param __p
				3083	/// A 32-byte aligned pointer to a memory location containing float values.
				3084	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3085	static __inline __m256 __DEFAULT_FN_ATTRS
				3086	_mm256_load_ps(float const *__p)
				3087	{
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	3088	return (const __m256 )__p;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3089	}
				3090
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3091	/// Loads 4 double-precision floating point values from an unaligned
				3092	/// memory location pointed to by \a __p into a vector of [4 x double].
				3093	///
				3094	/// \headerfile <x86intrin.h>
				3095	///
				3096	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
				3097	///
				3098	/// \param __p
				3099	/// A pointer to a memory location containing double-precision floating
				3100	/// point values.
				3101	/// \returns A 256-bit vector of [4 x double] containing the moved values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3102	static __inline __m256d __DEFAULT_FN_ATTRS
				3103	_mm256_loadu_pd(double const *__p)
				3104	{
				3105	struct __loadu_pd {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	3106	__m256d_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3107	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	3108	return ((const struct __loadu_pd*)__p)->__v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3109	}
				3110
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3111	/// Loads 8 single-precision floating point values from an unaligned
				3112	/// memory location pointed to by \a __p into a vector of [8 x float].
				3113	///
				3114	/// \headerfile <x86intrin.h>
				3115	///
				3116	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
				3117	///
				3118	/// \param __p
				3119	/// A pointer to a memory location containing single-precision floating
				3120	/// point values.
				3121	/// \returns A 256-bit vector of [8 x float] containing the moved values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3122	static __inline __m256 __DEFAULT_FN_ATTRS
				3123	_mm256_loadu_ps(float const *__p)
				3124	{
				3125	struct __loadu_ps {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	3126	__m256_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3127	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	3128	return ((const struct __loadu_ps*)__p)->__v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3129	}
				3130
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3131	/// Loads 256 bits of integer data from a 32-byte aligned memory
				3132	/// location pointed to by \a __p into elements of a 256-bit integer vector.
				3133	///
				3134	/// \headerfile <x86intrin.h>
				3135	///
				3136	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
				3137	///
				3138	/// \param __p
				3139	/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
				3140	/// values.
				3141	/// \returns A 256-bit integer vector containing the moved values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3142	static __inline __m256i __DEFAULT_FN_ATTRS
				3143	_mm256_load_si256(__m256i const *__p)
				3144	{
				3145	return *__p;
				3146	}
				3147
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3148	/// Loads 256 bits of integer data from an unaligned memory location
				3149	/// pointed to by \a __p into a 256-bit integer vector.
				3150	///
				3151	/// \headerfile <x86intrin.h>
				3152	///
				3153	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
				3154	///
				3155	/// \param __p
				3156	/// A pointer to a 256-bit integer vector containing integer values.
				3157	/// \returns A 256-bit integer vector containing the moved values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3158	static __inline __m256i __DEFAULT_FN_ATTRS
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	3159	_mm256_loadu_si256(__m256i_u const *__p)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3160	{
				3161	struct __loadu_si256 {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	3162	__m256i_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3163	} __attribute__((__packed__, __may_alias__));
Sasha Smundak	33d5ddd	2020-05-04 13:37:26 -0700	[diff] [blame]	3164	return ((const struct __loadu_si256*)__p)->__v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3165	}
				3166
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3167	/// Loads 256 bits of integer data from an unaligned memory location
				3168	/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
				3169	/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
				3170	/// line boundary.
				3171	///
				3172	/// \headerfile <x86intrin.h>
				3173	///
				3174	/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
				3175	///
				3176	/// \param __p
				3177	/// A pointer to a 256-bit integer vector containing integer values.
				3178	/// \returns A 256-bit integer vector containing the moved values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3179	static __inline __m256i __DEFAULT_FN_ATTRS
				3180	_mm256_lddqu_si256(__m256i const *__p)
				3181	{
				3182	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
				3183	}
				3184
				3185	/* SIMD store ops */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3186	/// Stores double-precision floating point values from a 256-bit vector
				3187	/// of [4 x double] to a 32-byte aligned memory location pointed to by
				3188	/// \a __p.
				3189	///
				3190	/// \headerfile <x86intrin.h>
				3191	///
				3192	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
				3193	///
				3194	/// \param __p
				3195	/// A 32-byte aligned pointer to a memory location that will receive the
				3196	/// double-precision floaing point values.
				3197	/// \param __a
				3198	/// A 256-bit vector of [4 x double] containing the values to be moved.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3199	static __inline void __DEFAULT_FN_ATTRS
				3200	_mm256_store_pd(double *__p, __m256d __a)
				3201	{
				3202	(__m256d )__p = __a;
				3203	}
				3204
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3205	/// Stores single-precision floating point values from a 256-bit vector
				3206	/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
				3207	///
				3208	/// \headerfile <x86intrin.h>
				3209	///
				3210	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
				3211	///
				3212	/// \param __p
				3213	/// A 32-byte aligned pointer to a memory location that will receive the
				3214	/// float values.
				3215	/// \param __a
				3216	/// A 256-bit vector of [8 x float] containing the values to be moved.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3217	static __inline void __DEFAULT_FN_ATTRS
				3218	_mm256_store_ps(float *__p, __m256 __a)
				3219	{
				3220	(__m256 )__p = __a;
				3221	}
				3222
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3223	/// Stores double-precision floating point values from a 256-bit vector
				3224	/// of [4 x double] to an unaligned memory location pointed to by \a __p.
				3225	///
				3226	/// \headerfile <x86intrin.h>
				3227	///
				3228	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
				3229	///
				3230	/// \param __p
				3231	/// A pointer to a memory location that will receive the double-precision
				3232	/// floating point values.
				3233	/// \param __a
				3234	/// A 256-bit vector of [4 x double] containing the values to be moved.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3235	static __inline void __DEFAULT_FN_ATTRS
				3236	_mm256_storeu_pd(double *__p, __m256d __a)
				3237	{
				3238	struct __storeu_pd {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	3239	__m256d_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3240	} __attribute__((__packed__, __may_alias__));
				3241	((struct __storeu_pd*)__p)->__v = __a;
				3242	}
				3243
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3244	/// Stores single-precision floating point values from a 256-bit vector
				3245	/// of [8 x float] to an unaligned memory location pointed to by \a __p.
				3246	///
				3247	/// \headerfile <x86intrin.h>
				3248	///
				3249	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
				3250	///
				3251	/// \param __p
				3252	/// A pointer to a memory location that will receive the float values.
				3253	/// \param __a
				3254	/// A 256-bit vector of [8 x float] containing the values to be moved.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3255	static __inline void __DEFAULT_FN_ATTRS
				3256	_mm256_storeu_ps(float *__p, __m256 __a)
				3257	{
				3258	struct __storeu_ps {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	3259	__m256_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3260	} __attribute__((__packed__, __may_alias__));
				3261	((struct __storeu_ps*)__p)->__v = __a;
				3262	}
				3263
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3264	/// Stores integer values from a 256-bit integer vector to a 32-byte
				3265	/// aligned memory location pointed to by \a __p.
				3266	///
				3267	/// \headerfile <x86intrin.h>
				3268	///
				3269	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
				3270	///
				3271	/// \param __p
				3272	/// A 32-byte aligned pointer to a memory location that will receive the
				3273	/// integer values.
				3274	/// \param __a
				3275	/// A 256-bit integer vector containing the values to be moved.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3276	static __inline void __DEFAULT_FN_ATTRS
				3277	_mm256_store_si256(__m256i *__p, __m256i __a)
				3278	{
				3279	*__p = __a;
				3280	}
				3281
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3282	/// Stores integer values from a 256-bit integer vector to an unaligned
				3283	/// memory location pointed to by \a __p.
				3284	///
				3285	/// \headerfile <x86intrin.h>
				3286	///
				3287	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
				3288	///
				3289	/// \param __p
				3290	/// A pointer to a memory location that will receive the integer values.
				3291	/// \param __a
				3292	/// A 256-bit integer vector containing the values to be moved.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3293	static __inline void __DEFAULT_FN_ATTRS
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	3294	_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3295	{
				3296	struct __storeu_si256 {
Logan Chien	dbcf412	2019-03-21 10:50:25 +0800	[diff] [blame]	3297	__m256i_u __v;
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3298	} __attribute__((__packed__, __may_alias__));
				3299	((struct __storeu_si256*)__p)->__v = __a;
				3300	}
				3301
				3302	/* Conditional load ops */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3303	/// Conditionally loads double-precision floating point elements from a
				3304	/// memory location pointed to by \a __p into a 128-bit vector of
				3305	/// [2 x double], depending on the mask bits associated with each data
				3306	/// element.
				3307	///
				3308	/// \headerfile <x86intrin.h>
				3309	///
				3310	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
				3311	///
				3312	/// \param __p
				3313	/// A pointer to a memory location that contains the double-precision
				3314	/// floating point values.
				3315	/// \param __m
				3316	/// A 128-bit integer vector containing the mask. The most significant bit of
				3317	/// each data element represents the mask bits. If a mask bit is zero, the
				3318	/// corresponding value in the memory location is not loaded and the
				3319	/// corresponding field in the return value is set to zero.
				3320	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
				3321	static __inline __m128d __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3322	_mm_maskload_pd(double const *__p, __m128i __m)
				3323	{
				3324	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
				3325	}
				3326
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3327	/// Conditionally loads double-precision floating point elements from a
				3328	/// memory location pointed to by \a __p into a 256-bit vector of
				3329	/// [4 x double], depending on the mask bits associated with each data
				3330	/// element.
				3331	///
				3332	/// \headerfile <x86intrin.h>
				3333	///
				3334	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
				3335	///
				3336	/// \param __p
				3337	/// A pointer to a memory location that contains the double-precision
				3338	/// floating point values.
				3339	/// \param __m
				3340	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3341	/// significant bit of each quadword element represents the mask bits. If a
				3342	/// mask bit is zero, the corresponding value in the memory location is not
				3343	/// loaded and the corresponding field in the return value is set to zero.
				3344	/// \returns A 256-bit vector of [4 x double] containing the loaded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3345	static __inline __m256d __DEFAULT_FN_ATTRS
				3346	_mm256_maskload_pd(double const *__p, __m256i __m)
				3347	{
				3348	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
				3349	(__v4di)__m);
				3350	}
				3351
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3352	/// Conditionally loads single-precision floating point elements from a
				3353	/// memory location pointed to by \a __p into a 128-bit vector of
				3354	/// [4 x float], depending on the mask bits associated with each data
				3355	/// element.
				3356	///
				3357	/// \headerfile <x86intrin.h>
				3358	///
				3359	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
				3360	///
				3361	/// \param __p
				3362	/// A pointer to a memory location that contains the single-precision
				3363	/// floating point values.
				3364	/// \param __m
				3365	/// A 128-bit integer vector containing the mask. The most significant bit of
				3366	/// each data element represents the mask bits. If a mask bit is zero, the
				3367	/// corresponding value in the memory location is not loaded and the
				3368	/// corresponding field in the return value is set to zero.
				3369	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
				3370	static __inline __m128 __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3371	_mm_maskload_ps(float const *__p, __m128i __m)
				3372	{
				3373	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
				3374	}
				3375
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3376	/// Conditionally loads single-precision floating point elements from a
				3377	/// memory location pointed to by \a __p into a 256-bit vector of
				3378	/// [8 x float], depending on the mask bits associated with each data
				3379	/// element.
				3380	///
				3381	/// \headerfile <x86intrin.h>
				3382	///
				3383	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
				3384	///
				3385	/// \param __p
				3386	/// A pointer to a memory location that contains the single-precision
				3387	/// floating point values.
				3388	/// \param __m
				3389	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3390	/// significant bit of each dword element represents the mask bits. If a mask
				3391	/// bit is zero, the corresponding value in the memory location is not loaded
				3392	/// and the corresponding field in the return value is set to zero.
				3393	/// \returns A 256-bit vector of [8 x float] containing the loaded values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3394	static __inline __m256 __DEFAULT_FN_ATTRS
				3395	_mm256_maskload_ps(float const *__p, __m256i __m)
				3396	{
				3397	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
				3398	}
				3399
				3400	/* Conditional store ops */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3401	/// Moves single-precision floating point values from a 256-bit vector
				3402	/// of [8 x float] to a memory location pointed to by \a __p, according to
				3403	/// the specified mask.
				3404	///
				3405	/// \headerfile <x86intrin.h>
				3406	///
				3407	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
				3408	///
				3409	/// \param __p
				3410	/// A pointer to a memory location that will receive the float values.
				3411	/// \param __m
				3412	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
				3413	/// significant bit of each dword element in the mask vector represents the
				3414	/// mask bits. If a mask bit is zero, the corresponding value from vector
				3415	/// \a __a is not stored and the corresponding field in the memory location
				3416	/// pointed to by \a __p is not changed.
				3417	/// \param __a
				3418	/// A 256-bit vector of [8 x float] containing the values to be stored.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3419	static __inline void __DEFAULT_FN_ATTRS
				3420	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
				3421	{
				3422	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
				3423	}
				3424
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3425	/// Moves double-precision values from a 128-bit vector of [2 x double]
				3426	/// to a memory location pointed to by \a __p, according to the specified
				3427	/// mask.
				3428	///
				3429	/// \headerfile <x86intrin.h>
				3430	///
				3431	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
				3432	///
				3433	/// \param __p
				3434	/// A pointer to a memory location that will receive the float values.
				3435	/// \param __m
				3436	/// A 128-bit integer vector containing the mask. The most significant bit of
				3437	/// each field in the mask vector represents the mask bits. If a mask bit is
				3438	/// zero, the corresponding value from vector \a __a is not stored and the
				3439	/// corresponding field in the memory location pointed to by \a __p is not
				3440	/// changed.
				3441	/// \param __a
				3442	/// A 128-bit vector of [2 x double] containing the values to be stored.
				3443	static __inline void __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3444	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
				3445	{
				3446	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
				3447	}
				3448
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3449	/// Moves double-precision values from a 256-bit vector of [4 x double]
				3450	/// to a memory location pointed to by \a __p, according to the specified
				3451	/// mask.
				3452	///
				3453	/// \headerfile <x86intrin.h>
				3454	///
				3455	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
				3456	///
				3457	/// \param __p
				3458	/// A pointer to a memory location that will receive the float values.
				3459	/// \param __m
				3460	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
				3461	/// significant bit of each quadword element in the mask vector represents
				3462	/// the mask bits. If a mask bit is zero, the corresponding value from vector
				3463	/// __a is not stored and the corresponding field in the memory location
				3464	/// pointed to by \a __p is not changed.
				3465	/// \param __a
				3466	/// A 256-bit vector of [4 x double] containing the values to be stored.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3467	static __inline void __DEFAULT_FN_ATTRS
				3468	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
				3469	{
				3470	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
				3471	}
				3472
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3473	/// Moves single-precision floating point values from a 128-bit vector
				3474	/// of [4 x float] to a memory location pointed to by \a __p, according to
				3475	/// the specified mask.
				3476	///
				3477	/// \headerfile <x86intrin.h>
				3478	///
				3479	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
				3480	///
				3481	/// \param __p
				3482	/// A pointer to a memory location that will receive the float values.
				3483	/// \param __m
				3484	/// A 128-bit integer vector containing the mask. The most significant bit of
				3485	/// each field in the mask vector represents the mask bits. If a mask bit is
				3486	/// zero, the corresponding value from vector __a is not stored and the
				3487	/// corresponding field in the memory location pointed to by \a __p is not
				3488	/// changed.
				3489	/// \param __a
				3490	/// A 128-bit vector of [4 x float] containing the values to be stored.
				3491	static __inline void __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3492	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
				3493	{
				3494	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
				3495	}
				3496
				3497	/* Cacheability support ops */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3498	/// Moves integer data from a 256-bit integer vector to a 32-byte
				3499	/// aligned memory location. To minimize caching, the data is flagged as
				3500	/// non-temporal (unlikely to be used again soon).
				3501	///
				3502	/// \headerfile <x86intrin.h>
				3503	///
				3504	/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
				3505	///
				3506	/// \param __a
				3507	/// A pointer to a 32-byte aligned memory location that will receive the
				3508	/// integer values.
				3509	/// \param __b
				3510	/// A 256-bit integer vector containing the values to be moved.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3511	static __inline void __DEFAULT_FN_ATTRS
				3512	_mm256_stream_si256(__m256i *__a, __m256i __b)
				3513	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3514	typedef __v4di __v4di_aligned __attribute__((aligned(32)));
				3515	__builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3516	}
				3517
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3518	/// Moves double-precision values from a 256-bit vector of [4 x double]
				3519	/// to a 32-byte aligned memory location. To minimize caching, the data is
				3520	/// flagged as non-temporal (unlikely to be used again soon).
				3521	///
				3522	/// \headerfile <x86intrin.h>
				3523	///
				3524	/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
				3525	///
				3526	/// \param __a
				3527	/// A pointer to a 32-byte aligned memory location that will receive the
				3528	/// double-precision floating-point values.
				3529	/// \param __b
				3530	/// A 256-bit vector of [4 x double] containing the values to be moved.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3531	static __inline void __DEFAULT_FN_ATTRS
				3532	_mm256_stream_pd(double *__a, __m256d __b)
				3533	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3534	typedef __v4df __v4df_aligned __attribute__((aligned(32)));
				3535	__builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3536	}
				3537
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3538	/// Moves single-precision floating point values from a 256-bit vector
				3539	/// of [8 x float] to a 32-byte aligned memory location. To minimize
				3540	/// caching, the data is flagged as non-temporal (unlikely to be used again
				3541	/// soon).
				3542	///
				3543	/// \headerfile <x86intrin.h>
				3544	///
				3545	/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
				3546	///
				3547	/// \param __p
				3548	/// A pointer to a 32-byte aligned memory location that will receive the
				3549	/// single-precision floating point values.
				3550	/// \param __a
				3551	/// A 256-bit vector of [8 x float] containing the values to be moved.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3552	static __inline void __DEFAULT_FN_ATTRS
				3553	_mm256_stream_ps(float *__p, __m256 __a)
				3554	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3555	typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
				3556	__builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3557	}
				3558
				3559	/* Create vectors */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3560	/// Create a 256-bit vector of [4 x double] with undefined values.
				3561	///
				3562	/// \headerfile <x86intrin.h>
				3563	///
				3564	/// This intrinsic has no corresponding instruction.
				3565	///
				3566	/// \returns A 256-bit vector of [4 x double] containing undefined values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3567	static __inline__ __m256d __DEFAULT_FN_ATTRS
				3568	_mm256_undefined_pd(void)
				3569	{
				3570	return (__m256d)__builtin_ia32_undef256();
				3571	}
				3572
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3573	/// Create a 256-bit vector of [8 x float] with undefined values.
				3574	///
				3575	/// \headerfile <x86intrin.h>
				3576	///
				3577	/// This intrinsic has no corresponding instruction.
				3578	///
				3579	/// \returns A 256-bit vector of [8 x float] containing undefined values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3580	static __inline__ __m256 __DEFAULT_FN_ATTRS
				3581	_mm256_undefined_ps(void)
				3582	{
				3583	return (__m256)__builtin_ia32_undef256();
				3584	}
				3585
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3586	/// Create a 256-bit integer vector with undefined values.
				3587	///
				3588	/// \headerfile <x86intrin.h>
				3589	///
				3590	/// This intrinsic has no corresponding instruction.
				3591	///
				3592	/// \returns A 256-bit integer vector containing undefined values.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3593	static __inline__ __m256i __DEFAULT_FN_ATTRS
				3594	_mm256_undefined_si256(void)
				3595	{
				3596	return (__m256i)__builtin_ia32_undef256();
				3597	}
				3598
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3599	/// Constructs a 256-bit floating-point vector of [4 x double]
				3600	/// initialized with the specified double-precision floating-point values.
				3601	///
				3602	/// \headerfile <x86intrin.h>
				3603	///
				3604	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
				3605	/// instruction.
				3606	///
				3607	/// \param __a
				3608	/// A double-precision floating-point value used to initialize bits [255:192]
				3609	/// of the result.
				3610	/// \param __b
				3611	/// A double-precision floating-point value used to initialize bits [191:128]
				3612	/// of the result.
				3613	/// \param __c
				3614	/// A double-precision floating-point value used to initialize bits [127:64]
				3615	/// of the result.
				3616	/// \param __d
				3617	/// A double-precision floating-point value used to initialize bits [63:0]
				3618	/// of the result.
				3619	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3620	static __inline __m256d __DEFAULT_FN_ATTRS
				3621	_mm256_set_pd(double __a, double __b, double __c, double __d)
				3622	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3623	return __extension__ (__m256d){ __d, __c, __b, __a };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3624	}
				3625
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3626	/// Constructs a 256-bit floating-point vector of [8 x float] initialized
				3627	/// with the specified single-precision floating-point values.
				3628	///
				3629	/// \headerfile <x86intrin.h>
				3630	///
				3631	/// This intrinsic is a utility function and does not correspond to a specific
				3632	/// instruction.
				3633	///
				3634	/// \param __a
				3635	/// A single-precision floating-point value used to initialize bits [255:224]
				3636	/// of the result.
				3637	/// \param __b
				3638	/// A single-precision floating-point value used to initialize bits [223:192]
				3639	/// of the result.
				3640	/// \param __c
				3641	/// A single-precision floating-point value used to initialize bits [191:160]
				3642	/// of the result.
				3643	/// \param __d
				3644	/// A single-precision floating-point value used to initialize bits [159:128]
				3645	/// of the result.
				3646	/// \param __e
				3647	/// A single-precision floating-point value used to initialize bits [127:96]
				3648	/// of the result.
				3649	/// \param __f
				3650	/// A single-precision floating-point value used to initialize bits [95:64]
				3651	/// of the result.
				3652	/// \param __g
				3653	/// A single-precision floating-point value used to initialize bits [63:32]
				3654	/// of the result.
				3655	/// \param __h
				3656	/// A single-precision floating-point value used to initialize bits [31:0]
				3657	/// of the result.
				3658	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3659	static __inline __m256 __DEFAULT_FN_ATTRS
				3660	_mm256_set_ps(float __a, float __b, float __c, float __d,
				3661	float __e, float __f, float __g, float __h)
				3662	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3663	return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3664	}
				3665
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3666	/// Constructs a 256-bit integer vector initialized with the specified
				3667	/// 32-bit integral values.
				3668	///
				3669	/// \headerfile <x86intrin.h>
				3670	///
				3671	/// This intrinsic is a utility function and does not correspond to a specific
				3672	/// instruction.
				3673	///
				3674	/// \param __i0
				3675	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				3676	/// \param __i1
				3677	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				3678	/// \param __i2
				3679	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				3680	/// \param __i3
				3681	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				3682	/// \param __i4
				3683	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				3684	/// \param __i5
				3685	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				3686	/// \param __i6
				3687	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				3688	/// \param __i7
				3689	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				3690	/// \returns An initialized 256-bit integer vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3691	static __inline __m256i __DEFAULT_FN_ATTRS
				3692	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
				3693	int __i4, int __i5, int __i6, int __i7)
				3694	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3695	return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3696	}
				3697
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3698	/// Constructs a 256-bit integer vector initialized with the specified
				3699	/// 16-bit integral values.
				3700	///
				3701	/// \headerfile <x86intrin.h>
				3702	///
				3703	/// This intrinsic is a utility function and does not correspond to a specific
				3704	/// instruction.
				3705	///
				3706	/// \param __w15
				3707	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				3708	/// \param __w14
				3709	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				3710	/// \param __w13
				3711	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				3712	/// \param __w12
				3713	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				3714	/// \param __w11
				3715	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				3716	/// \param __w10
				3717	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				3718	/// \param __w09
				3719	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				3720	/// \param __w08
				3721	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				3722	/// \param __w07
				3723	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				3724	/// \param __w06
				3725	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				3726	/// \param __w05
				3727	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				3728	/// \param __w04
				3729	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				3730	/// \param __w03
				3731	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				3732	/// \param __w02
				3733	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				3734	/// \param __w01
				3735	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				3736	/// \param __w00
				3737	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				3738	/// \returns An initialized 256-bit integer vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3739	static __inline __m256i __DEFAULT_FN_ATTRS
				3740	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
				3741	short __w11, short __w10, short __w09, short __w08,
				3742	short __w07, short __w06, short __w05, short __w04,
				3743	short __w03, short __w02, short __w01, short __w00)
				3744	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3745	return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3746	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
				3747	}
				3748
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3749	/// Constructs a 256-bit integer vector initialized with the specified
				3750	/// 8-bit integral values.
				3751	///
				3752	/// \headerfile <x86intrin.h>
				3753	///
				3754	/// This intrinsic is a utility function and does not correspond to a specific
				3755	/// instruction.
				3756	///
				3757	/// \param __b31
				3758	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				3759	/// \param __b30
				3760	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				3761	/// \param __b29
				3762	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				3763	/// \param __b28
				3764	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				3765	/// \param __b27
				3766	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				3767	/// \param __b26
				3768	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				3769	/// \param __b25
				3770	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				3771	/// \param __b24
				3772	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				3773	/// \param __b23
				3774	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				3775	/// \param __b22
				3776	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				3777	/// \param __b21
				3778	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				3779	/// \param __b20
				3780	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				3781	/// \param __b19
				3782	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				3783	/// \param __b18
				3784	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				3785	/// \param __b17
				3786	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				3787	/// \param __b16
				3788	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				3789	/// \param __b15
				3790	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				3791	/// \param __b14
				3792	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				3793	/// \param __b13
				3794	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				3795	/// \param __b12
				3796	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				3797	/// \param __b11
				3798	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				3799	/// \param __b10
				3800	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				3801	/// \param __b09
				3802	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				3803	/// \param __b08
				3804	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				3805	/// \param __b07
				3806	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				3807	/// \param __b06
				3808	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				3809	/// \param __b05
				3810	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				3811	/// \param __b04
				3812	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				3813	/// \param __b03
				3814	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				3815	/// \param __b02
				3816	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				3817	/// \param __b01
				3818	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				3819	/// \param __b00
				3820	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				3821	/// \returns An initialized 256-bit integer vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3822	static __inline __m256i __DEFAULT_FN_ATTRS
				3823	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
				3824	char __b27, char __b26, char __b25, char __b24,
				3825	char __b23, char __b22, char __b21, char __b20,
				3826	char __b19, char __b18, char __b17, char __b16,
				3827	char __b15, char __b14, char __b13, char __b12,
				3828	char __b11, char __b10, char __b09, char __b08,
				3829	char __b07, char __b06, char __b05, char __b04,
				3830	char __b03, char __b02, char __b01, char __b00)
				3831	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3832	return __extension__ (__m256i)(__v32qi){
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3833	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				3834	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				3835	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				3836	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
				3837	};
				3838	}
				3839
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3840	/// Constructs a 256-bit integer vector initialized with the specified
				3841	/// 64-bit integral values.
				3842	///
				3843	/// \headerfile <x86intrin.h>
				3844	///
				3845	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
				3846	/// instruction.
				3847	///
				3848	/// \param __a
				3849	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				3850	/// \param __b
				3851	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				3852	/// \param __c
				3853	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				3854	/// \param __d
				3855	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				3856	/// \returns An initialized 256-bit integer vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3857	static __inline __m256i __DEFAULT_FN_ATTRS
				3858	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
				3859	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3860	return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3861	}
				3862
				3863	/* Create vectors with elements in reverse order */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3864	/// Constructs a 256-bit floating-point vector of [4 x double],
				3865	/// initialized in reverse order with the specified double-precision
				3866	/// floating-point values.
				3867	///
				3868	/// \headerfile <x86intrin.h>
				3869	///
				3870	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
				3871	/// instruction.
				3872	///
				3873	/// \param __a
				3874	/// A double-precision floating-point value used to initialize bits [63:0]
				3875	/// of the result.
				3876	/// \param __b
				3877	/// A double-precision floating-point value used to initialize bits [127:64]
				3878	/// of the result.
				3879	/// \param __c
				3880	/// A double-precision floating-point value used to initialize bits [191:128]
				3881	/// of the result.
				3882	/// \param __d
				3883	/// A double-precision floating-point value used to initialize bits [255:192]
				3884	/// of the result.
				3885	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3886	static __inline __m256d __DEFAULT_FN_ATTRS
				3887	_mm256_setr_pd(double __a, double __b, double __c, double __d)
				3888	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3889	return _mm256_set_pd(__d, __c, __b, __a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3890	}
				3891
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3892	/// Constructs a 256-bit floating-point vector of [8 x float],
				3893	/// initialized in reverse order with the specified single-precision
				3894	/// float-point values.
				3895	///
				3896	/// \headerfile <x86intrin.h>
				3897	///
				3898	/// This intrinsic is a utility function and does not correspond to a specific
				3899	/// instruction.
				3900	///
				3901	/// \param __a
				3902	/// A single-precision floating-point value used to initialize bits [31:0]
				3903	/// of the result.
				3904	/// \param __b
				3905	/// A single-precision floating-point value used to initialize bits [63:32]
				3906	/// of the result.
				3907	/// \param __c
				3908	/// A single-precision floating-point value used to initialize bits [95:64]
				3909	/// of the result.
				3910	/// \param __d
				3911	/// A single-precision floating-point value used to initialize bits [127:96]
				3912	/// of the result.
				3913	/// \param __e
				3914	/// A single-precision floating-point value used to initialize bits [159:128]
				3915	/// of the result.
				3916	/// \param __f
				3917	/// A single-precision floating-point value used to initialize bits [191:160]
				3918	/// of the result.
				3919	/// \param __g
				3920	/// A single-precision floating-point value used to initialize bits [223:192]
				3921	/// of the result.
				3922	/// \param __h
				3923	/// A single-precision floating-point value used to initialize bits [255:224]
				3924	/// of the result.
				3925	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3926	static __inline __m256 __DEFAULT_FN_ATTRS
				3927	_mm256_setr_ps(float __a, float __b, float __c, float __d,
				3928	float __e, float __f, float __g, float __h)
				3929	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3930	return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3931	}
				3932
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3933	/// Constructs a 256-bit integer vector, initialized in reverse order
				3934	/// with the specified 32-bit integral values.
				3935	///
				3936	/// \headerfile <x86intrin.h>
				3937	///
				3938	/// This intrinsic is a utility function and does not correspond to a specific
				3939	/// instruction.
				3940	///
				3941	/// \param __i0
				3942	/// A 32-bit integral value used to initialize bits [31:0] of the result.
				3943	/// \param __i1
				3944	/// A 32-bit integral value used to initialize bits [63:32] of the result.
				3945	/// \param __i2
				3946	/// A 32-bit integral value used to initialize bits [95:64] of the result.
				3947	/// \param __i3
				3948	/// A 32-bit integral value used to initialize bits [127:96] of the result.
				3949	/// \param __i4
				3950	/// A 32-bit integral value used to initialize bits [159:128] of the result.
				3951	/// \param __i5
				3952	/// A 32-bit integral value used to initialize bits [191:160] of the result.
				3953	/// \param __i6
				3954	/// A 32-bit integral value used to initialize bits [223:192] of the result.
				3955	/// \param __i7
				3956	/// A 32-bit integral value used to initialize bits [255:224] of the result.
				3957	/// \returns An initialized 256-bit integer vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3958	static __inline __m256i __DEFAULT_FN_ATTRS
				3959	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
				3960	int __i4, int __i5, int __i6, int __i7)
				3961	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3962	return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	3963	}
				3964
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	3965	/// Constructs a 256-bit integer vector, initialized in reverse order
				3966	/// with the specified 16-bit integral values.
				3967	///
				3968	/// \headerfile <x86intrin.h>
				3969	///
				3970	/// This intrinsic is a utility function and does not correspond to a specific
				3971	/// instruction.
				3972	///
				3973	/// \param __w15
				3974	/// A 16-bit integral value used to initialize bits [15:0] of the result.
				3975	/// \param __w14
				3976	/// A 16-bit integral value used to initialize bits [31:16] of the result.
				3977	/// \param __w13
				3978	/// A 16-bit integral value used to initialize bits [47:32] of the result.
				3979	/// \param __w12
				3980	/// A 16-bit integral value used to initialize bits [63:48] of the result.
				3981	/// \param __w11
				3982	/// A 16-bit integral value used to initialize bits [79:64] of the result.
				3983	/// \param __w10
				3984	/// A 16-bit integral value used to initialize bits [95:80] of the result.
				3985	/// \param __w09
				3986	/// A 16-bit integral value used to initialize bits [111:96] of the result.
				3987	/// \param __w08
				3988	/// A 16-bit integral value used to initialize bits [127:112] of the result.
				3989	/// \param __w07
				3990	/// A 16-bit integral value used to initialize bits [143:128] of the result.
				3991	/// \param __w06
				3992	/// A 16-bit integral value used to initialize bits [159:144] of the result.
				3993	/// \param __w05
				3994	/// A 16-bit integral value used to initialize bits [175:160] of the result.
				3995	/// \param __w04
				3996	/// A 16-bit integral value used to initialize bits [191:176] of the result.
				3997	/// \param __w03
				3998	/// A 16-bit integral value used to initialize bits [207:192] of the result.
				3999	/// \param __w02
				4000	/// A 16-bit integral value used to initialize bits [223:208] of the result.
				4001	/// \param __w01
				4002	/// A 16-bit integral value used to initialize bits [239:224] of the result.
				4003	/// \param __w00
				4004	/// A 16-bit integral value used to initialize bits [255:240] of the result.
				4005	/// \returns An initialized 256-bit integer vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4006	static __inline __m256i __DEFAULT_FN_ATTRS
				4007	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
				4008	short __w11, short __w10, short __w09, short __w08,
				4009	short __w07, short __w06, short __w05, short __w04,
				4010	short __w03, short __w02, short __w01, short __w00)
				4011	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4012	return _mm256_set_epi16(__w00, __w01, __w02, __w03,
				4013	__w04, __w05, __w06, __w07,
				4014	__w08, __w09, __w10, __w11,
				4015	__w12, __w13, __w14, __w15);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4016	}
				4017
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4018	/// Constructs a 256-bit integer vector, initialized in reverse order
				4019	/// with the specified 8-bit integral values.
				4020	///
				4021	/// \headerfile <x86intrin.h>
				4022	///
				4023	/// This intrinsic is a utility function and does not correspond to a specific
				4024	/// instruction.
				4025	///
				4026	/// \param __b31
				4027	/// An 8-bit integral value used to initialize bits [7:0] of the result.
				4028	/// \param __b30
				4029	/// An 8-bit integral value used to initialize bits [15:8] of the result.
				4030	/// \param __b29
				4031	/// An 8-bit integral value used to initialize bits [23:16] of the result.
				4032	/// \param __b28
				4033	/// An 8-bit integral value used to initialize bits [31:24] of the result.
				4034	/// \param __b27
				4035	/// An 8-bit integral value used to initialize bits [39:32] of the result.
				4036	/// \param __b26
				4037	/// An 8-bit integral value used to initialize bits [47:40] of the result.
				4038	/// \param __b25
				4039	/// An 8-bit integral value used to initialize bits [55:48] of the result.
				4040	/// \param __b24
				4041	/// An 8-bit integral value used to initialize bits [63:56] of the result.
				4042	/// \param __b23
				4043	/// An 8-bit integral value used to initialize bits [71:64] of the result.
				4044	/// \param __b22
				4045	/// An 8-bit integral value used to initialize bits [79:72] of the result.
				4046	/// \param __b21
				4047	/// An 8-bit integral value used to initialize bits [87:80] of the result.
				4048	/// \param __b20
				4049	/// An 8-bit integral value used to initialize bits [95:88] of the result.
				4050	/// \param __b19
				4051	/// An 8-bit integral value used to initialize bits [103:96] of the result.
				4052	/// \param __b18
				4053	/// An 8-bit integral value used to initialize bits [111:104] of the result.
				4054	/// \param __b17
				4055	/// An 8-bit integral value used to initialize bits [119:112] of the result.
				4056	/// \param __b16
				4057	/// An 8-bit integral value used to initialize bits [127:120] of the result.
				4058	/// \param __b15
				4059	/// An 8-bit integral value used to initialize bits [135:128] of the result.
				4060	/// \param __b14
				4061	/// An 8-bit integral value used to initialize bits [143:136] of the result.
				4062	/// \param __b13
				4063	/// An 8-bit integral value used to initialize bits [151:144] of the result.
				4064	/// \param __b12
				4065	/// An 8-bit integral value used to initialize bits [159:152] of the result.
				4066	/// \param __b11
				4067	/// An 8-bit integral value used to initialize bits [167:160] of the result.
				4068	/// \param __b10
				4069	/// An 8-bit integral value used to initialize bits [175:168] of the result.
				4070	/// \param __b09
				4071	/// An 8-bit integral value used to initialize bits [183:176] of the result.
				4072	/// \param __b08
				4073	/// An 8-bit integral value used to initialize bits [191:184] of the result.
				4074	/// \param __b07
				4075	/// An 8-bit integral value used to initialize bits [199:192] of the result.
				4076	/// \param __b06
				4077	/// An 8-bit integral value used to initialize bits [207:200] of the result.
				4078	/// \param __b05
				4079	/// An 8-bit integral value used to initialize bits [215:208] of the result.
				4080	/// \param __b04
				4081	/// An 8-bit integral value used to initialize bits [223:216] of the result.
				4082	/// \param __b03
				4083	/// An 8-bit integral value used to initialize bits [231:224] of the result.
				4084	/// \param __b02
				4085	/// An 8-bit integral value used to initialize bits [239:232] of the result.
				4086	/// \param __b01
				4087	/// An 8-bit integral value used to initialize bits [247:240] of the result.
				4088	/// \param __b00
				4089	/// An 8-bit integral value used to initialize bits [255:248] of the result.
				4090	/// \returns An initialized 256-bit integer vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4091	static __inline __m256i __DEFAULT_FN_ATTRS
				4092	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
				4093	char __b27, char __b26, char __b25, char __b24,
				4094	char __b23, char __b22, char __b21, char __b20,
				4095	char __b19, char __b18, char __b17, char __b16,
				4096	char __b15, char __b14, char __b13, char __b12,
				4097	char __b11, char __b10, char __b09, char __b08,
				4098	char __b07, char __b06, char __b05, char __b04,
				4099	char __b03, char __b02, char __b01, char __b00)
				4100	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4101	return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
				4102	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
				4103	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
				4104	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4105	}
				4106
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4107	/// Constructs a 256-bit integer vector, initialized in reverse order
				4108	/// with the specified 64-bit integral values.
				4109	///
				4110	/// \headerfile <x86intrin.h>
				4111	///
				4112	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
				4113	/// instruction.
				4114	///
				4115	/// \param __a
				4116	/// A 64-bit integral value used to initialize bits [63:0] of the result.
				4117	/// \param __b
				4118	/// A 64-bit integral value used to initialize bits [127:64] of the result.
				4119	/// \param __c
				4120	/// A 64-bit integral value used to initialize bits [191:128] of the result.
				4121	/// \param __d
				4122	/// A 64-bit integral value used to initialize bits [255:192] of the result.
				4123	/// \returns An initialized 256-bit integer vector.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4124	static __inline __m256i __DEFAULT_FN_ATTRS
				4125	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
				4126	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4127	return _mm256_set_epi64x(__d, __c, __b, __a);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4128	}
				4129
				4130	/* Create vectors with repeated elements */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4131	/// Constructs a 256-bit floating-point vector of [4 x double], with each
				4132	/// of the four double-precision floating-point vector elements set to the
				4133	/// specified double-precision floating-point value.
				4134	///
				4135	/// \headerfile <x86intrin.h>
				4136	///
				4137	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
				4138	///
				4139	/// \param __w
				4140	/// A double-precision floating-point value used to initialize each vector
				4141	/// element of the result.
				4142	/// \returns An initialized 256-bit floating-point vector of [4 x double].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4143	static __inline __m256d __DEFAULT_FN_ATTRS
				4144	_mm256_set1_pd(double __w)
				4145	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4146	return _mm256_set_pd(__w, __w, __w, __w);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4147	}
				4148
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4149	/// Constructs a 256-bit floating-point vector of [8 x float], with each
				4150	/// of the eight single-precision floating-point vector elements set to the
				4151	/// specified single-precision floating-point value.
				4152	///
				4153	/// \headerfile <x86intrin.h>
				4154	///
				4155	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
				4156	/// instruction.
				4157	///
				4158	/// \param __w
				4159	/// A single-precision floating-point value used to initialize each vector
				4160	/// element of the result.
				4161	/// \returns An initialized 256-bit floating-point vector of [8 x float].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4162	static __inline __m256 __DEFAULT_FN_ATTRS
				4163	_mm256_set1_ps(float __w)
				4164	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4165	return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4166	}
				4167
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4168	/// Constructs a 256-bit integer vector of [8 x i32], with each of the
				4169	/// 32-bit integral vector elements set to the specified 32-bit integral
				4170	/// value.
				4171	///
				4172	/// \headerfile <x86intrin.h>
				4173	///
				4174	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
				4175	/// instruction.
				4176	///
				4177	/// \param __i
				4178	/// A 32-bit integral value used to initialize each vector element of the
				4179	/// result.
				4180	/// \returns An initialized 256-bit integer vector of [8 x i32].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4181	static __inline __m256i __DEFAULT_FN_ATTRS
				4182	_mm256_set1_epi32(int __i)
				4183	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4184	return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4185	}
				4186
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4187	/// Constructs a 256-bit integer vector of [16 x i16], with each of the
				4188	/// 16-bit integral vector elements set to the specified 16-bit integral
				4189	/// value.
				4190	///
				4191	/// \headerfile <x86intrin.h>
				4192	///
				4193	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
				4194	///
				4195	/// \param __w
				4196	/// A 16-bit integral value used to initialize each vector element of the
				4197	/// result.
				4198	/// \returns An initialized 256-bit integer vector of [16 x i16].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4199	static __inline __m256i __DEFAULT_FN_ATTRS
				4200	_mm256_set1_epi16(short __w)
				4201	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4202	return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
				4203	__w, __w, __w, __w, __w, __w, __w, __w);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4204	}
				4205
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4206	/// Constructs a 256-bit integer vector of [32 x i8], with each of the
				4207	/// 8-bit integral vector elements set to the specified 8-bit integral value.
				4208	///
				4209	/// \headerfile <x86intrin.h>
				4210	///
				4211	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
				4212	///
				4213	/// \param __b
				4214	/// An 8-bit integral value used to initialize each vector element of the
				4215	/// result.
				4216	/// \returns An initialized 256-bit integer vector of [32 x i8].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4217	static __inline __m256i __DEFAULT_FN_ATTRS
				4218	_mm256_set1_epi8(char __b)
				4219	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4220	return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
				4221	__b, __b, __b, __b, __b, __b, __b, __b,
				4222	__b, __b, __b, __b, __b, __b, __b, __b,
				4223	__b, __b, __b, __b, __b, __b, __b, __b);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4224	}
				4225
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4226	/// Constructs a 256-bit integer vector of [4 x i64], with each of the
				4227	/// 64-bit integral vector elements set to the specified 64-bit integral
				4228	/// value.
				4229	///
				4230	/// \headerfile <x86intrin.h>
				4231	///
				4232	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
				4233	///
				4234	/// \param __q
				4235	/// A 64-bit integral value used to initialize each vector element of the
				4236	/// result.
				4237	/// \returns An initialized 256-bit integer vector of [4 x i64].
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4238	static __inline __m256i __DEFAULT_FN_ATTRS
				4239	_mm256_set1_epi64x(long long __q)
				4240	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4241	return _mm256_set_epi64x(__q, __q, __q, __q);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4242	}
				4243
				4244	/* Create __zeroed vectors */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4245	/// Constructs a 256-bit floating-point vector of [4 x double] with all
				4246	/// vector elements initialized to zero.
				4247	///
				4248	/// \headerfile <x86intrin.h>
				4249	///
				4250	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
				4251	///
				4252	/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4253	static __inline __m256d __DEFAULT_FN_ATTRS
				4254	_mm256_setzero_pd(void)
				4255	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4256	return __extension__ (__m256d){ 0, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4257	}
				4258
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4259	/// Constructs a 256-bit floating-point vector of [8 x float] with all
				4260	/// vector elements initialized to zero.
				4261	///
				4262	/// \headerfile <x86intrin.h>
				4263	///
				4264	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
				4265	///
				4266	/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4267	static __inline __m256 __DEFAULT_FN_ATTRS
				4268	_mm256_setzero_ps(void)
				4269	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4270	return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4271	}
				4272
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4273	/// Constructs a 256-bit integer vector initialized to zero.
				4274	///
				4275	/// \headerfile <x86intrin.h>
				4276	///
				4277	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
				4278	///
				4279	/// \returns A 256-bit integer vector initialized to zero.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4280	static __inline __m256i __DEFAULT_FN_ATTRS
				4281	_mm256_setzero_si256(void)
				4282	{
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4283	return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4284	}
				4285
				4286	/* Cast between vector types */
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4287	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
				4288	/// floating-point vector of [8 x float].
				4289	///
				4290	/// \headerfile <x86intrin.h>
				4291	///
				4292	/// This intrinsic has no corresponding instruction.
				4293	///
				4294	/// \param __a
				4295	/// A 256-bit floating-point vector of [4 x double].
				4296	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4297	/// bitwise pattern as the parameter.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4298	static __inline __m256 __DEFAULT_FN_ATTRS
				4299	_mm256_castpd_ps(__m256d __a)
				4300	{
				4301	return (__m256)__a;
				4302	}
				4303
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4304	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
				4305	/// integer vector.
				4306	///
				4307	/// \headerfile <x86intrin.h>
				4308	///
				4309	/// This intrinsic has no corresponding instruction.
				4310	///
				4311	/// \param __a
				4312	/// A 256-bit floating-point vector of [4 x double].
				4313	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4314	/// parameter.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4315	static __inline __m256i __DEFAULT_FN_ATTRS
				4316	_mm256_castpd_si256(__m256d __a)
				4317	{
				4318	return (__m256i)__a;
				4319	}
				4320
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4321	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
				4322	/// floating-point vector of [4 x double].
				4323	///
				4324	/// \headerfile <x86intrin.h>
				4325	///
				4326	/// This intrinsic has no corresponding instruction.
				4327	///
				4328	/// \param __a
				4329	/// A 256-bit floating-point vector of [8 x float].
				4330	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4331	/// bitwise pattern as the parameter.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4332	static __inline __m256d __DEFAULT_FN_ATTRS
				4333	_mm256_castps_pd(__m256 __a)
				4334	{
				4335	return (__m256d)__a;
				4336	}
				4337
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4338	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
				4339	/// integer vector.
				4340	///
				4341	/// \headerfile <x86intrin.h>
				4342	///
				4343	/// This intrinsic has no corresponding instruction.
				4344	///
				4345	/// \param __a
				4346	/// A 256-bit floating-point vector of [8 x float].
				4347	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
				4348	/// parameter.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4349	static __inline __m256i __DEFAULT_FN_ATTRS
				4350	_mm256_castps_si256(__m256 __a)
				4351	{
				4352	return (__m256i)__a;
				4353	}
				4354
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4355	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
				4356	/// of [8 x float].
				4357	///
				4358	/// \headerfile <x86intrin.h>
				4359	///
				4360	/// This intrinsic has no corresponding instruction.
				4361	///
				4362	/// \param __a
				4363	/// A 256-bit integer vector.
				4364	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
				4365	/// bitwise pattern as the parameter.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4366	static __inline __m256 __DEFAULT_FN_ATTRS
				4367	_mm256_castsi256_ps(__m256i __a)
				4368	{
				4369	return (__m256)__a;
				4370	}
				4371
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4372	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
				4373	/// of [4 x double].
				4374	///
				4375	/// \headerfile <x86intrin.h>
				4376	///
				4377	/// This intrinsic has no corresponding instruction.
				4378	///
				4379	/// \param __a
				4380	/// A 256-bit integer vector.
				4381	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
				4382	/// bitwise pattern as the parameter.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4383	static __inline __m256d __DEFAULT_FN_ATTRS
				4384	_mm256_castsi256_pd(__m256i __a)
				4385	{
				4386	return (__m256d)__a;
				4387	}
				4388
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4389	/// Returns the lower 128 bits of a 256-bit floating-point vector of
				4390	/// [4 x double] as a 128-bit floating-point vector of [2 x double].
				4391	///
				4392	/// \headerfile <x86intrin.h>
				4393	///
				4394	/// This intrinsic has no corresponding instruction.
				4395	///
				4396	/// \param __a
				4397	/// A 256-bit floating-point vector of [4 x double].
				4398	/// \returns A 128-bit floating-point vector of [2 x double] containing the
				4399	/// lower 128 bits of the parameter.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4400	static __inline __m128d __DEFAULT_FN_ATTRS
				4401	_mm256_castpd256_pd128(__m256d __a)
				4402	{
				4403	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
				4404	}
				4405
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4406	/// Returns the lower 128 bits of a 256-bit floating-point vector of
				4407	/// [8 x float] as a 128-bit floating-point vector of [4 x float].
				4408	///
				4409	/// \headerfile <x86intrin.h>
				4410	///
				4411	/// This intrinsic has no corresponding instruction.
				4412	///
				4413	/// \param __a
				4414	/// A 256-bit floating-point vector of [8 x float].
				4415	/// \returns A 128-bit floating-point vector of [4 x float] containing the
				4416	/// lower 128 bits of the parameter.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4417	static __inline __m128 __DEFAULT_FN_ATTRS
				4418	_mm256_castps256_ps128(__m256 __a)
				4419	{
				4420	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
				4421	}
				4422
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4423	/// Truncates a 256-bit integer vector into a 128-bit integer vector.
				4424	///
				4425	/// \headerfile <x86intrin.h>
				4426	///
				4427	/// This intrinsic has no corresponding instruction.
				4428	///
				4429	/// \param __a
				4430	/// A 256-bit integer vector.
				4431	/// \returns A 128-bit integer vector containing the lower 128 bits of the
				4432	/// parameter.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4433	static __inline __m128i __DEFAULT_FN_ATTRS
				4434	_mm256_castsi256_si128(__m256i __a)
				4435	{
				4436	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
				4437	}
				4438
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4439	/// Constructs a 256-bit floating-point vector of [4 x double] from a
				4440	/// 128-bit floating-point vector of [2 x double].
				4441	///
				4442	/// The lower 128 bits contain the value of the source vector. The contents
				4443	/// of the upper 128 bits are undefined.
				4444	///
				4445	/// \headerfile <x86intrin.h>
				4446	///
				4447	/// This intrinsic has no corresponding instruction.
				4448	///
				4449	/// \param __a
				4450	/// A 128-bit vector of [2 x double].
				4451	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
				4452	/// contain the value of the parameter. The contents of the upper 128 bits
				4453	/// are undefined.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4454	static __inline __m256d __DEFAULT_FN_ATTRS
				4455	_mm256_castpd128_pd256(__m128d __a)
				4456	{
				4457	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
				4458	}
				4459
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4460	/// Constructs a 256-bit floating-point vector of [8 x float] from a
				4461	/// 128-bit floating-point vector of [4 x float].
				4462	///
				4463	/// The lower 128 bits contain the value of the source vector. The contents
				4464	/// of the upper 128 bits are undefined.
				4465	///
				4466	/// \headerfile <x86intrin.h>
				4467	///
				4468	/// This intrinsic has no corresponding instruction.
				4469	///
				4470	/// \param __a
				4471	/// A 128-bit vector of [4 x float].
				4472	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
				4473	/// contain the value of the parameter. The contents of the upper 128 bits
				4474	/// are undefined.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4475	static __inline __m256 __DEFAULT_FN_ATTRS
				4476	_mm256_castps128_ps256(__m128 __a)
				4477	{
				4478	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
				4479	}
				4480
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4481	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
				4482	///
				4483	/// The lower 128 bits contain the value of the source vector. The contents
				4484	/// of the upper 128 bits are undefined.
				4485	///
				4486	/// \headerfile <x86intrin.h>
				4487	///
				4488	/// This intrinsic has no corresponding instruction.
				4489	///
				4490	/// \param __a
				4491	/// A 128-bit integer vector.
				4492	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
				4493	/// the parameter. The contents of the upper 128 bits are undefined.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4494	static __inline __m256i __DEFAULT_FN_ATTRS
				4495	_mm256_castsi128_si256(__m128i __a)
				4496	{
				4497	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
				4498	}
				4499
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4500	/// Constructs a 256-bit floating-point vector of [4 x double] from a
				4501	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
				4502	/// contain the value of the source vector. The upper 128 bits are set
				4503	/// to zero.
				4504	///
				4505	/// \headerfile <x86intrin.h>
				4506	///
				4507	/// This intrinsic has no corresponding instruction.
				4508	///
				4509	/// \param __a
				4510	/// A 128-bit vector of [2 x double].
				4511	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
				4512	/// contain the value of the parameter. The upper 128 bits are set to zero.
				4513	static __inline __m256d __DEFAULT_FN_ATTRS
				4514	_mm256_zextpd128_pd256(__m128d __a)
				4515	{
				4516	return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
				4517	}
				4518
				4519	/// Constructs a 256-bit floating-point vector of [8 x float] from a
				4520	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
				4521	/// the value of the source vector. The upper 128 bits are set to zero.
				4522	///
				4523	/// \headerfile <x86intrin.h>
				4524	///
				4525	/// This intrinsic has no corresponding instruction.
				4526	///
				4527	/// \param __a
				4528	/// A 128-bit vector of [4 x float].
				4529	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
				4530	/// contain the value of the parameter. The upper 128 bits are set to zero.
				4531	static __inline __m256 __DEFAULT_FN_ATTRS
				4532	_mm256_zextps128_ps256(__m128 __a)
				4533	{
				4534	return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
				4535	}
				4536
				4537	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
				4538	/// The lower 128 bits contain the value of the source vector. The upper
				4539	/// 128 bits are set to zero.
				4540	///
				4541	/// \headerfile <x86intrin.h>
				4542	///
				4543	/// This intrinsic has no corresponding instruction.
				4544	///
				4545	/// \param __a
				4546	/// A 128-bit integer vector.
				4547	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
				4548	/// the parameter. The upper 128 bits are set to zero.
				4549	static __inline __m256i __DEFAULT_FN_ATTRS
				4550	_mm256_zextsi128_si256(__m128i __a)
				4551	{
				4552	return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
				4553	}
				4554
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4555	/*
				4556	Vector insert.
				4557	We use macros rather than inlines because we only want to accept
				4558	invocations where the immediate M is a constant expression.
				4559	*/
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4560	/// Constructs a new 256-bit vector of [8 x float] by first duplicating
				4561	/// a 256-bit vector of [8 x float] given in the first parameter, and then
				4562	/// replacing either the upper or the lower 128 bits with the contents of a
				4563	/// 128-bit vector of [4 x float] in the second parameter.
				4564	///
				4565	/// The immediate integer parameter determines between the upper or the lower
				4566	/// 128 bits.
				4567	///
				4568	/// \headerfile <x86intrin.h>
				4569	///
				4570	/// \code
				4571	/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
				4572	/// \endcode
				4573	///
				4574	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
				4575	///
				4576	/// \param V1
				4577	/// A 256-bit vector of [8 x float]. This vector is copied to the result
				4578	/// first, and then either the upper or the lower 128 bits of the result will
				4579	/// be replaced by the contents of \a V2.
				4580	/// \param V2
				4581	/// A 128-bit vector of [4 x float]. The contents of this parameter are
				4582	/// written to either the upper or the lower 128 bits of the result depending
				4583	/// on the value of parameter \a M.
				4584	/// \param M
				4585	/// An immediate integer. The least significant bit determines how the values
				4586	/// from the two parameters are interleaved: \n
				4587	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
				4588	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4589	/// result. \n
				4590	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4591	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4592	/// result.
				4593	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
				4594	#define _mm256_insertf128_ps(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	4595	((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
				4596	(__v4sf)(__m128)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4597
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4598	/// Constructs a new 256-bit vector of [4 x double] by first duplicating
				4599	/// a 256-bit vector of [4 x double] given in the first parameter, and then
				4600	/// replacing either the upper or the lower 128 bits with the contents of a
				4601	/// 128-bit vector of [2 x double] in the second parameter.
				4602	///
				4603	/// The immediate integer parameter determines between the upper or the lower
				4604	/// 128 bits.
				4605	///
				4606	/// \headerfile <x86intrin.h>
				4607	///
				4608	/// \code
				4609	/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
				4610	/// \endcode
				4611	///
				4612	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
				4613	///
				4614	/// \param V1
				4615	/// A 256-bit vector of [4 x double]. This vector is copied to the result
				4616	/// first, and then either the upper or the lower 128 bits of the result will
				4617	/// be replaced by the contents of \a V2.
				4618	/// \param V2
				4619	/// A 128-bit vector of [2 x double]. The contents of this parameter are
				4620	/// written to either the upper or the lower 128 bits of the result depending
				4621	/// on the value of parameter \a M.
				4622	/// \param M
				4623	/// An immediate integer. The least significant bit determines how the values
				4624	/// from the two parameters are interleaved: \n
				4625	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
				4626	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4627	/// result. \n
				4628	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4629	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4630	/// result.
				4631	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
				4632	#define _mm256_insertf128_pd(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	4633	((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
				4634	(__v2df)(__m128d)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4635
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4636	/// Constructs a new 256-bit integer vector by first duplicating a
				4637	/// 256-bit integer vector given in the first parameter, and then replacing
				4638	/// either the upper or the lower 128 bits with the contents of a 128-bit
				4639	/// integer vector in the second parameter.
				4640	///
				4641	/// The immediate integer parameter determines between the upper or the lower
				4642	/// 128 bits.
				4643	///
				4644	/// \headerfile <x86intrin.h>
				4645	///
				4646	/// \code
				4647	/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
				4648	/// \endcode
				4649	///
				4650	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
				4651	///
				4652	/// \param V1
				4653	/// A 256-bit integer vector. This vector is copied to the result first, and
				4654	/// then either the upper or the lower 128 bits of the result will be
				4655	/// replaced by the contents of \a V2.
				4656	/// \param V2
				4657	/// A 128-bit integer vector. The contents of this parameter are written to
				4658	/// either the upper or the lower 128 bits of the result depending on the
				4659	/// value of parameter \a M.
				4660	/// \param M
				4661	/// An immediate integer. The least significant bit determines how the values
				4662	/// from the two parameters are interleaved: \n
				4663	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
				4664	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
				4665	/// result. \n
				4666	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
				4667	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
				4668	/// result.
				4669	/// \returns A 256-bit integer vector containing the interleaved values.
				4670	#define _mm256_insertf128_si256(V1, V2, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	4671	((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
				4672	(__v4si)(__m128i)(V2), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4673
				4674	/*
				4675	Vector extract.
				4676	We use macros rather than inlines because we only want to accept
				4677	invocations where the immediate M is a constant expression.
				4678	*/
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4679	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
				4680	/// of [8 x float], as determined by the immediate integer parameter, and
				4681	/// returns the extracted bits as a 128-bit vector of [4 x float].
				4682	///
				4683	/// \headerfile <x86intrin.h>
				4684	///
				4685	/// \code
				4686	/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
				4687	/// \endcode
				4688	///
				4689	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
				4690	///
				4691	/// \param V
				4692	/// A 256-bit vector of [8 x float].
				4693	/// \param M
				4694	/// An immediate integer. The least significant bit determines which bits are
				4695	/// extracted from the first parameter: \n
				4696	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4697	/// result. \n
				4698	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
				4699	/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
				4700	#define _mm256_extractf128_ps(V, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	4701	((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4702
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4703	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
				4704	/// of [4 x double], as determined by the immediate integer parameter, and
				4705	/// returns the extracted bits as a 128-bit vector of [2 x double].
				4706	///
				4707	/// \headerfile <x86intrin.h>
				4708	///
				4709	/// \code
				4710	/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
				4711	/// \endcode
				4712	///
				4713	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
				4714	///
				4715	/// \param V
				4716	/// A 256-bit vector of [4 x double].
				4717	/// \param M
				4718	/// An immediate integer. The least significant bit determines which bits are
				4719	/// extracted from the first parameter: \n
				4720	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4721	/// result. \n
				4722	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
				4723	/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
				4724	#define _mm256_extractf128_pd(V, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	4725	((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4726
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4727	/// Extracts either the upper or the lower 128 bits from a 256-bit
				4728	/// integer vector, as determined by the immediate integer parameter, and
				4729	/// returns the extracted bits as a 128-bit integer vector.
				4730	///
				4731	/// \headerfile <x86intrin.h>
				4732	///
				4733	/// \code
				4734	/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
				4735	/// \endcode
				4736	///
				4737	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
				4738	///
				4739	/// \param V
				4740	/// A 256-bit integer vector.
				4741	/// \param M
				4742	/// An immediate integer. The least significant bit determines which bits are
				4743	/// extracted from the first parameter: \n
				4744	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
				4745	/// result. \n
				4746	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
				4747	/// \returns A 128-bit integer vector containing the extracted bits.
				4748	#define _mm256_extractf128_si256(V, M) \
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	4749	((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4750
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4751	/// Constructs a 256-bit floating-point vector of [8 x float] by
				4752	/// concatenating two 128-bit floating-point vectors of [4 x float].
				4753	///
				4754	/// \headerfile <x86intrin.h>
				4755	///
				4756	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
				4757	///
				4758	/// \param __hi
				4759	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				4760	/// 128 bits of the result.
				4761	/// \param __lo
				4762	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				4763	/// 128 bits of the result.
				4764	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4765	/// concatenated result.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4766	static __inline __m256 __DEFAULT_FN_ATTRS
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4767	_mm256_set_m128 (__m128 __hi, __m128 __lo)
				4768	{
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4769	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
				4770	}
				4771
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4772	/// Constructs a 256-bit floating-point vector of [4 x double] by
				4773	/// concatenating two 128-bit floating-point vectors of [2 x double].
				4774	///
				4775	/// \headerfile <x86intrin.h>
				4776	///
				4777	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
				4778	///
				4779	/// \param __hi
				4780	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				4781	/// 128 bits of the result.
				4782	/// \param __lo
				4783	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				4784	/// 128 bits of the result.
				4785	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4786	/// concatenated result.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4787	static __inline __m256d __DEFAULT_FN_ATTRS
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4788	_mm256_set_m128d (__m128d __hi, __m128d __lo)
				4789	{
				4790	return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4791	}
				4792
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4793	/// Constructs a 256-bit integer vector by concatenating two 128-bit
				4794	/// integer vectors.
				4795	///
				4796	/// \headerfile <x86intrin.h>
				4797	///
				4798	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
				4799	///
				4800	/// \param __hi
				4801	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				4802	/// result.
				4803	/// \param __lo
				4804	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				4805	/// result.
				4806	/// \returns A 256-bit integer vector containing the concatenated result.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4807	static __inline __m256i __DEFAULT_FN_ATTRS
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4808	_mm256_set_m128i (__m128i __hi, __m128i __lo)
				4809	{
				4810	return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4811	}
				4812
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4813	/// Constructs a 256-bit floating-point vector of [8 x float] by
				4814	/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
				4815	/// similar to _mm256_set_m128, but the order of the input parameters is
				4816	/// swapped.
				4817	///
				4818	/// \headerfile <x86intrin.h>
				4819	///
				4820	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
				4821	///
				4822	/// \param __lo
				4823	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
				4824	/// 128 bits of the result.
				4825	/// \param __hi
				4826	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
				4827	/// 128 bits of the result.
				4828	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4829	/// concatenated result.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4830	static __inline __m256 __DEFAULT_FN_ATTRS
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4831	_mm256_setr_m128 (__m128 __lo, __m128 __hi)
				4832	{
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4833	return _mm256_set_m128(__hi, __lo);
				4834	}
				4835
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4836	/// Constructs a 256-bit floating-point vector of [4 x double] by
				4837	/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
				4838	/// similar to _mm256_set_m128d, but the order of the input parameters is
				4839	/// swapped.
				4840	///
				4841	/// \headerfile <x86intrin.h>
				4842	///
				4843	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
				4844	///
				4845	/// \param __lo
				4846	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
				4847	/// 128 bits of the result.
				4848	/// \param __hi
				4849	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
				4850	/// 128 bits of the result.
				4851	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4852	/// concatenated result.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4853	static __inline __m256d __DEFAULT_FN_ATTRS
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4854	_mm256_setr_m128d (__m128d __lo, __m128d __hi)
				4855	{
				4856	return (__m256d)_mm256_set_m128d(__hi, __lo);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4857	}
				4858
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4859	/// Constructs a 256-bit integer vector by concatenating two 128-bit
				4860	/// integer vectors. This is similar to _mm256_set_m128i, but the order of
				4861	/// the input parameters is swapped.
				4862	///
				4863	/// \headerfile <x86intrin.h>
				4864	///
				4865	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
				4866	///
				4867	/// \param __lo
				4868	/// A 128-bit integer vector to be copied to the lower 128 bits of the
				4869	/// result.
				4870	/// \param __hi
				4871	/// A 128-bit integer vector to be copied to the upper 128 bits of the
				4872	/// result.
				4873	/// \returns A 256-bit integer vector containing the concatenated result.
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4874	static __inline __m256i __DEFAULT_FN_ATTRS
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	4875	_mm256_setr_m128i (__m128i __lo, __m128i __hi)
				4876	{
				4877	return (__m256i)_mm256_set_m128i(__hi, __lo);
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	4878	}
				4879
Pirama Arumuga Nainar	ec8c89d	2022-02-23 09:26:16 -0800	[diff] [blame]	4880	/* SIMD load ops (unaligned) */
				4881	/// Loads two 128-bit floating-point vectors of [4 x float] from
				4882	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4883	/// of [8 x float] by concatenating the two 128-bit vectors.
				4884	///
				4885	/// \headerfile <x86intrin.h>
				4886	///
				4887	/// This intrinsic corresponds to load instructions followed by the
				4888	/// <c> VINSERTF128 </c> instruction.
				4889	///
				4890	/// \param __addr_hi
				4891	/// A pointer to a 128-bit memory location containing 4 consecutive
				4892	/// single-precision floating-point values. These values are to be copied to
				4893	/// bits[255:128] of the result. The address of the memory location does not
				4894	/// have to be aligned.
				4895	/// \param __addr_lo
				4896	/// A pointer to a 128-bit memory location containing 4 consecutive
				4897	/// single-precision floating-point values. These values are to be copied to
				4898	/// bits[127:0] of the result. The address of the memory location does not
				4899	/// have to be aligned.
				4900	/// \returns A 256-bit floating-point vector of [8 x float] containing the
				4901	/// concatenated result.
				4902	static __inline __m256 __DEFAULT_FN_ATTRS
				4903	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
				4904	{
				4905	return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
				4906	}
				4907
				4908	/// Loads two 128-bit floating-point vectors of [2 x double] from
				4909	/// unaligned memory locations and constructs a 256-bit floating-point vector
				4910	/// of [4 x double] by concatenating the two 128-bit vectors.
				4911	///
				4912	/// \headerfile <x86intrin.h>
				4913	///
				4914	/// This intrinsic corresponds to load instructions followed by the
				4915	/// <c> VINSERTF128 </c> instruction.
				4916	///
				4917	/// \param __addr_hi
				4918	/// A pointer to a 128-bit memory location containing two consecutive
				4919	/// double-precision floating-point values. These values are to be copied to
				4920	/// bits[255:128] of the result. The address of the memory location does not
				4921	/// have to be aligned.
				4922	/// \param __addr_lo
				4923	/// A pointer to a 128-bit memory location containing two consecutive
				4924	/// double-precision floating-point values. These values are to be copied to
				4925	/// bits[127:0] of the result. The address of the memory location does not
				4926	/// have to be aligned.
				4927	/// \returns A 256-bit floating-point vector of [4 x double] containing the
				4928	/// concatenated result.
				4929	static __inline __m256d __DEFAULT_FN_ATTRS
				4930	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
				4931	{
				4932	return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
				4933	}
				4934
				4935	/// Loads two 128-bit integer vectors from unaligned memory locations and
				4936	/// constructs a 256-bit integer vector by concatenating the two 128-bit
				4937	/// vectors.
				4938	///
				4939	/// \headerfile <x86intrin.h>
				4940	///
				4941	/// This intrinsic corresponds to load instructions followed by the
				4942	/// <c> VINSERTF128 </c> instruction.
				4943	///
				4944	/// \param __addr_hi
				4945	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4946	/// vector. This vector is to be copied to bits[255:128] of the result. The
				4947	/// address of the memory location does not have to be aligned.
				4948	/// \param __addr_lo
				4949	/// A pointer to a 128-bit memory location containing a 128-bit integer
				4950	/// vector. This vector is to be copied to bits[127:0] of the result. The
				4951	/// address of the memory location does not have to be aligned.
				4952	/// \returns A 256-bit integer vector containing the concatenated result.
				4953	static __inline __m256i __DEFAULT_FN_ATTRS
				4954	_mm256_loadu2_m128i(__m128i_u const __addr_hi, __m128i_u const __addr_lo)
				4955	{
				4956	return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
				4957	}
				4958
				4959	/* SIMD store ops (unaligned) */
				4960	/// Stores the upper and lower 128 bits of a 256-bit floating-point
				4961	/// vector of [8 x float] into two different unaligned memory locations.
				4962	///
				4963	/// \headerfile <x86intrin.h>
				4964	///
				4965	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				4966	/// store instructions.
				4967	///
				4968	/// \param __addr_hi
				4969	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
				4970	/// copied to this memory location. The address of this memory location does
				4971	/// not have to be aligned.
				4972	/// \param __addr_lo
				4973	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
				4974	/// copied to this memory location. The address of this memory location does
				4975	/// not have to be aligned.
				4976	/// \param __a
				4977	/// A 256-bit floating-point vector of [8 x float].
				4978	static __inline void __DEFAULT_FN_ATTRS
				4979	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
				4980	{
				4981	__m128 __v128;
				4982
				4983	__v128 = _mm256_castps256_ps128(__a);
				4984	_mm_storeu_ps(__addr_lo, __v128);
				4985	__v128 = _mm256_extractf128_ps(__a, 1);
				4986	_mm_storeu_ps(__addr_hi, __v128);
				4987	}
				4988
				4989	/// Stores the upper and lower 128 bits of a 256-bit floating-point
				4990	/// vector of [4 x double] into two different unaligned memory locations.
				4991	///
				4992	/// \headerfile <x86intrin.h>
				4993	///
				4994	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				4995	/// store instructions.
				4996	///
				4997	/// \param __addr_hi
				4998	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
				4999	/// copied to this memory location. The address of this memory location does
				5000	/// not have to be aligned.
				5001	/// \param __addr_lo
				5002	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
				5003	/// copied to this memory location. The address of this memory location does
				5004	/// not have to be aligned.
				5005	/// \param __a
				5006	/// A 256-bit floating-point vector of [4 x double].
				5007	static __inline void __DEFAULT_FN_ATTRS
				5008	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
				5009	{
				5010	__m128d __v128;
				5011
				5012	__v128 = _mm256_castpd256_pd128(__a);
				5013	_mm_storeu_pd(__addr_lo, __v128);
				5014	__v128 = _mm256_extractf128_pd(__a, 1);
				5015	_mm_storeu_pd(__addr_hi, __v128);
				5016	}
				5017
				5018	/// Stores the upper and lower 128 bits of a 256-bit integer vector into
				5019	/// two different unaligned memory locations.
				5020	///
				5021	/// \headerfile <x86intrin.h>
				5022	///
				5023	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
				5024	/// store instructions.
				5025	///
				5026	/// \param __addr_hi
				5027	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
				5028	/// copied to this memory location. The address of this memory location does
				5029	/// not have to be aligned.
				5030	/// \param __addr_lo
				5031	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
				5032	/// copied to this memory location. The address of this memory location does
				5033	/// not have to be aligned.
				5034	/// \param __a
				5035	/// A 256-bit integer vector.
				5036	static __inline void __DEFAULT_FN_ATTRS
				5037	_mm256_storeu2_m128i(__m128i_u __addr_hi, __m128i_u __addr_lo, __m256i __a)
				5038	{
				5039	__m128i __v128;
				5040
				5041	__v128 = _mm256_castsi256_si128(__a);
				5042	_mm_storeu_si128(__addr_lo, __v128);
				5043	__v128 = _mm256_extractf128_si256(__a, 1);
				5044	_mm_storeu_si128(__addr_hi, __v128);
				5045	}
				5046
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	5047	#undef __DEFAULT_FN_ATTRS
Logan Chien	55afb0a	2018-10-15 10:42:14 +0800	[diff] [blame]	5048	#undef __DEFAULT_FN_ATTRS128
Logan Chien	2833ffb	2018-10-09 10:03:24 +0800	[diff] [blame]	5049
				5050	#endif /* __AVXINTRIN_H */