Blame - third_party/llvm-build/Release+Asserts/lib/clang/3.9.0/include/tmmintrin.h - fp2-dev/platform/external/v8

blob: a72796ba4a68b8b1c05ce97a1ca51186eee4c0e4 [file] [log] [blame]

Ben Murdoch	097c5b2	2016-05-18 11:27:45 +0100	[diff] [blame]	1	/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
				2	*
				3	* Permission is hereby granted, free of charge, to any person obtaining a copy
				4	* of this software and associated documentation files (the "Software"), to deal
				5	* in the Software without restriction, including without limitation the rights
				6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				7	* copies of the Software, and to permit persons to whom the Software is
				8	* furnished to do so, subject to the following conditions:
				9	*
				10	* The above copyright notice and this permission notice shall be included in
				11	* all copies or substantial portions of the Software.
				12	*
				13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				19	* THE SOFTWARE.
				20	*
				21	*===-----------------------------------------------------------------------===
				22	*/
				23
				24	#ifndef __TMMINTRIN_H
				25	#define __TMMINTRIN_H
				26
				27	#include <pmmintrin.h>
				28
				29	/* Define the default attributes for the functions in this file. */
				30	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
				31
				32	/// \brief Computes the absolute value of each of the packed 8-bit signed
				33	/// integers in the source operand and stores the 8-bit unsigned integer
				34	/// results in the destination.
				35	///
				36	/// \headerfile <x86intrin.h>
				37	///
				38	/// This intrinsic corresponds to the \c PABSB instruction.
				39	///
				40	/// \param __a
				41	/// A 64-bit vector of [8 x i8].
				42	/// \returns A 64-bit integer vector containing the absolute values of the
				43	/// elements in the operand.
				44	static __inline__ __m64 __DEFAULT_FN_ATTRS
				45	_mm_abs_pi8(__m64 __a)
				46	{
				47	return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
				48	}
				49
				50	/// \brief Computes the absolute value of each of the packed 8-bit signed
				51	/// integers in the source operand and stores the 8-bit unsigned integer
				52	/// results in the destination.
				53	///
				54	/// \headerfile <x86intrin.h>
				55	///
				56	/// This intrinsic corresponds to the \c VPABSB instruction.
				57	///
				58	/// \param __a
				59	/// A 128-bit vector of [16 x i8].
				60	/// \returns A 128-bit integer vector containing the absolute values of the
				61	/// elements in the operand.
				62	static __inline__ __m128i __DEFAULT_FN_ATTRS
				63	_mm_abs_epi8(__m128i __a)
				64	{
				65	return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
				66	}
				67
				68	/// \brief Computes the absolute value of each of the packed 16-bit signed
				69	/// integers in the source operand and stores the 16-bit unsigned integer
				70	/// results in the destination.
				71	///
				72	/// \headerfile <x86intrin.h>
				73	///
				74	/// This intrinsic corresponds to the \c PABSW instruction.
				75	///
				76	/// \param __a
				77	/// A 64-bit vector of [4 x i16].
				78	/// \returns A 64-bit integer vector containing the absolute values of the
				79	/// elements in the operand.
				80	static __inline__ __m64 __DEFAULT_FN_ATTRS
				81	_mm_abs_pi16(__m64 __a)
				82	{
				83	return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
				84	}
				85
				86	/// \brief Computes the absolute value of each of the packed 16-bit signed
				87	/// integers in the source operand and stores the 16-bit unsigned integer
				88	/// results in the destination.
				89	///
				90	/// \headerfile <x86intrin.h>
				91	///
				92	/// This intrinsic corresponds to the \c VPABSW instruction.
				93	///
				94	/// \param __a
				95	/// A 128-bit vector of [8 x i16].
				96	/// \returns A 128-bit integer vector containing the absolute values of the
				97	/// elements in the operand.
				98	static __inline__ __m128i __DEFAULT_FN_ATTRS
				99	_mm_abs_epi16(__m128i __a)
				100	{
				101	return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
				102	}
				103
				104	/// \brief Computes the absolute value of each of the packed 32-bit signed
				105	/// integers in the source operand and stores the 32-bit unsigned integer
				106	/// results in the destination.
				107	///
				108	/// \headerfile <x86intrin.h>
				109	///
				110	/// This intrinsic corresponds to the \c PABSD instruction.
				111	///
				112	/// \param __a
				113	/// A 64-bit vector of [2 x i32].
				114	/// \returns A 64-bit integer vector containing the absolute values of the
				115	/// elements in the operand.
				116	static __inline__ __m64 __DEFAULT_FN_ATTRS
				117	_mm_abs_pi32(__m64 __a)
				118	{
				119	return (__m64)__builtin_ia32_pabsd((__v2si)__a);
				120	}
				121
				122	/// \brief Computes the absolute value of each of the packed 32-bit signed
				123	/// integers in the source operand and stores the 32-bit unsigned integer
				124	/// results in the destination.
				125	///
				126	/// \headerfile <x86intrin.h>
				127	///
				128	/// This intrinsic corresponds to the \c VPABSD instruction.
				129	///
				130	/// \param __a
				131	/// A 128-bit vector of [4 x i32].
				132	/// \returns A 128-bit integer vector containing the absolute values of the
				133	/// elements in the operand.
				134	static __inline__ __m128i __DEFAULT_FN_ATTRS
				135	_mm_abs_epi32(__m128i __a)
				136	{
				137	return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
				138	}
				139
				140	/// \brief Concatenates the two 128-bit integer vector operands, and
				141	/// right-shifts the result by the number of bytes specified in the immediate
				142	/// operand.
				143	///
				144	/// \headerfile <x86intrin.h>
				145	///
				146	/// \code
				147	/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
				148	/// \endcode
				149	///
				150	/// This intrinsic corresponds to the \c PALIGNR instruction.
				151	///
				152	/// \param a
				153	/// A 128-bit vector of [16 x i8] containing one of the source operands.
				154	/// \param b
				155	/// A 128-bit vector of [16 x i8] containing one of the source operands.
				156	/// \param n
				157	/// An immediate operand specifying how many bytes to right-shift the result.
				158	/// \returns A 128-bit integer vector containing the concatenated right-shifted
				159	/// value.
				160	#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
				161	(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
				162	(__v16qi)(__m128i)(b), (n)); })
				163
				164	/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
				165	/// the result by the number of bytes specified in the immediate operand.
				166	///
				167	/// \headerfile <x86intrin.h>
				168	///
				169	/// \code
				170	/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
				171	/// \endcode
				172	///
				173	/// This intrinsic corresponds to the \c PALIGNR instruction.
				174	///
				175	/// \param a
				176	/// A 64-bit vector of [8 x i8] containing one of the source operands.
				177	/// \param b
				178	/// A 64-bit vector of [8 x i8] containing one of the source operands.
				179	/// \param n
				180	/// An immediate operand specifying how many bytes to right-shift the result.
				181	/// \returns A 64-bit integer vector containing the concatenated right-shifted
				182	/// value.
				183	#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
				184	(__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
				185
				186	/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
				187	/// 128-bit vectors of [8 x i16].
				188	///
				189	/// \headerfile <x86intrin.h>
				190	///
				191	/// This intrinsic corresponds to the \c VPHADDW instruction.
				192	///
				193	/// \param __a
				194	/// A 128-bit vector of [8 x i16] containing one of the source operands. The
				195	/// horizontal sums of the values are stored in the lower bits of the
				196	/// destination.
				197	/// \param __b
				198	/// A 128-bit vector of [8 x i16] containing one of the source operands. The
				199	/// horizontal sums of the values are stored in the upper bits of the
				200	/// destination.
				201	/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
				202	/// both operands.
				203	static __inline__ __m128i __DEFAULT_FN_ATTRS
				204	_mm_hadd_epi16(__m128i __a, __m128i __b)
				205	{
				206	return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
				207	}
				208
				209	/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
				210	/// 128-bit vectors of [4 x i32].
				211	///
				212	/// \headerfile <x86intrin.h>
				213	///
				214	/// This intrinsic corresponds to the \c VPHADDD instruction.
				215	///
				216	/// \param __a
				217	/// A 128-bit vector of [4 x i32] containing one of the source operands. The
				218	/// horizontal sums of the values are stored in the lower bits of the
				219	/// destination.
				220	/// \param __b
				221	/// A 128-bit vector of [4 x i32] containing one of the source operands. The
				222	/// horizontal sums of the values are stored in the upper bits of the
				223	/// destination.
				224	/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
				225	/// both operands.
				226	static __inline__ __m128i __DEFAULT_FN_ATTRS
				227	_mm_hadd_epi32(__m128i __a, __m128i __b)
				228	{
				229	return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
				230	}
				231
				232	/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
				233	/// 64-bit vectors of [4 x i16].
				234	///
				235	/// \headerfile <x86intrin.h>
				236	///
				237	/// This intrinsic corresponds to the \c PHADDW instruction.
				238	///
				239	/// \param __a
				240	/// A 64-bit vector of [4 x i16] containing one of the source operands. The
				241	/// horizontal sums of the values are stored in the lower bits of the
				242	/// destination.
				243	/// \param __b
				244	/// A 64-bit vector of [4 x i16] containing one of the source operands. The
				245	/// horizontal sums of the values are stored in the upper bits of the
				246	/// destination.
				247	/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
				248	/// operands.
				249	static __inline__ __m64 __DEFAULT_FN_ATTRS
				250	_mm_hadd_pi16(__m64 __a, __m64 __b)
				251	{
				252	return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
				253	}
				254
				255	/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
				256	/// 64-bit vectors of [2 x i32].
				257	///
				258	/// \headerfile <x86intrin.h>
				259	///
				260	/// This intrinsic corresponds to the \c PHADDD instruction.
				261	///
				262	/// \param __a
				263	/// A 64-bit vector of [2 x i32] containing one of the source operands. The
				264	/// horizontal sums of the values are stored in the lower bits of the
				265	/// destination.
				266	/// \param __b
				267	/// A 64-bit vector of [2 x i32] containing one of the source operands. The
				268	/// horizontal sums of the values are stored in the upper bits of the
				269	/// destination.
				270	/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
				271	/// operands.
				272	static __inline__ __m64 __DEFAULT_FN_ATTRS
				273	_mm_hadd_pi32(__m64 __a, __m64 __b)
				274	{
				275	return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
				276	}
				277
				278	/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
				279	/// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
				280	/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
				281	///
				282	/// \headerfile <x86intrin.h>
				283	///
				284	/// This intrinsic corresponds to the \c VPHADDSW instruction.
				285	///
				286	/// \param __a
				287	/// A 128-bit vector of [8 x i16] containing one of the source operands. The
				288	/// horizontal sums of the values are stored in the lower bits of the
				289	/// destination.
				290	/// \param __b
				291	/// A 128-bit vector of [8 x i16] containing one of the source operands. The
				292	/// horizontal sums of the values are stored in the upper bits of the
				293	/// destination.
				294	/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
				295	/// sums of both operands.
				296	static __inline__ __m128i __DEFAULT_FN_ATTRS
				297	_mm_hadds_epi16(__m128i __a, __m128i __b)
				298	{
				299	return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
				300	}
				301
				302	/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
				303	/// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
				304	/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
				305	///
				306	/// \headerfile <x86intrin.h>
				307	///
				308	/// This intrinsic corresponds to the \c PHADDSW instruction.
				309	///
				310	/// \param __a
				311	/// A 64-bit vector of [4 x i16] containing one of the source operands. The
				312	/// horizontal sums of the values are stored in the lower bits of the
				313	/// destination.
				314	/// \param __b
				315	/// A 64-bit vector of [4 x i16] containing one of the source operands. The
				316	/// horizontal sums of the values are stored in the upper bits of the
				317	/// destination.
				318	/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
				319	/// sums of both operands.
				320	static __inline__ __m64 __DEFAULT_FN_ATTRS
				321	_mm_hadds_pi16(__m64 __a, __m64 __b)
				322	{
				323	return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
				324	}
				325
				326	/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
				327	/// packed 128-bit vectors of [8 x i16].
				328	///
				329	/// \headerfile <x86intrin.h>
				330	///
				331	/// This intrinsic corresponds to the \c VPHSUBW instruction.
				332	///
				333	/// \param __a
				334	/// A 128-bit vector of [8 x i16] containing one of the source operands. The
				335	/// horizontal differences between the values are stored in the lower bits of
				336	/// the destination.
				337	/// \param __b
				338	/// A 128-bit vector of [8 x i16] containing one of the source operands. The
				339	/// horizontal differences between the values are stored in the upper bits of
				340	/// the destination.
				341	/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
				342	/// of both operands.
				343	static __inline__ __m128i __DEFAULT_FN_ATTRS
				344	_mm_hsub_epi16(__m128i __a, __m128i __b)
				345	{
				346	return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
				347	}
				348
				349	/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
				350	/// packed 128-bit vectors of [4 x i32].
				351	///
				352	/// \headerfile <x86intrin.h>
				353	///
				354	/// This intrinsic corresponds to the \c VPHSUBD instruction.
				355	///
				356	/// \param __a
				357	/// A 128-bit vector of [4 x i32] containing one of the source operands. The
				358	/// horizontal differences between the values are stored in the lower bits of
				359	/// the destination.
				360	/// \param __b
				361	/// A 128-bit vector of [4 x i32] containing one of the source operands. The
				362	/// horizontal differences between the values are stored in the upper bits of
				363	/// the destination.
				364	/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
				365	/// of both operands.
				366	static __inline__ __m128i __DEFAULT_FN_ATTRS
				367	_mm_hsub_epi32(__m128i __a, __m128i __b)
				368	{
				369	return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
				370	}
				371
				372	/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
				373	/// packed 64-bit vectors of [4 x i16].
				374	///
				375	/// \headerfile <x86intrin.h>
				376	///
				377	/// This intrinsic corresponds to the \c PHSUBW instruction.
				378	///
				379	/// \param __a
				380	/// A 64-bit vector of [4 x i16] containing one of the source operands. The
				381	/// horizontal differences between the values are stored in the lower bits of
				382	/// the destination.
				383	/// \param __b
				384	/// A 64-bit vector of [4 x i16] containing one of the source operands. The
				385	/// horizontal differences between the values are stored in the upper bits of
				386	/// the destination.
				387	/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
				388	/// of both operands.
				389	static __inline__ __m64 __DEFAULT_FN_ATTRS
				390	_mm_hsub_pi16(__m64 __a, __m64 __b)
				391	{
				392	return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
				393	}
				394
				395	/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
				396	/// packed 64-bit vectors of [2 x i32].
				397	///
				398	/// \headerfile <x86intrin.h>
				399	///
				400	/// This intrinsic corresponds to the \c PHSUBD instruction.
				401	///
				402	/// \param __a
				403	/// A 64-bit vector of [2 x i32] containing one of the source operands. The
				404	/// horizontal differences between the values are stored in the lower bits of
				405	/// the destination.
				406	/// \param __b
				407	/// A 64-bit vector of [2 x i32] containing one of the source operands. The
				408	/// horizontal differences between the values are stored in the upper bits of
				409	/// the destination.
				410	/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
				411	/// of both operands.
				412	static __inline__ __m64 __DEFAULT_FN_ATTRS
				413	_mm_hsub_pi32(__m64 __a, __m64 __b)
				414	{
				415	return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
				416	}
				417
				418	/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
				419	/// packed 128-bit vectors of [8 x i16]. Positive differences greater than
				420	/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
				421	/// saturated to 8000h.
				422	///
				423	/// \headerfile <x86intrin.h>
				424	///
				425	/// This intrinsic corresponds to the \c VPHSUBSW instruction.
				426	///
				427	/// \param __a
				428	/// A 128-bit vector of [8 x i16] containing one of the source operands. The
				429	/// horizontal differences between the values are stored in the lower bits of
				430	/// the destination.
				431	/// \param __b
				432	/// A 128-bit vector of [8 x i16] containing one of the source operands. The
				433	/// horizontal differences between the values are stored in the upper bits of
				434	/// the destination.
				435	/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
				436	/// differences of both operands.
				437	static __inline__ __m128i __DEFAULT_FN_ATTRS
				438	_mm_hsubs_epi16(__m128i __a, __m128i __b)
				439	{
				440	return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
				441	}
				442
				443	/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
				444	/// packed 64-bit vectors of [4 x i16]. Positive differences greater than
				445	/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
				446	/// saturated to 8000h.
				447	///
				448	/// \headerfile <x86intrin.h>
				449	///
				450	/// This intrinsic corresponds to the \c PHSUBSW instruction.
				451	///
				452	/// \param __a
				453	/// A 64-bit vector of [4 x i16] containing one of the source operands. The
				454	/// horizontal differences between the values are stored in the lower bits of
				455	/// the destination.
				456	/// \param __b
				457	/// A 64-bit vector of [4 x i16] containing one of the source operands. The
				458	/// horizontal differences between the values are stored in the upper bits of
				459	/// the destination.
				460	/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
				461	/// differences of both operands.
				462	static __inline__ __m64 __DEFAULT_FN_ATTRS
				463	_mm_hsubs_pi16(__m64 __a, __m64 __b)
				464	{
				465	return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
				466	}
				467
				468	/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
				469	/// values contained in the first source operand and packed 8-bit signed
				470	/// integer values contained in the second source operand, adds pairs of
				471	/// contiguous products with signed saturation, and writes the 16-bit sums to
				472	/// the corresponding bits in the destination. For example, bits [7:0] of
				473	/// both operands are multiplied, bits [15:8] of both operands are
				474	/// multiplied, and the sum of both results is written to bits [15:0] of the
				475	/// destination.
				476	///
				477	/// \headerfile <x86intrin.h>
				478	///
				479	/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
				480	///
				481	/// \param __a
				482	/// A 128-bit integer vector containing the first source operand.
				483	/// \param __b
				484	/// A 128-bit integer vector containing the second source operand.
				485	/// \returns A 128-bit integer vector containing the sums of products of both
				486	/// operands:
				487	/// R0 := (__a0 * __b0) + (__a1 * __b1)
				488	/// R1 := (__a2 * __b2) + (__a3 * __b3)
				489	/// R2 := (__a4 * __b4) + (__a5 * __b5)
				490	/// R3 := (__a6 * __b6) + (__a7 * __b7)
				491	/// R4 := (__a8 * __b8) + (__a9 * __b9)
				492	/// R5 := (__a10 * __b10) + (__a11 * __b11)
				493	/// R6 := (__a12 * __b12) + (__a13 * __b13)
				494	/// R7 := (__a14 * __b14) + (__a15 * __b15)
				495	static __inline__ __m128i __DEFAULT_FN_ATTRS
				496	_mm_maddubs_epi16(__m128i __a, __m128i __b)
				497	{
				498	return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
				499	}
				500
				501	/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
				502	/// values contained in the first source operand and packed 8-bit signed
				503	/// integer values contained in the second source operand, adds pairs of
				504	/// contiguous products with signed saturation, and writes the 16-bit sums to
				505	/// the corresponding bits in the destination. For example, bits [7:0] of
				506	/// both operands are multiplied, bits [15:8] of both operands are
				507	/// multiplied, and the sum of both results is written to bits [15:0] of the
				508	/// destination.
				509	///
				510	/// \headerfile <x86intrin.h>
				511	///
				512	/// This intrinsic corresponds to the \c PMADDUBSW instruction.
				513	///
				514	/// \param __a
				515	/// A 64-bit integer vector containing the first source operand.
				516	/// \param __b
				517	/// A 64-bit integer vector containing the second source operand.
				518	/// \returns A 64-bit integer vector containing the sums of products of both
				519	/// operands:
				520	/// R0 := (__a0 * __b0) + (__a1 * __b1)
				521	/// R1 := (__a2 * __b2) + (__a3 * __b3)
				522	/// R2 := (__a4 * __b4) + (__a5 * __b5)
				523	/// R3 := (__a6 * __b6) + (__a7 * __b7)
				524	static __inline__ __m64 __DEFAULT_FN_ATTRS
				525	_mm_maddubs_pi16(__m64 __a, __m64 __b)
				526	{
				527	return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
				528	}
				529
				530	/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
				531	/// products to the 18 most significant bits by right-shifting, rounds the
				532	/// truncated value by adding 1, and writes bits [16:1] to the destination.
				533	///
				534	/// \headerfile <x86intrin.h>
				535	///
				536	/// This intrinsic corresponds to the \c VPMULHRSW instruction.
				537	///
				538	/// \param __a
				539	/// A 128-bit vector of [8 x i16] containing one of the source operands.
				540	/// \param __b
				541	/// A 128-bit vector of [8 x i16] containing one of the source operands.
				542	/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
				543	/// products of both operands.
				544	static __inline__ __m128i __DEFAULT_FN_ATTRS
				545	_mm_mulhrs_epi16(__m128i __a, __m128i __b)
				546	{
				547	return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
				548	}
				549
				550	/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
				551	/// products to the 18 most significant bits by right-shifting, rounds the
				552	/// truncated value by adding 1, and writes bits [16:1] to the destination.
				553	///
				554	/// \headerfile <x86intrin.h>
				555	///
				556	/// This intrinsic corresponds to the \c PMULHRSW instruction.
				557	///
				558	/// \param __a
				559	/// A 64-bit vector of [4 x i16] containing one of the source operands.
				560	/// \param __b
				561	/// A 64-bit vector of [4 x i16] containing one of the source operands.
				562	/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
				563	/// products of both operands.
				564	static __inline__ __m64 __DEFAULT_FN_ATTRS
				565	_mm_mulhrs_pi16(__m64 __a, __m64 __b)
				566	{
				567	return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
				568	}
				569
				570	/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
				571	/// destination or clears 8-bit values in the destination, as specified by
				572	/// the second source operand.
				573	///
				574	/// \headerfile <x86intrin.h>
				575	///
				576	/// This intrinsic corresponds to the \c VPSHUFB instruction.
				577	///
				578	/// \param __a
				579	/// A 128-bit integer vector containing the values to be copied.
				580	/// \param __b
				581	/// A 128-bit integer vector containing control bytes corresponding to
				582	/// positions in the destination:
				583	/// Bit 7:
				584	/// 1: Clear the corresponding byte in the destination.
				585	/// 0: Copy the selected source byte to the corresponding byte in the
				586	/// destination.
				587	/// Bits [6:4] Reserved.
				588	/// Bits [3:0] select the source byte to be copied.
				589	/// \returns A 128-bit integer vector containing the copied or cleared values.
				590	static __inline__ __m128i __DEFAULT_FN_ATTRS
				591	_mm_shuffle_epi8(__m128i __a, __m128i __b)
				592	{
				593	return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
				594	}
				595
				596	/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
				597	/// destination or clears 8-bit values in the destination, as specified by
				598	/// the second source operand.
				599	///
				600	/// \headerfile <x86intrin.h>
				601	///
				602	/// This intrinsic corresponds to the \c PSHUFB instruction.
				603	///
				604	/// \param __a
				605	/// A 64-bit integer vector containing the values to be copied.
				606	/// \param __b
				607	/// A 64-bit integer vector containing control bytes corresponding to
				608	/// positions in the destination:
				609	/// Bit 7:
				610	/// 1: Clear the corresponding byte in the destination.
				611	/// 0: Copy the selected source byte to the corresponding byte in the
				612	/// destination.
				613	/// Bits [3:0] select the source byte to be copied.
				614	/// \returns A 64-bit integer vector containing the copied or cleared values.
				615	static __inline__ __m64 __DEFAULT_FN_ATTRS
				616	_mm_shuffle_pi8(__m64 __a, __m64 __b)
				617	{
				618	return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
				619	}
				620
				621	/// \brief For each 8-bit integer in the first source operand, perform one of
				622	/// the following actions as specified by the second source operand: If the
				623	/// byte in the second source is negative, calculate the two's complement of
				624	/// the corresponding byte in the first source, and write that value to the
				625	/// destination. If the byte in the second source is positive, copy the
				626	/// corresponding byte from the first source to the destination. If the byte
				627	/// in the second source is zero, clear the corresponding byte in the
				628	/// destination.
				629	///
				630	/// \headerfile <x86intrin.h>
				631	///
				632	/// This intrinsic corresponds to the \c VPSIGNB instruction.
				633	///
				634	/// \param __a
				635	/// A 128-bit integer vector containing the values to be copied.
				636	/// \param __b
				637	/// A 128-bit integer vector containing control bytes corresponding to
				638	/// positions in the destination.
				639	/// \returns A 128-bit integer vector containing the resultant values.
				640	static __inline__ __m128i __DEFAULT_FN_ATTRS
				641	_mm_sign_epi8(__m128i __a, __m128i __b)
				642	{
				643	return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
				644	}
				645
				646	/// \brief For each 16-bit integer in the first source operand, perform one of
				647	/// the following actions as specified by the second source operand: If the
				648	/// word in the second source is negative, calculate the two's complement of
				649	/// the corresponding word in the first source, and write that value to the
				650	/// destination. If the word in the second source is positive, copy the
				651	/// corresponding word from the first source to the destination. If the word
				652	/// in the second source is zero, clear the corresponding word in the
				653	/// destination.
				654	///
				655	/// \headerfile <x86intrin.h>
				656	///
				657	/// This intrinsic corresponds to the \c VPSIGNW instruction.
				658	///
				659	/// \param __a
				660	/// A 128-bit integer vector containing the values to be copied.
				661	/// \param __b
				662	/// A 128-bit integer vector containing control words corresponding to
				663	/// positions in the destination.
				664	/// \returns A 128-bit integer vector containing the resultant values.
				665	static __inline__ __m128i __DEFAULT_FN_ATTRS
				666	_mm_sign_epi16(__m128i __a, __m128i __b)
				667	{
				668	return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
				669	}
				670
				671	/// \brief For each 32-bit integer in the first source operand, perform one of
				672	/// the following actions as specified by the second source operand: If the
				673	/// doubleword in the second source is negative, calculate the two's
				674	/// complement of the corresponding word in the first source, and write that
				675	/// value to the destination. If the doubleword in the second source is
				676	/// positive, copy the corresponding word from the first source to the
				677	/// destination. If the doubleword in the second source is zero, clear the
				678	/// corresponding word in the destination.
				679	///
				680	/// \headerfile <x86intrin.h>
				681	///
				682	/// This intrinsic corresponds to the \c VPSIGND instruction.
				683	///
				684	/// \param __a
				685	/// A 128-bit integer vector containing the values to be copied.
				686	/// \param __b
				687	/// A 128-bit integer vector containing control doublewords corresponding to
				688	/// positions in the destination.
				689	/// \returns A 128-bit integer vector containing the resultant values.
				690	static __inline__ __m128i __DEFAULT_FN_ATTRS
				691	_mm_sign_epi32(__m128i __a, __m128i __b)
				692	{
				693	return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
				694	}
				695
				696	/// \brief For each 8-bit integer in the first source operand, perform one of
				697	/// the following actions as specified by the second source operand: If the
				698	/// byte in the second source is negative, calculate the two's complement of
				699	/// the corresponding byte in the first source, and write that value to the
				700	/// destination. If the byte in the second source is positive, copy the
				701	/// corresponding byte from the first source to the destination. If the byte
				702	/// in the second source is zero, clear the corresponding byte in the
				703	/// destination.
				704	///
				705	/// \headerfile <x86intrin.h>
				706	///
				707	/// This intrinsic corresponds to the \c PSIGNB instruction.
				708	///
				709	/// \param __a
				710	/// A 64-bit integer vector containing the values to be copied.
				711	/// \param __b
				712	/// A 64-bit integer vector containing control bytes corresponding to
				713	/// positions in the destination.
				714	/// \returns A 64-bit integer vector containing the resultant values.
				715	static __inline__ __m64 __DEFAULT_FN_ATTRS
				716	_mm_sign_pi8(__m64 __a, __m64 __b)
				717	{
				718	return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
				719	}
				720
				721	/// \brief For each 16-bit integer in the first source operand, perform one of
				722	/// the following actions as specified by the second source operand: If the
				723	/// word in the second source is negative, calculate the two's complement of
				724	/// the corresponding word in the first source, and write that value to the
				725	/// destination. If the word in the second source is positive, copy the
				726	/// corresponding word from the first source to the destination. If the word
				727	/// in the second source is zero, clear the corresponding word in the
				728	/// destination.
				729	///
				730	/// \headerfile <x86intrin.h>
				731	///
				732	/// This intrinsic corresponds to the \c PSIGNW instruction.
				733	///
				734	/// \param __a
				735	/// A 64-bit integer vector containing the values to be copied.
				736	/// \param __b
				737	/// A 64-bit integer vector containing control words corresponding to
				738	/// positions in the destination.
				739	/// \returns A 64-bit integer vector containing the resultant values.
				740	static __inline__ __m64 __DEFAULT_FN_ATTRS
				741	_mm_sign_pi16(__m64 __a, __m64 __b)
				742	{
				743	return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
				744	}
				745
				746	/// \brief For each 32-bit integer in the first source operand, perform one of
				747	/// the following actions as specified by the second source operand: If the
				748	/// doubleword in the second source is negative, calculate the two's
				749	/// complement of the corresponding doubleword in the first source, and
				750	/// write that value to the destination. If the doubleword in the second
				751	/// source is positive, copy the corresponding doubleword from the first
				752	/// source to the destination. If the doubleword in the second source is
				753	/// zero, clear the corresponding doubleword in the destination.
				754	///
				755	/// \headerfile <x86intrin.h>
				756	///
				757	/// This intrinsic corresponds to the \c PSIGND instruction.
				758	///
				759	/// \param __a
				760	/// A 64-bit integer vector containing the values to be copied.
				761	/// \param __b
				762	/// A 64-bit integer vector containing two control doublewords corresponding
				763	/// to positions in the destination.
				764	/// \returns A 64-bit integer vector containing the resultant values.
				765	static __inline__ __m64 __DEFAULT_FN_ATTRS
				766	_mm_sign_pi32(__m64 __a, __m64 __b)
				767	{
				768	return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
				769	}
				770
				771	#undef __DEFAULT_FN_ATTRS
				772
				773	#endif /* __TMMINTRIN_H */