Blame - lib/gcc/arm-linux-androideabi/4.8/include/mmintrin.h - fp2-dev/platform/prebuilts/gcc/darwin-x86/arm/arm-linux-androideabi-4.8

blob: 7e0360f823fd72413f1b38ed0e4c31db46405692 [file] [log] [blame]

Ben Cheng	264394c	2013-08-22 22:11:19 -0700	[diff] [blame]	1	/* Copyright (C) 2002-2013 Free Software Foundation, Inc.
				2
				3	This file is part of GCC.
				4
				5	GCC is free software; you can redistribute it and/or modify it
				6	under the terms of the GNU General Public License as published
				7	by the Free Software Foundation; either version 3, or (at your
				8	option) any later version.
				9
				10	GCC is distributed in the hope that it will be useful, but WITHOUT
				11	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
				12	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
				13	License for more details.
				14
				15	Under Section 7 of GPL version 3, you are granted additional
				16	permissions described in the GCC Runtime Library Exception, version
				17	3.1, as published by the Free Software Foundation.
				18
				19	You should have received a copy of the GNU General Public License and
				20	a copy of the GCC Runtime Library Exception along with this program;
				21	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
				22	<http://www.gnu.org/licenses/>. */
				23
				24	#ifndef _MMINTRIN_H_INCLUDED
				25	#define _MMINTRIN_H_INCLUDED
				26
				27	#ifndef __IWMMXT__
				28	#error mmintrin.h included without enabling WMMX/WMMX2 instructions (e.g. -march=iwmmxt or -march=iwmmxt2)
				29	#endif
				30
				31
				32	#if defined __cplusplus
				33	extern "C" {
				34	/* Intrinsics use C name-mangling. */
				35	#endif /* __cplusplus */
				36
				37	/* The data type intended for user use. */
				38	typedef unsigned long long __m64, __int64;
				39
				40	/* Internal data types for implementing the intrinsics. */
				41	typedef int __v2si __attribute__ ((vector_size (8)));
				42	typedef short __v4hi __attribute__ ((vector_size (8)));
				43	typedef signed char __v8qi __attribute__ ((vector_size (8)));
				44
				45	/* Provided for source compatibility with MMX. */
				46	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				47	_mm_empty (void)
				48	{
				49	}
				50
				51	/* "Convert" __m64 and __int64 into each other. */
				52	static __inline __m64
				53	_mm_cvtsi64_m64 (__int64 __i)
				54	{
				55	return __i;
				56	}
				57
				58	static __inline __int64
				59	_mm_cvtm64_si64 (__m64 __i)
				60	{
				61	return __i;
				62	}
				63
				64	static __inline int
				65	_mm_cvtsi64_si32 (__int64 __i)
				66	{
				67	return __i;
				68	}
				69
				70	static __inline __int64
				71	_mm_cvtsi32_si64 (int __i)
				72	{
				73	return (__i & 0xffffffff);
				74	}
				75
				76	/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
				77	the result, and the four 16-bit values from M2 into the upper four 8-bit
				78	values of the result, all with signed saturation. */
				79	static __inline __m64
				80	_mm_packs_pi16 (__m64 __m1, __m64 __m2)
				81	{
				82	return (__m64) __builtin_arm_wpackhss ((__v4hi)__m1, (__v4hi)__m2);
				83	}
				84
				85	/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
				86	the result, and the two 32-bit values from M2 into the upper two 16-bit
				87	values of the result, all with signed saturation. */
				88	static __inline __m64
				89	_mm_packs_pi32 (__m64 __m1, __m64 __m2)
				90	{
				91	return (__m64) __builtin_arm_wpackwss ((__v2si)__m1, (__v2si)__m2);
				92	}
				93
				94	/* Copy the 64-bit value from M1 into the lower 32-bits of the result, and
				95	the 64-bit value from M2 into the upper 32-bits of the result, all with
				96	signed saturation for values that do not fit exactly into 32-bits. */
				97	static __inline __m64
				98	_mm_packs_pi64 (__m64 __m1, __m64 __m2)
				99	{
				100	return (__m64) __builtin_arm_wpackdss ((long long)__m1, (long long)__m2);
				101	}
				102
				103	/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
				104	the result, and the four 16-bit values from M2 into the upper four 8-bit
				105	values of the result, all with unsigned saturation. */
				106	static __inline __m64
				107	_mm_packs_pu16 (__m64 __m1, __m64 __m2)
				108	{
				109	return (__m64) __builtin_arm_wpackhus ((__v4hi)__m1, (__v4hi)__m2);
				110	}
				111
				112	/* Pack the two 32-bit values from M1 into the lower two 16-bit values of
				113	the result, and the two 32-bit values from M2 into the upper two 16-bit
				114	values of the result, all with unsigned saturation. */
				115	static __inline __m64
				116	_mm_packs_pu32 (__m64 __m1, __m64 __m2)
				117	{
				118	return (__m64) __builtin_arm_wpackwus ((__v2si)__m1, (__v2si)__m2);
				119	}
				120
				121	/* Copy the 64-bit value from M1 into the lower 32-bits of the result, and
				122	the 64-bit value from M2 into the upper 32-bits of the result, all with
				123	unsigned saturation for values that do not fit exactly into 32-bits. */
				124	static __inline __m64
				125	_mm_packs_pu64 (__m64 __m1, __m64 __m2)
				126	{
				127	return (__m64) __builtin_arm_wpackdus ((long long)__m1, (long long)__m2);
				128	}
				129
				130	/* Interleave the four 8-bit values from the high half of M1 with the four
				131	8-bit values from the high half of M2. */
				132	static __inline __m64
				133	_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
				134	{
				135	return (__m64) __builtin_arm_wunpckihb ((__v8qi)__m1, (__v8qi)__m2);
				136	}
				137
				138	/* Interleave the two 16-bit values from the high half of M1 with the two
				139	16-bit values from the high half of M2. */
				140	static __inline __m64
				141	_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
				142	{
				143	return (__m64) __builtin_arm_wunpckihh ((__v4hi)__m1, (__v4hi)__m2);
				144	}
				145
				146	/* Interleave the 32-bit value from the high half of M1 with the 32-bit
				147	value from the high half of M2. */
				148	static __inline __m64
				149	_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
				150	{
				151	return (__m64) __builtin_arm_wunpckihw ((__v2si)__m1, (__v2si)__m2);
				152	}
				153
				154	/* Interleave the four 8-bit values from the low half of M1 with the four
				155	8-bit values from the low half of M2. */
				156	static __inline __m64
				157	_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
				158	{
				159	return (__m64) __builtin_arm_wunpckilb ((__v8qi)__m1, (__v8qi)__m2);
				160	}
				161
				162	/* Interleave the two 16-bit values from the low half of M1 with the two
				163	16-bit values from the low half of M2. */
				164	static __inline __m64
				165	_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
				166	{
				167	return (__m64) __builtin_arm_wunpckilh ((__v4hi)__m1, (__v4hi)__m2);
				168	}
				169
				170	/* Interleave the 32-bit value from the low half of M1 with the 32-bit
				171	value from the low half of M2. */
				172	static __inline __m64
				173	_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
				174	{
				175	return (__m64) __builtin_arm_wunpckilw ((__v2si)__m1, (__v2si)__m2);
				176	}
				177
				178	/* Take the four 8-bit values from the low half of M1, sign extend them,
				179	and return the result as a vector of four 16-bit quantities. */
				180	static __inline __m64
				181	_mm_unpackel_pi8 (__m64 __m1)
				182	{
				183	return (__m64) __builtin_arm_wunpckelsb ((__v8qi)__m1);
				184	}
				185
				186	/* Take the two 16-bit values from the low half of M1, sign extend them,
				187	and return the result as a vector of two 32-bit quantities. */
				188	static __inline __m64
				189	_mm_unpackel_pi16 (__m64 __m1)
				190	{
				191	return (__m64) __builtin_arm_wunpckelsh ((__v4hi)__m1);
				192	}
				193
				194	/* Take the 32-bit value from the low half of M1, and return it sign extended
				195	to 64 bits. */
				196	static __inline __m64
				197	_mm_unpackel_pi32 (__m64 __m1)
				198	{
				199	return (__m64) __builtin_arm_wunpckelsw ((__v2si)__m1);
				200	}
				201
				202	/* Take the four 8-bit values from the high half of M1, sign extend them,
				203	and return the result as a vector of four 16-bit quantities. */
				204	static __inline __m64
				205	_mm_unpackeh_pi8 (__m64 __m1)
				206	{
				207	return (__m64) __builtin_arm_wunpckehsb ((__v8qi)__m1);
				208	}
				209
				210	/* Take the two 16-bit values from the high half of M1, sign extend them,
				211	and return the result as a vector of two 32-bit quantities. */
				212	static __inline __m64
				213	_mm_unpackeh_pi16 (__m64 __m1)
				214	{
				215	return (__m64) __builtin_arm_wunpckehsh ((__v4hi)__m1);
				216	}
				217
				218	/* Take the 32-bit value from the high half of M1, and return it sign extended
				219	to 64 bits. */
				220	static __inline __m64
				221	_mm_unpackeh_pi32 (__m64 __m1)
				222	{
				223	return (__m64) __builtin_arm_wunpckehsw ((__v2si)__m1);
				224	}
				225
				226	/* Take the four 8-bit values from the low half of M1, zero extend them,
				227	and return the result as a vector of four 16-bit quantities. */
				228	static __inline __m64
				229	_mm_unpackel_pu8 (__m64 __m1)
				230	{
				231	return (__m64) __builtin_arm_wunpckelub ((__v8qi)__m1);
				232	}
				233
				234	/* Take the two 16-bit values from the low half of M1, zero extend them,
				235	and return the result as a vector of two 32-bit quantities. */
				236	static __inline __m64
				237	_mm_unpackel_pu16 (__m64 __m1)
				238	{
				239	return (__m64) __builtin_arm_wunpckeluh ((__v4hi)__m1);
				240	}
				241
				242	/* Take the 32-bit value from the low half of M1, and return it zero extended
				243	to 64 bits. */
				244	static __inline __m64
				245	_mm_unpackel_pu32 (__m64 __m1)
				246	{
				247	return (__m64) __builtin_arm_wunpckeluw ((__v2si)__m1);
				248	}
				249
				250	/* Take the four 8-bit values from the high half of M1, zero extend them,
				251	and return the result as a vector of four 16-bit quantities. */
				252	static __inline __m64
				253	_mm_unpackeh_pu8 (__m64 __m1)
				254	{
				255	return (__m64) __builtin_arm_wunpckehub ((__v8qi)__m1);
				256	}
				257
				258	/* Take the two 16-bit values from the high half of M1, zero extend them,
				259	and return the result as a vector of two 32-bit quantities. */
				260	static __inline __m64
				261	_mm_unpackeh_pu16 (__m64 __m1)
				262	{
				263	return (__m64) __builtin_arm_wunpckehuh ((__v4hi)__m1);
				264	}
				265
				266	/* Take the 32-bit value from the high half of M1, and return it zero extended
				267	to 64 bits. */
				268	static __inline __m64
				269	_mm_unpackeh_pu32 (__m64 __m1)
				270	{
				271	return (__m64) __builtin_arm_wunpckehuw ((__v2si)__m1);
				272	}
				273
				274	/* Add the 8-bit values in M1 to the 8-bit values in M2. */
				275	static __inline __m64
				276	_mm_add_pi8 (__m64 __m1, __m64 __m2)
				277	{
				278	return (__m64) __builtin_arm_waddb ((__v8qi)__m1, (__v8qi)__m2);
				279	}
				280
				281	/* Add the 16-bit values in M1 to the 16-bit values in M2. */
				282	static __inline __m64
				283	_mm_add_pi16 (__m64 __m1, __m64 __m2)
				284	{
				285	return (__m64) __builtin_arm_waddh ((__v4hi)__m1, (__v4hi)__m2);
				286	}
				287
				288	/* Add the 32-bit values in M1 to the 32-bit values in M2. */
				289	static __inline __m64
				290	_mm_add_pi32 (__m64 __m1, __m64 __m2)
				291	{
				292	return (__m64) __builtin_arm_waddw ((__v2si)__m1, (__v2si)__m2);
				293	}
				294
				295	/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
				296	saturated arithmetic. */
				297	static __inline __m64
				298	_mm_adds_pi8 (__m64 __m1, __m64 __m2)
				299	{
				300	return (__m64) __builtin_arm_waddbss ((__v8qi)__m1, (__v8qi)__m2);
				301	}
				302
				303	/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
				304	saturated arithmetic. */
				305	static __inline __m64
				306	_mm_adds_pi16 (__m64 __m1, __m64 __m2)
				307	{
				308	return (__m64) __builtin_arm_waddhss ((__v4hi)__m1, (__v4hi)__m2);
				309	}
				310
				311	/* Add the 32-bit values in M1 to the 32-bit values in M2 using signed
				312	saturated arithmetic. */
				313	static __inline __m64
				314	_mm_adds_pi32 (__m64 __m1, __m64 __m2)
				315	{
				316	return (__m64) __builtin_arm_waddwss ((__v2si)__m1, (__v2si)__m2);
				317	}
				318
				319	/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
				320	saturated arithmetic. */
				321	static __inline __m64
				322	_mm_adds_pu8 (__m64 __m1, __m64 __m2)
				323	{
				324	return (__m64) __builtin_arm_waddbus ((__v8qi)__m1, (__v8qi)__m2);
				325	}
				326
				327	/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
				328	saturated arithmetic. */
				329	static __inline __m64
				330	_mm_adds_pu16 (__m64 __m1, __m64 __m2)
				331	{
				332	return (__m64) __builtin_arm_waddhus ((__v4hi)__m1, (__v4hi)__m2);
				333	}
				334
				335	/* Add the 32-bit values in M1 to the 32-bit values in M2 using unsigned
				336	saturated arithmetic. */
				337	static __inline __m64
				338	_mm_adds_pu32 (__m64 __m1, __m64 __m2)
				339	{
				340	return (__m64) __builtin_arm_waddwus ((__v2si)__m1, (__v2si)__m2);
				341	}
				342
				343	/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
				344	static __inline __m64
				345	_mm_sub_pi8 (__m64 __m1, __m64 __m2)
				346	{
				347	return (__m64) __builtin_arm_wsubb ((__v8qi)__m1, (__v8qi)__m2);
				348	}
				349
				350	/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
				351	static __inline __m64
				352	_mm_sub_pi16 (__m64 __m1, __m64 __m2)
				353	{
				354	return (__m64) __builtin_arm_wsubh ((__v4hi)__m1, (__v4hi)__m2);
				355	}
				356
				357	/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
				358	static __inline __m64
				359	_mm_sub_pi32 (__m64 __m1, __m64 __m2)
				360	{
				361	return (__m64) __builtin_arm_wsubw ((__v2si)__m1, (__v2si)__m2);
				362	}
				363
				364	/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
				365	saturating arithmetic. */
				366	static __inline __m64
				367	_mm_subs_pi8 (__m64 __m1, __m64 __m2)
				368	{
				369	return (__m64) __builtin_arm_wsubbss ((__v8qi)__m1, (__v8qi)__m2);
				370	}
				371
				372	/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
				373	signed saturating arithmetic. */
				374	static __inline __m64
				375	_mm_subs_pi16 (__m64 __m1, __m64 __m2)
				376	{
				377	return (__m64) __builtin_arm_wsubhss ((__v4hi)__m1, (__v4hi)__m2);
				378	}
				379
				380	/* Subtract the 32-bit values in M2 from the 32-bit values in M1 using
				381	signed saturating arithmetic. */
				382	static __inline __m64
				383	_mm_subs_pi32 (__m64 __m1, __m64 __m2)
				384	{
				385	return (__m64) __builtin_arm_wsubwss ((__v2si)__m1, (__v2si)__m2);
				386	}
				387
				388	/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
				389	unsigned saturating arithmetic. */
				390	static __inline __m64
				391	_mm_subs_pu8 (__m64 __m1, __m64 __m2)
				392	{
				393	return (__m64) __builtin_arm_wsubbus ((__v8qi)__m1, (__v8qi)__m2);
				394	}
				395
				396	/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
				397	unsigned saturating arithmetic. */
				398	static __inline __m64
				399	_mm_subs_pu16 (__m64 __m1, __m64 __m2)
				400	{
				401	return (__m64) __builtin_arm_wsubhus ((__v4hi)__m1, (__v4hi)__m2);
				402	}
				403
				404	/* Subtract the 32-bit values in M2 from the 32-bit values in M1 using
				405	unsigned saturating arithmetic. */
				406	static __inline __m64
				407	_mm_subs_pu32 (__m64 __m1, __m64 __m2)
				408	{
				409	return (__m64) __builtin_arm_wsubwus ((__v2si)__m1, (__v2si)__m2);
				410	}
				411
				412	/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
				413	four 32-bit intermediate results, which are then summed by pairs to
				414	produce two 32-bit results. */
				415	static __inline __m64
				416	_mm_madd_pi16 (__m64 __m1, __m64 __m2)
				417	{
				418	return (__m64) __builtin_arm_wmadds ((__v4hi)__m1, (__v4hi)__m2);
				419	}
				420
				421	/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
				422	four 32-bit intermediate results, which are then summed by pairs to
				423	produce two 32-bit results. */
				424	static __inline __m64
				425	_mm_madd_pu16 (__m64 __m1, __m64 __m2)
				426	{
				427	return (__m64) __builtin_arm_wmaddu ((__v4hi)__m1, (__v4hi)__m2);
				428	}
				429
				430	/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
				431	M2 and produce the high 16 bits of the 32-bit results. */
				432	static __inline __m64
				433	_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
				434	{
				435	return (__m64) __builtin_arm_wmulsm ((__v4hi)__m1, (__v4hi)__m2);
				436	}
				437
				438	/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
				439	M2 and produce the high 16 bits of the 32-bit results. */
				440	static __inline __m64
				441	_mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
				442	{
				443	return (__m64) __builtin_arm_wmulum ((__v4hi)__m1, (__v4hi)__m2);
				444	}
				445
				446	/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
				447	the low 16 bits of the results. */
				448	static __inline __m64
				449	_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
				450	{
				451	return (__m64) __builtin_arm_wmulul ((__v4hi)__m1, (__v4hi)__m2);
				452	}
				453
				454	/* Shift four 16-bit values in M left by COUNT. */
				455	static __inline __m64
				456	_mm_sll_pi16 (__m64 __m, __m64 __count)
				457	{
				458	return (__m64) __builtin_arm_wsllh ((__v4hi)__m, __count);
				459	}
				460
				461	static __inline __m64
				462	_mm_slli_pi16 (__m64 __m, int __count)
				463	{
				464	return (__m64) __builtin_arm_wsllhi ((__v4hi)__m, __count);
				465	}
				466
				467	/* Shift two 32-bit values in M left by COUNT. */
				468	static __inline __m64
				469	_mm_sll_pi32 (__m64 __m, __m64 __count)
				470	{
				471	return (__m64) __builtin_arm_wsllw ((__v2si)__m, __count);
				472	}
				473
				474	static __inline __m64
				475	_mm_slli_pi32 (__m64 __m, int __count)
				476	{
				477	return (__m64) __builtin_arm_wsllwi ((__v2si)__m, __count);
				478	}
				479
				480	/* Shift the 64-bit value in M left by COUNT. */
				481	static __inline __m64
				482	_mm_sll_si64 (__m64 __m, __m64 __count)
				483	{
				484	return (__m64) __builtin_arm_wslld (__m, __count);
				485	}
				486
				487	static __inline __m64
				488	_mm_slli_si64 (__m64 __m, int __count)
				489	{
				490	return (__m64) __builtin_arm_wslldi (__m, __count);
				491	}
				492
				493	/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
				494	static __inline __m64
				495	_mm_sra_pi16 (__m64 __m, __m64 __count)
				496	{
				497	return (__m64) __builtin_arm_wsrah ((__v4hi)__m, __count);
				498	}
				499
				500	static __inline __m64
				501	_mm_srai_pi16 (__m64 __m, int __count)
				502	{
				503	return (__m64) __builtin_arm_wsrahi ((__v4hi)__m, __count);
				504	}
				505
				506	/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
				507	static __inline __m64
				508	_mm_sra_pi32 (__m64 __m, __m64 __count)
				509	{
				510	return (__m64) __builtin_arm_wsraw ((__v2si)__m, __count);
				511	}
				512
				513	static __inline __m64
				514	_mm_srai_pi32 (__m64 __m, int __count)
				515	{
				516	return (__m64) __builtin_arm_wsrawi ((__v2si)__m, __count);
				517	}
				518
				519	/* Shift the 64-bit value in M right by COUNT; shift in the sign bit. */
				520	static __inline __m64
				521	_mm_sra_si64 (__m64 __m, __m64 __count)
				522	{
				523	return (__m64) __builtin_arm_wsrad (__m, __count);
				524	}
				525
				526	static __inline __m64
				527	_mm_srai_si64 (__m64 __m, int __count)
				528	{
				529	return (__m64) __builtin_arm_wsradi (__m, __count);
				530	}
				531
				532	/* Shift four 16-bit values in M right by COUNT; shift in zeros. */
				533	static __inline __m64
				534	_mm_srl_pi16 (__m64 __m, __m64 __count)
				535	{
				536	return (__m64) __builtin_arm_wsrlh ((__v4hi)__m, __count);
				537	}
				538
				539	static __inline __m64
				540	_mm_srli_pi16 (__m64 __m, int __count)
				541	{
				542	return (__m64) __builtin_arm_wsrlhi ((__v4hi)__m, __count);
				543	}
				544
				545	/* Shift two 32-bit values in M right by COUNT; shift in zeros. */
				546	static __inline __m64
				547	_mm_srl_pi32 (__m64 __m, __m64 __count)
				548	{
				549	return (__m64) __builtin_arm_wsrlw ((__v2si)__m, __count);
				550	}
				551
				552	static __inline __m64
				553	_mm_srli_pi32 (__m64 __m, int __count)
				554	{
				555	return (__m64) __builtin_arm_wsrlwi ((__v2si)__m, __count);
				556	}
				557
				558	/* Shift the 64-bit value in M left by COUNT; shift in zeros. */
				559	static __inline __m64
				560	_mm_srl_si64 (__m64 __m, __m64 __count)
				561	{
				562	return (__m64) __builtin_arm_wsrld (__m, __count);
				563	}
				564
				565	static __inline __m64
				566	_mm_srli_si64 (__m64 __m, int __count)
				567	{
				568	return (__m64) __builtin_arm_wsrldi (__m, __count);
				569	}
				570
				571	/* Rotate four 16-bit values in M right by COUNT. */
				572	static __inline __m64
				573	_mm_ror_pi16 (__m64 __m, __m64 __count)
				574	{
				575	return (__m64) __builtin_arm_wrorh ((__v4hi)__m, __count);
				576	}
				577
				578	static __inline __m64
				579	_mm_rori_pi16 (__m64 __m, int __count)
				580	{
				581	return (__m64) __builtin_arm_wrorhi ((__v4hi)__m, __count);
				582	}
				583
				584	/* Rotate two 32-bit values in M right by COUNT. */
				585	static __inline __m64
				586	_mm_ror_pi32 (__m64 __m, __m64 __count)
				587	{
				588	return (__m64) __builtin_arm_wrorw ((__v2si)__m, __count);
				589	}
				590
				591	static __inline __m64
				592	_mm_rori_pi32 (__m64 __m, int __count)
				593	{
				594	return (__m64) __builtin_arm_wrorwi ((__v2si)__m, __count);
				595	}
				596
				597	/* Rotate two 64-bit values in M right by COUNT. */
				598	static __inline __m64
				599	_mm_ror_si64 (__m64 __m, __m64 __count)
				600	{
				601	return (__m64) __builtin_arm_wrord (__m, __count);
				602	}
				603
				604	static __inline __m64
				605	_mm_rori_si64 (__m64 __m, int __count)
				606	{
				607	return (__m64) __builtin_arm_wrordi (__m, __count);
				608	}
				609
				610	/* Bit-wise AND the 64-bit values in M1 and M2. */
				611	static __inline __m64
				612	_mm_and_si64 (__m64 __m1, __m64 __m2)
				613	{
				614	return __builtin_arm_wand (__m1, __m2);
				615	}
				616
				617	/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
				618	64-bit value in M2. */
				619	static __inline __m64
				620	_mm_andnot_si64 (__m64 __m1, __m64 __m2)
				621	{
				622	return __builtin_arm_wandn (__m2, __m1);
				623	}
				624
				625	/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
				626	static __inline __m64
				627	_mm_or_si64 (__m64 __m1, __m64 __m2)
				628	{
				629	return __builtin_arm_wor (__m1, __m2);
				630	}
				631
				632	/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
				633	static __inline __m64
				634	_mm_xor_si64 (__m64 __m1, __m64 __m2)
				635	{
				636	return __builtin_arm_wxor (__m1, __m2);
				637	}
				638
				639	/* Compare eight 8-bit values. The result of the comparison is 0xFF if the
				640	test is true and zero if false. */
				641	static __inline __m64
				642	_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
				643	{
				644	return (__m64) __builtin_arm_wcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
				645	}
				646
				647	static __inline __m64
				648	_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
				649	{
				650	return (__m64) __builtin_arm_wcmpgtsb ((__v8qi)__m1, (__v8qi)__m2);
				651	}
				652
				653	static __inline __m64
				654	_mm_cmpgt_pu8 (__m64 __m1, __m64 __m2)
				655	{
				656	return (__m64) __builtin_arm_wcmpgtub ((__v8qi)__m1, (__v8qi)__m2);
				657	}
				658
				659	/* Compare four 16-bit values. The result of the comparison is 0xFFFF if
				660	the test is true and zero if false. */
				661	static __inline __m64
				662	_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
				663	{
				664	return (__m64) __builtin_arm_wcmpeqh ((__v4hi)__m1, (__v4hi)__m2);
				665	}
				666
				667	static __inline __m64
				668	_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
				669	{
				670	return (__m64) __builtin_arm_wcmpgtsh ((__v4hi)__m1, (__v4hi)__m2);
				671	}
				672
				673	static __inline __m64
				674	_mm_cmpgt_pu16 (__m64 __m1, __m64 __m2)
				675	{
				676	return (__m64) __builtin_arm_wcmpgtuh ((__v4hi)__m1, (__v4hi)__m2);
				677	}
				678
				679	/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
				680	the test is true and zero if false. */
				681	static __inline __m64
				682	_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
				683	{
				684	return (__m64) __builtin_arm_wcmpeqw ((__v2si)__m1, (__v2si)__m2);
				685	}
				686
				687	static __inline __m64
				688	_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
				689	{
				690	return (__m64) __builtin_arm_wcmpgtsw ((__v2si)__m1, (__v2si)__m2);
				691	}
				692
				693	static __inline __m64
				694	_mm_cmpgt_pu32 (__m64 __m1, __m64 __m2)
				695	{
				696	return (__m64) __builtin_arm_wcmpgtuw ((__v2si)__m1, (__v2si)__m2);
				697	}
				698
				699	/* Element-wise multiplication of unsigned 16-bit values __B and __C, followed
				700	by accumulate across all elements and __A. */
				701	static __inline __m64
				702	_mm_mac_pu16 (__m64 __A, __m64 __B, __m64 __C)
				703	{
				704	return __builtin_arm_wmacu (__A, (__v4hi)__B, (__v4hi)__C);
				705	}
				706
				707	/* Element-wise multiplication of signed 16-bit values __B and __C, followed
				708	by accumulate across all elements and __A. */
				709	static __inline __m64
				710	_mm_mac_pi16 (__m64 __A, __m64 __B, __m64 __C)
				711	{
				712	return __builtin_arm_wmacs (__A, (__v4hi)__B, (__v4hi)__C);
				713	}
				714
				715	/* Element-wise multiplication of unsigned 16-bit values __B and __C, followed
				716	by accumulate across all elements. */
				717	static __inline __m64
				718	_mm_macz_pu16 (__m64 __A, __m64 __B)
				719	{
				720	return __builtin_arm_wmacuz ((__v4hi)__A, (__v4hi)__B);
				721	}
				722
				723	/* Element-wise multiplication of signed 16-bit values __B and __C, followed
				724	by accumulate across all elements. */
				725	static __inline __m64
				726	_mm_macz_pi16 (__m64 __A, __m64 __B)
				727	{
				728	return __builtin_arm_wmacsz ((__v4hi)__A, (__v4hi)__B);
				729	}
				730
				731	/* Accumulate across all unsigned 8-bit values in __A. */
				732	static __inline __m64
				733	_mm_acc_pu8 (__m64 __A)
				734	{
				735	return __builtin_arm_waccb ((__v8qi)__A);
				736	}
				737
				738	/* Accumulate across all unsigned 16-bit values in __A. */
				739	static __inline __m64
				740	_mm_acc_pu16 (__m64 __A)
				741	{
				742	return __builtin_arm_wacch ((__v4hi)__A);
				743	}
				744
				745	/* Accumulate across all unsigned 32-bit values in __A. */
				746	static __inline __m64
				747	_mm_acc_pu32 (__m64 __A)
				748	{
				749	return __builtin_arm_waccw ((__v2si)__A);
				750	}
				751
				752	static __inline __m64
				753	_mm_mia_si64 (__m64 __A, int __B, int __C)
				754	{
				755	return __builtin_arm_tmia (__A, __B, __C);
				756	}
				757
				758	static __inline __m64
				759	_mm_miaph_si64 (__m64 __A, int __B, int __C)
				760	{
				761	return __builtin_arm_tmiaph (__A, __B, __C);
				762	}
				763
				764	static __inline __m64
				765	_mm_miabb_si64 (__m64 __A, int __B, int __C)
				766	{
				767	return __builtin_arm_tmiabb (__A, __B, __C);
				768	}
				769
				770	static __inline __m64
				771	_mm_miabt_si64 (__m64 __A, int __B, int __C)
				772	{
				773	return __builtin_arm_tmiabt (__A, __B, __C);
				774	}
				775
				776	static __inline __m64
				777	_mm_miatb_si64 (__m64 __A, int __B, int __C)
				778	{
				779	return __builtin_arm_tmiatb (__A, __B, __C);
				780	}
				781
				782	static __inline __m64
				783	_mm_miatt_si64 (__m64 __A, int __B, int __C)
				784	{
				785	return __builtin_arm_tmiatt (__A, __B, __C);
				786	}
				787
				788	/* Extract one of the elements of A and sign extend. The selector N must
				789	be immediate. */
				790	#define _mm_extract_pi8(A, N) __builtin_arm_textrmsb ((__v8qi)(A), (N))
				791	#define _mm_extract_pi16(A, N) __builtin_arm_textrmsh ((__v4hi)(A), (N))
				792	#define _mm_extract_pi32(A, N) __builtin_arm_textrmsw ((__v2si)(A), (N))
				793
				794	/* Extract one of the elements of A and zero extend. The selector N must
				795	be immediate. */
				796	#define _mm_extract_pu8(A, N) __builtin_arm_textrmub ((__v8qi)(A), (N))
				797	#define _mm_extract_pu16(A, N) __builtin_arm_textrmuh ((__v4hi)(A), (N))
				798	#define _mm_extract_pu32(A, N) __builtin_arm_textrmuw ((__v2si)(A), (N))
				799
				800	/* Inserts word D into one of the elements of A. The selector N must be
				801	immediate. */
				802	#define _mm_insert_pi8(A, D, N) \
				803	((__m64) __builtin_arm_tinsrb ((__v8qi)(A), (D), (N)))
				804	#define _mm_insert_pi16(A, D, N) \
				805	((__m64) __builtin_arm_tinsrh ((__v4hi)(A), (D), (N)))
				806	#define _mm_insert_pi32(A, D, N) \
				807	((__m64) __builtin_arm_tinsrw ((__v2si)(A), (D), (N)))
				808
				809	/* Compute the element-wise maximum of signed 8-bit values. */
				810	static __inline __m64
				811	_mm_max_pi8 (__m64 __A, __m64 __B)
				812	{
				813	return (__m64) __builtin_arm_wmaxsb ((__v8qi)__A, (__v8qi)__B);
				814	}
				815
				816	/* Compute the element-wise maximum of signed 16-bit values. */
				817	static __inline __m64
				818	_mm_max_pi16 (__m64 __A, __m64 __B)
				819	{
				820	return (__m64) __builtin_arm_wmaxsh ((__v4hi)__A, (__v4hi)__B);
				821	}
				822
				823	/* Compute the element-wise maximum of signed 32-bit values. */
				824	static __inline __m64
				825	_mm_max_pi32 (__m64 __A, __m64 __B)
				826	{
				827	return (__m64) __builtin_arm_wmaxsw ((__v2si)__A, (__v2si)__B);
				828	}
				829
				830	/* Compute the element-wise maximum of unsigned 8-bit values. */
				831	static __inline __m64
				832	_mm_max_pu8 (__m64 __A, __m64 __B)
				833	{
				834	return (__m64) __builtin_arm_wmaxub ((__v8qi)__A, (__v8qi)__B);
				835	}
				836
				837	/* Compute the element-wise maximum of unsigned 16-bit values. */
				838	static __inline __m64
				839	_mm_max_pu16 (__m64 __A, __m64 __B)
				840	{
				841	return (__m64) __builtin_arm_wmaxuh ((__v4hi)__A, (__v4hi)__B);
				842	}
				843
				844	/* Compute the element-wise maximum of unsigned 32-bit values. */
				845	static __inline __m64
				846	_mm_max_pu32 (__m64 __A, __m64 __B)
				847	{
				848	return (__m64) __builtin_arm_wmaxuw ((__v2si)__A, (__v2si)__B);
				849	}
				850
				851	/* Compute the element-wise minimum of signed 16-bit values. */
				852	static __inline __m64
				853	_mm_min_pi8 (__m64 __A, __m64 __B)
				854	{
				855	return (__m64) __builtin_arm_wminsb ((__v8qi)__A, (__v8qi)__B);
				856	}
				857
				858	/* Compute the element-wise minimum of signed 16-bit values. */
				859	static __inline __m64
				860	_mm_min_pi16 (__m64 __A, __m64 __B)
				861	{
				862	return (__m64) __builtin_arm_wminsh ((__v4hi)__A, (__v4hi)__B);
				863	}
				864
				865	/* Compute the element-wise minimum of signed 32-bit values. */
				866	static __inline __m64
				867	_mm_min_pi32 (__m64 __A, __m64 __B)
				868	{
				869	return (__m64) __builtin_arm_wminsw ((__v2si)__A, (__v2si)__B);
				870	}
				871
				872	/* Compute the element-wise minimum of unsigned 16-bit values. */
				873	static __inline __m64
				874	_mm_min_pu8 (__m64 __A, __m64 __B)
				875	{
				876	return (__m64) __builtin_arm_wminub ((__v8qi)__A, (__v8qi)__B);
				877	}
				878
				879	/* Compute the element-wise minimum of unsigned 16-bit values. */
				880	static __inline __m64
				881	_mm_min_pu16 (__m64 __A, __m64 __B)
				882	{
				883	return (__m64) __builtin_arm_wminuh ((__v4hi)__A, (__v4hi)__B);
				884	}
				885
				886	/* Compute the element-wise minimum of unsigned 32-bit values. */
				887	static __inline __m64
				888	_mm_min_pu32 (__m64 __A, __m64 __B)
				889	{
				890	return (__m64) __builtin_arm_wminuw ((__v2si)__A, (__v2si)__B);
				891	}
				892
				893	/* Create an 8-bit mask of the signs of 8-bit values. */
				894	static __inline int
				895	_mm_movemask_pi8 (__m64 __A)
				896	{
				897	return __builtin_arm_tmovmskb ((__v8qi)__A);
				898	}
				899
				900	/* Create an 8-bit mask of the signs of 16-bit values. */
				901	static __inline int
				902	_mm_movemask_pi16 (__m64 __A)
				903	{
				904	return __builtin_arm_tmovmskh ((__v4hi)__A);
				905	}
				906
				907	/* Create an 8-bit mask of the signs of 32-bit values. */
				908	static __inline int
				909	_mm_movemask_pi32 (__m64 __A)
				910	{
				911	return __builtin_arm_tmovmskw ((__v2si)__A);
				912	}
				913
				914	/* Return a combination of the four 16-bit values in A. The selector
				915	must be an immediate. */
				916	#define _mm_shuffle_pi16(A, N) \
				917	((__m64) __builtin_arm_wshufh ((__v4hi)(A), (N)))
				918
				919
				920	/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
				921	static __inline __m64
				922	_mm_avg_pu8 (__m64 __A, __m64 __B)
				923	{
				924	return (__m64) __builtin_arm_wavg2br ((__v8qi)__A, (__v8qi)__B);
				925	}
				926
				927	/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
				928	static __inline __m64
				929	_mm_avg_pu16 (__m64 __A, __m64 __B)
				930	{
				931	return (__m64) __builtin_arm_wavg2hr ((__v4hi)__A, (__v4hi)__B);
				932	}
				933
				934	/* Compute the averages of the unsigned 8-bit values in A and B. */
				935	static __inline __m64
				936	_mm_avg2_pu8 (__m64 __A, __m64 __B)
				937	{
				938	return (__m64) __builtin_arm_wavg2b ((__v8qi)__A, (__v8qi)__B);
				939	}
				940
				941	/* Compute the averages of the unsigned 16-bit values in A and B. */
				942	static __inline __m64
				943	_mm_avg2_pu16 (__m64 __A, __m64 __B)
				944	{
				945	return (__m64) __builtin_arm_wavg2h ((__v4hi)__A, (__v4hi)__B);
				946	}
				947
				948	/* Compute the sum of the absolute differences of the unsigned 8-bit
				949	values in A and B. Return the value in the lower 16-bit word; the
				950	upper words are cleared. */
				951	static __inline __m64
				952	_mm_sad_pu8 (__m64 __A, __m64 __B)
				953	{
				954	return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
				955	}
				956
				957	static __inline __m64
				958	_mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C)
				959	{
				960	return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C);
				961	}
				962
				963	/* Compute the sum of the absolute differences of the unsigned 16-bit
				964	values in A and B. Return the value in the lower 32-bit word; the
				965	upper words are cleared. */
				966	static __inline __m64
				967	_mm_sad_pu16 (__m64 __A, __m64 __B)
				968	{
				969	return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
				970	}
				971
				972	static __inline __m64
				973	_mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C)
				974	{
				975	return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C);
				976	}
				977
				978
				979	/* Compute the sum of the absolute differences of the unsigned 8-bit
				980	values in A and B. Return the value in the lower 16-bit word; the
				981	upper words are cleared. */
				982	static __inline __m64
				983	_mm_sadz_pu8 (__m64 __A, __m64 __B)
				984	{
				985	return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
				986	}
				987
				988	/* Compute the sum of the absolute differences of the unsigned 16-bit
				989	values in A and B. Return the value in the lower 32-bit word; the
				990	upper words are cleared. */
				991	static __inline __m64
				992	_mm_sadz_pu16 (__m64 __A, __m64 __B)
				993	{
				994	return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
				995	}
				996
				997	#define _mm_align_si64(__A,__B, N) \
				998	(__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N))
				999
				1000	/* Creates a 64-bit zero. */
				1001	static __inline __m64
				1002	_mm_setzero_si64 (void)
				1003	{
				1004	return __builtin_arm_wzero ();
				1005	}
				1006
				1007	/* Set and Get arbitrary iWMMXt Control registers.
				1008	Note only registers 0-3 and 8-11 are currently defined,
				1009	the rest are reserved. */
				1010
				1011	static __inline void
				1012	_mm_setwcx (const int __value, const int __regno)
				1013	{
				1014	switch (__regno)
				1015	{
				1016	case 0:
				1017	__asm __volatile ("tmcr wcid, %0" :: "r"(__value));
				1018	break;
				1019	case 1:
				1020	__asm __volatile ("tmcr wcon, %0" :: "r"(__value));
				1021	break;
				1022	case 2:
				1023	__asm __volatile ("tmcr wcssf, %0" :: "r"(__value));
				1024	break;
				1025	case 3:
				1026	__asm __volatile ("tmcr wcasf, %0" :: "r"(__value));
				1027	break;
				1028	case 8:
				1029	__builtin_arm_setwcgr0 (__value);
				1030	break;
				1031	case 9:
				1032	__builtin_arm_setwcgr1 (__value);
				1033	break;
				1034	case 10:
				1035	__builtin_arm_setwcgr2 (__value);
				1036	break;
				1037	case 11:
				1038	__builtin_arm_setwcgr3 (__value);
				1039	break;
				1040	default:
				1041	break;
				1042	}
				1043	}
				1044
				1045	static __inline int
				1046	_mm_getwcx (const int __regno)
				1047	{
				1048	int __value;
				1049	switch (__regno)
				1050	{
				1051	case 0:
				1052	__asm __volatile ("tmrc %0, wcid" : "=r"(__value));
				1053	break;
				1054	case 1:
				1055	__asm __volatile ("tmrc %0, wcon" : "=r"(__value));
				1056	break;
				1057	case 2:
				1058	__asm __volatile ("tmrc %0, wcssf" : "=r"(__value));
				1059	break;
				1060	case 3:
				1061	__asm __volatile ("tmrc %0, wcasf" : "=r"(__value));
				1062	break;
				1063	case 8:
				1064	return __builtin_arm_getwcgr0 ();
				1065	case 9:
				1066	return __builtin_arm_getwcgr1 ();
				1067	case 10:
				1068	return __builtin_arm_getwcgr2 ();
				1069	case 11:
				1070	return __builtin_arm_getwcgr3 ();
				1071	default:
				1072	break;
				1073	}
				1074	return __value;
				1075	}
				1076
				1077	/* Creates a vector of two 32-bit values; I0 is least significant. */
				1078	static __inline __m64
				1079	_mm_set_pi32 (int __i1, int __i0)
				1080	{
				1081	union
				1082	{
				1083	__m64 __q;
				1084	struct
				1085	{
				1086	unsigned int __i0;
				1087	unsigned int __i1;
				1088	} __s;
				1089	} __u;
				1090
				1091	__u.__s.__i0 = __i0;
				1092	__u.__s.__i1 = __i1;
				1093
				1094	return __u.__q;
				1095	}
				1096
				1097	/* Creates a vector of four 16-bit values; W0 is least significant. */
				1098	static __inline __m64
				1099	_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
				1100	{
				1101	unsigned int __i1 = (unsigned short) __w3 << 16 \| (unsigned short) __w2;
				1102	unsigned int __i0 = (unsigned short) __w1 << 16 \| (unsigned short) __w0;
				1103
				1104	return _mm_set_pi32 (__i1, __i0);
				1105	}
				1106
				1107	/* Creates a vector of eight 8-bit values; B0 is least significant. */
				1108	static __inline __m64
				1109	_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
				1110	char __b3, char __b2, char __b1, char __b0)
				1111	{
				1112	unsigned int __i1, __i0;
				1113
				1114	__i1 = (unsigned char)__b7;
				1115	__i1 = __i1 << 8 \| (unsigned char)__b6;
				1116	__i1 = __i1 << 8 \| (unsigned char)__b5;
				1117	__i1 = __i1 << 8 \| (unsigned char)__b4;
				1118
				1119	__i0 = (unsigned char)__b3;
				1120	__i0 = __i0 << 8 \| (unsigned char)__b2;
				1121	__i0 = __i0 << 8 \| (unsigned char)__b1;
				1122	__i0 = __i0 << 8 \| (unsigned char)__b0;
				1123
				1124	return _mm_set_pi32 (__i1, __i0);
				1125	}
				1126
				1127	/* Similar, but with the arguments in reverse order. */
				1128	static __inline __m64
				1129	_mm_setr_pi32 (int __i0, int __i1)
				1130	{
				1131	return _mm_set_pi32 (__i1, __i0);
				1132	}
				1133
				1134	static __inline __m64
				1135	_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
				1136	{
				1137	return _mm_set_pi16 (__w3, __w2, __w1, __w0);
				1138	}
				1139
				1140	static __inline __m64
				1141	_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
				1142	char __b4, char __b5, char __b6, char __b7)
				1143	{
				1144	return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
				1145	}
				1146
				1147	/* Creates a vector of two 32-bit values, both elements containing I. */
				1148	static __inline __m64
				1149	_mm_set1_pi32 (int __i)
				1150	{
				1151	return _mm_set_pi32 (__i, __i);
				1152	}
				1153
				1154	/* Creates a vector of four 16-bit values, all elements containing W. */
				1155	static __inline __m64
				1156	_mm_set1_pi16 (short __w)
				1157	{
				1158	unsigned int __i = (unsigned short)__w << 16 \| (unsigned short)__w;
				1159	return _mm_set1_pi32 (__i);
				1160	}
				1161
				1162	/* Creates a vector of four 16-bit values, all elements containing B. */
				1163	static __inline __m64
				1164	_mm_set1_pi8 (char __b)
				1165	{
				1166	unsigned int __w = (unsigned char)__b << 8 \| (unsigned char)__b;
				1167	unsigned int __i = __w << 16 \| __w;
				1168	return _mm_set1_pi32 (__i);
				1169	}
				1170
				1171	#ifdef __IWMMXT2__
				1172	static __inline __m64
				1173	_mm_abs_pi8 (__m64 m1)
				1174	{
				1175	return (__m64) __builtin_arm_wabsb ((__v8qi)m1);
				1176	}
				1177
				1178	static __inline __m64
				1179	_mm_abs_pi16 (__m64 m1)
				1180	{
				1181	return (__m64) __builtin_arm_wabsh ((__v4hi)m1);
				1182
				1183	}
				1184
				1185	static __inline __m64
				1186	_mm_abs_pi32 (__m64 m1)
				1187	{
				1188	return (__m64) __builtin_arm_wabsw ((__v2si)m1);
				1189
				1190	}
				1191
				1192	static __inline __m64
				1193	_mm_addsubhx_pi16 (__m64 a, __m64 b)
				1194	{
				1195	return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b);
				1196	}
				1197
				1198	static __inline __m64
				1199	_mm_absdiff_pu8 (__m64 a, __m64 b)
				1200	{
				1201	return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b);
				1202	}
				1203
				1204	static __inline __m64
				1205	_mm_absdiff_pu16 (__m64 a, __m64 b)
				1206	{
				1207	return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b);
				1208	}
				1209
				1210	static __inline __m64
				1211	_mm_absdiff_pu32 (__m64 a, __m64 b)
				1212	{
				1213	return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b);
				1214	}
				1215
				1216	static __inline __m64
				1217	_mm_addc_pu16 (__m64 a, __m64 b)
				1218	{
				1219	__m64 result;
				1220	__asm__ __volatile__ ("waddhc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b));
				1221	return result;
				1222	}
				1223
				1224	static __inline __m64
				1225	_mm_addc_pu32 (__m64 a, __m64 b)
				1226	{
				1227	__m64 result;
				1228	__asm__ __volatile__ ("waddwc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b));
				1229	return result;
				1230	}
				1231
				1232	static __inline __m64
				1233	_mm_avg4_pu8 (__m64 a, __m64 b)
				1234	{
				1235	return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b);
				1236	}
				1237
				1238	static __inline __m64
				1239	_mm_avg4r_pu8 (__m64 a, __m64 b)
				1240	{
				1241	return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b);
				1242	}
				1243
				1244	static __inline __m64
				1245	_mm_maddx_pi16 (__m64 a, __m64 b)
				1246	{
				1247	return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b);
				1248	}
				1249
				1250	static __inline __m64
				1251	_mm_maddx_pu16 (__m64 a, __m64 b)
				1252	{
				1253	return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b);
				1254	}
				1255
				1256	static __inline __m64
				1257	_mm_msub_pi16 (__m64 a, __m64 b)
				1258	{
				1259	return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b);
				1260	}
				1261
				1262	static __inline __m64
				1263	_mm_msub_pu16 (__m64 a, __m64 b)
				1264	{
				1265	return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b);
				1266	}
				1267
				1268	static __inline __m64
				1269	_mm_mulhi_pi32 (__m64 a, __m64 b)
				1270	{
				1271	return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b);
				1272	}
				1273
				1274	static __inline __m64
				1275	_mm_mulhi_pu32 (__m64 a, __m64 b)
				1276	{
				1277	return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b);
				1278	}
				1279
				1280	static __inline __m64
				1281	_mm_mulhir_pi16 (__m64 a, __m64 b)
				1282	{
				1283	return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b);
				1284	}
				1285
				1286	static __inline __m64
				1287	_mm_mulhir_pi32 (__m64 a, __m64 b)
				1288	{
				1289	return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b);
				1290	}
				1291
				1292	static __inline __m64
				1293	_mm_mulhir_pu16 (__m64 a, __m64 b)
				1294	{
				1295	return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b);
				1296	}
				1297
				1298	static __inline __m64
				1299	_mm_mulhir_pu32 (__m64 a, __m64 b)
				1300	{
				1301	return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b);
				1302	}
				1303
				1304	static __inline __m64
				1305	_mm_mullo_pi32 (__m64 a, __m64 b)
				1306	{
				1307	return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b);
				1308	}
				1309
				1310	static __inline __m64
				1311	_mm_qmulm_pi16 (__m64 a, __m64 b)
				1312	{
				1313	return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b);
				1314	}
				1315
				1316	static __inline __m64
				1317	_mm_qmulm_pi32 (__m64 a, __m64 b)
				1318	{
				1319	return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b);
				1320	}
				1321
				1322	static __inline __m64
				1323	_mm_qmulmr_pi16 (__m64 a, __m64 b)
				1324	{
				1325	return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b);
				1326	}
				1327
				1328	static __inline __m64
				1329	_mm_qmulmr_pi32 (__m64 a, __m64 b)
				1330	{
				1331	return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b);
				1332	}
				1333
				1334	static __inline __m64
				1335	_mm_subaddhx_pi16 (__m64 a, __m64 b)
				1336	{
				1337	return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b);
				1338	}
				1339
				1340	static __inline __m64
				1341	_mm_addbhusl_pu8 (__m64 a, __m64 b)
				1342	{
				1343	return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b);
				1344	}
				1345
				1346	static __inline __m64
				1347	_mm_addbhusm_pu8 (__m64 a, __m64 b)
				1348	{
				1349	return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b);
				1350	}
				1351
				1352	#define _mm_qmiabb_pi32(acc, m1, m2) \
				1353	({\
				1354	__m64 _acc = acc;\
				1355	__m64 _m1 = m1;\
				1356	__m64 _m2 = m2;\
				1357	_acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1358	_acc;\
				1359	})
				1360
				1361	#define _mm_qmiabbn_pi32(acc, m1, m2) \
				1362	({\
				1363	__m64 _acc = acc;\
				1364	__m64 _m1 = m1;\
				1365	__m64 _m2 = m2;\
				1366	_acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1367	_acc;\
				1368	})
				1369
				1370	#define _mm_qmiabt_pi32(acc, m1, m2) \
				1371	({\
				1372	__m64 _acc = acc;\
				1373	__m64 _m1 = m1;\
				1374	__m64 _m2 = m2;\
				1375	_acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1376	_acc;\
				1377	})
				1378
				1379	#define _mm_qmiabtn_pi32(acc, m1, m2) \
				1380	({\
				1381	__m64 _acc=acc;\
				1382	__m64 _m1=m1;\
				1383	__m64 _m2=m2;\
				1384	_acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1385	_acc;\
				1386	})
				1387
				1388	#define _mm_qmiatb_pi32(acc, m1, m2) \
				1389	({\
				1390	__m64 _acc = acc;\
				1391	__m64 _m1 = m1;\
				1392	__m64 _m2 = m2;\
				1393	_acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1394	_acc;\
				1395	})
				1396
				1397	#define _mm_qmiatbn_pi32(acc, m1, m2) \
				1398	({\
				1399	__m64 _acc = acc;\
				1400	__m64 _m1 = m1;\
				1401	__m64 _m2 = m2;\
				1402	_acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1403	_acc;\
				1404	})
				1405
				1406	#define _mm_qmiatt_pi32(acc, m1, m2) \
				1407	({\
				1408	__m64 _acc = acc;\
				1409	__m64 _m1 = m1;\
				1410	__m64 _m2 = m2;\
				1411	_acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1412	_acc;\
				1413	})
				1414
				1415	#define _mm_qmiattn_pi32(acc, m1, m2) \
				1416	({\
				1417	__m64 _acc = acc;\
				1418	__m64 _m1 = m1;\
				1419	__m64 _m2 = m2;\
				1420	_acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1421	_acc;\
				1422	})
				1423
				1424	#define _mm_wmiabb_si64(acc, m1, m2) \
				1425	({\
				1426	__m64 _acc = acc;\
				1427	__m64 _m1 = m1;\
				1428	__m64 _m2 = m2;\
				1429	_acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1430	_acc;\
				1431	})
				1432
				1433	#define _mm_wmiabbn_si64(acc, m1, m2) \
				1434	({\
				1435	__m64 _acc = acc;\
				1436	__m64 _m1 = m1;\
				1437	__m64 _m2 = m2;\
				1438	_acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1439	_acc;\
				1440	})
				1441
				1442	#define _mm_wmiabt_si64(acc, m1, m2) \
				1443	({\
				1444	__m64 _acc = acc;\
				1445	__m64 _m1 = m1;\
				1446	__m64 _m2 = m2;\
				1447	_acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1448	_acc;\
				1449	})
				1450
				1451	#define _mm_wmiabtn_si64(acc, m1, m2) \
				1452	({\
				1453	__m64 _acc = acc;\
				1454	__m64 _m1 = m1;\
				1455	__m64 _m2 = m2;\
				1456	_acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1457	_acc;\
				1458	})
				1459
				1460	#define _mm_wmiatb_si64(acc, m1, m2) \
				1461	({\
				1462	__m64 _acc = acc;\
				1463	__m64 _m1 = m1;\
				1464	__m64 _m2 = m2;\
				1465	_acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1466	_acc;\
				1467	})
				1468
				1469	#define _mm_wmiatbn_si64(acc, m1, m2) \
				1470	({\
				1471	__m64 _acc = acc;\
				1472	__m64 _m1 = m1;\
				1473	__m64 _m2 = m2;\
				1474	_acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1475	_acc;\
				1476	})
				1477
				1478	#define _mm_wmiatt_si64(acc, m1, m2) \
				1479	({\
				1480	__m64 _acc = acc;\
				1481	__m64 _m1 = m1;\
				1482	__m64 _m2 = m2;\
				1483	_acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1484	_acc;\
				1485	})
				1486
				1487	#define _mm_wmiattn_si64(acc, m1, m2) \
				1488	({\
				1489	__m64 _acc = acc;\
				1490	__m64 _m1 = m1;\
				1491	__m64 _m2 = m2;\
				1492	_acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
				1493	_acc;\
				1494	})
				1495
				1496	#define _mm_wmiawbb_si64(acc, m1, m2) \
				1497	({\
				1498	__m64 _acc = acc;\
				1499	__m64 _m1 = m1;\
				1500	__m64 _m2 = m2;\
				1501	_acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\
				1502	_acc;\
				1503	})
				1504
				1505	#define _mm_wmiawbbn_si64(acc, m1, m2) \
				1506	({\
				1507	__m64 _acc = acc;\
				1508	__m64 _m1 = m1;\
				1509	__m64 _m2 = m2;\
				1510	_acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\
				1511	_acc;\
				1512	})
				1513
				1514	#define _mm_wmiawbt_si64(acc, m1, m2) \
				1515	({\
				1516	__m64 _acc = acc;\
				1517	__m64 _m1 = m1;\
				1518	__m64 _m2 = m2;\
				1519	_acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\
				1520	_acc;\
				1521	})
				1522
				1523	#define _mm_wmiawbtn_si64(acc, m1, m2) \
				1524	({\
				1525	__m64 _acc = acc;\
				1526	__m64 _m1 = m1;\
				1527	__m64 _m2 = m2;\
				1528	_acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\
				1529	_acc;\
				1530	})
				1531
				1532	#define _mm_wmiawtb_si64(acc, m1, m2) \
				1533	({\
				1534	__m64 _acc = acc;\
				1535	__m64 _m1 = m1;\
				1536	__m64 _m2 = m2;\
				1537	_acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\
				1538	_acc;\
				1539	})
				1540
				1541	#define _mm_wmiawtbn_si64(acc, m1, m2) \
				1542	({\
				1543	__m64 _acc = acc;\
				1544	__m64 _m1 = m1;\
				1545	__m64 _m2 = m2;\
				1546	_acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\
				1547	_acc;\
				1548	})
				1549
				1550	#define _mm_wmiawtt_si64(acc, m1, m2) \
				1551	({\
				1552	__m64 _acc = acc;\
				1553	__m64 _m1 = m1;\
				1554	__m64 _m2 = m2;\
				1555	_acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\
				1556	_acc;\
				1557	})
				1558
				1559	#define _mm_wmiawttn_si64(acc, m1, m2) \
				1560	({\
				1561	__m64 _acc = acc;\
				1562	__m64 _m1 = m1;\
				1563	__m64 _m2 = m2;\
				1564	_acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\
				1565	_acc;\
				1566	})
				1567
				1568	/* The third arguments should be an immediate. */
				1569	#define _mm_merge_si64(a, b, n) \
				1570	({\
				1571	__m64 result;\
				1572	result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\
				1573	result;\
				1574	})
				1575	#endif /* __IWMMXT2__ */
				1576
				1577	static __inline __m64
				1578	_mm_alignr0_si64 (__m64 a, __m64 b)
				1579	{
				1580	return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b);
				1581	}
				1582
				1583	static __inline __m64
				1584	_mm_alignr1_si64 (__m64 a, __m64 b)
				1585	{
				1586	return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b);
				1587	}
				1588
				1589	static __inline __m64
				1590	_mm_alignr2_si64 (__m64 a, __m64 b)
				1591	{
				1592	return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b);
				1593	}
				1594
				1595	static __inline __m64
				1596	_mm_alignr3_si64 (__m64 a, __m64 b)
				1597	{
				1598	return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b);
				1599	}
				1600
				1601	static __inline void
				1602	_mm_tandcb ()
				1603	{
				1604	__asm __volatile ("tandcb r15");
				1605	}
				1606
				1607	static __inline void
				1608	_mm_tandch ()
				1609	{
				1610	__asm __volatile ("tandch r15");
				1611	}
				1612
				1613	static __inline void
				1614	_mm_tandcw ()
				1615	{
				1616	__asm __volatile ("tandcw r15");
				1617	}
				1618
				1619	#define _mm_textrcb(n) \
				1620	({\
				1621	__asm__ __volatile__ (\
				1622	"textrcb r15, %0" : : "i" (n));\
				1623	})
				1624
				1625	#define _mm_textrch(n) \
				1626	({\
				1627	__asm__ __volatile__ (\
				1628	"textrch r15, %0" : : "i" (n));\
				1629	})
				1630
				1631	#define _mm_textrcw(n) \
				1632	({\
				1633	__asm__ __volatile__ (\
				1634	"textrcw r15, %0" : : "i" (n));\
				1635	})
				1636
				1637	static __inline void
				1638	_mm_torcb ()
				1639	{
				1640	__asm __volatile ("torcb r15");
				1641	}
				1642
				1643	static __inline void
				1644	_mm_torch ()
				1645	{
				1646	__asm __volatile ("torch r15");
				1647	}
				1648
				1649	static __inline void
				1650	_mm_torcw ()
				1651	{
				1652	__asm __volatile ("torcw r15");
				1653	}
				1654
				1655	#ifdef __IWMMXT2__
				1656	static __inline void
				1657	_mm_torvscb ()
				1658	{
				1659	__asm __volatile ("torvscb r15");
				1660	}
				1661
				1662	static __inline void
				1663	_mm_torvsch ()
				1664	{
				1665	__asm __volatile ("torvsch r15");
				1666	}
				1667
				1668	static __inline void
				1669	_mm_torvscw ()
				1670	{
				1671	__asm __volatile ("torvscw r15");
				1672	}
				1673	#endif /* __IWMMXT2__ */
				1674
				1675	static __inline __m64
				1676	_mm_tbcst_pi8 (int value)
				1677	{
				1678	return (__m64) __builtin_arm_tbcstb ((signed char) value);
				1679	}
				1680
				1681	static __inline __m64
				1682	_mm_tbcst_pi16 (int value)
				1683	{
				1684	return (__m64) __builtin_arm_tbcsth ((short) value);
				1685	}
				1686
				1687	static __inline __m64
				1688	_mm_tbcst_pi32 (int value)
				1689	{
				1690	return (__m64) __builtin_arm_tbcstw (value);
				1691	}
				1692
				1693	#define _m_empty _mm_empty
				1694	#define _m_packsswb _mm_packs_pi16
				1695	#define _m_packssdw _mm_packs_pi32
				1696	#define _m_packuswb _mm_packs_pu16
				1697	#define _m_packusdw _mm_packs_pu32
				1698	#define _m_packssqd _mm_packs_pi64
				1699	#define _m_packusqd _mm_packs_pu64
				1700	#define _mm_packs_si64 _mm_packs_pi64
				1701	#define _mm_packs_su64 _mm_packs_pu64
				1702	#define _m_punpckhbw _mm_unpackhi_pi8
				1703	#define _m_punpckhwd _mm_unpackhi_pi16
				1704	#define _m_punpckhdq _mm_unpackhi_pi32
				1705	#define _m_punpcklbw _mm_unpacklo_pi8
				1706	#define _m_punpcklwd _mm_unpacklo_pi16
				1707	#define _m_punpckldq _mm_unpacklo_pi32
				1708	#define _m_punpckehsbw _mm_unpackeh_pi8
				1709	#define _m_punpckehswd _mm_unpackeh_pi16
				1710	#define _m_punpckehsdq _mm_unpackeh_pi32
				1711	#define _m_punpckehubw _mm_unpackeh_pu8
				1712	#define _m_punpckehuwd _mm_unpackeh_pu16
				1713	#define _m_punpckehudq _mm_unpackeh_pu32
				1714	#define _m_punpckelsbw _mm_unpackel_pi8
				1715	#define _m_punpckelswd _mm_unpackel_pi16
				1716	#define _m_punpckelsdq _mm_unpackel_pi32
				1717	#define _m_punpckelubw _mm_unpackel_pu8
				1718	#define _m_punpckeluwd _mm_unpackel_pu16
				1719	#define _m_punpckeludq _mm_unpackel_pu32
				1720	#define _m_paddb _mm_add_pi8
				1721	#define _m_paddw _mm_add_pi16
				1722	#define _m_paddd _mm_add_pi32
				1723	#define _m_paddsb _mm_adds_pi8
				1724	#define _m_paddsw _mm_adds_pi16
				1725	#define _m_paddsd _mm_adds_pi32
				1726	#define _m_paddusb _mm_adds_pu8
				1727	#define _m_paddusw _mm_adds_pu16
				1728	#define _m_paddusd _mm_adds_pu32
				1729	#define _m_psubb _mm_sub_pi8
				1730	#define _m_psubw _mm_sub_pi16
				1731	#define _m_psubd _mm_sub_pi32
				1732	#define _m_psubsb _mm_subs_pi8
				1733	#define _m_psubsw _mm_subs_pi16
				1734	#define _m_psubuw _mm_subs_pi32
				1735	#define _m_psubusb _mm_subs_pu8
				1736	#define _m_psubusw _mm_subs_pu16
				1737	#define _m_psubusd _mm_subs_pu32
				1738	#define _m_pmaddwd _mm_madd_pi16
				1739	#define _m_pmadduwd _mm_madd_pu16
				1740	#define _m_pmulhw _mm_mulhi_pi16
				1741	#define _m_pmulhuw _mm_mulhi_pu16
				1742	#define _m_pmullw _mm_mullo_pi16
				1743	#define _m_pmacsw _mm_mac_pi16
				1744	#define _m_pmacuw _mm_mac_pu16
				1745	#define _m_pmacszw _mm_macz_pi16
				1746	#define _m_pmacuzw _mm_macz_pu16
				1747	#define _m_paccb _mm_acc_pu8
				1748	#define _m_paccw _mm_acc_pu16
				1749	#define _m_paccd _mm_acc_pu32
				1750	#define _m_pmia _mm_mia_si64
				1751	#define _m_pmiaph _mm_miaph_si64
				1752	#define _m_pmiabb _mm_miabb_si64
				1753	#define _m_pmiabt _mm_miabt_si64
				1754	#define _m_pmiatb _mm_miatb_si64
				1755	#define _m_pmiatt _mm_miatt_si64
				1756	#define _m_psllw _mm_sll_pi16
				1757	#define _m_psllwi _mm_slli_pi16
				1758	#define _m_pslld _mm_sll_pi32
				1759	#define _m_pslldi _mm_slli_pi32
				1760	#define _m_psllq _mm_sll_si64
				1761	#define _m_psllqi _mm_slli_si64
				1762	#define _m_psraw _mm_sra_pi16
				1763	#define _m_psrawi _mm_srai_pi16
				1764	#define _m_psrad _mm_sra_pi32
				1765	#define _m_psradi _mm_srai_pi32
				1766	#define _m_psraq _mm_sra_si64
				1767	#define _m_psraqi _mm_srai_si64
				1768	#define _m_psrlw _mm_srl_pi16
				1769	#define _m_psrlwi _mm_srli_pi16
				1770	#define _m_psrld _mm_srl_pi32
				1771	#define _m_psrldi _mm_srli_pi32
				1772	#define _m_psrlq _mm_srl_si64
				1773	#define _m_psrlqi _mm_srli_si64
				1774	#define _m_prorw _mm_ror_pi16
				1775	#define _m_prorwi _mm_rori_pi16
				1776	#define _m_prord _mm_ror_pi32
				1777	#define _m_prordi _mm_rori_pi32
				1778	#define _m_prorq _mm_ror_si64
				1779	#define _m_prorqi _mm_rori_si64
				1780	#define _m_pand _mm_and_si64
				1781	#define _m_pandn _mm_andnot_si64
				1782	#define _m_por _mm_or_si64
				1783	#define _m_pxor _mm_xor_si64
				1784	#define _m_pcmpeqb _mm_cmpeq_pi8
				1785	#define _m_pcmpeqw _mm_cmpeq_pi16
				1786	#define _m_pcmpeqd _mm_cmpeq_pi32
				1787	#define _m_pcmpgtb _mm_cmpgt_pi8
				1788	#define _m_pcmpgtub _mm_cmpgt_pu8
				1789	#define _m_pcmpgtw _mm_cmpgt_pi16
				1790	#define _m_pcmpgtuw _mm_cmpgt_pu16
				1791	#define _m_pcmpgtd _mm_cmpgt_pi32
				1792	#define _m_pcmpgtud _mm_cmpgt_pu32
				1793	#define _m_pextrb _mm_extract_pi8
				1794	#define _m_pextrw _mm_extract_pi16
				1795	#define _m_pextrd _mm_extract_pi32
				1796	#define _m_pextrub _mm_extract_pu8
				1797	#define _m_pextruw _mm_extract_pu16
				1798	#define _m_pextrud _mm_extract_pu32
				1799	#define _m_pinsrb _mm_insert_pi8
				1800	#define _m_pinsrw _mm_insert_pi16
				1801	#define _m_pinsrd _mm_insert_pi32
				1802	#define _m_pmaxsb _mm_max_pi8
				1803	#define _m_pmaxsw _mm_max_pi16
				1804	#define _m_pmaxsd _mm_max_pi32
				1805	#define _m_pmaxub _mm_max_pu8
				1806	#define _m_pmaxuw _mm_max_pu16
				1807	#define _m_pmaxud _mm_max_pu32
				1808	#define _m_pminsb _mm_min_pi8
				1809	#define _m_pminsw _mm_min_pi16
				1810	#define _m_pminsd _mm_min_pi32
				1811	#define _m_pminub _mm_min_pu8
				1812	#define _m_pminuw _mm_min_pu16
				1813	#define _m_pminud _mm_min_pu32
				1814	#define _m_pmovmskb _mm_movemask_pi8
				1815	#define _m_pmovmskw _mm_movemask_pi16
				1816	#define _m_pmovmskd _mm_movemask_pi32
				1817	#define _m_pshufw _mm_shuffle_pi16
				1818	#define _m_pavgb _mm_avg_pu8
				1819	#define _m_pavgw _mm_avg_pu16
				1820	#define _m_pavg2b _mm_avg2_pu8
				1821	#define _m_pavg2w _mm_avg2_pu16
				1822	#define _m_psadbw _mm_sad_pu8
				1823	#define _m_psadwd _mm_sad_pu16
				1824	#define _m_psadzbw _mm_sadz_pu8
				1825	#define _m_psadzwd _mm_sadz_pu16
				1826	#define _m_paligniq _mm_align_si64
				1827	#define _m_cvt_si2pi _mm_cvtsi64_m64
				1828	#define _m_cvt_pi2si _mm_cvtm64_si64
				1829	#define _m_from_int _mm_cvtsi32_si64
				1830	#define _m_to_int _mm_cvtsi64_si32
				1831
				1832	#if defined __cplusplus
				1833	}; /* End "C" */
				1834	#endif /* __cplusplus */
				1835
				1836	#endif /* _MMINTRIN_H_INCLUDED */