Blame - linux-x86/lib64/clang/14.0.2/include/ppc_wrappers/mmintrin.h - platform/prebuilts/clang-tools

blob: 54e4ee9f4468696bda027874059229bc1b902d61 [file] [log] [blame]

Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	1	/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
				2	*
				3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	* See https://llvm.org/LICENSE.txt for license information.
				5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				6	*
				7	*===-----------------------------------------------------------------------===
				8	*/
				9
				10	/* Implemented from the specification included in the Intel C++ Compiler
				11	User Guide and Reference, version 9.0. */
				12
				13	#ifndef NO_WARN_X86_INTRINSICS
				14	/* This header file is to help porting code using Intel intrinsics
				15	explicitly from x86_64 to powerpc64/powerpc64le.
				16
				17	Since PowerPC target doesn't support native 64-bit vector type, we
				18	typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
				19	works well for _si64 and some _pi32 operations.
				20
				21	For _pi16 and _pi8 operations, it's better to transfer __m64 into
				22	128-bit PowerPC vector first. Power8 introduced direct register
				23	move instructions which helps for more efficient implementation.
				24
				25	It's user's responsibility to determine if the results of such port
				26	are acceptable or further changes are needed. Please note that much
				27	code using Intel intrinsics CAN BE REWRITTEN in more portable and
				28	efficient standard C or GNU C extensions with 64-bit scalar
				29	operations, or 128-bit SSE/Altivec operations, which are more
				30	recommended. */
				31	#error \
				32	"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
				33	#endif
				34
				35	#ifndef _MMINTRIN_H_INCLUDED
				36	#define _MMINTRIN_H_INCLUDED
				37
Pirama Arumuga Nainar	ec8c89d	2022-02-23 09:26:16 -0800	[diff] [blame]	38	#if defined(__ppc64__) && (defined(__linux__) \|\| defined(__FreeBSD__))
Logan Chien	bedbf4f	2020-01-06 19:35:19 -0800	[diff] [blame]	39
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	40	#include <altivec.h>
				41	/* The Intel API is flexible enough that we must allow aliasing with other
				42	vector types, and their scalar components. */
				43	typedef __attribute__((__aligned__(8))) unsigned long long __m64;
				44
				45	typedef __attribute__((__aligned__(8))) union {
				46	__m64 as_m64;
				47	char as_char[8];
				48	signed char as_signed_char[8];
				49	short as_short[4];
				50	int as_int[2];
				51	long long as_long_long;
				52	float as_float[2];
				53	double as_double;
				54	} __m64_union;
				55
				56	/* Empty the multimedia state. */
				57	extern __inline void
				58	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				59	_mm_empty(void) {
				60	/* nothing to do on PowerPC. */
				61	}
				62
				63	extern __inline void
				64	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				65	_m_empty(void) {
				66	/* nothing to do on PowerPC. */
				67	}
				68
				69	/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
				70	extern __inline __m64
				71	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				72	_mm_cvtsi32_si64(int __i) {
				73	return (__m64)(unsigned int)__i;
				74	}
				75
				76	extern __inline __m64
				77	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				78	_m_from_int(int __i) {
				79	return _mm_cvtsi32_si64(__i);
				80	}
				81
				82	/* Convert the lower 32 bits of the __m64 object into an integer. */
				83	extern __inline int
				84	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				85	_mm_cvtsi64_si32(__m64 __i) {
				86	return ((int)__i);
				87	}
				88
				89	extern __inline int
				90	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				91	_m_to_int(__m64 __i) {
				92	return _mm_cvtsi64_si32(__i);
				93	}
				94
				95	/* Convert I to a __m64 object. */
				96
				97	/* Intel intrinsic. */
				98	extern __inline __m64
				99	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				100	_m_from_int64(long long __i) {
				101	return (__m64)__i;
				102	}
				103
				104	extern __inline __m64
				105	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				106	_mm_cvtsi64_m64(long long __i) {
				107	return (__m64)__i;
				108	}
				109
				110	/* Microsoft intrinsic. */
				111	extern __inline __m64
				112	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				113	_mm_cvtsi64x_si64(long long __i) {
				114	return (__m64)__i;
				115	}
				116
				117	extern __inline __m64
				118	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				119	_mm_set_pi64x(long long __i) {
				120	return (__m64)__i;
				121	}
				122
				123	/* Convert the __m64 object to a 64bit integer. */
				124
				125	/* Intel intrinsic. */
				126	extern __inline long long
				127	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				128	_m_to_int64(__m64 __i) {
				129	return (long long)__i;
				130	}
				131
				132	extern __inline long long
				133	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				134	_mm_cvtm64_si64(__m64 __i) {
				135	return (long long)__i;
				136	}
				137
				138	/* Microsoft intrinsic. */
				139	extern __inline long long
				140	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				141	_mm_cvtsi64_si64x(__m64 __i) {
				142	return (long long)__i;
				143	}
				144
				145	#ifdef _ARCH_PWR8
				146	/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
				147	the result, and the four 16-bit values from M2 into the upper four 8-bit
				148	values of the result, all with signed saturation. */
				149	extern __inline __m64
				150	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				151	_mm_packs_pi16(__m64 __m1, __m64 __m2) {
				152	__vector signed short vm1;
				153	__vector signed char vresult;
				154
				155	vm1 = (__vector signed short)(__vector unsigned long long)
				156	#ifdef __LITTLE_ENDIAN__
				157	{__m1, __m2};
				158	#else
				159	{__m2, __m1};
				160	#endif
				161	vresult = vec_packs(vm1, vm1);
				162	return (__m64)((__vector long long)vresult)[0];
				163	}
				164
				165	extern __inline __m64
				166	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				167	_m_packsswb(__m64 __m1, __m64 __m2) {
				168	return _mm_packs_pi16(__m1, __m2);
				169	}
				170
				171	/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
				172	the result, and the two 32-bit values from M2 into the upper two 16-bit
				173	values of the result, all with signed saturation. */
				174	extern __inline __m64
				175	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				176	_mm_packs_pi32(__m64 __m1, __m64 __m2) {
				177	__vector signed int vm1;
				178	__vector signed short vresult;
				179
				180	vm1 = (__vector signed int)(__vector unsigned long long)
				181	#ifdef __LITTLE_ENDIAN__
				182	{__m1, __m2};
				183	#else
				184	{__m2, __m1};
				185	#endif
				186	vresult = vec_packs(vm1, vm1);
				187	return (__m64)((__vector long long)vresult)[0];
				188	}
				189
				190	extern __inline __m64
				191	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				192	_m_packssdw(__m64 __m1, __m64 __m2) {
				193	return _mm_packs_pi32(__m1, __m2);
				194	}
				195
				196	/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
				197	the result, and the four 16-bit values from M2 into the upper four 8-bit
				198	values of the result, all with unsigned saturation. */
				199	extern __inline __m64
				200	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				201	_mm_packs_pu16(__m64 __m1, __m64 __m2) {
				202	__vector unsigned char r;
				203	__vector signed short vm1 = (__vector signed short)(__vector long long)
				204	#ifdef __LITTLE_ENDIAN__
				205	{__m1, __m2};
				206	#else
				207	{__m2, __m1};
				208	#endif
				209	const __vector signed short __zero = {0};
				210	__vector __bool short __select = vec_cmplt(vm1, __zero);
				211	r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
				212	__vector __bool char packsel = vec_pack(__select, __select);
				213	r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
				214	return (__m64)((__vector long long)r)[0];
				215	}
				216
				217	extern __inline __m64
				218	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				219	_m_packuswb(__m64 __m1, __m64 __m2) {
				220	return _mm_packs_pu16(__m1, __m2);
				221	}
				222	#endif /* end ARCH_PWR8 */
				223
				224	/* Interleave the four 8-bit values from the high half of M1 with the four
				225	8-bit values from the high half of M2. */
				226	extern __inline __m64
				227	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				228	_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
				229	#if _ARCH_PWR8
				230	__vector unsigned char a, b, c;
				231
				232	a = (__vector unsigned char)vec_splats(__m1);
				233	b = (__vector unsigned char)vec_splats(__m2);
				234	c = vec_mergel(a, b);
				235	return (__m64)((__vector long long)c)[1];
				236	#else
				237	__m64_union m1, m2, res;
				238
				239	m1.as_m64 = __m1;
				240	m2.as_m64 = __m2;
				241
				242	res.as_char[0] = m1.as_char[4];
				243	res.as_char[1] = m2.as_char[4];
				244	res.as_char[2] = m1.as_char[5];
				245	res.as_char[3] = m2.as_char[5];
				246	res.as_char[4] = m1.as_char[6];
				247	res.as_char[5] = m2.as_char[6];
				248	res.as_char[6] = m1.as_char[7];
				249	res.as_char[7] = m2.as_char[7];
				250
				251	return (__m64)res.as_m64;
				252	#endif
				253	}
				254
				255	extern __inline __m64
				256	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				257	_m_punpckhbw(__m64 __m1, __m64 __m2) {
				258	return _mm_unpackhi_pi8(__m1, __m2);
				259	}
				260
				261	/* Interleave the two 16-bit values from the high half of M1 with the two
				262	16-bit values from the high half of M2. */
				263	extern __inline __m64
				264	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				265	_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
				266	__m64_union m1, m2, res;
				267
				268	m1.as_m64 = __m1;
				269	m2.as_m64 = __m2;
				270
				271	res.as_short[0] = m1.as_short[2];
				272	res.as_short[1] = m2.as_short[2];
				273	res.as_short[2] = m1.as_short[3];
				274	res.as_short[3] = m2.as_short[3];
				275
				276	return (__m64)res.as_m64;
				277	}
				278
				279	extern __inline __m64
				280	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				281	_m_punpckhwd(__m64 __m1, __m64 __m2) {
				282	return _mm_unpackhi_pi16(__m1, __m2);
				283	}
				284	/* Interleave the 32-bit value from the high half of M1 with the 32-bit
				285	value from the high half of M2. */
				286	extern __inline __m64
				287	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				288	_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
				289	__m64_union m1, m2, res;
				290
				291	m1.as_m64 = __m1;
				292	m2.as_m64 = __m2;
				293
				294	res.as_int[0] = m1.as_int[1];
				295	res.as_int[1] = m2.as_int[1];
				296
				297	return (__m64)res.as_m64;
				298	}
				299
				300	extern __inline __m64
				301	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				302	_m_punpckhdq(__m64 __m1, __m64 __m2) {
				303	return _mm_unpackhi_pi32(__m1, __m2);
				304	}
				305	/* Interleave the four 8-bit values from the low half of M1 with the four
				306	8-bit values from the low half of M2. */
				307	extern __inline __m64
				308	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				309	_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
				310	#if _ARCH_PWR8
				311	__vector unsigned char a, b, c;
				312
				313	a = (__vector unsigned char)vec_splats(__m1);
				314	b = (__vector unsigned char)vec_splats(__m2);
				315	c = vec_mergel(a, b);
				316	return (__m64)((__vector long long)c)[0];
				317	#else
				318	__m64_union m1, m2, res;
				319
				320	m1.as_m64 = __m1;
				321	m2.as_m64 = __m2;
				322
				323	res.as_char[0] = m1.as_char[0];
				324	res.as_char[1] = m2.as_char[0];
				325	res.as_char[2] = m1.as_char[1];
				326	res.as_char[3] = m2.as_char[1];
				327	res.as_char[4] = m1.as_char[2];
				328	res.as_char[5] = m2.as_char[2];
				329	res.as_char[6] = m1.as_char[3];
				330	res.as_char[7] = m2.as_char[3];
				331
				332	return (__m64)res.as_m64;
				333	#endif
				334	}
				335
				336	extern __inline __m64
				337	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				338	_m_punpcklbw(__m64 __m1, __m64 __m2) {
				339	return _mm_unpacklo_pi8(__m1, __m2);
				340	}
				341	/* Interleave the two 16-bit values from the low half of M1 with the two
				342	16-bit values from the low half of M2. */
				343	extern __inline __m64
				344	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				345	_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
				346	__m64_union m1, m2, res;
				347
				348	m1.as_m64 = __m1;
				349	m2.as_m64 = __m2;
				350
				351	res.as_short[0] = m1.as_short[0];
				352	res.as_short[1] = m2.as_short[0];
				353	res.as_short[2] = m1.as_short[1];
				354	res.as_short[3] = m2.as_short[1];
				355
				356	return (__m64)res.as_m64;
				357	}
				358
				359	extern __inline __m64
				360	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				361	_m_punpcklwd(__m64 __m1, __m64 __m2) {
				362	return _mm_unpacklo_pi16(__m1, __m2);
				363	}
				364
				365	/* Interleave the 32-bit value from the low half of M1 with the 32-bit
				366	value from the low half of M2. */
				367	extern __inline __m64
				368	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				369	_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
				370	__m64_union m1, m2, res;
				371
				372	m1.as_m64 = __m1;
				373	m2.as_m64 = __m2;
				374
				375	res.as_int[0] = m1.as_int[0];
				376	res.as_int[1] = m2.as_int[0];
				377
				378	return (__m64)res.as_m64;
				379	}
				380
				381	extern __inline __m64
				382	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				383	_m_punpckldq(__m64 __m1, __m64 __m2) {
				384	return _mm_unpacklo_pi32(__m1, __m2);
				385	}
				386
				387	/* Add the 8-bit values in M1 to the 8-bit values in M2. */
				388	extern __inline __m64
				389	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				390	_mm_add_pi8(__m64 __m1, __m64 __m2) {
				391	#if _ARCH_PWR8
				392	__vector signed char a, b, c;
				393
				394	a = (__vector signed char)vec_splats(__m1);
				395	b = (__vector signed char)vec_splats(__m2);
				396	c = vec_add(a, b);
				397	return (__m64)((__vector long long)c)[0];
				398	#else
				399	__m64_union m1, m2, res;
				400
				401	m1.as_m64 = __m1;
				402	m2.as_m64 = __m2;
				403
				404	res.as_char[0] = m1.as_char[0] + m2.as_char[0];
				405	res.as_char[1] = m1.as_char[1] + m2.as_char[1];
				406	res.as_char[2] = m1.as_char[2] + m2.as_char[2];
				407	res.as_char[3] = m1.as_char[3] + m2.as_char[3];
				408	res.as_char[4] = m1.as_char[4] + m2.as_char[4];
				409	res.as_char[5] = m1.as_char[5] + m2.as_char[5];
				410	res.as_char[6] = m1.as_char[6] + m2.as_char[6];
				411	res.as_char[7] = m1.as_char[7] + m2.as_char[7];
				412
				413	return (__m64)res.as_m64;
				414	#endif
				415	}
				416
				417	extern __inline __m64
				418	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				419	_m_paddb(__m64 __m1, __m64 __m2) {
				420	return _mm_add_pi8(__m1, __m2);
				421	}
				422
				423	/* Add the 16-bit values in M1 to the 16-bit values in M2. */
				424	extern __inline __m64
				425	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				426	_mm_add_pi16(__m64 __m1, __m64 __m2) {
				427	#if _ARCH_PWR8
				428	__vector signed short a, b, c;
				429
				430	a = (__vector signed short)vec_splats(__m1);
				431	b = (__vector signed short)vec_splats(__m2);
				432	c = vec_add(a, b);
				433	return (__m64)((__vector long long)c)[0];
				434	#else
				435	__m64_union m1, m2, res;
				436
				437	m1.as_m64 = __m1;
				438	m2.as_m64 = __m2;
				439
				440	res.as_short[0] = m1.as_short[0] + m2.as_short[0];
				441	res.as_short[1] = m1.as_short[1] + m2.as_short[1];
				442	res.as_short[2] = m1.as_short[2] + m2.as_short[2];
				443	res.as_short[3] = m1.as_short[3] + m2.as_short[3];
				444
				445	return (__m64)res.as_m64;
				446	#endif
				447	}
				448
				449	extern __inline __m64
				450	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				451	_m_paddw(__m64 __m1, __m64 __m2) {
				452	return _mm_add_pi16(__m1, __m2);
				453	}
				454
				455	/* Add the 32-bit values in M1 to the 32-bit values in M2. */
				456	extern __inline __m64
				457	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				458	_mm_add_pi32(__m64 __m1, __m64 __m2) {
				459	#if _ARCH_PWR9
				460	__vector signed int a, b, c;
				461
				462	a = (__vector signed int)vec_splats(__m1);
				463	b = (__vector signed int)vec_splats(__m2);
				464	c = vec_add(a, b);
				465	return (__m64)((__vector long long)c)[0];
				466	#else
				467	__m64_union m1, m2, res;
				468
				469	m1.as_m64 = __m1;
				470	m2.as_m64 = __m2;
				471
				472	res.as_int[0] = m1.as_int[0] + m2.as_int[0];
				473	res.as_int[1] = m1.as_int[1] + m2.as_int[1];
				474
				475	return (__m64)res.as_m64;
				476	#endif
				477	}
				478
				479	extern __inline __m64
				480	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				481	_m_paddd(__m64 __m1, __m64 __m2) {
				482	return _mm_add_pi32(__m1, __m2);
				483	}
				484
				485	/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
				486	extern __inline __m64
				487	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				488	_mm_sub_pi8(__m64 __m1, __m64 __m2) {
				489	#if _ARCH_PWR8
				490	__vector signed char a, b, c;
				491
				492	a = (__vector signed char)vec_splats(__m1);
				493	b = (__vector signed char)vec_splats(__m2);
				494	c = vec_sub(a, b);
				495	return (__m64)((__vector long long)c)[0];
				496	#else
				497	__m64_union m1, m2, res;
				498
				499	m1.as_m64 = __m1;
				500	m2.as_m64 = __m2;
				501
				502	res.as_char[0] = m1.as_char[0] - m2.as_char[0];
				503	res.as_char[1] = m1.as_char[1] - m2.as_char[1];
				504	res.as_char[2] = m1.as_char[2] - m2.as_char[2];
				505	res.as_char[3] = m1.as_char[3] - m2.as_char[3];
				506	res.as_char[4] = m1.as_char[4] - m2.as_char[4];
				507	res.as_char[5] = m1.as_char[5] - m2.as_char[5];
				508	res.as_char[6] = m1.as_char[6] - m2.as_char[6];
				509	res.as_char[7] = m1.as_char[7] - m2.as_char[7];
				510
				511	return (__m64)res.as_m64;
				512	#endif
				513	}
				514
				515	extern __inline __m64
				516	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				517	_m_psubb(__m64 __m1, __m64 __m2) {
				518	return _mm_sub_pi8(__m1, __m2);
				519	}
				520
				521	/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
				522	extern __inline __m64
				523	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				524	_mm_sub_pi16(__m64 __m1, __m64 __m2) {
				525	#if _ARCH_PWR8
				526	__vector signed short a, b, c;
				527
				528	a = (__vector signed short)vec_splats(__m1);
				529	b = (__vector signed short)vec_splats(__m2);
				530	c = vec_sub(a, b);
				531	return (__m64)((__vector long long)c)[0];
				532	#else
				533	__m64_union m1, m2, res;
				534
				535	m1.as_m64 = __m1;
				536	m2.as_m64 = __m2;
				537
				538	res.as_short[0] = m1.as_short[0] - m2.as_short[0];
				539	res.as_short[1] = m1.as_short[1] - m2.as_short[1];
				540	res.as_short[2] = m1.as_short[2] - m2.as_short[2];
				541	res.as_short[3] = m1.as_short[3] - m2.as_short[3];
				542
				543	return (__m64)res.as_m64;
				544	#endif
				545	}
				546
				547	extern __inline __m64
				548	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				549	_m_psubw(__m64 __m1, __m64 __m2) {
				550	return _mm_sub_pi16(__m1, __m2);
				551	}
				552
				553	/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
				554	extern __inline __m64
				555	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				556	_mm_sub_pi32(__m64 __m1, __m64 __m2) {
				557	#if _ARCH_PWR9
				558	__vector signed int a, b, c;
				559
				560	a = (__vector signed int)vec_splats(__m1);
				561	b = (__vector signed int)vec_splats(__m2);
				562	c = vec_sub(a, b);
				563	return (__m64)((__vector long long)c)[0];
				564	#else
				565	__m64_union m1, m2, res;
				566
				567	m1.as_m64 = __m1;
				568	m2.as_m64 = __m2;
				569
				570	res.as_int[0] = m1.as_int[0] - m2.as_int[0];
				571	res.as_int[1] = m1.as_int[1] - m2.as_int[1];
				572
				573	return (__m64)res.as_m64;
				574	#endif
				575	}
				576
				577	extern __inline __m64
				578	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				579	_m_psubd(__m64 __m1, __m64 __m2) {
				580	return _mm_sub_pi32(__m1, __m2);
				581	}
				582
				583	extern __inline __m64
				584	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				585	_mm_add_si64(__m64 __m1, __m64 __m2) {
				586	return (__m1 + __m2);
				587	}
				588
				589	extern __inline __m64
				590	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				591	_mm_sub_si64(__m64 __m1, __m64 __m2) {
				592	return (__m1 - __m2);
				593	}
				594
				595	/* Shift the 64-bit value in M left by COUNT. */
				596	extern __inline __m64
				597	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				598	_mm_sll_si64(__m64 __m, __m64 __count) {
				599	return (__m << __count);
				600	}
				601
				602	extern __inline __m64
				603	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				604	_m_psllq(__m64 __m, __m64 __count) {
				605	return _mm_sll_si64(__m, __count);
				606	}
				607
				608	extern __inline __m64
				609	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				610	_mm_slli_si64(__m64 __m, const int __count) {
				611	return (__m << __count);
				612	}
				613
				614	extern __inline __m64
				615	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				616	_m_psllqi(__m64 __m, const int __count) {
				617	return _mm_slli_si64(__m, __count);
				618	}
				619
				620	/* Shift the 64-bit value in M left by COUNT; shift in zeros. */
				621	extern __inline __m64
				622	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				623	_mm_srl_si64(__m64 __m, __m64 __count) {
				624	return (__m >> __count);
				625	}
				626
				627	extern __inline __m64
				628	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				629	_m_psrlq(__m64 __m, __m64 __count) {
				630	return _mm_srl_si64(__m, __count);
				631	}
				632
				633	extern __inline __m64
				634	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				635	_mm_srli_si64(__m64 __m, const int __count) {
				636	return (__m >> __count);
				637	}
				638
				639	extern __inline __m64
				640	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				641	_m_psrlqi(__m64 __m, const int __count) {
				642	return _mm_srli_si64(__m, __count);
				643	}
				644
				645	/* Bit-wise AND the 64-bit values in M1 and M2. */
				646	extern __inline __m64
				647	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				648	_mm_and_si64(__m64 __m1, __m64 __m2) {
				649	return (__m1 & __m2);
				650	}
				651
				652	extern __inline __m64
				653	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				654	_m_pand(__m64 __m1, __m64 __m2) {
				655	return _mm_and_si64(__m1, __m2);
				656	}
				657
				658	/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
				659	64-bit value in M2. */
				660	extern __inline __m64
				661	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				662	_mm_andnot_si64(__m64 __m1, __m64 __m2) {
				663	return (~__m1 & __m2);
				664	}
				665
				666	extern __inline __m64
				667	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				668	_m_pandn(__m64 __m1, __m64 __m2) {
				669	return _mm_andnot_si64(__m1, __m2);
				670	}
				671
				672	/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
				673	extern __inline __m64
				674	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				675	_mm_or_si64(__m64 __m1, __m64 __m2) {
				676	return (__m1 \| __m2);
				677	}
				678
				679	extern __inline __m64
				680	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				681	_m_por(__m64 __m1, __m64 __m2) {
				682	return _mm_or_si64(__m1, __m2);
				683	}
				684
				685	/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
				686	extern __inline __m64
				687	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				688	_mm_xor_si64(__m64 __m1, __m64 __m2) {
				689	return (__m1 ^ __m2);
				690	}
				691
				692	extern __inline __m64
				693	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				694	_m_pxor(__m64 __m1, __m64 __m2) {
				695	return _mm_xor_si64(__m1, __m2);
				696	}
				697
				698	/* Creates a 64-bit zero. */
				699	extern __inline __m64
				700	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				701	_mm_setzero_si64(void) {
				702	return (__m64)0;
				703	}
				704
				705	/* Compare eight 8-bit values. The result of the comparison is 0xFF if the
				706	test is true and zero if false. */
				707	extern __inline __m64
				708	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				709	_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
				710	#if defined(_ARCH_PWR6) && defined(__powerpc64__)
				711	__m64 res;
				712	__asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
				713	return (res);
				714	#else
				715	__m64_union m1, m2, res;
				716
				717	m1.as_m64 = __m1;
				718	m2.as_m64 = __m2;
				719
				720	res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
				721	res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
				722	res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
				723	res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
				724	res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
				725	res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
				726	res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
				727	res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
				728
				729	return (__m64)res.as_m64;
				730	#endif
				731	}
				732
				733	extern __inline __m64
				734	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				735	_m_pcmpeqb(__m64 __m1, __m64 __m2) {
				736	return _mm_cmpeq_pi8(__m1, __m2);
				737	}
				738
				739	extern __inline __m64
				740	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				741	_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
				742	#if _ARCH_PWR8
				743	__vector signed char a, b, c;
				744
				745	a = (__vector signed char)vec_splats(__m1);
				746	b = (__vector signed char)vec_splats(__m2);
				747	c = (__vector signed char)vec_cmpgt(a, b);
				748	return (__m64)((__vector long long)c)[0];
				749	#else
				750	__m64_union m1, m2, res;
				751
				752	m1.as_m64 = __m1;
				753	m2.as_m64 = __m2;
				754
				755	res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
				756	res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
				757	res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
				758	res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
				759	res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
				760	res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
				761	res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
				762	res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
				763
				764	return (__m64)res.as_m64;
				765	#endif
				766	}
				767
				768	extern __inline __m64
				769	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				770	_m_pcmpgtb(__m64 __m1, __m64 __m2) {
				771	return _mm_cmpgt_pi8(__m1, __m2);
				772	}
				773
				774	/* Compare four 16-bit values. The result of the comparison is 0xFFFF if
				775	the test is true and zero if false. */
				776	extern __inline __m64
				777	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				778	_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
				779	#if _ARCH_PWR8
				780	__vector signed short a, b, c;
				781
				782	a = (__vector signed short)vec_splats(__m1);
				783	b = (__vector signed short)vec_splats(__m2);
				784	c = (__vector signed short)vec_cmpeq(a, b);
				785	return (__m64)((__vector long long)c)[0];
				786	#else
				787	__m64_union m1, m2, res;
				788
				789	m1.as_m64 = __m1;
				790	m2.as_m64 = __m2;
				791
				792	res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
				793	res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
				794	res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
				795	res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
				796
				797	return (__m64)res.as_m64;
				798	#endif
				799	}
				800
				801	extern __inline __m64
				802	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				803	_m_pcmpeqw(__m64 __m1, __m64 __m2) {
				804	return _mm_cmpeq_pi16(__m1, __m2);
				805	}
				806
				807	extern __inline __m64
				808	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				809	_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
				810	#if _ARCH_PWR8
				811	__vector signed short a, b, c;
				812
				813	a = (__vector signed short)vec_splats(__m1);
				814	b = (__vector signed short)vec_splats(__m2);
				815	c = (__vector signed short)vec_cmpgt(a, b);
				816	return (__m64)((__vector long long)c)[0];
				817	#else
				818	__m64_union m1, m2, res;
				819
				820	m1.as_m64 = __m1;
				821	m2.as_m64 = __m2;
				822
				823	res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
				824	res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
				825	res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
				826	res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
				827
				828	return (__m64)res.as_m64;
				829	#endif
				830	}
				831
				832	extern __inline __m64
				833	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				834	_m_pcmpgtw(__m64 __m1, __m64 __m2) {
				835	return _mm_cmpgt_pi16(__m1, __m2);
				836	}
				837
				838	/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
				839	the test is true and zero if false. */
				840	extern __inline __m64
				841	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				842	_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
				843	#if _ARCH_PWR9
				844	__vector signed int a, b, c;
				845
				846	a = (__vector signed int)vec_splats(__m1);
				847	b = (__vector signed int)vec_splats(__m2);
				848	c = (__vector signed int)vec_cmpeq(a, b);
				849	return (__m64)((__vector long long)c)[0];
				850	#else
				851	__m64_union m1, m2, res;
				852
				853	m1.as_m64 = __m1;
				854	m2.as_m64 = __m2;
				855
				856	res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
				857	res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
				858
				859	return (__m64)res.as_m64;
				860	#endif
				861	}
				862
				863	extern __inline __m64
				864	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				865	_m_pcmpeqd(__m64 __m1, __m64 __m2) {
				866	return _mm_cmpeq_pi32(__m1, __m2);
				867	}
				868
				869	extern __inline __m64
				870	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				871	_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
				872	#if _ARCH_PWR9
				873	__vector signed int a, b, c;
				874
				875	a = (__vector signed int)vec_splats(__m1);
				876	b = (__vector signed int)vec_splats(__m2);
				877	c = (__vector signed int)vec_cmpgt(a, b);
				878	return (__m64)((__vector long long)c)[0];
				879	#else
				880	__m64_union m1, m2, res;
				881
				882	m1.as_m64 = __m1;
				883	m2.as_m64 = __m2;
				884
				885	res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
				886	res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
				887
				888	return (__m64)res.as_m64;
				889	#endif
				890	}
				891
				892	extern __inline __m64
				893	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				894	_m_pcmpgtd(__m64 __m1, __m64 __m2) {
				895	return _mm_cmpgt_pi32(__m1, __m2);
				896	}
				897
				898	#if _ARCH_PWR8
				899	/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
				900	saturated arithmetic. */
				901	extern __inline __m64
				902	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				903	_mm_adds_pi8(__m64 __m1, __m64 __m2) {
				904	__vector signed char a, b, c;
				905
				906	a = (__vector signed char)vec_splats(__m1);
				907	b = (__vector signed char)vec_splats(__m2);
				908	c = vec_adds(a, b);
				909	return (__m64)((__vector long long)c)[0];
				910	}
				911
				912	extern __inline __m64
				913	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				914	_m_paddsb(__m64 __m1, __m64 __m2) {
				915	return _mm_adds_pi8(__m1, __m2);
				916	}
				917	/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
				918	saturated arithmetic. */
				919	extern __inline __m64
				920	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				921	_mm_adds_pi16(__m64 __m1, __m64 __m2) {
				922	__vector signed short a, b, c;
				923
				924	a = (__vector signed short)vec_splats(__m1);
				925	b = (__vector signed short)vec_splats(__m2);
				926	c = vec_adds(a, b);
				927	return (__m64)((__vector long long)c)[0];
				928	}
				929
				930	extern __inline __m64
				931	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				932	_m_paddsw(__m64 __m1, __m64 __m2) {
				933	return _mm_adds_pi16(__m1, __m2);
				934	}
				935	/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
				936	saturated arithmetic. */
				937	extern __inline __m64
				938	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				939	_mm_adds_pu8(__m64 __m1, __m64 __m2) {
				940	__vector unsigned char a, b, c;
				941
				942	a = (__vector unsigned char)vec_splats(__m1);
				943	b = (__vector unsigned char)vec_splats(__m2);
				944	c = vec_adds(a, b);
				945	return (__m64)((__vector long long)c)[0];
				946	}
				947
				948	extern __inline __m64
				949	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				950	_m_paddusb(__m64 __m1, __m64 __m2) {
				951	return _mm_adds_pu8(__m1, __m2);
				952	}
				953
				954	/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
				955	saturated arithmetic. */
				956	extern __inline __m64
				957	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				958	_mm_adds_pu16(__m64 __m1, __m64 __m2) {
				959	__vector unsigned short a, b, c;
				960
				961	a = (__vector unsigned short)vec_splats(__m1);
				962	b = (__vector unsigned short)vec_splats(__m2);
				963	c = vec_adds(a, b);
				964	return (__m64)((__vector long long)c)[0];
				965	}
				966
				967	extern __inline __m64
				968	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				969	_m_paddusw(__m64 __m1, __m64 __m2) {
				970	return _mm_adds_pu16(__m1, __m2);
				971	}
				972
				973	/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
				974	saturating arithmetic. */
				975	extern __inline __m64
				976	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				977	_mm_subs_pi8(__m64 __m1, __m64 __m2) {
				978	__vector signed char a, b, c;
				979
				980	a = (__vector signed char)vec_splats(__m1);
				981	b = (__vector signed char)vec_splats(__m2);
				982	c = vec_subs(a, b);
				983	return (__m64)((__vector long long)c)[0];
				984	}
				985
				986	extern __inline __m64
				987	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				988	_m_psubsb(__m64 __m1, __m64 __m2) {
				989	return _mm_subs_pi8(__m1, __m2);
				990	}
				991
				992	/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
				993	signed saturating arithmetic. */
				994	extern __inline __m64
				995	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				996	_mm_subs_pi16(__m64 __m1, __m64 __m2) {
				997	__vector signed short a, b, c;
				998
				999	a = (__vector signed short)vec_splats(__m1);
				1000	b = (__vector signed short)vec_splats(__m2);
				1001	c = vec_subs(a, b);
				1002	return (__m64)((__vector long long)c)[0];
				1003	}
				1004
				1005	extern __inline __m64
				1006	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1007	_m_psubsw(__m64 __m1, __m64 __m2) {
				1008	return _mm_subs_pi16(__m1, __m2);
				1009	}
				1010
				1011	/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
				1012	unsigned saturating arithmetic. */
				1013	extern __inline __m64
				1014	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1015	_mm_subs_pu8(__m64 __m1, __m64 __m2) {
				1016	__vector unsigned char a, b, c;
				1017
				1018	a = (__vector unsigned char)vec_splats(__m1);
				1019	b = (__vector unsigned char)vec_splats(__m2);
				1020	c = vec_subs(a, b);
				1021	return (__m64)((__vector long long)c)[0];
				1022	}
				1023
				1024	extern __inline __m64
				1025	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1026	_m_psubusb(__m64 __m1, __m64 __m2) {
				1027	return _mm_subs_pu8(__m1, __m2);
				1028	}
				1029
				1030	/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
				1031	unsigned saturating arithmetic. */
				1032	extern __inline __m64
				1033	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1034	_mm_subs_pu16(__m64 __m1, __m64 __m2) {
				1035	__vector unsigned short a, b, c;
				1036
				1037	a = (__vector unsigned short)vec_splats(__m1);
				1038	b = (__vector unsigned short)vec_splats(__m2);
				1039	c = vec_subs(a, b);
				1040	return (__m64)((__vector long long)c)[0];
				1041	}
				1042
				1043	extern __inline __m64
				1044	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1045	_m_psubusw(__m64 __m1, __m64 __m2) {
				1046	return _mm_subs_pu16(__m1, __m2);
				1047	}
				1048
				1049	/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
				1050	four 32-bit intermediate results, which are then summed by pairs to
				1051	produce two 32-bit results. */
				1052	extern __inline __m64
				1053	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1054	_mm_madd_pi16(__m64 __m1, __m64 __m2) {
				1055	__vector signed short a, b;
				1056	__vector signed int c;
				1057	__vector signed int zero = {0, 0, 0, 0};
				1058
				1059	a = (__vector signed short)vec_splats(__m1);
				1060	b = (__vector signed short)vec_splats(__m2);
				1061	c = vec_vmsumshm(a, b, zero);
				1062	return (__m64)((__vector long long)c)[0];
				1063	}
				1064
				1065	extern __inline __m64
				1066	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1067	_m_pmaddwd(__m64 __m1, __m64 __m2) {
				1068	return _mm_madd_pi16(__m1, __m2);
				1069	}
				1070	/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
				1071	M2 and produce the high 16 bits of the 32-bit results. */
				1072	extern __inline __m64
				1073	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1074	_mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
				1075	__vector signed short a, b;
				1076	__vector signed short c;
				1077	__vector signed int w0, w1;
				1078	__vector unsigned char xform1 = {
				1079	#ifdef __LITTLE_ENDIAN__
				1080	0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
				1081	0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
				1082	#else
				1083	0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
				1084	0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
				1085	#endif
				1086	};
				1087
				1088	a = (__vector signed short)vec_splats(__m1);
				1089	b = (__vector signed short)vec_splats(__m2);
				1090
				1091	w0 = vec_vmulesh(a, b);
				1092	w1 = vec_vmulosh(a, b);
				1093	c = (__vector signed short)vec_perm(w0, w1, xform1);
				1094
				1095	return (__m64)((__vector long long)c)[0];
				1096	}
				1097
				1098	extern __inline __m64
				1099	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1100	_m_pmulhw(__m64 __m1, __m64 __m2) {
				1101	return _mm_mulhi_pi16(__m1, __m2);
				1102	}
				1103
				1104	/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
				1105	the low 16 bits of the results. */
				1106	extern __inline __m64
				1107	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1108	_mm_mullo_pi16(__m64 __m1, __m64 __m2) {
				1109	__vector signed short a, b, c;
				1110
				1111	a = (__vector signed short)vec_splats(__m1);
				1112	b = (__vector signed short)vec_splats(__m2);
				1113	c = a * b;
				1114	return (__m64)((__vector long long)c)[0];
				1115	}
				1116
				1117	extern __inline __m64
				1118	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1119	_m_pmullw(__m64 __m1, __m64 __m2) {
				1120	return _mm_mullo_pi16(__m1, __m2);
				1121	}
				1122
				1123	/* Shift four 16-bit values in M left by COUNT. */
				1124	extern __inline __m64
				1125	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1126	_mm_sll_pi16(__m64 __m, __m64 __count) {
				1127	__vector signed short m, r;
				1128	__vector unsigned short c;
				1129
				1130	if (__count <= 15) {
				1131	m = (__vector signed short)vec_splats(__m);
				1132	c = (__vector unsigned short)vec_splats((unsigned short)__count);
				1133	r = vec_sl(m, (__vector unsigned short)c);
				1134	return (__m64)((__vector long long)r)[0];
				1135	} else
				1136	return (0);
				1137	}
				1138
				1139	extern __inline __m64
				1140	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1141	_m_psllw(__m64 __m, __m64 __count) {
				1142	return _mm_sll_pi16(__m, __count);
				1143	}
				1144
				1145	extern __inline __m64
				1146	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1147	_mm_slli_pi16(__m64 __m, int __count) {
				1148	/* Promote int to long then invoke mm_sll_pi16. */
				1149	return _mm_sll_pi16(__m, __count);
				1150	}
				1151
				1152	extern __inline __m64
				1153	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1154	_m_psllwi(__m64 __m, int __count) {
				1155	return _mm_slli_pi16(__m, __count);
				1156	}
				1157
				1158	/* Shift two 32-bit values in M left by COUNT. */
				1159	extern __inline __m64
				1160	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1161	_mm_sll_pi32(__m64 __m, __m64 __count) {
				1162	__m64_union m, res;
				1163
				1164	m.as_m64 = __m;
				1165
				1166	res.as_int[0] = m.as_int[0] << __count;
				1167	res.as_int[1] = m.as_int[1] << __count;
				1168	return (res.as_m64);
				1169	}
				1170
				1171	extern __inline __m64
				1172	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1173	_m_pslld(__m64 __m, __m64 __count) {
				1174	return _mm_sll_pi32(__m, __count);
				1175	}
				1176
				1177	extern __inline __m64
				1178	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1179	_mm_slli_pi32(__m64 __m, int __count) {
				1180	/* Promote int to long then invoke mm_sll_pi32. */
				1181	return _mm_sll_pi32(__m, __count);
				1182	}
				1183
				1184	extern __inline __m64
				1185	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1186	_m_pslldi(__m64 __m, int __count) {
				1187	return _mm_slli_pi32(__m, __count);
				1188	}
				1189
				1190	/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
				1191	extern __inline __m64
				1192	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1193	_mm_sra_pi16(__m64 __m, __m64 __count) {
				1194	__vector signed short m, r;
				1195	__vector unsigned short c;
				1196
				1197	if (__count <= 15) {
				1198	m = (__vector signed short)vec_splats(__m);
				1199	c = (__vector unsigned short)vec_splats((unsigned short)__count);
				1200	r = vec_sra(m, (__vector unsigned short)c);
				1201	return (__m64)((__vector long long)r)[0];
				1202	} else
				1203	return (0);
				1204	}
				1205
				1206	extern __inline __m64
				1207	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1208	_m_psraw(__m64 __m, __m64 __count) {
				1209	return _mm_sra_pi16(__m, __count);
				1210	}
				1211
				1212	extern __inline __m64
				1213	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1214	_mm_srai_pi16(__m64 __m, int __count) {
				1215	/* Promote int to long then invoke mm_sra_pi32. */
				1216	return _mm_sra_pi16(__m, __count);
				1217	}
				1218
				1219	extern __inline __m64
				1220	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1221	_m_psrawi(__m64 __m, int __count) {
				1222	return _mm_srai_pi16(__m, __count);
				1223	}
				1224
				1225	/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
				1226	extern __inline __m64
				1227	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1228	_mm_sra_pi32(__m64 __m, __m64 __count) {
				1229	__m64_union m, res;
				1230
				1231	m.as_m64 = __m;
				1232
				1233	res.as_int[0] = m.as_int[0] >> __count;
				1234	res.as_int[1] = m.as_int[1] >> __count;
				1235	return (res.as_m64);
				1236	}
				1237
				1238	extern __inline __m64
				1239	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1240	_m_psrad(__m64 __m, __m64 __count) {
				1241	return _mm_sra_pi32(__m, __count);
				1242	}
				1243
				1244	extern __inline __m64
				1245	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1246	_mm_srai_pi32(__m64 __m, int __count) {
				1247	/* Promote int to long then invoke mm_sra_pi32. */
				1248	return _mm_sra_pi32(__m, __count);
				1249	}
				1250
				1251	extern __inline __m64
				1252	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1253	_m_psradi(__m64 __m, int __count) {
				1254	return _mm_srai_pi32(__m, __count);
				1255	}
				1256
				1257	/* Shift four 16-bit values in M right by COUNT; shift in zeros. */
				1258	extern __inline __m64
				1259	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1260	_mm_srl_pi16(__m64 __m, __m64 __count) {
				1261	__vector unsigned short m, r;
				1262	__vector unsigned short c;
				1263
				1264	if (__count <= 15) {
				1265	m = (__vector unsigned short)vec_splats(__m);
				1266	c = (__vector unsigned short)vec_splats((unsigned short)__count);
				1267	r = vec_sr(m, (__vector unsigned short)c);
				1268	return (__m64)((__vector long long)r)[0];
				1269	} else
				1270	return (0);
				1271	}
				1272
				1273	extern __inline __m64
				1274	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1275	_m_psrlw(__m64 __m, __m64 __count) {
				1276	return _mm_srl_pi16(__m, __count);
				1277	}
				1278
				1279	extern __inline __m64
				1280	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1281	_mm_srli_pi16(__m64 __m, int __count) {
				1282	/* Promote int to long then invoke mm_sra_pi32. */
				1283	return _mm_srl_pi16(__m, __count);
				1284	}
				1285
				1286	extern __inline __m64
				1287	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1288	_m_psrlwi(__m64 __m, int __count) {
				1289	return _mm_srli_pi16(__m, __count);
				1290	}
				1291
				1292	/* Shift two 32-bit values in M right by COUNT; shift in zeros. */
				1293	extern __inline __m64
				1294	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1295	_mm_srl_pi32(__m64 __m, __m64 __count) {
				1296	__m64_union m, res;
				1297
				1298	m.as_m64 = __m;
				1299
				1300	res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
				1301	res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
				1302	return (res.as_m64);
				1303	}
				1304
				1305	extern __inline __m64
				1306	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1307	_m_psrld(__m64 __m, __m64 __count) {
				1308	return _mm_srl_pi32(__m, __count);
				1309	}
				1310
				1311	extern __inline __m64
				1312	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1313	_mm_srli_pi32(__m64 __m, int __count) {
				1314	/* Promote int to long then invoke mm_srl_pi32. */
				1315	return _mm_srl_pi32(__m, __count);
				1316	}
				1317
				1318	extern __inline __m64
				1319	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1320	_m_psrldi(__m64 __m, int __count) {
				1321	return _mm_srli_pi32(__m, __count);
				1322	}
				1323	#endif /* _ARCH_PWR8 */
				1324
				1325	/* Creates a vector of two 32-bit values; I0 is least significant. */
				1326	extern __inline __m64
				1327	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1328	_mm_set_pi32(int __i1, int __i0) {
				1329	__m64_union res;
				1330
				1331	res.as_int[0] = __i0;
				1332	res.as_int[1] = __i1;
				1333	return (res.as_m64);
				1334	}
				1335
				1336	/* Creates a vector of four 16-bit values; W0 is least significant. */
				1337	extern __inline __m64
				1338	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1339	_mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
				1340	__m64_union res;
				1341
				1342	res.as_short[0] = __w0;
				1343	res.as_short[1] = __w1;
				1344	res.as_short[2] = __w2;
				1345	res.as_short[3] = __w3;
				1346	return (res.as_m64);
				1347	}
				1348
				1349	/* Creates a vector of eight 8-bit values; B0 is least significant. */
				1350	extern __inline __m64
				1351	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1352	_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
				1353	char __b2, char __b1, char __b0) {
				1354	__m64_union res;
				1355
				1356	res.as_char[0] = __b0;
				1357	res.as_char[1] = __b1;
				1358	res.as_char[2] = __b2;
				1359	res.as_char[3] = __b3;
				1360	res.as_char[4] = __b4;
				1361	res.as_char[5] = __b5;
				1362	res.as_char[6] = __b6;
				1363	res.as_char[7] = __b7;
				1364	return (res.as_m64);
				1365	}
				1366
				1367	/* Similar, but with the arguments in reverse order. */
				1368	extern __inline __m64
				1369	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1370	_mm_setr_pi32(int __i0, int __i1) {
				1371	__m64_union res;
				1372
				1373	res.as_int[0] = __i0;
				1374	res.as_int[1] = __i1;
				1375	return (res.as_m64);
				1376	}
				1377
				1378	extern __inline __m64
				1379	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1380	_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
				1381	return _mm_set_pi16(__w3, __w2, __w1, __w0);
				1382	}
				1383
				1384	extern __inline __m64
				1385	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1386	_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
				1387	char __b5, char __b6, char __b7) {
				1388	return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
				1389	}
				1390
				1391	/* Creates a vector of two 32-bit values, both elements containing I. */
				1392	extern __inline __m64
				1393	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1394	_mm_set1_pi32(int __i) {
				1395	__m64_union res;
				1396
				1397	res.as_int[0] = __i;
				1398	res.as_int[1] = __i;
				1399	return (res.as_m64);
				1400	}
				1401
				1402	/* Creates a vector of four 16-bit values, all elements containing W. */
				1403	extern __inline __m64
				1404	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1405	_mm_set1_pi16(short __w) {
				1406	#if _ARCH_PWR9
				1407	__vector signed short w;
				1408
				1409	w = (__vector signed short)vec_splats(__w);
				1410	return (__m64)((__vector long long)w)[0];
				1411	#else
				1412	__m64_union res;
				1413
				1414	res.as_short[0] = __w;
				1415	res.as_short[1] = __w;
				1416	res.as_short[2] = __w;
				1417	res.as_short[3] = __w;
				1418	return (res.as_m64);
				1419	#endif
				1420	}
				1421
				1422	/* Creates a vector of eight 8-bit values, all elements containing B. */
				1423	extern __inline __m64
				1424	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1425	_mm_set1_pi8(signed char __b) {
				1426	#if _ARCH_PWR8
				1427	__vector signed char b;
				1428
				1429	b = (__vector signed char)vec_splats(__b);
				1430	return (__m64)((__vector long long)b)[0];
				1431	#else
				1432	__m64_union res;
				1433
				1434	res.as_char[0] = __b;
				1435	res.as_char[1] = __b;
				1436	res.as_char[2] = __b;
				1437	res.as_char[3] = __b;
				1438	res.as_char[4] = __b;
				1439	res.as_char[5] = __b;
				1440	res.as_char[6] = __b;
				1441	res.as_char[7] = __b;
				1442	return (res.as_m64);
				1443	#endif
				1444	}
Logan Chien	bedbf4f	2020-01-06 19:35:19 -0800	[diff] [blame]	1445
				1446	#else
				1447	#include_next <mmintrin.h>
Pirama Arumuga Nainar	ec8c89d	2022-02-23 09:26:16 -0800	[diff] [blame]	1448	#endif /* defined(__ppc64__) && (defined(__linux__) \|\| defined(__FreeBSD__)) \
				1449	*/
Logan Chien	bedbf4f	2020-01-06 19:35:19 -0800	[diff] [blame]	1450
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	1451	#endif /* _MMINTRIN_H_INCLUDED */