Blame - linux-x86/lib64/clang/14.0.2/include/ppc_wrappers/xmmintrin.h - platform/prebuilts/clang-tools

blob: 956603d364082d065a2abeef447c143e90de2953 [file] [log] [blame]

Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	1	/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
				2	*
				3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	* See https://llvm.org/LICENSE.txt for license information.
				5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				6	*
				7	*===-----------------------------------------------------------------------===
				8	*/
				9
				10	/* Implemented from the specification included in the Intel C++ Compiler
				11	User Guide and Reference, version 9.0. */
				12
				13	#ifndef NO_WARN_X86_INTRINSICS
				14	/* This header file is to help porting code using Intel intrinsics
				15	explicitly from x86_64 to powerpc64/powerpc64le.
				16
				17	Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
				18	VMX/VSX ISA is a good match for vector float SIMD operations.
				19	However scalar float operations in vector (XMM) registers require
				20	the POWER8 VSX ISA (2.07) level. There are differences for data
				21	format and placement of float scalars in the vector register, which
				22	require extra steps to match SSE scalar float semantics on POWER.
				23
				24	It should be noted that there's much difference between X86_64's
				25	MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
				26	portable <fenv.h> instead of access MXSCR directly.
				27
				28	Most SSE scalar float intrinsic operations can be performed more
				29	efficiently as C language float scalar operations or optimized to
				30	use vector SIMD operations. We recommend this for new applications. */
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	31	#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	32	#endif
				33
				34	#ifndef _XMMINTRIN_H_INCLUDED
				35	#define _XMMINTRIN_H_INCLUDED
				36
Pirama Arumuga Nainar	ec8c89d	2022-02-23 09:26:16 -0800	[diff] [blame]	37	#if defined(__ppc64__) && (defined(__linux__) \|\| defined(__FreeBSD__))
Logan Chien	bedbf4f	2020-01-06 19:35:19 -0800	[diff] [blame]	38
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	39	/* Define four value permute mask */
				40	#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) \| ((x) << 4) \| ((y) << 2) \| (z))
				41
				42	#include <altivec.h>
				43
				44	/* Avoid collisions between altivec.h and strict adherence to C++ and
				45	C11 standards. This should eventually be done inside altivec.h itself,
				46	but only after testing a full distro build. */
				47	#if defined(__STRICT_ANSI__) && (defined(__cplusplus) \|\| \
				48	(defined(__STDC_VERSION__) && \
				49	__STDC_VERSION__ >= 201112L))
				50	#undef vector
				51	#undef pixel
				52	#undef bool
				53	#endif
				54
				55	/* We need type definitions from the MMX header file. */
				56	#include <mmintrin.h>
				57
				58	/* Get _mm_malloc () and _mm_free (). */
				59	#if __STDC_HOSTED__
				60	#include <mm_malloc.h>
				61	#endif
				62
				63	/* The Intel API is flexible enough that we must allow aliasing with other
				64	vector types, and their scalar components. */
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	65	typedef vector float __m128 __attribute__((__may_alias__));
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	66
				67	/* Unaligned version of the same type. */
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	68	typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	69
				70	/* Internal data types for implementing the intrinsics. */
Pirama Arumuga Nainar	494f645	2021-12-02 10:42:14 -0800	[diff] [blame]	71	typedef vector float __v4sf;
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	72
				73	/* Create an undefined vector. */
				74	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				75	_mm_undefined_ps (void)
				76	{
				77	__m128 __Y = __Y;
				78	return __Y;
				79	}
				80
				81	/* Create a vector of zeros. */
				82	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				83	_mm_setzero_ps (void)
				84	{
				85	return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
				86	}
				87
				88	/* Load four SPFP values from P. The address must be 16-byte aligned. */
				89	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				90	_mm_load_ps (float const *__P)
				91	{
				92	return ((__m128)vec_ld(0, (__v4sf*)__P));
				93	}
				94
				95	/* Load four SPFP values from P. The address need not be 16-byte aligned. */
				96	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				97	_mm_loadu_ps (float const *__P)
				98	{
				99	return (vec_vsx_ld(0, __P));
				100	}
				101
				102	/* Load four SPFP values in reverse order. The address must be aligned. */
				103	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				104	_mm_loadr_ps (float const *__P)
				105	{
				106	__v4sf __tmp;
				107	__m128 result;
				108	static const __vector unsigned char permute_vector =
				109	{ 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
				110	0x17, 0x10, 0x11, 0x12, 0x13 };
				111
				112	__tmp = vec_ld (0, (__v4sf *) __P);
				113	result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
				114	return result;
				115	}
				116
				117	/* Create a vector with all four elements equal to F. */
				118	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				119	_mm_set1_ps (float __F)
				120	{
				121	return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
				122	}
				123
				124	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				125	_mm_set_ps1 (float __F)
				126	{
				127	return _mm_set1_ps (__F);
				128	}
				129
				130	/* Create the vector [Z Y X W]. */
				131	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				132	_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
				133	{
				134	return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
				135	}
				136
				137	/* Create the vector [W X Y Z]. */
				138	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				139	_mm_setr_ps (float __Z, float __Y, float __X, float __W)
				140	{
				141	return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
				142	}
				143
				144	/* Store four SPFP values. The address must be 16-byte aligned. */
				145	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				146	_mm_store_ps (float *__P, __m128 __A)
				147	{
				148	vec_st((__v4sf)__A, 0, (__v4sf*)__P);
				149	}
				150
				151	/* Store four SPFP values. The address need not be 16-byte aligned. */
				152	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				153	_mm_storeu_ps (float *__P, __m128 __A)
				154	{
				155	(__m128_u )__P = __A;
				156	}
				157
				158	/* Store four SPFP values in reverse order. The address must be aligned. */
				159	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				160	_mm_storer_ps (float *__P, __m128 __A)
				161	{
				162	__v4sf __tmp;
				163	static const __vector unsigned char permute_vector =
				164	{ 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
				165	0x17, 0x10, 0x11, 0x12, 0x13 };
				166
				167	__tmp = (__m128) vec_perm (__A, __A, permute_vector);
				168
				169	_mm_store_ps (__P, __tmp);
				170	}
				171
				172	/* Store the lower SPFP value across four words. */
				173	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				174	_mm_store1_ps (float *__P, __m128 __A)
				175	{
				176	__v4sf __va = vec_splat((__v4sf)__A, 0);
				177	_mm_store_ps (__P, __va);
				178	}
				179
				180	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				181	_mm_store_ps1 (float *__P, __m128 __A)
				182	{
				183	_mm_store1_ps (__P, __A);
				184	}
				185
				186	/* Create a vector with element 0 as F and the rest zero. */
				187	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				188	_mm_set_ss (float __F)
				189	{
				190	return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
				191	}
				192
				193	/* Sets the low SPFP value of A from the low value of B. */
				194	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				195	_mm_move_ss (__m128 __A, __m128 __B)
				196	{
				197	static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
				198
				199	return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
				200	}
				201
				202	/* Create a vector with element 0 as P and the rest zero. /
				203	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				204	_mm_load_ss (float const *__P)
				205	{
				206	return _mm_set_ss (*__P);
				207	}
				208
				209	/* Stores the lower SPFP value. */
				210	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				211	_mm_store_ss (float *__P, __m128 __A)
				212	{
				213	*__P = ((__v4sf)__A)[0];
				214	}
				215
				216	/* Perform the respective operation on the lower SPFP (single-precision
				217	floating-point) values of A and B; the upper three SPFP values are
				218	passed through from A. */
				219
				220	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				221	_mm_add_ss (__m128 __A, __m128 __B)
				222	{
				223	#ifdef _ARCH_PWR7
				224	__m128 a, b, c;
				225	static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
				226	/* PowerISA VSX does not allow partial (for just lower double)
				227	results. So to insure we don't generate spurious exceptions
				228	(from the upper double values) we splat the lower double
				229	before we to the operation. */
				230	a = vec_splat (__A, 0);
				231	b = vec_splat (__B, 0);
				232	c = a + b;
				233	/* Then we merge the lower float result with the original upper
				234	float elements from __A. */
				235	return (vec_sel (__A, c, mask));
				236	#else
				237	__A[0] = __A[0] + __B[0];
				238	return (__A);
				239	#endif
				240	}
				241
				242	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				243	_mm_sub_ss (__m128 __A, __m128 __B)
				244	{
				245	#ifdef _ARCH_PWR7
				246	__m128 a, b, c;
				247	static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
				248	/* PowerISA VSX does not allow partial (for just lower double)
				249	results. So to insure we don't generate spurious exceptions
				250	(from the upper double values) we splat the lower double
				251	before we to the operation. */
				252	a = vec_splat (__A, 0);
				253	b = vec_splat (__B, 0);
				254	c = a - b;
				255	/* Then we merge the lower float result with the original upper
				256	float elements from __A. */
				257	return (vec_sel (__A, c, mask));
				258	#else
				259	__A[0] = __A[0] - __B[0];
				260	return (__A);
				261	#endif
				262	}
				263
				264	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				265	_mm_mul_ss (__m128 __A, __m128 __B)
				266	{
				267	#ifdef _ARCH_PWR7
				268	__m128 a, b, c;
				269	static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
				270	/* PowerISA VSX does not allow partial (for just lower double)
				271	results. So to insure we don't generate spurious exceptions
				272	(from the upper double values) we splat the lower double
				273	before we to the operation. */
				274	a = vec_splat (__A, 0);
				275	b = vec_splat (__B, 0);
				276	c = a * b;
				277	/* Then we merge the lower float result with the original upper
				278	float elements from __A. */
				279	return (vec_sel (__A, c, mask));
				280	#else
				281	__A[0] = __A[0] * __B[0];
				282	return (__A);
				283	#endif
				284	}
				285
				286	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				287	_mm_div_ss (__m128 __A, __m128 __B)
				288	{
				289	#ifdef _ARCH_PWR7
				290	__m128 a, b, c;
				291	static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
				292	/* PowerISA VSX does not allow partial (for just lower double)
				293	results. So to insure we don't generate spurious exceptions
				294	(from the upper double values) we splat the lower double
				295	before we to the operation. */
				296	a = vec_splat (__A, 0);
				297	b = vec_splat (__B, 0);
				298	c = a / b;
				299	/* Then we merge the lower float result with the original upper
				300	float elements from __A. */
				301	return (vec_sel (__A, c, mask));
				302	#else
				303	__A[0] = __A[0] / __B[0];
				304	return (__A);
				305	#endif
				306	}
				307
				308	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				309	_mm_sqrt_ss (__m128 __A)
				310	{
				311	__m128 a, c;
				312	static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
				313	/* PowerISA VSX does not allow partial (for just lower double)
				314	* results. So to insure we don't generate spurious exceptions
				315	* (from the upper double values) we splat the lower double
				316	* before we to the operation. */
				317	a = vec_splat (__A, 0);
				318	c = vec_sqrt (a);
				319	/* Then we merge the lower float result with the original upper
				320	* float elements from __A. */
				321	return (vec_sel (__A, c, mask));
				322	}
				323
				324	/* Perform the respective operation on the four SPFP values in A and B. */
				325	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				326	_mm_add_ps (__m128 __A, __m128 __B)
				327	{
				328	return (__m128) ((__v4sf)__A + (__v4sf)__B);
				329	}
				330
				331	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				332	_mm_sub_ps (__m128 __A, __m128 __B)
				333	{
				334	return (__m128) ((__v4sf)__A - (__v4sf)__B);
				335	}
				336
				337	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				338	_mm_mul_ps (__m128 __A, __m128 __B)
				339	{
				340	return (__m128) ((__v4sf)__A * (__v4sf)__B);
				341	}
				342
				343	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				344	_mm_div_ps (__m128 __A, __m128 __B)
				345	{
				346	return (__m128) ((__v4sf)__A / (__v4sf)__B);
				347	}
				348
				349	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				350	_mm_sqrt_ps (__m128 __A)
				351	{
				352	return (vec_sqrt ((__v4sf)__A));
				353	}
				354
				355	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				356	_mm_rcp_ps (__m128 __A)
				357	{
				358	return (vec_re ((__v4sf)__A));
				359	}
				360
				361	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				362	_mm_rsqrt_ps (__m128 __A)
				363	{
				364	return (vec_rsqrte (__A));
				365	}
				366
				367	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				368	_mm_rcp_ss (__m128 __A)
				369	{
				370	__m128 a, c;
				371	static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
				372	/* PowerISA VSX does not allow partial (for just lower double)
				373	* results. So to insure we don't generate spurious exceptions
				374	* (from the upper double values) we splat the lower double
				375	* before we to the operation. */
				376	a = vec_splat (__A, 0);
				377	c = _mm_rcp_ps (a);
				378	/* Then we merge the lower float result with the original upper
				379	* float elements from __A. */
				380	return (vec_sel (__A, c, mask));
				381	}
				382
				383	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				384	_mm_rsqrt_ss (__m128 __A)
				385	{
				386	__m128 a, c;
				387	static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
				388	/* PowerISA VSX does not allow partial (for just lower double)
				389	* results. So to insure we don't generate spurious exceptions
				390	* (from the upper double values) we splat the lower double
				391	* before we to the operation. */
				392	a = vec_splat (__A, 0);
				393	c = vec_rsqrte (a);
				394	/* Then we merge the lower float result with the original upper
				395	* float elements from __A. */
				396	return (vec_sel (__A, c, mask));
				397	}
				398
				399	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				400	_mm_min_ss (__m128 __A, __m128 __B)
				401	{
				402	__v4sf a, b, c;
				403	static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
				404	/* PowerISA VSX does not allow partial (for just lower float)
				405	* results. So to insure we don't generate spurious exceptions
				406	* (from the upper float values) we splat the lower float
				407	* before we to the operation. */
				408	a = vec_splat ((__v4sf)__A, 0);
				409	b = vec_splat ((__v4sf)__B, 0);
				410	c = vec_min (a, b);
				411	/* Then we merge the lower float result with the original upper
				412	* float elements from __A. */
				413	return (vec_sel ((__v4sf)__A, c, mask));
				414	}
				415
				416	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				417	_mm_max_ss (__m128 __A, __m128 __B)
				418	{
				419	__v4sf a, b, c;
				420	static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
				421	/* PowerISA VSX does not allow partial (for just lower float)
				422	* results. So to insure we don't generate spurious exceptions
				423	* (from the upper float values) we splat the lower float
				424	* before we to the operation. */
				425	a = vec_splat (__A, 0);
				426	b = vec_splat (__B, 0);
				427	c = vec_max (a, b);
				428	/* Then we merge the lower float result with the original upper
				429	* float elements from __A. */
				430	return (vec_sel ((__v4sf)__A, c, mask));
				431	}
				432
				433	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				434	_mm_min_ps (__m128 __A, __m128 __B)
				435	{
				436	__vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
				437	return vec_sel (__B, __A, m);
				438	}
				439
				440	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				441	_mm_max_ps (__m128 __A, __m128 __B)
				442	{
				443	__vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
				444	return vec_sel (__B, __A, m);
				445	}
				446
				447	/* Perform logical bit-wise operations on 128-bit values. */
				448	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				449	_mm_and_ps (__m128 __A, __m128 __B)
				450	{
				451	return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
				452	// return __builtin_ia32_andps (__A, __B);
				453	}
				454
				455	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				456	_mm_andnot_ps (__m128 __A, __m128 __B)
				457	{
				458	return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
				459	}
				460
				461	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				462	_mm_or_ps (__m128 __A, __m128 __B)
				463	{
				464	return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
				465	}
				466
				467	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				468	_mm_xor_ps (__m128 __A, __m128 __B)
				469	{
				470	return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
				471	}
				472
				473	/* Perform a comparison on the four SPFP values of A and B. For each
				474	element, if the comparison is true, place a mask of all ones in the
				475	result, otherwise a mask of zeros. */
				476	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				477	_mm_cmpeq_ps (__m128 __A, __m128 __B)
				478	{
				479	return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
				480	}
				481
				482	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				483	_mm_cmplt_ps (__m128 __A, __m128 __B)
				484	{
				485	return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
				486	}
				487
				488	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				489	_mm_cmple_ps (__m128 __A, __m128 __B)
				490	{
				491	return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
				492	}
				493
				494	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				495	_mm_cmpgt_ps (__m128 __A, __m128 __B)
				496	{
				497	return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
				498	}
				499
				500	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				501	_mm_cmpge_ps (__m128 __A, __m128 __B)
				502	{
				503	return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
				504	}
				505
				506	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				507	_mm_cmpneq_ps (__m128 __A, __m128 __B)
				508	{
				509	__v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
				510	return ((__m128)vec_nor (temp, temp));
				511	}
				512
				513	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				514	_mm_cmpnlt_ps (__m128 __A, __m128 __B)
				515	{
				516	return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
				517	}
				518
				519	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				520	_mm_cmpnle_ps (__m128 __A, __m128 __B)
				521	{
				522	return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
				523	}
				524
				525	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				526	_mm_cmpngt_ps (__m128 __A, __m128 __B)
				527	{
				528	return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
				529	}
				530
				531	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				532	_mm_cmpnge_ps (__m128 __A, __m128 __B)
				533	{
				534	return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
				535	}
				536
				537	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				538	_mm_cmpord_ps (__m128 __A, __m128 __B)
				539	{
				540	__vector unsigned int a, b;
				541	__vector unsigned int c, d;
				542	static const __vector unsigned int float_exp_mask =
				543	{ 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
				544
				545	a = (__vector unsigned int) vec_abs ((__v4sf)__A);
				546	b = (__vector unsigned int) vec_abs ((__v4sf)__B);
				547	c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
				548	d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
				549	return ((__m128 ) vec_and (c, d));
				550	}
				551
				552	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				553	_mm_cmpunord_ps (__m128 __A, __m128 __B)
				554	{
				555	__vector unsigned int a, b;
				556	__vector unsigned int c, d;
				557	static const __vector unsigned int float_exp_mask =
				558	{ 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
				559
				560	a = (__vector unsigned int) vec_abs ((__v4sf)__A);
				561	b = (__vector unsigned int) vec_abs ((__v4sf)__B);
				562	c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
				563	d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
				564	return ((__m128 ) vec_or (c, d));
				565	}
				566
				567	/* Perform a comparison on the lower SPFP values of A and B. If the
				568	comparison is true, place a mask of all ones in the result, otherwise a
				569	mask of zeros. The upper three SPFP values are passed through from A. */
				570	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				571	_mm_cmpeq_ss (__m128 __A, __m128 __B)
				572	{
				573	static const __vector unsigned int mask =
				574	{ 0xffffffff, 0, 0, 0 };
				575	__v4sf a, b, c;
				576	/* PowerISA VMX does not allow partial (for just element 0)
				577	* results. So to insure we don't generate spurious exceptions
				578	* (from the upper elements) we splat the lower float
				579	* before we to the operation. */
				580	a = vec_splat ((__v4sf) __A, 0);
				581	b = vec_splat ((__v4sf) __B, 0);
				582	c = (__v4sf) vec_cmpeq(a, b);
				583	/* Then we merge the lower float result with the original upper
				584	* float elements from __A. */
				585	return ((__m128)vec_sel ((__v4sf)__A, c, mask));
				586	}
				587
				588	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				589	_mm_cmplt_ss (__m128 __A, __m128 __B)
				590	{
				591	static const __vector unsigned int mask =
				592	{ 0xffffffff, 0, 0, 0 };
				593	__v4sf a, b, c;
				594	/* PowerISA VMX does not allow partial (for just element 0)
				595	* results. So to insure we don't generate spurious exceptions
				596	* (from the upper elements) we splat the lower float
				597	* before we to the operation. */
				598	a = vec_splat ((__v4sf) __A, 0);
				599	b = vec_splat ((__v4sf) __B, 0);
				600	c = (__v4sf) vec_cmplt(a, b);
				601	/* Then we merge the lower float result with the original upper
				602	* float elements from __A. */
				603	return ((__m128)vec_sel ((__v4sf)__A, c, mask));
				604	}
				605
				606	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				607	_mm_cmple_ss (__m128 __A, __m128 __B)
				608	{
				609	static const __vector unsigned int mask =
				610	{ 0xffffffff, 0, 0, 0 };
				611	__v4sf a, b, c;
				612	/* PowerISA VMX does not allow partial (for just element 0)
				613	* results. So to insure we don't generate spurious exceptions
				614	* (from the upper elements) we splat the lower float
				615	* before we to the operation. */
				616	a = vec_splat ((__v4sf) __A, 0);
				617	b = vec_splat ((__v4sf) __B, 0);
				618	c = (__v4sf) vec_cmple(a, b);
				619	/* Then we merge the lower float result with the original upper
				620	* float elements from __A. */
				621	return ((__m128)vec_sel ((__v4sf)__A, c, mask));
				622	}
				623
				624	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				625	_mm_cmpgt_ss (__m128 __A, __m128 __B)
				626	{
				627	static const __vector unsigned int mask =
				628	{ 0xffffffff, 0, 0, 0 };
				629	__v4sf a, b, c;
				630	/* PowerISA VMX does not allow partial (for just element 0)
				631	* results. So to insure we don't generate spurious exceptions
				632	* (from the upper elements) we splat the lower float
				633	* before we to the operation. */
				634	a = vec_splat ((__v4sf) __A, 0);
				635	b = vec_splat ((__v4sf) __B, 0);
				636	c = (__v4sf) vec_cmpgt(a, b);
				637	/* Then we merge the lower float result with the original upper
				638	* float elements from __A. */
				639	return ((__m128)vec_sel ((__v4sf)__A, c, mask));
				640	}
				641
				642	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				643	_mm_cmpge_ss (__m128 __A, __m128 __B)
				644	{
				645	static const __vector unsigned int mask =
				646	{ 0xffffffff, 0, 0, 0 };
				647	__v4sf a, b, c;
				648	/* PowerISA VMX does not allow partial (for just element 0)
				649	* results. So to insure we don't generate spurious exceptions
				650	* (from the upper elements) we splat the lower float
				651	* before we to the operation. */
				652	a = vec_splat ((__v4sf) __A, 0);
				653	b = vec_splat ((__v4sf) __B, 0);
				654	c = (__v4sf) vec_cmpge(a, b);
				655	/* Then we merge the lower float result with the original upper
				656	* float elements from __A. */
				657	return ((__m128)vec_sel ((__v4sf)__A, c, mask));
				658	}
				659
				660	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				661	_mm_cmpneq_ss (__m128 __A, __m128 __B)
				662	{
				663	static const __vector unsigned int mask =
				664	{ 0xffffffff, 0, 0, 0 };
				665	__v4sf a, b, c;
				666	/* PowerISA VMX does not allow partial (for just element 0)
				667	* results. So to insure we don't generate spurious exceptions
				668	* (from the upper elements) we splat the lower float
				669	* before we to the operation. */
				670	a = vec_splat ((__v4sf) __A, 0);
				671	b = vec_splat ((__v4sf) __B, 0);
				672	c = (__v4sf) vec_cmpeq(a, b);
				673	c = vec_nor (c, c);
				674	/* Then we merge the lower float result with the original upper
				675	* float elements from __A. */
				676	return ((__m128)vec_sel ((__v4sf)__A, c, mask));
				677	}
				678
				679	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				680	_mm_cmpnlt_ss (__m128 __A, __m128 __B)
				681	{
				682	static const __vector unsigned int mask =
				683	{ 0xffffffff, 0, 0, 0 };
				684	__v4sf a, b, c;
				685	/* PowerISA VMX does not allow partial (for just element 0)
				686	* results. So to insure we don't generate spurious exceptions
				687	* (from the upper elements) we splat the lower float
				688	* before we to the operation. */
				689	a = vec_splat ((__v4sf) __A, 0);
				690	b = vec_splat ((__v4sf) __B, 0);
				691	c = (__v4sf) vec_cmpge(a, b);
				692	/* Then we merge the lower float result with the original upper
				693	* float elements from __A. */
				694	return ((__m128)vec_sel ((__v4sf)__A, c, mask));
				695	}
				696
				697	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				698	_mm_cmpnle_ss (__m128 __A, __m128 __B)
				699	{
				700	static const __vector unsigned int mask =
				701	{ 0xffffffff, 0, 0, 0 };
				702	__v4sf a, b, c;
				703	/* PowerISA VMX does not allow partial (for just element 0)
				704	* results. So to insure we don't generate spurious exceptions
				705	* (from the upper elements) we splat the lower float
				706	* before we to the operation. */
				707	a = vec_splat ((__v4sf) __A, 0);
				708	b = vec_splat ((__v4sf) __B, 0);
				709	c = (__v4sf) vec_cmpgt(a, b);
				710	/* Then we merge the lower float result with the original upper
				711	* float elements from __A. */
				712	return ((__m128)vec_sel ((__v4sf)__A, c, mask));
				713	}
				714
				715	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				716	_mm_cmpngt_ss (__m128 __A, __m128 __B)
				717	{
				718	static const __vector unsigned int mask =
				719	{ 0xffffffff, 0, 0, 0 };
				720	__v4sf a, b, c;
				721	/* PowerISA VMX does not allow partial (for just element 0)
				722	* results. So to insure we don't generate spurious exceptions
				723	* (from the upper elements) we splat the lower float
				724	* before we to the operation. */
				725	a = vec_splat ((__v4sf) __A, 0);
				726	b = vec_splat ((__v4sf) __B, 0);
				727	c = (__v4sf) vec_cmple(a, b);
				728	/* Then we merge the lower float result with the original upper
				729	* float elements from __A. */
				730	return ((__m128)vec_sel ((__v4sf)__A, c, mask));
				731	}
				732
				733	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				734	_mm_cmpnge_ss (__m128 __A, __m128 __B)
				735	{
				736	static const __vector unsigned int mask =
				737	{ 0xffffffff, 0, 0, 0 };
				738	__v4sf a, b, c;
				739	/* PowerISA VMX does not allow partial (for just element 0)
				740	* results. So to insure we don't generate spurious exceptions
				741	* (from the upper elements) we splat the lower float
				742	* before we do the operation. */
				743	a = vec_splat ((__v4sf) __A, 0);
				744	b = vec_splat ((__v4sf) __B, 0);
				745	c = (__v4sf) vec_cmplt(a, b);
				746	/* Then we merge the lower float result with the original upper
				747	* float elements from __A. */
				748	return ((__m128)vec_sel ((__v4sf)__A, c, mask));
				749	}
				750
				751	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				752	_mm_cmpord_ss (__m128 __A, __m128 __B)
				753	{
				754	__vector unsigned int a, b;
				755	__vector unsigned int c, d;
				756	static const __vector unsigned int float_exp_mask =
				757	{ 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
				758	static const __vector unsigned int mask =
				759	{ 0xffffffff, 0, 0, 0 };
				760
				761	a = (__vector unsigned int) vec_abs ((__v4sf)__A);
				762	b = (__vector unsigned int) vec_abs ((__v4sf)__B);
				763	c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
				764	d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
				765	c = vec_and (c, d);
				766	/* Then we merge the lower float result with the original upper
				767	* float elements from __A. */
				768	return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
				769	}
				770
				771	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				772	_mm_cmpunord_ss (__m128 __A, __m128 __B)
				773	{
				774	__vector unsigned int a, b;
				775	__vector unsigned int c, d;
				776	static const __vector unsigned int float_exp_mask =
				777	{ 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
				778	static const __vector unsigned int mask =
				779	{ 0xffffffff, 0, 0, 0 };
				780
				781	a = (__vector unsigned int) vec_abs ((__v4sf)__A);
				782	b = (__vector unsigned int) vec_abs ((__v4sf)__B);
				783	c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
				784	d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
				785	c = vec_or (c, d);
				786	/* Then we merge the lower float result with the original upper
				787	* float elements from __A. */
				788	return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
				789	}
				790
				791	/* Compare the lower SPFP values of A and B and return 1 if true
				792	and 0 if false. */
				793	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				794	_mm_comieq_ss (__m128 __A, __m128 __B)
				795	{
				796	return (__A[0] == __B[0]);
				797	}
				798
				799	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				800	_mm_comilt_ss (__m128 __A, __m128 __B)
				801	{
				802	return (__A[0] < __B[0]);
				803	}
				804
				805	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				806	_mm_comile_ss (__m128 __A, __m128 __B)
				807	{
				808	return (__A[0] <= __B[0]);
				809	}
				810
				811	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				812	_mm_comigt_ss (__m128 __A, __m128 __B)
				813	{
				814	return (__A[0] > __B[0]);
				815	}
				816
				817	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				818	_mm_comige_ss (__m128 __A, __m128 __B)
				819	{
				820	return (__A[0] >= __B[0]);
				821	}
				822
				823	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				824	_mm_comineq_ss (__m128 __A, __m128 __B)
				825	{
				826	return (__A[0] != __B[0]);
				827	}
				828
				829	/* FIXME
				830	* The __mm_ucomi??_ss implementations below are exactly the same as
				831	* __mm_comi??_ss because GCC for PowerPC only generates unordered
				832	* compares (scalar and vector).
				833	* Technically __mm_comieq_ss et al should be using the ordered
				834	* compare and signal for QNaNs.
				835	* The __mm_ucomieq_sd et all should be OK, as is.
				836	*/
				837	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				838	_mm_ucomieq_ss (__m128 __A, __m128 __B)
				839	{
				840	return (__A[0] == __B[0]);
				841	}
				842
				843	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				844	_mm_ucomilt_ss (__m128 __A, __m128 __B)
				845	{
				846	return (__A[0] < __B[0]);
				847	}
				848
				849	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				850	_mm_ucomile_ss (__m128 __A, __m128 __B)
				851	{
				852	return (__A[0] <= __B[0]);
				853	}
				854
				855	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				856	_mm_ucomigt_ss (__m128 __A, __m128 __B)
				857	{
				858	return (__A[0] > __B[0]);
				859	}
				860
				861	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				862	_mm_ucomige_ss (__m128 __A, __m128 __B)
				863	{
				864	return (__A[0] >= __B[0]);
				865	}
				866
				867	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				868	_mm_ucomineq_ss (__m128 __A, __m128 __B)
				869	{
				870	return (__A[0] != __B[0]);
				871	}
				872
				873	extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				874	_mm_cvtss_f32 (__m128 __A)
				875	{
				876	return ((__v4sf)__A)[0];
				877	}
				878
				879	/* Convert the lower SPFP value to a 32-bit integer according to the current
				880	rounding mode. */
				881	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				882	_mm_cvtss_si32 (__m128 __A)
				883	{
				884	__m64 res = 0;
				885	#ifdef _ARCH_PWR8
				886	double dtmp;
				887	__asm__(
				888	#ifdef __LITTLE_ENDIAN__
				889	"xxsldwi %x0,%x0,%x0,3;\n"
				890	#endif
				891	"xscvspdp %x2,%x0;\n"
				892	"fctiw %2,%2;\n"
				893	"mfvsrd %1,%x2;\n"
				894	: "+wa" (__A),
				895	"=r" (res),
				896	"=f" (dtmp)
				897	: );
				898	#else
				899	res = __builtin_rint(__A[0]);
				900	#endif
				901	return (res);
				902	}
				903
				904	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				905	_mm_cvt_ss2si (__m128 __A)
				906	{
				907	return _mm_cvtss_si32 (__A);
				908	}
				909
				910	/* Convert the lower SPFP value to a 32-bit integer according to the
				911	current rounding mode. */
				912
				913	/* Intel intrinsic. */
				914	extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				915	_mm_cvtss_si64 (__m128 __A)
				916	{
				917	__m64 res = 0;
				918	#ifdef _ARCH_PWR8
				919	double dtmp;
				920	__asm__(
				921	#ifdef __LITTLE_ENDIAN__
				922	"xxsldwi %x0,%x0,%x0,3;\n"
				923	#endif
				924	"xscvspdp %x2,%x0;\n"
				925	"fctid %2,%2;\n"
				926	"mfvsrd %1,%x2;\n"
				927	: "+wa" (__A),
				928	"=r" (res),
				929	"=f" (dtmp)
				930	: );
				931	#else
				932	res = __builtin_llrint(__A[0]);
				933	#endif
				934	return (res);
				935	}
				936
				937	/* Microsoft intrinsic. */
				938	extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				939	_mm_cvtss_si64x (__m128 __A)
				940	{
				941	return _mm_cvtss_si64 ((__v4sf) __A);
				942	}
				943
				944	/* Constants for use with _mm_prefetch. */
				945	enum _mm_hint
				946	{
				947	/* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
				948	_MM_HINT_ET0 = 7,
				949	_MM_HINT_ET1 = 6,
				950	_MM_HINT_T0 = 3,
				951	_MM_HINT_T1 = 2,
				952	_MM_HINT_T2 = 1,
				953	_MM_HINT_NTA = 0
				954	};
				955
				956	/* Loads one cache line from address P to a location "closer" to the
				957	processor. The selector I specifies the type of prefetch operation. */
				958	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				959	_mm_prefetch (const void *__P, enum _mm_hint __I)
				960	{
				961	/* Current PowerPC will ignores the hint parameters. */
				962	__builtin_prefetch (__P);
				963	}
				964
				965	/* Convert the two lower SPFP values to 32-bit integers according to the
				966	current rounding mode. Return the integers in packed form. */
				967	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				968	_mm_cvtps_pi32 (__m128 __A)
				969	{
				970	/* Splat two lower SPFP values to both halves. */
				971	__v4sf temp, rounded;
				972	__vector unsigned long long result;
				973
				974	/* Splat two lower SPFP values to both halves. */
				975	temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
				976	rounded = vec_rint(temp);
				977	result = (__vector unsigned long long) vec_cts (rounded, 0);
				978
				979	return (__m64) ((__vector long long) result)[0];
				980	}
				981
				982	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				983	_mm_cvt_ps2pi (__m128 __A)
				984	{
				985	return _mm_cvtps_pi32 (__A);
				986	}
				987
				988	/* Truncate the lower SPFP value to a 32-bit integer. */
				989	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				990	_mm_cvttss_si32 (__m128 __A)
				991	{
				992	/* Extract the lower float element. */
				993	float temp = __A[0];
				994	/* truncate to 32-bit integer and return. */
				995	return temp;
				996	}
				997
				998	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				999	_mm_cvtt_ss2si (__m128 __A)
				1000	{
				1001	return _mm_cvttss_si32 (__A);
				1002	}
				1003
				1004	/* Intel intrinsic. */
				1005	extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1006	_mm_cvttss_si64 (__m128 __A)
				1007	{
				1008	/* Extract the lower float element. */
				1009	float temp = __A[0];
				1010	/* truncate to 32-bit integer and return. */
				1011	return temp;
				1012	}
				1013
				1014	/* Microsoft intrinsic. */
				1015	extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1016	_mm_cvttss_si64x (__m128 __A)
				1017	{
				1018	/* Extract the lower float element. */
				1019	float temp = __A[0];
				1020	/* truncate to 32-bit integer and return. */
				1021	return temp;
				1022	}
				1023
				1024	/* Truncate the two lower SPFP values to 32-bit integers. Return the
				1025	integers in packed form. */
				1026	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1027	_mm_cvttps_pi32 (__m128 __A)
				1028	{
				1029	__v4sf temp;
				1030	__vector unsigned long long result;
				1031
				1032	/* Splat two lower SPFP values to both halves. */
				1033	temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
				1034	result = (__vector unsigned long long) vec_cts (temp, 0);
				1035
				1036	return (__m64) ((__vector long long) result)[0];
				1037	}
				1038
				1039	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1040	_mm_cvtt_ps2pi (__m128 __A)
				1041	{
				1042	return _mm_cvttps_pi32 (__A);
				1043	}
				1044
				1045	/* Convert B to a SPFP value and insert it as element zero in A. */
				1046	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1047	_mm_cvtsi32_ss (__m128 __A, int __B)
				1048	{
				1049	float temp = __B;
				1050	__A[0] = temp;
				1051
				1052	return __A;
				1053	}
				1054
				1055	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1056	_mm_cvt_si2ss (__m128 __A, int __B)
				1057	{
				1058	return _mm_cvtsi32_ss (__A, __B);
				1059	}
				1060
				1061	/* Convert B to a SPFP value and insert it as element zero in A. */
				1062	/* Intel intrinsic. */
				1063	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1064	_mm_cvtsi64_ss (__m128 __A, long long __B)
				1065	{
				1066	float temp = __B;
				1067	__A[0] = temp;
				1068
				1069	return __A;
				1070	}
				1071
				1072	/* Microsoft intrinsic. */
				1073	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1074	_mm_cvtsi64x_ss (__m128 __A, long long __B)
				1075	{
				1076	return _mm_cvtsi64_ss (__A, __B);
				1077	}
				1078
				1079	/* Convert the two 32-bit values in B to SPFP form and insert them
				1080	as the two lower elements in A. */
				1081	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1082	_mm_cvtpi32_ps (__m128 __A, __m64 __B)
				1083	{
				1084	__vector signed int vm1;
				1085	__vector float vf1;
				1086
				1087	vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
				1088	vf1 = (__vector float) vec_ctf (vm1, 0);
				1089
				1090	return ((__m128) (__vector unsigned long long)
				1091	{ ((__vector unsigned long long)vf1) [0],
				1092	((__vector unsigned long long)__A) [1]});
				1093	}
				1094
				1095	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1096	_mm_cvt_pi2ps (__m128 __A, __m64 __B)
				1097	{
				1098	return _mm_cvtpi32_ps (__A, __B);
				1099	}
				1100
				1101	/* Convert the four signed 16-bit values in A to SPFP form. */
				1102	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1103	_mm_cvtpi16_ps (__m64 __A)
				1104	{
				1105	__vector signed short vs8;
				1106	__vector signed int vi4;
				1107	__vector float vf1;
				1108
				1109	vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
				1110	vi4 = vec_vupklsh (vs8);
				1111	vf1 = (__vector float) vec_ctf (vi4, 0);
				1112
				1113	return (__m128) vf1;
				1114	}
				1115
				1116	/* Convert the four unsigned 16-bit values in A to SPFP form. */
				1117	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1118	_mm_cvtpu16_ps (__m64 __A)
				1119	{
				1120	const __vector unsigned short zero =
				1121	{ 0, 0, 0, 0, 0, 0, 0, 0 };
				1122	__vector unsigned short vs8;
				1123	__vector unsigned int vi4;
				1124	__vector float vf1;
				1125
				1126	vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
				1127	vi4 = (__vector unsigned int) vec_mergel
				1128	#ifdef __LITTLE_ENDIAN__
				1129	(vs8, zero);
				1130	#else
				1131	(zero, vs8);
				1132	#endif
				1133	vf1 = (__vector float) vec_ctf (vi4, 0);
				1134
				1135	return (__m128) vf1;
				1136	}
				1137
				1138	/* Convert the low four signed 8-bit values in A to SPFP form. */
				1139	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1140	_mm_cvtpi8_ps (__m64 __A)
				1141	{
				1142	__vector signed char vc16;
				1143	__vector signed short vs8;
				1144	__vector signed int vi4;
				1145	__vector float vf1;
				1146
				1147	vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
				1148	vs8 = vec_vupkhsb (vc16);
				1149	vi4 = vec_vupkhsh (vs8);
				1150	vf1 = (__vector float) vec_ctf (vi4, 0);
				1151
				1152	return (__m128) vf1;
				1153	}
				1154
				1155	/* Convert the low four unsigned 8-bit values in A to SPFP form. */
				1156	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1157
				1158	_mm_cvtpu8_ps (__m64 __A)
				1159	{
				1160	const __vector unsigned char zero =
				1161	{ 0, 0, 0, 0, 0, 0, 0, 0 };
				1162	__vector unsigned char vc16;
				1163	__vector unsigned short vs8;
				1164	__vector unsigned int vi4;
				1165	__vector float vf1;
				1166
				1167	vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
				1168	#ifdef __LITTLE_ENDIAN__
				1169	vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
				1170	vi4 = (__vector unsigned int) vec_mergeh (vs8,
				1171	(__vector unsigned short) zero);
				1172	#else
				1173	vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
				1174	vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
				1175	vs8);
				1176	#endif
				1177	vf1 = (__vector float) vec_ctf (vi4, 0);
				1178
				1179	return (__m128) vf1;
				1180	}
				1181
				1182	/* Convert the four signed 32-bit values in A and B to SPFP form. */
				1183	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1184	_mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
				1185	{
				1186	__vector signed int vi4;
				1187	__vector float vf4;
				1188
				1189	vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
				1190	vf4 = (__vector float) vec_ctf (vi4, 0);
				1191	return (__m128) vf4;
				1192	}
				1193
				1194	/* Convert the four SPFP values in A to four signed 16-bit integers. */
				1195	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1196	_mm_cvtps_pi16 (__m128 __A)
				1197	{
				1198	__v4sf rounded;
				1199	__vector signed int temp;
				1200	__vector unsigned long long result;
				1201
				1202	rounded = vec_rint(__A);
				1203	temp = vec_cts (rounded, 0);
				1204	result = (__vector unsigned long long) vec_pack (temp, temp);
				1205
				1206	return (__m64) ((__vector long long) result)[0];
				1207	}
				1208
				1209	/* Convert the four SPFP values in A to four signed 8-bit integers. */
				1210	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1211	_mm_cvtps_pi8 (__m128 __A)
				1212	{
				1213	__v4sf rounded;
				1214	__vector signed int tmp_i;
				1215	static const __vector signed int zero = {0, 0, 0, 0};
				1216	__vector signed short tmp_s;
				1217	__vector signed char res_v;
				1218
				1219	rounded = vec_rint(__A);
				1220	tmp_i = vec_cts (rounded, 0);
				1221	tmp_s = vec_pack (tmp_i, zero);
				1222	res_v = vec_pack (tmp_s, tmp_s);
				1223	return (__m64) ((__vector long long) res_v)[0];
				1224	}
				1225
				1226	/* Selects four specific SPFP values from A and B based on MASK. */
				1227	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1228
				1229	_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
				1230	{
				1231	unsigned long element_selector_10 = __mask & 0x03;
				1232	unsigned long element_selector_32 = (__mask >> 2) & 0x03;
				1233	unsigned long element_selector_54 = (__mask >> 4) & 0x03;
				1234	unsigned long element_selector_76 = (__mask >> 6) & 0x03;
				1235	static const unsigned int permute_selectors[4] =
				1236	{
				1237	#ifdef __LITTLE_ENDIAN__
				1238	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
				1239	#else
				1240	0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
				1241	#endif
				1242	};
				1243	__vector unsigned int t;
				1244
				1245	t[0] = permute_selectors[element_selector_10];
				1246	t[1] = permute_selectors[element_selector_32];
				1247	t[2] = permute_selectors[element_selector_54] + 0x10101010;
				1248	t[3] = permute_selectors[element_selector_76] + 0x10101010;
				1249	return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
				1250	}
				1251
				1252	/* Selects and interleaves the upper two SPFP values from A and B. */
				1253	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1254	_mm_unpackhi_ps (__m128 __A, __m128 __B)
				1255	{
				1256	return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
				1257	}
				1258
				1259	/* Selects and interleaves the lower two SPFP values from A and B. */
				1260	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1261	_mm_unpacklo_ps (__m128 __A, __m128 __B)
				1262	{
				1263	return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
				1264	}
				1265
				1266	/* Sets the upper two SPFP values with 64-bits of data loaded from P;
				1267	the lower two values are passed through from A. */
				1268	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1269	_mm_loadh_pi (__m128 __A, __m64 const *__P)
				1270	{
				1271	__vector unsigned long long __a = (__vector unsigned long long)__A;
				1272	__vector unsigned long long __p = vec_splats(*__P);
				1273	__a [1] = __p [1];
				1274
				1275	return (__m128)__a;
				1276	}
				1277
				1278	/* Stores the upper two SPFP values of A into P. */
				1279	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1280	_mm_storeh_pi (__m64 *__P, __m128 __A)
				1281	{
				1282	__vector unsigned long long __a = (__vector unsigned long long) __A;
				1283
				1284	*__P = __a[1];
				1285	}
				1286
				1287	/* Moves the upper two values of B into the lower two values of A. */
				1288	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1289	_mm_movehl_ps (__m128 __A, __m128 __B)
				1290	{
				1291	return (__m128) vec_mergel ((__vector unsigned long long)__B,
				1292	(__vector unsigned long long)__A);
				1293	}
				1294
				1295	/* Moves the lower two values of B into the upper two values of A. */
				1296	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1297	_mm_movelh_ps (__m128 __A, __m128 __B)
				1298	{
				1299	return (__m128) vec_mergeh ((__vector unsigned long long)__A,
				1300	(__vector unsigned long long)__B);
				1301	}
				1302
				1303	/* Sets the lower two SPFP values with 64-bits of data loaded from P;
				1304	the upper two values are passed through from A. */
				1305	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1306	_mm_loadl_pi (__m128 __A, __m64 const *__P)
				1307	{
				1308	__vector unsigned long long __a = (__vector unsigned long long)__A;
				1309	__vector unsigned long long __p = vec_splats(*__P);
				1310	__a [0] = __p [0];
				1311
				1312	return (__m128)__a;
				1313	}
				1314
				1315	/* Stores the lower two SPFP values of A into P. */
				1316	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1317	_mm_storel_pi (__m64 *__P, __m128 __A)
				1318	{
				1319	__vector unsigned long long __a = (__vector unsigned long long) __A;
				1320
				1321	*__P = __a[0];
				1322	}
				1323
				1324	#ifdef _ARCH_PWR8
				1325	/* Intrinsic functions that require PowerISA 2.07 minimum. */
				1326
				1327	/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
				1328	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1329	_mm_movemask_ps (__m128 __A)
				1330	{
				1331	__vector unsigned long long result;
				1332	static const __vector unsigned int perm_mask =
				1333	{
				1334	#ifdef __LITTLE_ENDIAN__
				1335	0x00204060, 0x80808080, 0x80808080, 0x80808080
				1336	#else
				1337	0x80808080, 0x80808080, 0x80808080, 0x00204060
				1338	#endif
				1339	};
				1340
				1341	result = ((__vector unsigned long long)
				1342	vec_vbpermq ((__vector unsigned char) __A,
				1343	(__vector unsigned char) perm_mask));
				1344
				1345	#ifdef __LITTLE_ENDIAN__
				1346	return result[1];
				1347	#else
				1348	return result[0];
				1349	#endif
				1350	}
				1351	#endif /* _ARCH_PWR8 */
				1352
				1353	/* Create a vector with all four elements equal to P. /
				1354	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1355	_mm_load1_ps (float const *__P)
				1356	{
				1357	return _mm_set1_ps (*__P);
				1358	}
				1359
				1360	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1361	_mm_load_ps1 (float const *__P)
				1362	{
				1363	return _mm_load1_ps (__P);
				1364	}
				1365
				1366	/* Extracts one of the four words of A. The selector N must be immediate. */
				1367	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1368	_mm_extract_pi16 (__m64 const __A, int const __N)
				1369	{
				1370	unsigned int shiftr = __N & 3;
				1371	#ifdef __BIG_ENDIAN__
				1372	shiftr = 3 - shiftr;
				1373	#endif
				1374
				1375	return ((__A >> (shiftr * 16)) & 0xffff);
				1376	}
				1377
				1378	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1379	_m_pextrw (__m64 const __A, int const __N)
				1380	{
				1381	return _mm_extract_pi16 (__A, __N);
				1382	}
				1383
				1384	/* Inserts word D into one of four words of A. The selector N must be
				1385	immediate. */
				1386	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1387	_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
				1388	{
				1389	const int shiftl = (__N & 3) * 16;
				1390	const __m64 shiftD = (const __m64) __D << shiftl;
				1391	const __m64 mask = 0xffffUL << shiftl;
				1392	__m64 result = (__A & (~mask)) \| (shiftD & mask);
				1393
				1394	return (result);
				1395	}
				1396
				1397	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1398	_m_pinsrw (__m64 const __A, int const __D, int const __N)
				1399	{
				1400	return _mm_insert_pi16 (__A, __D, __N);
				1401	}
				1402
				1403	/* Compute the element-wise maximum of signed 16-bit values. */
				1404	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1405
				1406	_mm_max_pi16 (__m64 __A, __m64 __B)
				1407	{
				1408	#if _ARCH_PWR8
				1409	__vector signed short a, b, r;
				1410	__vector __bool short c;
				1411
				1412	a = (__vector signed short)vec_splats (__A);
				1413	b = (__vector signed short)vec_splats (__B);
				1414	c = (__vector __bool short)vec_cmpgt (a, b);
				1415	r = vec_sel (b, a, c);
				1416	return (__m64) ((__vector long long) r)[0];
				1417	#else
				1418	__m64_union m1, m2, res;
				1419
				1420	m1.as_m64 = __A;
				1421	m2.as_m64 = __B;
				1422
				1423	res.as_short[0] =
				1424	(m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
				1425	res.as_short[1] =
				1426	(m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
				1427	res.as_short[2] =
				1428	(m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
				1429	res.as_short[3] =
				1430	(m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
				1431
				1432	return (__m64) res.as_m64;
				1433	#endif
				1434	}
				1435
				1436	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1437	_m_pmaxsw (__m64 __A, __m64 __B)
				1438	{
				1439	return _mm_max_pi16 (__A, __B);
				1440	}
				1441
				1442	/* Compute the element-wise maximum of unsigned 8-bit values. */
				1443	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1444	_mm_max_pu8 (__m64 __A, __m64 __B)
				1445	{
				1446	#if _ARCH_PWR8
				1447	__vector unsigned char a, b, r;
				1448	__vector __bool char c;
				1449
				1450	a = (__vector unsigned char)vec_splats (__A);
				1451	b = (__vector unsigned char)vec_splats (__B);
				1452	c = (__vector __bool char)vec_cmpgt (a, b);
				1453	r = vec_sel (b, a, c);
				1454	return (__m64) ((__vector long long) r)[0];
				1455	#else
				1456	__m64_union m1, m2, res;
				1457	long i;
				1458
				1459	m1.as_m64 = __A;
				1460	m2.as_m64 = __B;
				1461
				1462
				1463	for (i = 0; i < 8; i++)
				1464	res.as_char[i] =
				1465	((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
				1466	m1.as_char[i] : m2.as_char[i];
				1467
				1468	return (__m64) res.as_m64;
				1469	#endif
				1470	}
				1471
				1472	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1473	_m_pmaxub (__m64 __A, __m64 __B)
				1474	{
				1475	return _mm_max_pu8 (__A, __B);
				1476	}
				1477
				1478	/* Compute the element-wise minimum of signed 16-bit values. */
				1479	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1480	_mm_min_pi16 (__m64 __A, __m64 __B)
				1481	{
				1482	#if _ARCH_PWR8
				1483	__vector signed short a, b, r;
				1484	__vector __bool short c;
				1485
				1486	a = (__vector signed short)vec_splats (__A);
				1487	b = (__vector signed short)vec_splats (__B);
				1488	c = (__vector __bool short)vec_cmplt (a, b);
				1489	r = vec_sel (b, a, c);
				1490	return (__m64) ((__vector long long) r)[0];
				1491	#else
				1492	__m64_union m1, m2, res;
				1493
				1494	m1.as_m64 = __A;
				1495	m2.as_m64 = __B;
				1496
				1497	res.as_short[0] =
				1498	(m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
				1499	res.as_short[1] =
				1500	(m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
				1501	res.as_short[2] =
				1502	(m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
				1503	res.as_short[3] =
				1504	(m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
				1505
				1506	return (__m64) res.as_m64;
				1507	#endif
				1508	}
				1509
				1510	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1511	_m_pminsw (__m64 __A, __m64 __B)
				1512	{
				1513	return _mm_min_pi16 (__A, __B);
				1514	}
				1515
				1516	/* Compute the element-wise minimum of unsigned 8-bit values. */
				1517	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1518	_mm_min_pu8 (__m64 __A, __m64 __B)
				1519	{
				1520	#if _ARCH_PWR8
				1521	__vector unsigned char a, b, r;
				1522	__vector __bool char c;
				1523
				1524	a = (__vector unsigned char)vec_splats (__A);
				1525	b = (__vector unsigned char)vec_splats (__B);
				1526	c = (__vector __bool char)vec_cmplt (a, b);
				1527	r = vec_sel (b, a, c);
				1528	return (__m64) ((__vector long long) r)[0];
				1529	#else
				1530	__m64_union m1, m2, res;
				1531	long i;
				1532
				1533	m1.as_m64 = __A;
				1534	m2.as_m64 = __B;
				1535
				1536
				1537	for (i = 0; i < 8; i++)
				1538	res.as_char[i] =
				1539	((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
				1540	m1.as_char[i] : m2.as_char[i];
				1541
				1542	return (__m64) res.as_m64;
				1543	#endif
				1544	}
				1545
				1546	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1547	_m_pminub (__m64 __A, __m64 __B)
				1548	{
				1549	return _mm_min_pu8 (__A, __B);
				1550	}
				1551
				1552	/* Create an 8-bit mask of the signs of 8-bit values. */
				1553	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1554	_mm_movemask_pi8 (__m64 __A)
				1555	{
				1556	unsigned long long p =
				1557	#ifdef __LITTLE_ENDIAN__
				1558	0x0008101820283038UL; // permute control for sign bits
				1559	#else
				1560	0x3830282018100800UL; // permute control for sign bits
				1561	#endif
				1562	return __builtin_bpermd (p, __A);
				1563	}
				1564
				1565	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1566	_m_pmovmskb (__m64 __A)
				1567	{
				1568	return _mm_movemask_pi8 (__A);
				1569	}
				1570
				1571	/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
				1572	in B and produce the high 16 bits of the 32-bit results. */
				1573	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1574	_mm_mulhi_pu16 (__m64 __A, __m64 __B)
				1575	{
				1576	__vector unsigned short a, b;
				1577	__vector unsigned short c;
				1578	__vector unsigned int w0, w1;
				1579	__vector unsigned char xform1 = {
				1580	#ifdef __LITTLE_ENDIAN__
				1581	0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
				1582	0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
				1583	#else
				1584	0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
				1585	0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
				1586	#endif
				1587	};
				1588
				1589	a = (__vector unsigned short)vec_splats (__A);
				1590	b = (__vector unsigned short)vec_splats (__B);
				1591
				1592	w0 = vec_vmuleuh (a, b);
				1593	w1 = vec_vmulouh (a, b);
				1594	c = (__vector unsigned short)vec_perm (w0, w1, xform1);
				1595
				1596	return (__m64) ((__vector long long) c)[0];
				1597	}
				1598
				1599	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1600	_m_pmulhuw (__m64 __A, __m64 __B)
				1601	{
				1602	return _mm_mulhi_pu16 (__A, __B);
				1603	}
				1604
				1605	/* Return a combination of the four 16-bit values in A. The selector
				1606	must be an immediate. */
				1607	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1608	_mm_shuffle_pi16 (__m64 __A, int const __N)
				1609	{
				1610	unsigned long element_selector_10 = __N & 0x03;
				1611	unsigned long element_selector_32 = (__N >> 2) & 0x03;
				1612	unsigned long element_selector_54 = (__N >> 4) & 0x03;
				1613	unsigned long element_selector_76 = (__N >> 6) & 0x03;
				1614	static const unsigned short permute_selectors[4] =
				1615	{
				1616	#ifdef __LITTLE_ENDIAN__
				1617	0x0908, 0x0B0A, 0x0D0C, 0x0F0E
				1618	#else
				1619	0x0607, 0x0405, 0x0203, 0x0001
				1620	#endif
				1621	};
				1622	__m64_union t;
				1623	__vector unsigned long long a, p, r;
				1624
				1625	#ifdef __LITTLE_ENDIAN__
				1626	t.as_short[0] = permute_selectors[element_selector_10];
				1627	t.as_short[1] = permute_selectors[element_selector_32];
				1628	t.as_short[2] = permute_selectors[element_selector_54];
				1629	t.as_short[3] = permute_selectors[element_selector_76];
				1630	#else
				1631	t.as_short[3] = permute_selectors[element_selector_10];
				1632	t.as_short[2] = permute_selectors[element_selector_32];
				1633	t.as_short[1] = permute_selectors[element_selector_54];
				1634	t.as_short[0] = permute_selectors[element_selector_76];
				1635	#endif
				1636	p = vec_splats (t.as_m64);
				1637	a = vec_splats (__A);
				1638	r = vec_perm (a, a, (__vector unsigned char)p);
				1639	return (__m64) ((__vector long long) r)[0];
				1640	}
				1641
				1642	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1643	_m_pshufw (__m64 __A, int const __N)
				1644	{
				1645	return _mm_shuffle_pi16 (__A, __N);
				1646	}
				1647
				1648	/* Conditionally store byte elements of A into P. The high bit of each
				1649	byte in the selector N determines whether the corresponding byte from
				1650	A is stored. */
				1651	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1652	_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
				1653	{
				1654	__m64 hibit = 0x8080808080808080UL;
				1655	__m64 mask, tmp;
				1656	__m64 p = (__m64)__P;
				1657
				1658	tmp = *p;
				1659	mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
				1660	tmp = (tmp & (~mask)) \| (__A & mask);
				1661	*p = tmp;
				1662	}
				1663
				1664	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1665	_m_maskmovq (__m64 __A, __m64 __N, char *__P)
				1666	{
				1667	_mm_maskmove_si64 (__A, __N, __P);
				1668	}
				1669
				1670	/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
				1671	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1672	_mm_avg_pu8 (__m64 __A, __m64 __B)
				1673	{
				1674	__vector unsigned char a, b, c;
				1675
				1676	a = (__vector unsigned char)vec_splats (__A);
				1677	b = (__vector unsigned char)vec_splats (__B);
				1678	c = vec_avg (a, b);
				1679	return (__m64) ((__vector long long) c)[0];
				1680	}
				1681
				1682	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1683	_m_pavgb (__m64 __A, __m64 __B)
				1684	{
				1685	return _mm_avg_pu8 (__A, __B);
				1686	}
				1687
				1688	/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
				1689	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1690	_mm_avg_pu16 (__m64 __A, __m64 __B)
				1691	{
				1692	__vector unsigned short a, b, c;
				1693
				1694	a = (__vector unsigned short)vec_splats (__A);
				1695	b = (__vector unsigned short)vec_splats (__B);
				1696	c = vec_avg (a, b);
				1697	return (__m64) ((__vector long long) c)[0];
				1698	}
				1699
				1700	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1701	_m_pavgw (__m64 __A, __m64 __B)
				1702	{
				1703	return _mm_avg_pu16 (__A, __B);
				1704	}
				1705
				1706	/* Compute the sum of the absolute differences of the unsigned 8-bit
				1707	values in A and B. Return the value in the lower 16-bit word; the
				1708	upper words are cleared. */
				1709	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1710	_mm_sad_pu8 (__m64 __A, __m64 __B)
				1711	{
				1712	__vector unsigned char a, b;
				1713	__vector unsigned char vmin, vmax, vabsdiff;
				1714	__vector signed int vsum;
				1715	const __vector unsigned int zero =
				1716	{ 0, 0, 0, 0 };
				1717	__m64_union result = {0};
				1718
				1719	a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
				1720	b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
				1721	vmin = vec_min (a, b);
				1722	vmax = vec_max (a, b);
				1723	vabsdiff = vec_sub (vmax, vmin);
				1724	/* Sum four groups of bytes into integers. */
				1725	vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
				1726	/* Sum across four integers with integer result. */
				1727	vsum = vec_sums (vsum, (__vector signed int) zero);
				1728	/* The sum is in the right most 32-bits of the vector result.
				1729	Transfer to a GPR and truncate to 16 bits. */
				1730	result.as_short[0] = vsum[3];
				1731	return result.as_m64;
				1732	}
				1733
				1734	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1735	_m_psadbw (__m64 __A, __m64 __B)
				1736	{
				1737	return _mm_sad_pu8 (__A, __B);
				1738	}
				1739
				1740	/* Stores the data in A to the address P without polluting the caches. */
				1741	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1742	_mm_stream_pi (__m64 *__P, __m64 __A)
				1743	{
				1744	/* Use the data cache block touch for store transient. */
				1745	__asm__ (
				1746	" dcbtstt 0,%0"
				1747	:
				1748	: "b" (__P)
				1749	: "memory"
				1750	);
				1751	*__P = __A;
				1752	}
				1753
				1754	/* Likewise. The address must be 16-byte aligned. */
				1755	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1756	_mm_stream_ps (float *__P, __m128 __A)
				1757	{
				1758	/* Use the data cache block touch for store transient. */
				1759	__asm__ (
				1760	" dcbtstt 0,%0"
				1761	:
				1762	: "b" (__P)
				1763	: "memory"
				1764	);
				1765	_mm_store_ps (__P, __A);
				1766	}
				1767
				1768	/* Guarantees that every preceding store is globally visible before
				1769	any subsequent store. */
				1770	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1771	_mm_sfence (void)
				1772	{
				1773	/* Generate a light weight sync. */
				1774	__atomic_thread_fence (__ATOMIC_RELEASE);
				1775	}
				1776
				1777	/* The execution of the next instruction is delayed by an implementation
				1778	specific amount of time. The instruction does not modify the
				1779	architectural state. This is after the pop_options pragma because
				1780	it does not require SSE support in the processor--the encoding is a
				1781	nop on processors that do not support it. */
				1782	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
				1783	_mm_pause (void)
				1784	{
				1785	/* There is no exact match with this construct, but the following is
				1786	close to the desired effect. */
				1787	#if _ARCH_PWR8
				1788	/* On power8 and later processors we can depend on Program Priority
				1789	(PRI) and associated "very low" PPI setting. Since we don't know
				1790	what PPI this thread is running at we: 1) save the current PRI
				1791	from the PPR SPR into a local GRP, 2) set the PRI to "very low*
				1792	via the special or 31,31,31 encoding. 3) issue an "isync" to
				1793	insure the PRI change takes effect before we execute any more
				1794	instructions.
				1795	Now we can execute a lwsync (release barrier) while we execute
				1796	this thread at "very low" PRI. Finally we restore the original
				1797	PRI and continue execution. */
				1798	unsigned long __PPR;
				1799
				1800	__asm__ volatile (
				1801	" mfppr %0;"
				1802	" or 31,31,31;"
				1803	" isync;"
				1804	" lwsync;"
				1805	" isync;"
				1806	" mtppr %0;"
				1807	: "=r" (__PPR)
				1808	:
				1809	: "memory"
				1810	);
				1811	#else
				1812	/* For older processor where we may not even have Program Priority
				1813	controls we can only depend on Heavy Weight Sync. */
				1814	__atomic_thread_fence (__ATOMIC_SEQ_CST);
				1815	#endif
				1816	}
				1817
				1818	/* Transpose the 4x4 matrix composed of row[0-3]. */
				1819	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
				1820	do { \
				1821	__v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
				1822	__v4sf __t0 = vec_vmrghw (__r0, __r1); \
				1823	__v4sf __t1 = vec_vmrghw (__r2, __r3); \
				1824	__v4sf __t2 = vec_vmrglw (__r0, __r1); \
				1825	__v4sf __t3 = vec_vmrglw (__r2, __r3); \
				1826	(row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
				1827	(__vector long long)__t1); \
				1828	(row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
				1829	(__vector long long)__t1); \
				1830	(row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
				1831	(__vector long long)__t3); \
				1832	(row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
				1833	(__vector long long)__t3); \
				1834	} while (0)
				1835
				1836	/* For backward source compatibility. */
				1837	//# include <emmintrin.h>
				1838
Logan Chien	bedbf4f	2020-01-06 19:35:19 -0800	[diff] [blame]	1839	#else
				1840	#include_next <xmmintrin.h>
Pirama Arumuga Nainar	ec8c89d	2022-02-23 09:26:16 -0800	[diff] [blame]	1841	#endif /* defined(__ppc64__) && (defined(__linux__) \|\| defined(__FreeBSD__)) \
				1842	*/
Logan Chien	bedbf4f	2020-01-06 19:35:19 -0800	[diff] [blame]	1843
Logan Chien	df4f766	2019-09-04 16:45:23 -0700	[diff] [blame]	1844	#endif /* _XMMINTRIN_H_INCLUDED */