Blame - sse2neon.h - platform/external/pffft

blob: b28a797037d08d2b0a2b7081efae58c98994ddba [file] [log] [blame]

hayati ayguen	2587d83	2020-11-12 07:15:43 +0000	[diff] [blame]	1	#ifndef SSE2NEON_H
				2	#define SSE2NEON_H
				3
				4	// This header file provides a simple API translation layer
				5	// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
				6	//
				7	// This header file does not yet translate all of the SSE intrinsics.
				8	//
				9	// Contributors to this work are:
				10	// John W. Ratcliff <jratcliffscarab@gmail.com>
				11	// Brandon Rowlett <browlett@nvidia.com>
				12	// Ken Fast <kfast@gdeb.com>
				13	// Eric van Beurden <evanbeurden@nvidia.com>
				14	// Alexander Potylitsin <apotylitsin@nvidia.com>
				15	// Hasindu Gamaarachchi <hasindu2008@gmail.com>
				16	// Jim Huang <jserv@biilabs.io>
				17	// Mark Cheng <marktwtn@biilabs.io>
				18	// Malcolm James MacLeod <malcolm@gulden.com>
				19	// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
				20	// Sebastian Pop <spop@amazon.com>
				21	// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
				22	// Danila Kutenin <danilak@google.com>
				23	// François Turban (JishinMaster) <francois.turban@gmail.com>
				24	// Pei-Hsuan Hung <afcidk@gmail.com>
				25	// Yang-Hao Yuan <yanghau@biilabs.io>
				26
				27	/*
				28	* sse2neon is freely redistributable under the MIT License.
				29	*
				30	* Permission is hereby granted, free of charge, to any person obtaining a copy
				31	* of this software and associated documentation files (the "Software"), to deal
				32	* in the Software without restriction, including without limitation the rights
				33	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				34	* copies of the Software, and to permit persons to whom the Software is
				35	* furnished to do so, subject to the following conditions:
				36	*
				37	* The above copyright notice and this permission notice shall be included in
				38	* all copies or substantial portions of the Software.
				39	*
				40	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				41	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				42	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				43	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				44	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				45	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				46	* SOFTWARE.
				47	*/
				48
				49	/* Tunable configurations */
				50
				51	/* Enable precise implementation of _mm_min_ps and _mm_max_ps
				52	* This would slow down the computation a bit, but gives consistent result with
				53	* x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
				54	*/
				55	#ifndef SSE2NEON_PRECISE_MINMAX
				56	#define SSE2NEON_PRECISE_MINMAX (0)
				57	#endif
				58
				59	#if defined(__GNUC__) \|\| defined(__clang__)
				60	#pragma push_macro("FORCE_INLINE")
				61	#pragma push_macro("ALIGN_STRUCT")
				62	#define FORCE_INLINE static inline __attribute__((always_inline))
				63	#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
				64	#else
				65	#error "Macro name collisions may happen with unsupported compiler."
				66	#ifdef FORCE_INLINE
				67	#undef FORCE_INLINE
				68	#endif
				69	#define FORCE_INLINE static inline
				70	#ifndef ALIGN_STRUCT
				71	#define ALIGN_STRUCT(x) __declspec(align(x))
				72	#endif
				73	#endif
				74
				75	#include <stdint.h>
				76	#include <stdlib.h>
				77
				78	/* Architecture-specific build options */
				79	/* FIXME: #pragma GCC push_options is only available on GCC */
				80	#if defined(__GNUC__)
				81	#if defined(__arm__) && __ARM_ARCH == 7
				82	/* According to ARM C Language Extensions Architecture specification,
				83	* __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
				84	* architecture supported.
				85	*/
				86	#if !defined(__ARM_NEON) \|\| !defined(__ARM_NEON__)
				87	#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
				88	#endif
				89	#pragma GCC push_options
				90	#pragma GCC target("fpu=neon")
				91	#elif defined(__aarch64__)
				92	#pragma GCC push_options
				93	#pragma GCC target("+simd")
				94	#else
				95	#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
				96	#endif
				97	#endif
				98
				99	#include <arm_neon.h>
				100
				101	/* Rounding functions require either Aarch64 instructions or libm failback */
				102	#if !defined(__aarch64__)
				103	#include <math.h>
				104	#endif
				105
				106	/* "__has_builtin" can be used to query support for built-in functions
				107	* provided by gcc/clang and other compilers that support it.
				108	*/
				109	#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
				110	/* Compatibility with gcc <= 9 */
				111	#if __GNUC__ <= 9
				112	#define __has_builtin(x) HAS##x
				113	#define HAS__builtin_popcount 1
				114	#define HAS__builtin_popcountll 1
				115	#else
				116	#define __has_builtin(x) 0
				117	#endif
				118	#endif
				119
				120	/**
				121	* MACRO for shuffle parameter for _mm_shuffle_ps().
				122	* Argument fp3 is a digit[0123] that represents the fp from argument "b"
				123	* of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
				124	* for fp2 in result. fp1 is a digit[0123] that represents the fp from
				125	* argument "a" of mm_shuffle_ps that will be places in fp1 of result.
				126	* fp0 is the same for fp0 of result.
				127	*/
				128	#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
				129	(((fp3) << 6) \| ((fp2) << 4) \| ((fp1) << 2) \| ((fp0)))
				130
				131	/* Rounding mode macros. */
				132	#define _MM_FROUND_TO_NEAREST_INT 0x00
				133	#define _MM_FROUND_TO_NEG_INF 0x01
				134	#define _MM_FROUND_TO_POS_INF 0x02
				135	#define _MM_FROUND_TO_ZERO 0x03
				136	#define _MM_FROUND_CUR_DIRECTION 0x04
				137	#define _MM_FROUND_NO_EXC 0x08
				138
				139	/* indicate immediate constant argument in a given range */
				140	#define __constrange(a, b) const
				141
				142	/* A few intrinsics accept traditional data types like ints or floats, but
				143	* most operate on data types that are specific to SSE.
				144	* If a vector type ends in d, it contains doubles, and if it does not have
				145	* a suffix, it contains floats. An integer vector type can contain any type
				146	* of integer, from chars to shorts to unsigned long longs.
				147	*/
				148	typedef int64x1_t __m64;
				149	typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
				150	// On ARM 32-bit architecture, the float64x2_t is not supported.
				151	// The data type __m128d should be represented in a different way for related
				152	// intrinsic conversion.
				153	#if defined(__aarch64__)
				154	typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
				155	#else
				156	typedef float32x4_t __m128d;
				157	#endif
				158	typedef int64x2_t __m128i; /* 128-bit vector containing integers */
				159
				160	/* type-safe casting between types */
				161
				162	#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
				163	#define vreinterpretq_m128_f32(x) (x)
				164	#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
				165
				166	#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
				167	#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
				168	#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
				169	#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
				170
				171	#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
				172	#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
				173	#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
				174	#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
				175
				176	#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
				177	#define vreinterpretq_f32_m128(x) (x)
				178	#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
				179
				180	#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
				181	#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
				182	#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
				183	#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
				184
				185	#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
				186	#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
				187	#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
				188	#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
				189
				190	#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
				191	#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
				192	#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
				193	#define vreinterpretq_m128i_s64(x) (x)
				194
				195	#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
				196	#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
				197	#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
				198	#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
				199
				200	#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
				201	#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
				202	#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
				203	#define vreinterpretq_s64_m128i(x) (x)
				204
				205	#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
				206	#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
				207	#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
				208	#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
				209
				210	#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
				211	#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
				212	#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
				213	#define vreinterpret_m64_s64(x) (x)
				214
				215	#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
				216	#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
				217	#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
				218	#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
				219
				220	#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
				221	#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
				222	#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
				223
				224	#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
				225	#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
				226	#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
				227	#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
				228
				229	#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
				230	#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
				231	#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
				232	#define vreinterpret_s64_m64(x) (x)
				233
				234	#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
				235
				236	#if defined(__aarch64__)
				237	#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
				238	#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
				239
				240	#define vreinterpretq_m128d_f64(x) (x)
				241
				242	#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
				243
				244	#define vreinterpretq_f64_m128d(x) (x)
				245	#else
				246	#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
				247	#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
				248
				249	#define vreinterpretq_m128d_f32(x) (x)
				250
				251	#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
				252
				253	#define vreinterpretq_f32_m128d(x) (x)
				254	#endif
				255
				256	// A struct is defined in this header file called 'SIMDVec' which can be used
				257	// by applications which attempt to access the contents of an _m128 struct
				258	// directly. It is important to note that accessing the __m128 struct directly
				259	// is bad coding practice by Microsoft: @see:
				260	// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
				261	//
				262	// However, some legacy source code may try to access the contents of an __m128
				263	// struct directly so the developer can use the SIMDVec as an alias for it. Any
				264	// casting must be done manually by the developer, as you cannot cast or
				265	// otherwise alias the base NEON data type for intrinsic operations.
				266	//
				267	// union intended to allow direct access to an __m128 variable using the names
				268	// that the MSVC compiler provides. This union should really only be used when
				269	// trying to access the members of the vector as integer values. GCC/clang
				270	// allow native access to the float members through a simple array access
				271	// operator (in C since 4.6, in C++ since 4.8).
				272	//
				273	// Ideally direct accesses to SIMD vectors should not be used since it can cause
				274	// a performance hit. If it really is needed however, the original __m128
				275	// variable can be aliased with a pointer to this union and used to access
				276	// individual components. The use of this union should be hidden behind a macro
				277	// that is used throughout the codebase to access the members instead of always
				278	// declaring this type of variable.
				279	typedef union ALIGN_STRUCT(16) SIMDVec {
				280	float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
				281	int8_t m128_i8[16]; // as signed 8-bit integers.
				282	int16_t m128_i16[8]; // as signed 16-bit integers.
				283	int32_t m128_i32[4]; // as signed 32-bit integers.
				284	int64_t m128_i64[2]; // as signed 64-bit integers.
				285	uint8_t m128_u8[16]; // as unsigned 8-bit integers.
				286	uint16_t m128_u16[8]; // as unsigned 16-bit integers.
				287	uint32_t m128_u32[4]; // as unsigned 32-bit integers.
				288	uint64_t m128_u64[2]; // as unsigned 64-bit integers.
				289	} SIMDVec;
				290
				291	// casting using SIMDVec
				292	#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
				293	#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
				294	#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
				295
				296	/* Backwards compatibility for compilers with lack of specific type support */
				297
				298	// Older gcc does not define vld1q_u8_x4 type
				299	#if defined(__GNUC__) && !defined(__clang__)
				300	#if __GNUC__ <= 9
				301	FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
				302	{
				303	uint8x16x4_t ret;
				304	ret.val[0] = vld1q_u8(p + 0);
				305	ret.val[1] = vld1q_u8(p + 16);
				306	ret.val[2] = vld1q_u8(p + 32);
				307	ret.val[3] = vld1q_u8(p + 48);
				308	return ret;
				309	}
				310	#endif
				311	#endif
				312
				313	/* Function Naming Conventions
				314	* The naming convention of SSE intrinsics is straightforward. A generic SSE
				315	* intrinsic function is given as follows:
				316	* _mm_<name>_<data_type>
				317	*
				318	* The parts of this format are given as follows:
				319	* 1. <name> describes the operation performed by the intrinsic
				320	* 2. <data_type> identifies the data type of the function's primary arguments
				321	*
				322	* This last part, <data_type>, is a little complicated. It identifies the
				323	* content of the input values, and can be set to any of the following values:
				324	* + ps - vectors contain floats (ps stands for packed single-precision)
				325	* + pd - vectors cantain doubles (pd stands for packed double-precision)
				326	* + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
				327	* signed integers
				328	* + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
				329	* unsigned integers
				330	* + si128 - unspecified 128-bit vector or 256-bit vector
				331	* + m128/m128i/m128d - identifies input vector types when they are different
				332	* than the type of the returned vector
				333	*
				334	* For example, _mm_setzero_ps. The _mm implies that the function returns
				335	* a 128-bit vector. The _ps at the end implies that the argument vectors
				336	* contain floats.
				337	*
				338	* A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
				339	* // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
				340	* __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
				341	* // Set packed 8-bit integers
				342	* // 128 bits, 16 chars, per 8 bits
				343	* __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
				344	* 4, 5, 12, 13, 6, 7, 14, 15);
				345	* // Shuffle packed 8-bit integers
				346	* __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
				347	*
				348	* Data (Number, Binary, Byte Index):
				349	+------+------+-------------+------+------+-------------+
				350	\| 1 \| 2 \| 3 \| 4 \| Number
				351	+------+------+------+------+------+------+------+------+
				352	\| 0000 \| 0001 \| 0000 \| 0010 \| 0000 \| 0011 \| 0000 \| 0100 \| Binary
				353	+------+------+------+------+------+------+------+------+
				354	\| 0 \| 1 \| 2 \| 3 \| 4 \| 5 \| 6 \| 7 \| Index
				355	+------+------+------+------+------+------+------+------+
				356
				357	+------+------+------+------+------+------+------+------+
				358	\| 5 \| 6 \| 7 \| 8 \| Number
				359	+------+------+------+------+------+------+------+------+
				360	\| 0000 \| 0101 \| 0000 \| 0110 \| 0000 \| 0111 \| 0000 \| 1000 \| Binary
				361	+------+------+------+------+------+------+------+------+
				362	\| 8 \| 9 \| 10 \| 11 \| 12 \| 13 \| 14 \| 15 \| Index
				363	+------+------+------+------+------+------+------+------+
				364	* Index (Byte Index):
				365	+------+------+------+------+------+------+------+------+
				366	\| 1 \| 0 \| 2 \| 3 \| 8 \| 9 \| 10 \| 11 \|
				367	+------+------+------+------+------+------+------+------+
				368
				369	+------+------+------+------+------+------+------+------+
				370	\| 4 \| 5 \| 12 \| 13 \| 6 \| 7 \| 14 \| 15 \|
				371	+------+------+------+------+------+------+------+------+
				372	* Result:
				373	+------+------+------+------+------+------+------+------+
				374	\| 1 \| 0 \| 2 \| 3 \| 8 \| 9 \| 10 \| 11 \| Index
				375	+------+------+------+------+------+------+------+------+
				376	\| 0001 \| 0000 \| 0000 \| 0010 \| 0000 \| 0101 \| 0000 \| 0110 \| Binary
				377	+------+------+------+------+------+------+------+------+
				378	\| 256 \| 2 \| 5 \| 6 \| Number
				379	+------+------+------+------+------+------+------+------+
				380
				381	+------+------+------+------+------+------+------+------+
				382	\| 4 \| 5 \| 12 \| 13 \| 6 \| 7 \| 14 \| 15 \| Index
				383	+------+------+------+------+------+------+------+------+
				384	\| 0000 \| 0011 \| 0000 \| 0111 \| 0000 \| 0100 \| 0000 \| 1000 \| Binary
				385	+------+------+------+------+------+------+------+------+
				386	\| 3 \| 7 \| 4 \| 8 \| Number
				387	+------+------+------+------+------+------+-------------+
				388	*/
				389
				390	/* Set/get methods */
				391
				392	/* Constants for use with _mm_prefetch. */
				393	enum _mm_hint {
				394	_MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
				395	_MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
				396	_MM_HINT_T1 = 2, /* load data to L2 cache only */
				397	_MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
				398	_MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
				399	_MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
				400	_MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
				401	_MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
				402	};
				403
				404	// Loads one cache line of data from address p to a location closer to the
				405	// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
				406	FORCE_INLINE void _mm_prefetch(const void *p, int i)
				407	{
				408	(void) i;
				409	__builtin_prefetch(p);
				410	}
				411
				412	// Copy the lower single-precision (32-bit) floating-point element of a to dst.
				413	//
				414	// dst[31:0] := a[31:0]
				415	//
				416	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
				417	FORCE_INLINE float _mm_cvtss_f32(__m128 a)
				418	{
				419	return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
				420	}
				421
				422	// Sets the 128-bit value to zero
				423	// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
				424	FORCE_INLINE __m128i _mm_setzero_si128(void)
				425	{
				426	return vreinterpretq_m128i_s32(vdupq_n_s32(0));
				427	}
				428
				429	// Clears the four single-precision, floating-point values.
				430	// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
				431	FORCE_INLINE __m128 _mm_setzero_ps(void)
				432	{
				433	return vreinterpretq_m128_f32(vdupq_n_f32(0));
				434	}
				435
				436	// Sets the four single-precision, floating-point values to w.
				437	//
				438	// r0 := r1 := r2 := r3 := w
				439	//
				440	// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
				441	FORCE_INLINE __m128 _mm_set1_ps(float _w)
				442	{
				443	return vreinterpretq_m128_f32(vdupq_n_f32(_w));
				444	}
				445
				446	// Sets the four single-precision, floating-point values to w.
				447	// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
				448	FORCE_INLINE __m128 _mm_set_ps1(float _w)
				449	{
				450	return vreinterpretq_m128_f32(vdupq_n_f32(_w));
				451	}
				452
				453	// Sets the four single-precision, floating-point values to the four inputs.
				454	// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
				455	FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
				456	{
				457	float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
				458	return vreinterpretq_m128_f32(vld1q_f32(data));
				459	}
				460
				461	// Copy single-precision (32-bit) floating-point element a to the lower element
				462	// of dst, and zero the upper 3 elements.
				463	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
				464	FORCE_INLINE __m128 _mm_set_ss(float a)
				465	{
				466	float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
				467	return vreinterpretq_m128_f32(vld1q_f32(data));
				468	}
				469
				470	// Sets the four single-precision, floating-point values to the four inputs in
				471	// reverse order.
				472	// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
				473	FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
				474	{
				475	float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
				476	return vreinterpretq_m128_f32(vld1q_f32(data));
				477	}
				478
				479	// Sets the 8 signed 16-bit integer values in reverse order.
				480	//
				481	// Return Value
				482	// r0 := w0
				483	// r1 := w1
				484	// ...
				485	// r7 := w7
				486	FORCE_INLINE __m128i _mm_setr_epi16(short w0,
				487	short w1,
				488	short w2,
				489	short w3,
				490	short w4,
				491	short w5,
				492	short w6,
				493	short w7)
				494	{
				495	int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
				496	return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
				497	}
				498
				499	// Sets the 4 signed 32-bit integer values in reverse order
				500	// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
				501	FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
				502	{
				503	int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
				504	return vreinterpretq_m128i_s32(vld1q_s32(data));
				505	}
				506
				507	// Set packed 64-bit integers in dst with the supplied values in reverse order.
				508	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
				509	FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
				510	{
				511	return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
				512	}
				513
				514	// Sets the 16 signed 8-bit integer values to b.
				515	//
				516	// r0 := b
				517	// r1 := b
				518	// ...
				519	// r15 := b
				520	//
				521	// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
				522	FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
				523	{
				524	return vreinterpretq_m128i_s8(vdupq_n_s8(w));
				525	}
				526
				527	// Sets the 8 signed 16-bit integer values to w.
				528	//
				529	// r0 := w
				530	// r1 := w
				531	// ...
				532	// r7 := w
				533	//
				534	// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
				535	FORCE_INLINE __m128i _mm_set1_epi16(short w)
				536	{
				537	return vreinterpretq_m128i_s16(vdupq_n_s16(w));
				538	}
				539
				540	// Sets the 16 signed 8-bit integer values.
				541	// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
				542	FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
				543	signed char b14,
				544	signed char b13,
				545	signed char b12,
				546	signed char b11,
				547	signed char b10,
				548	signed char b9,
				549	signed char b8,
				550	signed char b7,
				551	signed char b6,
				552	signed char b5,
				553	signed char b4,
				554	signed char b3,
				555	signed char b2,
				556	signed char b1,
				557	signed char b0)
				558	{
				559	int8_t ALIGN_STRUCT(16)
				560	data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
				561	(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
				562	(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
				563	(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
				564	return (__m128i) vld1q_s8(data);
				565	}
				566
				567	// Sets the 8 signed 16-bit integer values.
				568	// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
				569	FORCE_INLINE __m128i _mm_set_epi16(short i7,
				570	short i6,
				571	short i5,
				572	short i4,
				573	short i3,
				574	short i2,
				575	short i1,
				576	short i0)
				577	{
				578	int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
				579	return vreinterpretq_m128i_s16(vld1q_s16(data));
				580	}
				581
				582	// Sets the 16 signed 8-bit integer values in reverse order.
				583	// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
				584	FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
				585	signed char b1,
				586	signed char b2,
				587	signed char b3,
				588	signed char b4,
				589	signed char b5,
				590	signed char b6,
				591	signed char b7,
				592	signed char b8,
				593	signed char b9,
				594	signed char b10,
				595	signed char b11,
				596	signed char b12,
				597	signed char b13,
				598	signed char b14,
				599	signed char b15)
				600	{
				601	int8_t ALIGN_STRUCT(16)
				602	data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
				603	(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
				604	(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
				605	(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
				606	return (__m128i) vld1q_s8(data);
				607	}
				608
				609	// Sets the 4 signed 32-bit integer values to i.
				610	//
				611	// r0 := i
				612	// r1 := i
				613	// r2 := i
				614	// r3 := I
				615	//
				616	// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
				617	FORCE_INLINE __m128i _mm_set1_epi32(int _i)
				618	{
				619	return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
				620	}
				621
				622	// Sets the 2 signed 64-bit integer values to i.
				623	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
				624	FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
				625	{
				626	return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
				627	}
				628
				629	// Sets the 2 signed 64-bit integer values to i.
				630	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
				631	FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
				632	{
				633	return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
				634	}
				635
				636	// Sets the 4 signed 32-bit integer values.
				637	// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
				638	FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
				639	{
				640	int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
				641	return vreinterpretq_m128i_s32(vld1q_s32(data));
				642	}
				643
				644	// Returns the __m128i structure with its two 64-bit integer values
				645	// initialized to the values of the two 64-bit integers passed in.
				646	// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
				647	FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
				648	{
				649	int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
				650	return vreinterpretq_m128i_s64(vld1q_s64(data));
				651	}
				652
				653	// Returns the __m128i structure with its two 64-bit integer values
				654	// initialized to the values of the two 64-bit integers passed in.
				655	// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
				656	FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
				657	{
				658	return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
				659	}
				660
				661	// Set packed double-precision (64-bit) floating-point elements in dst with the
				662	// supplied values.
				663	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
				664	FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
				665	{
				666	double ALIGN_STRUCT(16) data[2] = {e0, e1};
				667	#if defined(__aarch64__)
				668	return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
				669	#else
				670	return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
				671	#endif
				672	}
				673
				674	// Stores four single-precision, floating-point values.
				675	// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
				676	FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
				677	{
				678	vst1q_f32(p, vreinterpretq_f32_m128(a));
				679	}
				680
				681	// Stores four single-precision, floating-point values.
				682	// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
				683	FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
				684	{
				685	vst1q_f32(p, vreinterpretq_f32_m128(a));
				686	}
				687
				688	// Stores four 32-bit integer values as (as a __m128i value) at the address p.
				689	// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
				690	FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
				691	{
				692	vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
				693	}
				694
				695	// Stores four 32-bit integer values as (as a __m128i value) at the address p.
				696	// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
				697	FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
				698	{
				699	vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
				700	}
				701
				702	// Stores the lower single - precision, floating - point value.
				703	// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
				704	FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
				705	{
				706	vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
				707	}
				708
				709	// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
				710	// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
				711	// or a general-protection exception may be generated.
				712	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
				713	FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
				714	{
				715	#if defined(__aarch64__)
				716	vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
				717	#else
				718	vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
				719	#endif
				720	}
				721
				722	// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
				723	// elements) from a into memory. mem_addr does not need to be aligned on any
				724	// particular boundary.
				725	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
				726	FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
				727	{
				728	_mm_store_pd(mem_addr, a);
				729	}
				730
				731	// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
				732	// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
				733	FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
				734	{
				735	uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
				736	uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
				737	*a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
				738	}
				739
				740	// Stores the lower two single-precision floating point values of a to the
				741	// address p.
				742	//
				743	// *p0 := a0
				744	// *p1 := a1
				745	//
				746	// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
				747	FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
				748	{
				749	*p = vreinterpret_m64_f32(vget_low_f32(a));
				750	}
				751
				752	// Stores the upper two single-precision, floating-point values of a to the
				753	// address p.
				754	//
				755	// *p0 := a2
				756	// *p1 := a3
				757	//
				758	// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
				759	FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
				760	{
				761	*p = vreinterpret_m64_f32(vget_high_f32(a));
				762	}
				763
				764	// Loads a single single-precision, floating-point value, copying it into all
				765	// four words
				766	// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
				767	FORCE_INLINE __m128 _mm_load1_ps(const float *p)
				768	{
				769	return vreinterpretq_m128_f32(vld1q_dup_f32(p));
				770	}
				771
				772	// Load a single-precision (32-bit) floating-point element from memory into all
				773	// elements of dst.
				774	//
				775	// dst[31:0] := MEM[mem_addr+31:mem_addr]
				776	// dst[63:32] := MEM[mem_addr+31:mem_addr]
				777	// dst[95:64] := MEM[mem_addr+31:mem_addr]
				778	// dst[127:96] := MEM[mem_addr+31:mem_addr]
				779	//
				780	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
				781	#define _mm_load_ps1 _mm_load1_ps
				782
				783	// Sets the lower two single-precision, floating-point values with 64
				784	// bits of data loaded from the address p; the upper two values are passed
				785	// through from a.
				786	//
				787	// Return Value
				788	// r0 := *p0
				789	// r1 := *p1
				790	// r2 := a2
				791	// r3 := a3
				792	//
				793	// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
				794	FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
				795	{
				796	return vreinterpretq_m128_f32(
				797	vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
				798	}
				799
				800	// Load 4 single-precision (32-bit) floating-point elements from memory into dst
				801	// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
				802	// general-protection exception may be generated.
				803	//
				804	// dst[31:0] := MEM[mem_addr+127:mem_addr+96]
				805	// dst[63:32] := MEM[mem_addr+95:mem_addr+64]
				806	// dst[95:64] := MEM[mem_addr+63:mem_addr+32]
				807	// dst[127:96] := MEM[mem_addr+31:mem_addr]
				808	//
				809	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
				810	FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
				811	{
				812	float32x4_t v = vrev64q_f32(vld1q_f32(p));
				813	return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
				814	}
				815
				816	// Sets the upper two single-precision, floating-point values with 64
				817	// bits of data loaded from the address p; the lower two values are passed
				818	// through from a.
				819	//
				820	// r0 := a0
				821	// r1 := a1
				822	// r2 := *p0
				823	// r3 := *p1
				824	//
				825	// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
				826	FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
				827	{
				828	return vreinterpretq_m128_f32(
				829	vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
				830	}
				831
				832	// Loads four single-precision, floating-point values.
				833	// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
				834	FORCE_INLINE __m128 _mm_load_ps(const float *p)
				835	{
				836	return vreinterpretq_m128_f32(vld1q_f32(p));
				837	}
				838
				839	// Loads four single-precision, floating-point values.
				840	// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
				841	FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
				842	{
				843	// for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
				844	// equivalent for neon
				845	return vreinterpretq_m128_f32(vld1q_f32(p));
				846	}
				847
				848	// Load unaligned 16-bit integer from memory into the first element of dst.
				849	//
				850	// dst[15:0] := MEM[mem_addr+15:mem_addr]
				851	// dst[MAX:16] := 0
				852	//
				853	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
				854	FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
				855	{
				856	return vreinterpretq_m128i_s16(
				857	vsetq_lane_s16((const int16_t ) p, vdupq_n_s16(0), 0));
				858	}
				859
				860	// Load unaligned 64-bit integer from memory into the first element of dst.
				861	//
				862	// dst[63:0] := MEM[mem_addr+63:mem_addr]
				863	// dst[MAX:64] := 0
				864	//
				865	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
				866	FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
				867	{
				868	return vreinterpretq_m128i_s64(
				869	vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
				870	}
				871
				872	// Load a double-precision (64-bit) floating-point element from memory into the
				873	// lower of dst, and zero the upper element. mem_addr does not need to be
				874	// aligned on any particular boundary.
				875	//
				876	// dst[63:0] := MEM[mem_addr+63:mem_addr]
				877	// dst[127:64] := 0
				878	//
				879	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
				880	FORCE_INLINE __m128d _mm_load_sd(const double *p)
				881	{
				882	#if defined(__aarch64__)
				883	return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
				884	#else
				885	const float fp = (const float ) p;
				886	float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
				887	return vreinterpretq_m128d_f32(vld1q_f32(data));
				888	#endif
				889	}
				890
				891	// Loads two double-precision from 16-byte aligned memory, floating-point
				892	// values.
				893	//
				894	// dst[127:0] := MEM[mem_addr+127:mem_addr]
				895	//
				896	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
				897	FORCE_INLINE __m128d _mm_load_pd(const double *p)
				898	{
				899	#if defined(__aarch64__)
				900	return vreinterpretq_m128d_f64(vld1q_f64(p));
				901	#else
				902	const float fp = (const float ) p;
				903	float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
				904	return vreinterpretq_m128d_f32(vld1q_f32(data));
				905	#endif
				906	}
				907
				908	// Loads two double-precision from unaligned memory, floating-point values.
				909	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
				910	FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
				911	{
				912	return _mm_load_pd(p);
				913	}
				914
				915	// Loads an single - precision, floating - point value into the low word and
				916	// clears the upper three words.
				917	// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
				918	FORCE_INLINE __m128 _mm_load_ss(const float *p)
				919	{
				920	return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
				921	}
				922
				923	FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
				924	{
				925	/* Load the lower 64 bits of the value pointed to by p into the
				926	* lower 64 bits of the result, zeroing the upper 64 bits of the result.
				927	*/
				928	return vreinterpretq_m128i_s32(
				929	vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
				930	}
				931
				932	// Load a double-precision (64-bit) floating-point element from memory into the
				933	// lower element of dst, and copy the upper element from a to dst. mem_addr does
				934	// not need to be aligned on any particular boundary.
				935	//
				936	// dst[63:0] := MEM[mem_addr+63:mem_addr]
				937	// dst[127:64] := a[127:64]
				938	//
				939	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
				940	FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
				941	{
				942	#if defined(__aarch64__)
				943	return vreinterpretq_m128d_f64(
				944	vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
				945	#else
				946	return vreinterpretq_m128d_f32(
				947	vcombine_f32(vld1_f32((const float *) p),
				948	vget_high_f32(vreinterpretq_f32_m128d(a))));
				949	#endif
				950	}
				951
				952	// Load 2 double-precision (64-bit) floating-point elements from memory into dst
				953	// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
				954	// general-protection exception may be generated.
				955	//
				956	// dst[63:0] := MEM[mem_addr+127:mem_addr+64]
				957	// dst[127:64] := MEM[mem_addr+63:mem_addr]
				958	//
				959	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
				960	FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
				961	{
				962	#if defined(__aarch64__)
				963	float64x2_t v = vld1q_f64(p);
				964	return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
				965	#else
				966	int64x2_t v = vld1q_s64((const int64_t *) p);
				967	return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
				968	#endif
				969	}
				970
				971	// Sets the low word to the single-precision, floating-point value of b
				972	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
				973	FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
				974	{
				975	return vreinterpretq_m128_f32(
				976	vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
				977	vreinterpretq_f32_m128(a), 0));
				978	}
				979
				980	// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
				981	// upper element.
				982	//
				983	// dst[63:0] := a[63:0]
				984	// dst[127:64] := 0
				985	//
				986	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
				987	FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
				988	{
				989	return vreinterpretq_m128i_s64(
				990	vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
				991	}
				992
				993	// Return vector of type __m128 with undefined elements.
				994	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
				995	FORCE_INLINE __m128 _mm_undefined_ps(void)
				996	{
				997	__m128 a;
				998	return a;
				999	}
				1000
				1001	/* Logic/Binary operations */
				1002
				1003	// Computes the bitwise AND-NOT of the four single-precision, floating-point
				1004	// values of a and b.
				1005	//
				1006	// r0 := ~a0 & b0
				1007	// r1 := ~a1 & b1
				1008	// r2 := ~a2 & b2
				1009	// r3 := ~a3 & b3
				1010	//
				1011	// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
				1012	FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
				1013	{
				1014	return vreinterpretq_m128_s32(
				1015	vbicq_s32(vreinterpretq_s32_m128(b),
				1016	vreinterpretq_s32_m128(a))); // NOTE argument swap
				1017	}
				1018
				1019	// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
				1020	// elements in a and then AND with b, and store the results in dst.
				1021	//
				1022	// FOR j := 0 to 1
				1023	// i := j*64
				1024	// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
				1025	// ENDFOR
				1026	//
				1027	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
				1028	FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
				1029	{
				1030	// NOTE argument swap
				1031	return vreinterpretq_m128d_s64(
				1032	vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
				1033	}
				1034
				1035	// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
				1036	// 128-bit value in a.
				1037	//
				1038	// r := (~a) & b
				1039	//
				1040	// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
				1041	FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
				1042	{
				1043	return vreinterpretq_m128i_s32(
				1044	vbicq_s32(vreinterpretq_s32_m128i(b),
				1045	vreinterpretq_s32_m128i(a))); // NOTE argument swap
				1046	}
				1047
				1048	// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
				1049	// b.
				1050	//
				1051	// r := a & b
				1052	//
				1053	// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
				1054	FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
				1055	{
				1056	return vreinterpretq_m128i_s32(
				1057	vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				1058	}
				1059
				1060	// Computes the bitwise AND of the four single-precision, floating-point values
				1061	// of a and b.
				1062	//
				1063	// r0 := a0 & b0
				1064	// r1 := a1 & b1
				1065	// r2 := a2 & b2
				1066	// r3 := a3 & b3
				1067	//
				1068	// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
				1069	FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
				1070	{
				1071	return vreinterpretq_m128_s32(
				1072	vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
				1073	}
				1074
				1075	// Compute the bitwise AND of packed double-precision (64-bit) floating-point
				1076	// elements in a and b, and store the results in dst.
				1077	//
				1078	// FOR j := 0 to 1
				1079	// i := j*64
				1080	// dst[i+63:i] := a[i+63:i] AND b[i+63:i]
				1081	// ENDFOR
				1082	//
				1083	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
				1084	FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
				1085	{
				1086	return vreinterpretq_m128d_s64(
				1087	vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
				1088	}
				1089
				1090	// Computes the bitwise OR of the four single-precision, floating-point values
				1091	// of a and b.
				1092	// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
				1093	FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
				1094	{
				1095	return vreinterpretq_m128_s32(
				1096	vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
				1097	}
				1098
				1099	// Computes bitwise EXOR (exclusive-or) of the four single-precision,
				1100	// floating-point values of a and b.
				1101	// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
				1102	FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
				1103	{
				1104	return vreinterpretq_m128_s32(
				1105	veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
				1106	}
				1107
				1108	// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
				1109	// elements in a and b, and store the results in dst.
				1110	//
				1111	// FOR j := 0 to 1
				1112	// i := j*64
				1113	// dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
				1114	// ENDFOR
				1115	//
				1116	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
				1117	FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
				1118	{
				1119	return vreinterpretq_m128d_s64(
				1120	veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
				1121	}
				1122
				1123	// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
				1124	//
				1125	// r := a \| b
				1126	//
				1127	// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
				1128	FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
				1129	{
				1130	return vreinterpretq_m128i_s32(
				1131	vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				1132	}
				1133
				1134	// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
				1135	// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
				1136	FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
				1137	{
				1138	return vreinterpretq_m128i_s32(
				1139	veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				1140	}
				1141
				1142	// Duplicate odd-indexed single-precision (32-bit) floating-point elements
				1143	// from a, and store the results in dst.
				1144	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
				1145	FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
				1146	{
				1147	#if __has_builtin(__builtin_shufflevector)
				1148	return vreinterpretq_m128_f32(__builtin_shufflevector(
				1149	vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
				1150	#else
				1151	float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
				1152	float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
				1153	float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
				1154	return vreinterpretq_m128_f32(vld1q_f32(data));
				1155	#endif
				1156	}
				1157
				1158	// Duplicate even-indexed single-precision (32-bit) floating-point elements
				1159	// from a, and store the results in dst.
				1160	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
				1161	FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
				1162	{
				1163	#if __has_builtin(__builtin_shufflevector)
				1164	return vreinterpretq_m128_f32(__builtin_shufflevector(
				1165	vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
				1166	#else
				1167	float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
				1168	float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
				1169	float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
				1170	return vreinterpretq_m128_f32(vld1q_f32(data));
				1171	#endif
				1172	}
				1173
				1174	// Moves the upper two values of B into the lower two values of A.
				1175	//
				1176	// r3 := a3
				1177	// r2 := a2
				1178	// r1 := b3
				1179	// r0 := b2
				1180	FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
				1181	{
				1182	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
				1183	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
				1184	return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
				1185	}
				1186
				1187	// Moves the lower two values of B into the upper two values of A.
				1188	//
				1189	// r3 := b1
				1190	// r2 := b0
				1191	// r1 := a1
				1192	// r0 := a0
				1193	FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
				1194	{
				1195	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
				1196	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
				1197	return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
				1198	}
				1199
				1200	// Compute the absolute value of packed signed 32-bit integers in a, and store
				1201	// the unsigned results in dst.
				1202	//
				1203	// FOR j := 0 to 3
				1204	// i := j*32
				1205	// dst[i+31:i] := ABS(a[i+31:i])
				1206	// ENDFOR
				1207	//
				1208	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
				1209	FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
				1210	{
				1211	return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
				1212	}
				1213
				1214	// Compute the absolute value of packed signed 16-bit integers in a, and store
				1215	// the unsigned results in dst.
				1216	//
				1217	// FOR j := 0 to 7
				1218	// i := j*16
				1219	// dst[i+15:i] := ABS(a[i+15:i])
				1220	// ENDFOR
				1221	//
				1222	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
				1223	FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
				1224	{
				1225	return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
				1226	}
				1227
				1228	// Compute the absolute value of packed signed 8-bit integers in a, and store
				1229	// the unsigned results in dst.
				1230	//
				1231	// FOR j := 0 to 15
				1232	// i := j*8
				1233	// dst[i+7:i] := ABS(a[i+7:i])
				1234	// ENDFOR
				1235	//
				1236	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
				1237	FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
				1238	{
				1239	return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
				1240	}
				1241
				1242	// Compute the absolute value of packed signed 32-bit integers in a, and store
				1243	// the unsigned results in dst.
				1244	//
				1245	// FOR j := 0 to 1
				1246	// i := j*32
				1247	// dst[i+31:i] := ABS(a[i+31:i])
				1248	// ENDFOR
				1249	//
				1250	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
				1251	FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
				1252	{
				1253	return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
				1254	}
				1255
				1256	// Compute the absolute value of packed signed 16-bit integers in a, and store
				1257	// the unsigned results in dst.
				1258	//
				1259	// FOR j := 0 to 3
				1260	// i := j*16
				1261	// dst[i+15:i] := ABS(a[i+15:i])
				1262	// ENDFOR
				1263	//
				1264	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
				1265	FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
				1266	{
				1267	return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
				1268	}
				1269
				1270	// Compute the absolute value of packed signed 8-bit integers in a, and store
				1271	// the unsigned results in dst.
				1272	//
				1273	// FOR j := 0 to 7
				1274	// i := j*8
				1275	// dst[i+7:i] := ABS(a[i+7:i])
				1276	// ENDFOR
				1277	//
				1278	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
				1279	FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
				1280	{
				1281	return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
				1282	}
				1283
				1284	// Takes the upper 64 bits of a and places it in the low end of the result
				1285	// Takes the lower 64 bits of b and places it into the high end of the result.
				1286	FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
				1287	{
				1288	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
				1289	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
				1290	return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
				1291	}
				1292
				1293	// takes the lower two 32-bit values from a and swaps them and places in high
				1294	// end of result takes the higher two 32 bit values from b and swaps them and
				1295	// places in low end of result.
				1296	FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
				1297	{
				1298	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
				1299	float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
				1300	return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
				1301	}
				1302
				1303	FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
				1304	{
				1305	float32x2_t a21 = vget_high_f32(
				1306	vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
				1307	float32x2_t b03 = vget_low_f32(
				1308	vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
				1309	return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
				1310	}
				1311
				1312	FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
				1313	{
				1314	float32x2_t a03 = vget_low_f32(
				1315	vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
				1316	float32x2_t b21 = vget_high_f32(
				1317	vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
				1318	return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
				1319	}
				1320
				1321	FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
				1322	{
				1323	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
				1324	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
				1325	return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
				1326	}
				1327
				1328	FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
				1329	{
				1330	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
				1331	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
				1332	return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
				1333	}
				1334
				1335	FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
				1336	{
				1337	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
				1338	float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
				1339	return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
				1340	}
				1341
				1342	// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
				1343	// high
				1344	FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
				1345	{
				1346	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
				1347	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
				1348	return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
				1349	}
				1350
				1351	FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
				1352	{
				1353	float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
				1354	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
				1355	return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
				1356	}
				1357
				1358	FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
				1359	{
				1360	float32x2_t a22 =
				1361	vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
				1362	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
				1363	return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
				1364	}
				1365
				1366	FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
				1367	{
				1368	float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
				1369	float32x2_t b22 =
				1370	vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
				1371	return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
				1372	}
				1373
				1374	FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
				1375	{
				1376	float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
				1377	float32x2_t a22 =
				1378	vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
				1379	float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
				1380	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
				1381	return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
				1382	}
				1383
				1384	FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
				1385	{
				1386	float32x2_t a33 =
				1387	vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
				1388	float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
				1389	return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
				1390	}
				1391
				1392	FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
				1393	{
				1394	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
				1395	float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
				1396	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
				1397	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
				1398	return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
				1399	}
				1400
				1401	FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
				1402	{
				1403	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
				1404	float32_t b2 = vgetq_lane_f32(b, 2);
				1405	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
				1406	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
				1407	return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
				1408	}
				1409
				1410	FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
				1411	{
				1412	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
				1413	float32_t b2 = vgetq_lane_f32(b, 2);
				1414	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
				1415	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
				1416	return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
				1417	}
				1418
				1419	// NEON does not support a general purpose permute intrinsic
				1420	// Selects four specific single-precision, floating-point values from a and b,
				1421	// based on the mask i.
				1422	//
				1423	// C equivalent:
				1424	// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
				1425	// __constrange(0, 255) int imm) {
				1426	// __m128 ret;
				1427	// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
				1428	// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
				1429	// return ret;
				1430	// }
				1431	//
				1432	// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
				1433	#define _mm_shuffle_ps_default(a, b, imm) \
				1434	__extension__({ \
				1435	float32x4_t ret; \
				1436	ret = vmovq_n_f32( \
				1437	vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
				1438	ret = vsetq_lane_f32( \
				1439	vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
				1440	ret, 1); \
				1441	ret = vsetq_lane_f32( \
				1442	vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
				1443	ret, 2); \
				1444	ret = vsetq_lane_f32( \
				1445	vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
				1446	ret, 3); \
				1447	vreinterpretq_m128_f32(ret); \
				1448	})
				1449
				1450	// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
				1451	// int imm)
				1452	#if __has_builtin(__builtin_shufflevector)
				1453	#define _mm_shuffle_ps(a, b, imm) \
				1454	__extension__({ \
				1455	float32x4_t _input1 = vreinterpretq_f32_m128(a); \
				1456	float32x4_t _input2 = vreinterpretq_f32_m128(b); \
				1457	float32x4_t _shuf = __builtin_shufflevector( \
				1458	_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
				1459	(((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
				1460	vreinterpretq_m128_f32(_shuf); \
				1461	})
				1462	#else // generic
				1463	#define _mm_shuffle_ps(a, b, imm) \
				1464	__extension__({ \
				1465	__m128 ret; \
				1466	switch (imm) { \
				1467	case _MM_SHUFFLE(1, 0, 3, 2): \
				1468	ret = _mm_shuffle_ps_1032((a), (b)); \
				1469	break; \
				1470	case _MM_SHUFFLE(2, 3, 0, 1): \
				1471	ret = _mm_shuffle_ps_2301((a), (b)); \
				1472	break; \
				1473	case _MM_SHUFFLE(0, 3, 2, 1): \
				1474	ret = _mm_shuffle_ps_0321((a), (b)); \
				1475	break; \
				1476	case _MM_SHUFFLE(2, 1, 0, 3): \
				1477	ret = _mm_shuffle_ps_2103((a), (b)); \
				1478	break; \
				1479	case _MM_SHUFFLE(1, 0, 1, 0): \
				1480	ret = _mm_movelh_ps((a), (b)); \
				1481	break; \
				1482	case _MM_SHUFFLE(1, 0, 0, 1): \
				1483	ret = _mm_shuffle_ps_1001((a), (b)); \
				1484	break; \
				1485	case _MM_SHUFFLE(0, 1, 0, 1): \
				1486	ret = _mm_shuffle_ps_0101((a), (b)); \
				1487	break; \
				1488	case _MM_SHUFFLE(3, 2, 1, 0): \
				1489	ret = _mm_shuffle_ps_3210((a), (b)); \
				1490	break; \
				1491	case _MM_SHUFFLE(0, 0, 1, 1): \
				1492	ret = _mm_shuffle_ps_0011((a), (b)); \
				1493	break; \
				1494	case _MM_SHUFFLE(0, 0, 2, 2): \
				1495	ret = _mm_shuffle_ps_0022((a), (b)); \
				1496	break; \
				1497	case _MM_SHUFFLE(2, 2, 0, 0): \
				1498	ret = _mm_shuffle_ps_2200((a), (b)); \
				1499	break; \
				1500	case _MM_SHUFFLE(3, 2, 0, 2): \
				1501	ret = _mm_shuffle_ps_3202((a), (b)); \
				1502	break; \
				1503	case _MM_SHUFFLE(3, 2, 3, 2): \
				1504	ret = _mm_movehl_ps((b), (a)); \
				1505	break; \
				1506	case _MM_SHUFFLE(1, 1, 3, 3): \
				1507	ret = _mm_shuffle_ps_1133((a), (b)); \
				1508	break; \
				1509	case _MM_SHUFFLE(2, 0, 1, 0): \
				1510	ret = _mm_shuffle_ps_2010((a), (b)); \
				1511	break; \
				1512	case _MM_SHUFFLE(2, 0, 0, 1): \
				1513	ret = _mm_shuffle_ps_2001((a), (b)); \
				1514	break; \
				1515	case _MM_SHUFFLE(2, 0, 3, 2): \
				1516	ret = _mm_shuffle_ps_2032((a), (b)); \
				1517	break; \
				1518	default: \
				1519	ret = _mm_shuffle_ps_default((a), (b), (imm)); \
				1520	break; \
				1521	} \
				1522	ret; \
				1523	})
				1524	#endif
				1525
				1526	// Takes the upper 64 bits of a and places it in the low end of the result
				1527	// Takes the lower 64 bits of a and places it into the high end of the result.
				1528	FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
				1529	{
				1530	int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
				1531	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
				1532	return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
				1533	}
				1534
				1535	// takes the lower two 32-bit values from a and swaps them and places in low end
				1536	// of result takes the higher two 32 bit values from a and swaps them and places
				1537	// in high end of result.
				1538	FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
				1539	{
				1540	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
				1541	int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
				1542	return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
				1543	}
				1544
				1545	// rotates the least significant 32 bits into the most signficant 32 bits, and
				1546	// shifts the rest down
				1547	FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
				1548	{
				1549	return vreinterpretq_m128i_s32(
				1550	vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
				1551	}
				1552
				1553	// rotates the most significant 32 bits into the least signficant 32 bits, and
				1554	// shifts the rest up
				1555	FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
				1556	{
				1557	return vreinterpretq_m128i_s32(
				1558	vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
				1559	}
				1560
				1561	// gets the lower 64 bits of a, and places it in the upper 64 bits
				1562	// gets the lower 64 bits of a and places it in the lower 64 bits
				1563	FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
				1564	{
				1565	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
				1566	return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
				1567	}
				1568
				1569	// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
				1570	// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
				1571	FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
				1572	{
				1573	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
				1574	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
				1575	return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
				1576	}
				1577
				1578	// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
				1579	// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
				1580	// places it in the lower 64 bits
				1581	FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
				1582	{
				1583	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
				1584	return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
				1585	}
				1586
				1587	FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
				1588	{
				1589	int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
				1590	int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
				1591	return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
				1592	}
				1593
				1594	FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
				1595	{
				1596	int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
				1597	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
				1598	return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
				1599	}
				1600
				1601	FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
				1602	{
				1603	int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
				1604	int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
				1605	return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
				1606	}
				1607
				1608	// Shuffle packed 8-bit integers in a according to shuffle control mask in the
				1609	// corresponding 8-bit element of b, and store the results in dst.
				1610	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
				1611	FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
				1612	{
				1613	int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
				1614	uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
				1615	uint8x16_t idx_masked =
				1616	vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
				1617	#if defined(__aarch64__)
				1618	return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
				1619	#elif defined(__GNUC__)
				1620	int8x16_t ret;
				1621	// %e and %f represent the even and odd D registers
				1622	// respectively.
				1623	__asm__ __volatile__(
				1624	"vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
				1625	"vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
				1626	: [ret] "=&w"(ret)
				1627	: [tbl] "w"(tbl), [idx] "w"(idx_masked));
				1628	return vreinterpretq_m128i_s8(ret);
				1629	#else
				1630	// use this line if testing on aarch64
				1631	int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
				1632	return vreinterpretq_m128i_s8(
				1633	vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
				1634	vtbl2_s8(a_split, vget_high_u8(idx_masked))));
				1635	#endif
				1636	}
				1637
				1638	// C equivalent:
				1639	// __m128i _mm_shuffle_epi32_default(__m128i a,
				1640	// __constrange(0, 255) int imm) {
				1641	// __m128i ret;
				1642	// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
				1643	// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
				1644	// return ret;
				1645	// }
				1646	#define _mm_shuffle_epi32_default(a, imm) \
				1647	__extension__({ \
				1648	int32x4_t ret; \
				1649	ret = vmovq_n_s32( \
				1650	vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
				1651	ret = vsetq_lane_s32( \
				1652	vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
				1653	ret, 1); \
				1654	ret = vsetq_lane_s32( \
				1655	vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
				1656	ret, 2); \
				1657	ret = vsetq_lane_s32( \
				1658	vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
				1659	ret, 3); \
				1660	vreinterpretq_m128i_s32(ret); \
				1661	})
				1662
				1663	// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
				1664	// int imm)
				1665	#if defined(__aarch64__)
				1666	#define _mm_shuffle_epi32_splat(a, imm) \
				1667	__extension__({ \
				1668	vreinterpretq_m128i_s32( \
				1669	vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
				1670	})
				1671	#else
				1672	#define _mm_shuffle_epi32_splat(a, imm) \
				1673	__extension__({ \
				1674	vreinterpretq_m128i_s32( \
				1675	vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
				1676	})
				1677	#endif
				1678
				1679	// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
				1680	// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
				1681	// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
				1682	// __constrange(0,255) int imm)
				1683	#if __has_builtin(__builtin_shufflevector)
				1684	#define _mm_shuffle_epi32(a, imm) \
				1685	__extension__({ \
				1686	int32x4_t _input = vreinterpretq_s32_m128i(a); \
				1687	int32x4_t _shuf = __builtin_shufflevector( \
				1688	_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
				1689	((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
				1690	vreinterpretq_m128i_s32(_shuf); \
				1691	})
				1692	#else // generic
				1693	#define _mm_shuffle_epi32(a, imm) \
				1694	__extension__({ \
				1695	__m128i ret; \
				1696	switch (imm) { \
				1697	case _MM_SHUFFLE(1, 0, 3, 2): \
				1698	ret = _mm_shuffle_epi_1032((a)); \
				1699	break; \
				1700	case _MM_SHUFFLE(2, 3, 0, 1): \
				1701	ret = _mm_shuffle_epi_2301((a)); \
				1702	break; \
				1703	case _MM_SHUFFLE(0, 3, 2, 1): \
				1704	ret = _mm_shuffle_epi_0321((a)); \
				1705	break; \
				1706	case _MM_SHUFFLE(2, 1, 0, 3): \
				1707	ret = _mm_shuffle_epi_2103((a)); \
				1708	break; \
				1709	case _MM_SHUFFLE(1, 0, 1, 0): \
				1710	ret = _mm_shuffle_epi_1010((a)); \
				1711	break; \
				1712	case _MM_SHUFFLE(1, 0, 0, 1): \
				1713	ret = _mm_shuffle_epi_1001((a)); \
				1714	break; \
				1715	case _MM_SHUFFLE(0, 1, 0, 1): \
				1716	ret = _mm_shuffle_epi_0101((a)); \
				1717	break; \
				1718	case _MM_SHUFFLE(2, 2, 1, 1): \
				1719	ret = _mm_shuffle_epi_2211((a)); \
				1720	break; \
				1721	case _MM_SHUFFLE(0, 1, 2, 2): \
				1722	ret = _mm_shuffle_epi_0122((a)); \
				1723	break; \
				1724	case _MM_SHUFFLE(3, 3, 3, 2): \
				1725	ret = _mm_shuffle_epi_3332((a)); \
				1726	break; \
				1727	case _MM_SHUFFLE(0, 0, 0, 0): \
				1728	ret = _mm_shuffle_epi32_splat((a), 0); \
				1729	break; \
				1730	case _MM_SHUFFLE(1, 1, 1, 1): \
				1731	ret = _mm_shuffle_epi32_splat((a), 1); \
				1732	break; \
				1733	case _MM_SHUFFLE(2, 2, 2, 2): \
				1734	ret = _mm_shuffle_epi32_splat((a), 2); \
				1735	break; \
				1736	case _MM_SHUFFLE(3, 3, 3, 3): \
				1737	ret = _mm_shuffle_epi32_splat((a), 3); \
				1738	break; \
				1739	default: \
				1740	ret = _mm_shuffle_epi32_default((a), (imm)); \
				1741	break; \
				1742	} \
				1743	ret; \
				1744	})
				1745	#endif
				1746
				1747	// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
				1748	// by imm.
				1749	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
				1750	// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
				1751	// __constrange(0,255) int
				1752	// imm)
				1753	#define _mm_shufflelo_epi16_function(a, imm) \
				1754	__extension__({ \
				1755	int16x8_t ret = vreinterpretq_s16_m128i(a); \
				1756	int16x4_t lowBits = vget_low_s16(ret); \
				1757	ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
				1758	ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
				1759	1); \
				1760	ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
				1761	2); \
				1762	ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
				1763	3); \
				1764	vreinterpretq_m128i_s16(ret); \
				1765	})
				1766
				1767	// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
				1768	// __constrange(0,255) int imm)
				1769	#if __has_builtin(__builtin_shufflevector)
				1770	#define _mm_shufflelo_epi16(a, imm) \
				1771	__extension__({ \
				1772	int16x8_t _input = vreinterpretq_s16_m128i(a); \
				1773	int16x8_t _shuf = __builtin_shufflevector( \
				1774	_input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
				1775	(((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
				1776	vreinterpretq_m128i_s16(_shuf); \
				1777	})
				1778	#else // generic
				1779	#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
				1780	#endif
				1781
				1782	// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
				1783	// by imm.
				1784	// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
				1785	// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
				1786	// __constrange(0,255) int
				1787	// imm)
				1788	#define _mm_shufflehi_epi16_function(a, imm) \
				1789	__extension__({ \
				1790	int16x8_t ret = vreinterpretq_s16_m128i(a); \
				1791	int16x4_t highBits = vget_high_s16(ret); \
				1792	ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
				1793	ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
				1794	5); \
				1795	ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
				1796	6); \
				1797	ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
				1798	7); \
				1799	vreinterpretq_m128i_s16(ret); \
				1800	})
				1801
				1802	// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
				1803	// __constrange(0,255) int imm)
				1804	#if __has_builtin(__builtin_shufflevector)
				1805	#define _mm_shufflehi_epi16(a, imm) \
				1806	__extension__({ \
				1807	int16x8_t _input = vreinterpretq_s16_m128i(a); \
				1808	int16x8_t _shuf = __builtin_shufflevector( \
				1809	_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
				1810	(((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
				1811	(((imm) >> 6) & 0x3) + 4); \
				1812	vreinterpretq_m128i_s16(_shuf); \
				1813	})
				1814	#else // generic
				1815	#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
				1816	#endif
				1817
				1818	// Blend packed 16-bit integers from a and b using control mask imm8, and store
				1819	// the results in dst.
				1820	//
				1821	// FOR j := 0 to 7
				1822	// i := j*16
				1823	// IF imm8[j]
				1824	// dst[i+15:i] := b[i+15:i]
				1825	// ELSE
				1826	// dst[i+15:i] := a[i+15:i]
				1827	// FI
				1828	// ENDFOR
				1829	// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
				1830	// __constrange(0,255) int imm)
				1831	#define _mm_blend_epi16(a, b, imm) \
				1832	__extension__({ \
				1833	const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \
				1834	((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \
				1835	((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \
				1836	((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \
				1837	((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \
				1838	((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \
				1839	((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \
				1840	((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
				1841	uint16x8_t _mask_vec = vld1q_u16(_mask); \
				1842	uint16x8_t _a = vreinterpretq_u16_m128i(a); \
				1843	uint16x8_t _b = vreinterpretq_u16_m128i(b); \
				1844	vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
				1845	})
				1846
				1847	// Blend packed 8-bit integers from a and b using mask, and store the results in
				1848	// dst.
				1849	//
				1850	// FOR j := 0 to 15
				1851	// i := j*8
				1852	// IF mask[i+7]
				1853	// dst[i+7:i] := b[i+7:i]
				1854	// ELSE
				1855	// dst[i+7:i] := a[i+7:i]
				1856	// FI
				1857	// ENDFOR
				1858	FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
				1859	{
				1860	// Use a signed shift right to create a mask with the sign bit
				1861	uint8x16_t mask =
				1862	vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
				1863	uint8x16_t a = vreinterpretq_u8_m128i(_a);
				1864	uint8x16_t b = vreinterpretq_u8_m128i(_b);
				1865	return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
				1866	}
				1867
				1868	/* Shifts */
				1869
				1870
				1871	// Shift packed 16-bit integers in a right by imm while shifting in sign
				1872	// bits, and store the results in dst.
				1873	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
				1874	FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
				1875	{
				1876	const int count = (imm & ~15) ? 15 : imm;
				1877	return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
				1878	}
				1879
				1880	// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
				1881	// shifting in zeros.
				1882	//
				1883	// r0 := a0 << count
				1884	// r1 := a1 << count
				1885	// ...
				1886	// r7 := a7 << count
				1887	//
				1888	// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
				1889	#define _mm_slli_epi16(a, imm) \
				1890	__extension__({ \
				1891	__m128i ret; \
				1892	if ((imm) <= 0) { \
				1893	ret = a; \
				1894	} else if ((imm) > 15) { \
				1895	ret = _mm_setzero_si128(); \
				1896	} else { \
				1897	ret = vreinterpretq_m128i_s16( \
				1898	vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
				1899	} \
				1900	ret; \
				1901	})
				1902
				1903	// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
				1904	// shifting in zeros. :
				1905	// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
				1906	// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
				1907	FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
				1908	{
				1909	if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
				1910	return a;
				1911	if (imm > 31) /* TODO: add unlikely macro */
				1912	return _mm_setzero_si128();
				1913	return vreinterpretq_m128i_s32(
				1914	vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
				1915	}
				1916
				1917	// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
				1918	// store the results in dst.
				1919	FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
				1920	{
				1921	if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
				1922	return a;
				1923	if (imm > 63) /* TODO: add unlikely macro */
				1924	return _mm_setzero_si128();
				1925	return vreinterpretq_m128i_s64(
				1926	vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
				1927	}
				1928
				1929	// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
				1930	// store the results in dst.
				1931	//
				1932	// FOR j := 0 to 7
				1933	// i := j*16
				1934	// IF imm8[7:0] > 15
				1935	// dst[i+15:i] := 0
				1936	// ELSE
				1937	// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
				1938	// FI
				1939	// ENDFOR
				1940	//
				1941	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
				1942	#define _mm_srli_epi16(a, imm) \
				1943	__extension__({ \
				1944	__m128i ret; \
				1945	if ((imm) == 0) { \
				1946	ret = a; \
				1947	} else if (0 < (imm) && (imm) < 16) { \
				1948	ret = vreinterpretq_m128i_u16( \
				1949	vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
				1950	} else { \
				1951	ret = _mm_setzero_si128(); \
				1952	} \
				1953	ret; \
				1954	})
				1955
				1956	// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
				1957	// store the results in dst.
				1958	//
				1959	// FOR j := 0 to 3
				1960	// i := j*32
				1961	// IF imm8[7:0] > 31
				1962	// dst[i+31:i] := 0
				1963	// ELSE
				1964	// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
				1965	// FI
				1966	// ENDFOR
				1967	//
				1968	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
				1969	// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
				1970	#define _mm_srli_epi32(a, imm) \
				1971	__extension__({ \
				1972	__m128i ret; \
				1973	if ((imm) == 0) { \
				1974	ret = a; \
				1975	} else if (0 < (imm) && (imm) < 32) { \
				1976	ret = vreinterpretq_m128i_u32( \
				1977	vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
				1978	} else { \
				1979	ret = _mm_setzero_si128(); \
				1980	} \
				1981	ret; \
				1982	})
				1983
				1984	// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
				1985	// store the results in dst.
				1986	//
				1987	// FOR j := 0 to 1
				1988	// i := j*64
				1989	// IF imm8[7:0] > 63
				1990	// dst[i+63:i] := 0
				1991	// ELSE
				1992	// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
				1993	// FI
				1994	// ENDFOR
				1995	//
				1996	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
				1997	#define _mm_srli_epi64(a, imm) \
				1998	__extension__({ \
				1999	__m128i ret; \
				2000	if ((imm) == 0) { \
				2001	ret = a; \
				2002	} else if (0 < (imm) && (imm) < 64) { \
				2003	ret = vreinterpretq_m128i_u64( \
				2004	vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
				2005	} else { \
				2006	ret = _mm_setzero_si128(); \
				2007	} \
				2008	ret; \
				2009	})
				2010
				2011	// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
				2012	// and store the results in dst.
				2013	//
				2014	// FOR j := 0 to 3
				2015	// i := j*32
				2016	// IF imm8[7:0] > 31
				2017	// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
				2018	// ELSE
				2019	// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
				2020	// FI
				2021	// ENDFOR
				2022	//
				2023	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
				2024	// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
				2025	#define _mm_srai_epi32(a, imm) \
				2026	__extension__({ \
				2027	__m128i ret; \
				2028	if ((imm) == 0) { \
				2029	ret = a; \
				2030	} else if (0 < (imm) && (imm) < 32) { \
				2031	ret = vreinterpretq_m128i_s32( \
				2032	vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
				2033	} else { \
				2034	ret = vreinterpretq_m128i_s32( \
				2035	vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
				2036	} \
				2037	ret; \
				2038	})
				2039
				2040	// Shifts the 128 - bit value in a right by imm bytes while shifting in
				2041	// zeros.imm must be an immediate.
				2042	//
				2043	// r := srl(a, imm*8)
				2044	//
				2045	// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
				2046	// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
				2047	#define _mm_srli_si128(a, imm) \
				2048	__extension__({ \
				2049	__m128i ret; \
				2050	if ((imm) <= 0) { \
				2051	ret = a; \
				2052	} else if ((imm) > 15) { \
				2053	ret = _mm_setzero_si128(); \
				2054	} else { \
				2055	ret = vreinterpretq_m128i_s8( \
				2056	vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
				2057	} \
				2058	ret; \
				2059	})
				2060
				2061	// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
				2062	// must be an immediate.
				2063	//
				2064	// r := a << (imm * 8)
				2065	//
				2066	// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
				2067	// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
				2068	#define _mm_slli_si128(a, imm) \
				2069	__extension__({ \
				2070	__m128i ret; \
				2071	if ((imm) <= 0) { \
				2072	ret = a; \
				2073	} else if ((imm) > 15) { \
				2074	ret = _mm_setzero_si128(); \
				2075	} else { \
				2076	ret = vreinterpretq_m128i_s8(vextq_s8( \
				2077	vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
				2078	} \
				2079	ret; \
				2080	})
				2081
				2082	// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
				2083	// shifting in zeros.
				2084	//
				2085	// r0 := a0 << count
				2086	// r1 := a1 << count
				2087	// ...
				2088	// r7 := a7 << count
				2089	//
				2090	// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
				2091	FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
				2092	{
				2093	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
				2094	if (c > 15)
				2095	return _mm_setzero_si128();
				2096
				2097	int16x8_t vc = vdupq_n_s16((int16_t) c);
				2098	return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
				2099	}
				2100
				2101	// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
				2102	// shifting in zeros.
				2103	//
				2104	// r0 := a0 << count
				2105	// r1 := a1 << count
				2106	// r2 := a2 << count
				2107	// r3 := a3 << count
				2108	//
				2109	// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
				2110	FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
				2111	{
				2112	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
				2113	if (c > 31)
				2114	return _mm_setzero_si128();
				2115
				2116	int32x4_t vc = vdupq_n_s32((int32_t) c);
				2117	return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
				2118	}
				2119
				2120	// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
				2121	// shifting in zeros.
				2122	//
				2123	// r0 := a0 << count
				2124	// r1 := a1 << count
				2125	//
				2126	// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
				2127	FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
				2128	{
				2129	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
				2130	if (c > 63)
				2131	return _mm_setzero_si128();
				2132
				2133	int64x2_t vc = vdupq_n_s64((int64_t) c);
				2134	return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
				2135	}
				2136
				2137	// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
				2138	// while shifting in zeros.
				2139	//
				2140	// r0 := srl(a0, count)
				2141	// r1 := srl(a1, count)
				2142	// ...
				2143	// r7 := srl(a7, count)
				2144	//
				2145	// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
				2146	FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
				2147	{
				2148	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
				2149	if (c > 15)
				2150	return _mm_setzero_si128();
				2151
				2152	int16x8_t vc = vdupq_n_s16(-(int16_t) c);
				2153	return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
				2154	}
				2155
				2156	// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
				2157	// while shifting in zeros.
				2158	//
				2159	// r0 := srl(a0, count)
				2160	// r1 := srl(a1, count)
				2161	// r2 := srl(a2, count)
				2162	// r3 := srl(a3, count)
				2163	//
				2164	// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
				2165	FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
				2166	{
				2167	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
				2168	if (c > 31)
				2169	return _mm_setzero_si128();
				2170
				2171	int32x4_t vc = vdupq_n_s32(-(int32_t) c);
				2172	return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
				2173	}
				2174
				2175	// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
				2176	// while shifting in zeros.
				2177	//
				2178	// r0 := srl(a0, count)
				2179	// r1 := srl(a1, count)
				2180	//
				2181	// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
				2182	FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
				2183	{
				2184	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
				2185	if (c > 63)
				2186	return _mm_setzero_si128();
				2187
				2188	int64x2_t vc = vdupq_n_s64(-(int64_t) c);
				2189	return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
				2190	}
				2191
				2192	// NEON does not provide a version of this function.
				2193	// Creates a 16-bit mask from the most significant bits of the 16 signed or
				2194	// unsigned 8-bit integers in a and zero extends the upper bits.
				2195	// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
				2196	FORCE_INLINE int _mm_movemask_epi8(__m128i a)
				2197	{
				2198	#if defined(__aarch64__)
				2199	uint8x16_t input = vreinterpretq_u8_m128i(a);
				2200	const int8_t ALIGN_STRUCT(16)
				2201	xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
				2202	const uint8x16_t mask_and = vdupq_n_u8(0x80);
				2203	const int8x16_t mask_shift = vld1q_s8(xr);
				2204	const uint8x16_t mask_result =
				2205	vshlq_u8(vandq_u8(input, mask_and), mask_shift);
				2206	uint8x8_t lo = vget_low_u8(mask_result);
				2207	uint8x8_t hi = vget_high_u8(mask_result);
				2208
				2209	return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
				2210	#else
				2211	// Use increasingly wide shifts+adds to collect the sign bits
				2212	// together.
				2213	// Since the widening shifts would be rather confusing to follow in little
				2214	// endian, everything will be illustrated in big endian order instead. This
				2215	// has a different result - the bits would actually be reversed on a big
				2216	// endian machine.
				2217
				2218	// Starting input (only half the elements are shown):
				2219	// 89 ff 1d c0 00 10 99 33
				2220	uint8x16_t input = vreinterpretq_u8_m128i(a);
				2221
				2222	// Shift out everything but the sign bits with an unsigned shift right.
				2223	//
				2224	// Bytes of the vector::
				2225	// 89 ff 1d c0 00 10 99 33
				2226	// \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
				2227	// \| \| \| \| \| \| \| \|
				2228	// 01 01 00 01 00 00 01 00
				2229	//
				2230	// Bits of first important lane(s):
				2231	// 10001001 (89)
				2232	// \______
				2233	// \|
				2234	// 00000001 (01)
				2235	uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
				2236
				2237	// Merge the even lanes together with a 16-bit unsigned shift right + add.
				2238	// 'xx' represents garbage data which will be ignored in the final result.
				2239	// In the important bytes, the add functions like a binary OR.
				2240	//
				2241	// 01 01 00 01 00 00 01 00
				2242	// \_ \| \_ \| \_ \| \_ \| paired16 = (uint32x4_t)(input + (input >> 7))
				2243	// \\| \\| \\| \\|
				2244	// xx 03 xx 01 xx 00 xx 02
				2245	//
				2246	// 00000001 00000001 (01 01)
				2247	// \_______ \|
				2248	// \\|
				2249	// xxxxxxxx xxxxxx11 (xx 03)
				2250	uint32x4_t paired16 =
				2251	vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
				2252
				2253	// Repeat with a wider 32-bit shift + add.
				2254	// xx 03 xx 01 xx 00 xx 02
				2255	// \____ \| \____ \| paired32 = (uint64x1_t)(paired16 + (paired16 >>
				2256	// 14))
				2257	// \\| \\|
				2258	// xx xx xx 0d xx xx xx 02
				2259	//
				2260	// 00000011 00000001 (03 01)
				2261	// \\_____ \|\|
				2262	// '----.\\|\|
				2263	// xxxxxxxx xxxx1101 (xx 0d)
				2264	uint64x2_t paired32 =
				2265	vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
				2266
				2267	// Last, an even wider 64-bit shift + add to get our result in the low 8 bit
				2268	// lanes. xx xx xx 0d xx xx xx 02
				2269	// \_________ \| paired64 = (uint8x8_t)(paired32 + (paired32 >>
				2270	// 28))
				2271	// \\|
				2272	// xx xx xx xx xx xx xx d2
				2273	//
				2274	// 00001101 00000010 (0d 02)
				2275	// \ \___ \| \|
				2276	// '---. \\| \|
				2277	// xxxxxxxx 11010010 (xx d2)
				2278	uint8x16_t paired64 =
				2279	vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
				2280
				2281	// Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
				2282	// xx xx xx xx xx xx xx d2
				2283	// \|\| return paired64[0]
				2284	// d2
				2285	// Note: Little endian would return the correct value 4b (01001011) instead.
				2286	return vgetq_lane_u8(paired64, 0) \| ((int) vgetq_lane_u8(paired64, 8) << 8);
				2287	#endif
				2288	}
				2289
				2290	// Copy the lower 64-bit integer in a to dst.
				2291	//
				2292	// dst[63:0] := a[63:0]
				2293	//
				2294	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
				2295	FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
				2296	{
				2297	return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
				2298	}
				2299
				2300	// Copy the 64-bit integer a to the lower element of dst, and zero the upper
				2301	// element.
				2302	//
				2303	// dst[63:0] := a[63:0]
				2304	// dst[127:64] := 0
				2305	//
				2306	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
				2307	FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
				2308	{
				2309	return vreinterpretq_m128i_s64(
				2310	vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
				2311	}
				2312
				2313	// NEON does not provide this method
				2314	// Creates a 4-bit mask from the most significant bits of the four
				2315	// single-precision, floating-point values.
				2316	// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
				2317	FORCE_INLINE int _mm_movemask_ps(__m128 a)
				2318	{
				2319	uint32x4_t input = vreinterpretq_u32_m128(a);
				2320	#if defined(__aarch64__)
				2321	static const int32x4_t shift = {0, 1, 2, 3};
				2322	uint32x4_t tmp = vshrq_n_u32(input, 31);
				2323	return vaddvq_u32(vshlq_u32(tmp, shift));
				2324	#else
				2325	// Uses the exact same method as _mm_movemask_epi8, see that for details.
				2326	// Shift out everything but the sign bits with a 32-bit unsigned shift
				2327	// right.
				2328	uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
				2329	// Merge the two pairs together with a 64-bit unsigned shift right + add.
				2330	uint8x16_t paired =
				2331	vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
				2332	// Extract the result.
				2333	return vgetq_lane_u8(paired, 0) \| (vgetq_lane_u8(paired, 8) << 2);
				2334	#endif
				2335	}
				2336
				2337	// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
				2338	// all 1's, and return 1 if the result is zero, otherwise return 0.
				2339	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
				2340	FORCE_INLINE int _mm_test_all_ones(__m128i a)
				2341	{
				2342	return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
				2343	~(uint64_t) 0;
				2344	}
				2345
				2346	// Compute the bitwise AND of 128 bits (representing integer data) in a and
				2347	// mask, and return 1 if the result is zero, otherwise return 0.
				2348	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
				2349	FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
				2350	{
				2351	int64x2_t a_and_mask =
				2352	vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
				2353	return (vgetq_lane_s64(a_and_mask, 0) \| vgetq_lane_s64(a_and_mask, 1)) ? 0
				2354	: 1;
				2355	}
				2356
				2357	/* Math operations */
				2358
				2359	// Subtracts the four single-precision, floating-point values of a and b.
				2360	//
				2361	// r0 := a0 - b0
				2362	// r1 := a1 - b1
				2363	// r2 := a2 - b2
				2364	// r3 := a3 - b3
				2365	//
				2366	// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
				2367	FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
				2368	{
				2369	return vreinterpretq_m128_f32(
				2370	vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				2371	}
				2372
				2373	// Subtract the lower single-precision (32-bit) floating-point element in b from
				2374	// the lower single-precision (32-bit) floating-point element in a, store the
				2375	// result in the lower element of dst, and copy the upper 3 packed elements from
				2376	// a to the upper elements of dst.
				2377	//
				2378	// dst[31:0] := a[31:0] - b[31:0]
				2379	// dst[127:32] := a[127:32]
				2380	//
				2381	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
				2382	FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
				2383	{
				2384	return _mm_move_ss(a, _mm_sub_ps(a, b));
				2385	}
				2386
				2387	// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
				2388	// and store the results in dst.
				2389	// r0 := a0 - b0
				2390	// r1 := a1 - b1
				2391	FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
				2392	{
				2393	return vreinterpretq_m128i_s64(
				2394	vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
				2395	}
				2396
				2397	// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
				2398	// unsigned 32-bit integers of a.
				2399	//
				2400	// r0 := a0 - b0
				2401	// r1 := a1 - b1
				2402	// r2 := a2 - b2
				2403	// r3 := a3 - b3
				2404	//
				2405	// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
				2406	FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
				2407	{
				2408	return vreinterpretq_m128i_s32(
				2409	vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				2410	}
				2411
				2412	FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
				2413	{
				2414	return vreinterpretq_m128i_s16(
				2415	vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				2416	}
				2417
				2418	FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
				2419	{
				2420	return vreinterpretq_m128i_s8(
				2421	vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
				2422	}
				2423
				2424	// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
				2425	//
				2426	// dst[63:0] := a[63:0] - b[63:0]
				2427	//
				2428	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
				2429	FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
				2430	{
				2431	return vreinterpret_m64_s64(
				2432	vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
				2433	}
				2434
				2435	// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
				2436	// integers of a and saturates..
				2437	// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
				2438	FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
				2439	{
				2440	return vreinterpretq_m128i_u16(
				2441	vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
				2442	}
				2443
				2444	// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
				2445	// integers of a and saturates.
				2446	//
				2447	// r0 := UnsignedSaturate(a0 - b0)
				2448	// r1 := UnsignedSaturate(a1 - b1)
				2449	// ...
				2450	// r15 := UnsignedSaturate(a15 - b15)
				2451	//
				2452	// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
				2453	FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
				2454	{
				2455	return vreinterpretq_m128i_u8(
				2456	vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
				2457	}
				2458
				2459	// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
				2460	// of a and saturates.
				2461	//
				2462	// r0 := SignedSaturate(a0 - b0)
				2463	// r1 := SignedSaturate(a1 - b1)
				2464	// ...
				2465	// r15 := SignedSaturate(a15 - b15)
				2466	//
				2467	// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
				2468	FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
				2469	{
				2470	return vreinterpretq_m128i_s8(
				2471	vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
				2472	}
				2473
				2474	// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
				2475	// of a and saturates.
				2476	//
				2477	// r0 := SignedSaturate(a0 - b0)
				2478	// r1 := SignedSaturate(a1 - b1)
				2479	// ...
				2480	// r7 := SignedSaturate(a7 - b7)
				2481	//
				2482	// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
				2483	FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
				2484	{
				2485	return vreinterpretq_m128i_s16(
				2486	vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				2487	}
				2488
				2489	FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
				2490	{
				2491	return vreinterpretq_m128i_u16(
				2492	vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
				2493	}
				2494
				2495	// Negate packed 8-bit integers in a when the corresponding signed
				2496	// 8-bit integer in b is negative, and store the results in dst.
				2497	// Element in dst are zeroed out when the corresponding element
				2498	// in b is zero.
				2499	//
				2500	// for i in 0..15
				2501	// if b[i] < 0
				2502	// r[i] := -a[i]
				2503	// else if b[i] == 0
				2504	// r[i] := 0
				2505	// else
				2506	// r[i] := a[i]
				2507	// fi
				2508	// done
				2509	FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
				2510	{
				2511	int8x16_t a = vreinterpretq_s8_m128i(_a);
				2512	int8x16_t b = vreinterpretq_s8_m128i(_b);
				2513
				2514	// signed shift right: faster than vclt
				2515	// (b < 0) ? 0xFF : 0
				2516	uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
				2517
				2518	// (b == 0) ? 0xFF : 0
				2519	#if defined(__aarch64__)
				2520	int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
				2521	#else
				2522	int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
				2523	#endif
				2524
				2525	// bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
				2526	// based on ltMask
				2527	int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
				2528	// res = masked & (~zeroMask)
				2529	int8x16_t res = vbicq_s8(masked, zeroMask);
				2530
				2531	return vreinterpretq_m128i_s8(res);
				2532	}
				2533
				2534	// Negate packed 16-bit integers in a when the corresponding signed
				2535	// 16-bit integer in b is negative, and store the results in dst.
				2536	// Element in dst are zeroed out when the corresponding element
				2537	// in b is zero.
				2538	//
				2539	// for i in 0..7
				2540	// if b[i] < 0
				2541	// r[i] := -a[i]
				2542	// else if b[i] == 0
				2543	// r[i] := 0
				2544	// else
				2545	// r[i] := a[i]
				2546	// fi
				2547	// done
				2548	FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
				2549	{
				2550	int16x8_t a = vreinterpretq_s16_m128i(_a);
				2551	int16x8_t b = vreinterpretq_s16_m128i(_b);
				2552
				2553	// signed shift right: faster than vclt
				2554	// (b < 0) ? 0xFFFF : 0
				2555	uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
				2556	// (b == 0) ? 0xFFFF : 0
				2557	#if defined(__aarch64__)
				2558	int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
				2559	#else
				2560	int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
				2561	#endif
				2562
				2563	// bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
				2564	// 'a') based on ltMask
				2565	int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
				2566	// res = masked & (~zeroMask)
				2567	int16x8_t res = vbicq_s16(masked, zeroMask);
				2568	return vreinterpretq_m128i_s16(res);
				2569	}
				2570
				2571	// Negate packed 32-bit integers in a when the corresponding signed
				2572	// 32-bit integer in b is negative, and store the results in dst.
				2573	// Element in dst are zeroed out when the corresponding element
				2574	// in b is zero.
				2575	//
				2576	// for i in 0..3
				2577	// if b[i] < 0
				2578	// r[i] := -a[i]
				2579	// else if b[i] == 0
				2580	// r[i] := 0
				2581	// else
				2582	// r[i] := a[i]
				2583	// fi
				2584	// done
				2585	FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
				2586	{
				2587	int32x4_t a = vreinterpretq_s32_m128i(_a);
				2588	int32x4_t b = vreinterpretq_s32_m128i(_b);
				2589
				2590	// signed shift right: faster than vclt
				2591	// (b < 0) ? 0xFFFFFFFF : 0
				2592	uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
				2593
				2594	// (b == 0) ? 0xFFFFFFFF : 0
				2595	#if defined(__aarch64__)
				2596	int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
				2597	#else
				2598	int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
				2599	#endif
				2600
				2601	// bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
				2602	// 'a') based on ltMask
				2603	int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
				2604	// res = masked & (~zeroMask)
				2605	int32x4_t res = vbicq_s32(masked, zeroMask);
				2606	return vreinterpretq_m128i_s32(res);
				2607	}
				2608
				2609	// Negate packed 16-bit integers in a when the corresponding signed 16-bit
				2610	// integer in b is negative, and store the results in dst. Element in dst are
				2611	// zeroed out when the corresponding element in b is zero.
				2612	//
				2613	// FOR j := 0 to 3
				2614	// i := j*16
				2615	// IF b[i+15:i] < 0
				2616	// dst[i+15:i] := -(a[i+15:i])
				2617	// ELSE IF b[i+15:i] == 0
				2618	// dst[i+15:i] := 0
				2619	// ELSE
				2620	// dst[i+15:i] := a[i+15:i]
				2621	// FI
				2622	// ENDFOR
				2623	//
				2624	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
				2625	FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
				2626	{
				2627	int16x4_t a = vreinterpret_s16_m64(_a);
				2628	int16x4_t b = vreinterpret_s16_m64(_b);
				2629
				2630	// signed shift right: faster than vclt
				2631	// (b < 0) ? 0xFFFF : 0
				2632	uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
				2633
				2634	// (b == 0) ? 0xFFFF : 0
				2635	#if defined(__aarch64__)
				2636	int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
				2637	#else
				2638	int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
				2639	#endif
				2640
				2641	// bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
				2642	// based on ltMask
				2643	int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
				2644	// res = masked & (~zeroMask)
				2645	int16x4_t res = vbic_s16(masked, zeroMask);
				2646
				2647	return vreinterpret_m64_s16(res);
				2648	}
				2649
				2650	// Negate packed 32-bit integers in a when the corresponding signed 32-bit
				2651	// integer in b is negative, and store the results in dst. Element in dst are
				2652	// zeroed out when the corresponding element in b is zero.
				2653	//
				2654	// FOR j := 0 to 1
				2655	// i := j*32
				2656	// IF b[i+31:i] < 0
				2657	// dst[i+31:i] := -(a[i+31:i])
				2658	// ELSE IF b[i+31:i] == 0
				2659	// dst[i+31:i] := 0
				2660	// ELSE
				2661	// dst[i+31:i] := a[i+31:i]
				2662	// FI
				2663	// ENDFOR
				2664	//
				2665	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
				2666	FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
				2667	{
				2668	int32x2_t a = vreinterpret_s32_m64(_a);
				2669	int32x2_t b = vreinterpret_s32_m64(_b);
				2670
				2671	// signed shift right: faster than vclt
				2672	// (b < 0) ? 0xFFFFFFFF : 0
				2673	uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
				2674
				2675	// (b == 0) ? 0xFFFFFFFF : 0
				2676	#if defined(__aarch64__)
				2677	int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
				2678	#else
				2679	int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
				2680	#endif
				2681
				2682	// bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
				2683	// based on ltMask
				2684	int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
				2685	// res = masked & (~zeroMask)
				2686	int32x2_t res = vbic_s32(masked, zeroMask);
				2687
				2688	return vreinterpret_m64_s32(res);
				2689	}
				2690
				2691	// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
				2692	// in b is negative, and store the results in dst. Element in dst are zeroed out
				2693	// when the corresponding element in b is zero.
				2694	//
				2695	// FOR j := 0 to 7
				2696	// i := j*8
				2697	// IF b[i+7:i] < 0
				2698	// dst[i+7:i] := -(a[i+7:i])
				2699	// ELSE IF b[i+7:i] == 0
				2700	// dst[i+7:i] := 0
				2701	// ELSE
				2702	// dst[i+7:i] := a[i+7:i]
				2703	// FI
				2704	// ENDFOR
				2705	//
				2706	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
				2707	FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
				2708	{
				2709	int8x8_t a = vreinterpret_s8_m64(_a);
				2710	int8x8_t b = vreinterpret_s8_m64(_b);
				2711
				2712	// signed shift right: faster than vclt
				2713	// (b < 0) ? 0xFF : 0
				2714	uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
				2715
				2716	// (b == 0) ? 0xFF : 0
				2717	#if defined(__aarch64__)
				2718	int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
				2719	#else
				2720	int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
				2721	#endif
				2722
				2723	// bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
				2724	// based on ltMask
				2725	int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
				2726	// res = masked & (~zeroMask)
				2727	int8x8_t res = vbic_s8(masked, zeroMask);
				2728
				2729	return vreinterpret_m64_s8(res);
				2730	}
				2731
				2732	// Average packed unsigned 16-bit integers in a and b, and store the results in
				2733	// dst.
				2734	//
				2735	// FOR j := 0 to 3
				2736	// i := j*16
				2737	// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
				2738	// ENDFOR
				2739	//
				2740	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
				2741	FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
				2742	{
				2743	return vreinterpret_m64_u16(
				2744	vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
				2745	}
				2746
				2747	// Average packed unsigned 8-bit integers in a and b, and store the results in
				2748	// dst.
				2749	//
				2750	// FOR j := 0 to 7
				2751	// i := j*8
				2752	// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
				2753	// ENDFOR
				2754	//
				2755	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
				2756	FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
				2757	{
				2758	return vreinterpret_m64_u8(
				2759	vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
				2760	}
				2761
				2762	// Average packed unsigned 8-bit integers in a and b, and store the results in
				2763	// dst.
				2764	//
				2765	// FOR j := 0 to 7
				2766	// i := j*8
				2767	// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
				2768	// ENDFOR
				2769	//
				2770	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
				2771	#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
				2772
				2773	// Average packed unsigned 16-bit integers in a and b, and store the results in
				2774	// dst.
				2775	//
				2776	// FOR j := 0 to 3
				2777	// i := j*16
				2778	// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
				2779	// ENDFOR
				2780	//
				2781	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
				2782	#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
				2783
				2784	// Computes the average of the 16 unsigned 8-bit integers in a and the 16
				2785	// unsigned 8-bit integers in b and rounds.
				2786	//
				2787	// r0 := (a0 + b0) / 2
				2788	// r1 := (a1 + b1) / 2
				2789	// ...
				2790	// r15 := (a15 + b15) / 2
				2791	//
				2792	// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
				2793	FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
				2794	{
				2795	return vreinterpretq_m128i_u8(
				2796	vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
				2797	}
				2798
				2799	// Computes the average of the 8 unsigned 16-bit integers in a and the 8
				2800	// unsigned 16-bit integers in b and rounds.
				2801	//
				2802	// r0 := (a0 + b0) / 2
				2803	// r1 := (a1 + b1) / 2
				2804	// ...
				2805	// r7 := (a7 + b7) / 2
				2806	//
				2807	// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
				2808	FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
				2809	{
				2810	return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
				2811	vreinterpretq_u16_m128i(b));
				2812	}
				2813
				2814	// Adds the four single-precision, floating-point values of a and b.
				2815	//
				2816	// r0 := a0 + b0
				2817	// r1 := a1 + b1
				2818	// r2 := a2 + b2
				2819	// r3 := a3 + b3
				2820	//
				2821	// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
				2822	FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
				2823	{
				2824	return vreinterpretq_m128_f32(
				2825	vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				2826	}
				2827
				2828	// Add packed double-precision (64-bit) floating-point elements in a and b, and
				2829	// store the results in dst.
				2830	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
				2831	FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
				2832	{
				2833	#if defined(__aarch64__)
				2834	return vreinterpretq_m128d_f64(
				2835	vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
				2836	#else
				2837	double da = (double ) &a;
				2838	double db = (double ) &b;
				2839	double c[2];
				2840	c[0] = da[0] + db[0];
				2841	c[1] = da[1] + db[1];
				2842	return vld1q_f32((float32_t *) c);
				2843	#endif
				2844	}
				2845
				2846	// Add 64-bit integers a and b, and store the result in dst.
				2847	//
				2848	// dst[63:0] := a[63:0] + b[63:0]
				2849	//
				2850	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
				2851	FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
				2852	{
				2853	return vreinterpret_m64_s64(
				2854	vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
				2855	}
				2856
				2857	// adds the scalar single-precision floating point values of a and b.
				2858	// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
				2859	FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
				2860	{
				2861	float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
				2862	float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
				2863	// the upper values in the result must be the remnants of <a>.
				2864	return vreinterpretq_m128_f32(vaddq_f32(a, value));
				2865	}
				2866
				2867	// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
				2868	// unsigned 32-bit integers in b.
				2869	// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
				2870	FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
				2871	{
				2872	return vreinterpretq_m128i_s64(
				2873	vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
				2874	}
				2875
				2876	// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
				2877	// unsigned 32-bit integers in b.
				2878	//
				2879	// r0 := a0 + b0
				2880	// r1 := a1 + b1
				2881	// r2 := a2 + b2
				2882	// r3 := a3 + b3
				2883	//
				2884	// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
				2885	FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
				2886	{
				2887	return vreinterpretq_m128i_s32(
				2888	vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				2889	}
				2890
				2891	// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
				2892	// unsigned 16-bit integers in b.
				2893	// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
				2894	FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
				2895	{
				2896	return vreinterpretq_m128i_s16(
				2897	vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				2898	}
				2899
				2900	// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
				2901	// unsigned 8-bit integers in b.
				2902	// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
				2903	FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
				2904	{
				2905	return vreinterpretq_m128i_s8(
				2906	vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
				2907	}
				2908
				2909	// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
				2910	// and saturates.
				2911	//
				2912	// r0 := SignedSaturate(a0 + b0)
				2913	// r1 := SignedSaturate(a1 + b1)
				2914	// ...
				2915	// r7 := SignedSaturate(a7 + b7)
				2916	//
				2917	// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
				2918	FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
				2919	{
				2920	return vreinterpretq_m128i_s16(
				2921	vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				2922	}
				2923
				2924	// Add packed signed 8-bit integers in a and b using saturation, and store the
				2925	// results in dst.
				2926	//
				2927	// FOR j := 0 to 15
				2928	// i := j*8
				2929	// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
				2930	// ENDFOR
				2931	//
				2932	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
				2933	FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
				2934	{
				2935	return vreinterpretq_m128i_s8(
				2936	vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
				2937	}
				2938
				2939	// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
				2940	// b and saturates..
				2941	// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
				2942	FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
				2943	{
				2944	return vreinterpretq_m128i_u8(
				2945	vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
				2946	}
				2947
				2948	// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
				2949	// unsigned 16-bit integers from b.
				2950	//
				2951	// r0 := (a0 * b0)[15:0]
				2952	// r1 := (a1 * b1)[15:0]
				2953	// ...
				2954	// r7 := (a7 * b7)[15:0]
				2955	//
				2956	// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
				2957	FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
				2958	{
				2959	return vreinterpretq_m128i_s16(
				2960	vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				2961	}
				2962
				2963	// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
				2964	// unsigned 32-bit integers from b.
				2965	// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
				2966	FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
				2967	{
				2968	return vreinterpretq_m128i_s32(
				2969	vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				2970	}
				2971
				2972	// Multiply the packed unsigned 16-bit integers in a and b, producing
				2973	// intermediate 32-bit integers, and store the high 16 bits of the intermediate
				2974	// integers in dst.
				2975	//
				2976	// FOR j := 0 to 3
				2977	// i := j*16
				2978	// tmp[31:0] := a[i+15:i] * b[i+15:i]
				2979	// dst[i+15:i] := tmp[31:16]
				2980	// ENDFOR
				2981	//
				2982	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
				2983	#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
				2984
				2985	// Multiplies the four single-precision, floating-point values of a and b.
				2986	//
				2987	// r0 := a0 * b0
				2988	// r1 := a1 * b1
				2989	// r2 := a2 * b2
				2990	// r3 := a3 * b3
				2991	//
				2992	// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
				2993	FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
				2994	{
				2995	return vreinterpretq_m128_f32(
				2996	vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				2997	}
				2998
				2999	// Multiply the lower single-precision (32-bit) floating-point element in a and
				3000	// b, store the result in the lower element of dst, and copy the upper 3 packed
				3001	// elements from a to the upper elements of dst.
				3002	//
				3003	// dst[31:0] := a[31:0] * b[31:0]
				3004	// dst[127:32] := a[127:32]
				3005	//
				3006	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
				3007	FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
				3008	{
				3009	return _mm_move_ss(a, _mm_mul_ps(a, b));
				3010	}
				3011
				3012	// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
				3013	// a and b, and store the unsigned 64-bit results in dst.
				3014	//
				3015	// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
				3016	// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
				3017	FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
				3018	{
				3019	// vmull_u32 upcasts instead of masking, so we downcast.
				3020	uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
				3021	uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
				3022	return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
				3023	}
				3024
				3025	// Multiply the low unsigned 32-bit integers from a and b, and store the
				3026	// unsigned 64-bit result in dst.
				3027	//
				3028	// dst[63:0] := a[31:0] * b[31:0]
				3029	//
				3030	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
				3031	FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
				3032	{
				3033	return vreinterpret_m64_u64(vget_low_u64(
				3034	vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
				3035	}
				3036
				3037	// Multiply the low signed 32-bit integers from each packed 64-bit element in
				3038	// a and b, and store the signed 64-bit results in dst.
				3039	//
				3040	// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
				3041	// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
				3042	FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
				3043	{
				3044	// vmull_s32 upcasts instead of masking, so we downcast.
				3045	int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
				3046	int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
				3047	return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
				3048	}
				3049
				3050	// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
				3051	// integers from b.
				3052	//
				3053	// r0 := (a0 * b0) + (a1 * b1)
				3054	// r1 := (a2 * b2) + (a3 * b3)
				3055	// r2 := (a4 * b4) + (a5 * b5)
				3056	// r3 := (a6 * b6) + (a7 * b7)
				3057	// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
				3058	FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
				3059	{
				3060	int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
				3061	vget_low_s16(vreinterpretq_s16_m128i(b)));
				3062	int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
				3063	vget_high_s16(vreinterpretq_s16_m128i(b)));
				3064
				3065	int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
				3066	int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
				3067
				3068	return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
				3069	}
				3070
				3071	// Multiply packed signed 16-bit integers in a and b, producing intermediate
				3072	// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
				3073	// the packed 16-bit integers in dst.
				3074	//
				3075	// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
				3076	// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
				3077	// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
				3078	// ...
				3079	// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
				3080	FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
				3081	{
				3082	// Has issues due to saturation
				3083	// return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
				3084
				3085	// Multiply
				3086	int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
				3087	vget_low_s16(vreinterpretq_s16_m128i(b)));
				3088	int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
				3089	vget_high_s16(vreinterpretq_s16_m128i(b)));
				3090
				3091	// Rounding narrowing shift right
				3092	// narrow = (int16_t)((mul + 16384) >> 15);
				3093	int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
				3094	int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
				3095
				3096	// Join together
				3097	return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
				3098	}
				3099
				3100	// Vertically multiply each unsigned 8-bit integer from a with the corresponding
				3101	// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
				3102	// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
				3103	// and pack the saturated results in dst.
				3104	//
				3105	// FOR j := 0 to 7
				3106	// i := j*16
				3107	// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
				3108	// a[i+7:i]*b[i+7:i] )
				3109	// ENDFOR
				3110	FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
				3111	{
				3112	#if defined(__aarch64__)
				3113	uint8x16_t a = vreinterpretq_u8_m128i(_a);
				3114	int8x16_t b = vreinterpretq_s8_m128i(_b);
				3115	int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
				3116	vmovl_s8(vget_low_s8(b)));
				3117	int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
				3118	vmovl_s8(vget_high_s8(b)));
				3119	return vreinterpretq_m128i_s16(
				3120	vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
				3121	#else
				3122	// This would be much simpler if x86 would choose to zero extend OR sign
				3123	// extend, not both. This could probably be optimized better.
				3124	uint16x8_t a = vreinterpretq_u16_m128i(_a);
				3125	int16x8_t b = vreinterpretq_s16_m128i(_b);
				3126
				3127	// Zero extend a
				3128	int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
				3129	int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
				3130
				3131	// Sign extend by shifting left then shifting right.
				3132	int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
				3133	int16x8_t b_odd = vshrq_n_s16(b, 8);
				3134
				3135	// multiply
				3136	int16x8_t prod1 = vmulq_s16(a_even, b_even);
				3137	int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
				3138
				3139	// saturated add
				3140	return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
				3141	#endif
				3142	}
				3143
				3144	// Computes the fused multiple add product of 32-bit floating point numbers.
				3145	//
				3146	// Return Value
				3147	// Multiplies A and B, and adds C to the temporary result before returning it.
				3148	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
				3149	FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
				3150	{
				3151	#if defined(__aarch64__)
				3152	return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
				3153	vreinterpretq_f32_m128(b),
				3154	vreinterpretq_f32_m128(a)));
				3155	#else
				3156	return _mm_add_ps(_mm_mul_ps(a, b), c);
				3157	#endif
				3158	}
				3159
				3160	// Alternatively add and subtract packed single-precision (32-bit)
				3161	// floating-point elements in a to/from packed elements in b, and store the
				3162	// results in dst.
				3163	//
				3164	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
				3165	FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
				3166	{
				3167	__m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
				3168	return _mm_fmadd_ps(b, mask, a);
				3169	}
				3170
				3171	// Compute the absolute differences of packed unsigned 8-bit integers in a and
				3172	// b, then horizontally sum each consecutive 8 differences to produce two
				3173	// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
				3174	// 16 bits of 64-bit elements in dst.
				3175	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
				3176	FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
				3177	{
				3178	uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
				3179	uint16_t r0 = t[0] + t[1] + t[2] + t[3];
				3180	uint16_t r4 = t[4] + t[5] + t[6] + t[7];
				3181	uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
				3182	return (__m128i) vsetq_lane_u16(r4, r, 4);
				3183	}
				3184
				3185	// Compute the absolute differences of packed unsigned 8-bit integers in a and
				3186	// b, then horizontally sum each consecutive 8 differences to produce four
				3187	// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
				3188	// 16 bits of dst.
				3189	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
				3190	FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
				3191	{
				3192	uint16x4_t t =
				3193	vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
				3194	uint16_t r0 = t[0] + t[1] + t[2] + t[3];
				3195	return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
				3196	}
				3197
				3198	// Compute the absolute differences of packed unsigned 8-bit integers in a and
				3199	// b, then horizontally sum each consecutive 8 differences to produce four
				3200	// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
				3201	// 16 bits of dst.
				3202	//
				3203	// FOR j := 0 to 7
				3204	// i := j*8
				3205	// tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
				3206	// ENDFOR
				3207	// dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
				3208	// tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
				3209	//
				3210	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
				3211	#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
				3212
				3213	// Divides the four single-precision, floating-point values of a and b.
				3214	//
				3215	// r0 := a0 / b0
				3216	// r1 := a1 / b1
				3217	// r2 := a2 / b2
				3218	// r3 := a3 / b3
				3219	//
				3220	// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
				3221	FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
				3222	{
				3223	#if defined(__aarch64__)
				3224	return vreinterpretq_m128_f32(
				3225	vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				3226	#else
				3227	float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
				3228	float32x4_t recip1 =
				3229	vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
				3230	return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
				3231	#endif
				3232	}
				3233
				3234	// Divides the scalar single-precision floating point value of a by b.
				3235	// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
				3236	FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
				3237	{
				3238	float32_t value =
				3239	vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
				3240	return vreinterpretq_m128_f32(
				3241	vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
				3242	}
				3243
				3244	// Computes the approximations of reciprocals of the four single-precision,
				3245	// floating-point values of a.
				3246	// https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
				3247	FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
				3248	{
				3249	float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
				3250	recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
				3251	return vreinterpretq_m128_f32(recip);
				3252	}
				3253
				3254	// Compute the approximate reciprocal of the lower single-precision (32-bit)
				3255	// floating-point element in a, store the result in the lower element of dst,
				3256	// and copy the upper 3 packed elements from a to the upper elements of dst. The
				3257	// maximum relative error for this approximation is less than 1.5*2^-12.
				3258	//
				3259	// dst[31:0] := (1.0 / a[31:0])
				3260	// dst[127:32] := a[127:32]
				3261	//
				3262	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
				3263	FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
				3264	{
				3265	return _mm_move_ss(a, _mm_rcp_ps(a));
				3266	}
				3267
				3268	// Computes the approximations of square roots of the four single-precision,
				3269	// floating-point values of a. First computes reciprocal square roots and then
				3270	// reciprocals of the four values.
				3271	//
				3272	// r0 := sqrt(a0)
				3273	// r1 := sqrt(a1)
				3274	// r2 := sqrt(a2)
				3275	// r3 := sqrt(a3)
				3276	//
				3277	// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
				3278	FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
				3279	{
				3280	#if defined(__aarch64__)
				3281	return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
				3282	#else
				3283	float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
				3284	float32x4_t sq = vrecpeq_f32(recipsq);
				3285	// ??? use step versions of both sqrt and recip for better accuracy?
				3286	return vreinterpretq_m128_f32(sq);
				3287	#endif
				3288	}
				3289
				3290	// Computes the approximation of the square root of the scalar single-precision
				3291	// floating point value of in.
				3292	// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
				3293	FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
				3294	{
				3295	float32_t value =
				3296	vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
				3297	return vreinterpretq_m128_f32(
				3298	vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
				3299	}
				3300
				3301	// Computes the approximations of the reciprocal square roots of the four
				3302	// single-precision floating point values of in.
				3303	// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
				3304	FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
				3305	{
				3306	return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
				3307	}
				3308
				3309	// Compute the approximate reciprocal square root of the lower single-precision
				3310	// (32-bit) floating-point element in a, store the result in the lower element
				3311	// of dst, and copy the upper 3 packed elements from a to the upper elements of
				3312	// dst.
				3313	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
				3314	FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
				3315	{
				3316	return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
				3317	}
				3318
				3319	// Compare packed signed 16-bit integers in a and b, and store packed maximum
				3320	// values in dst.
				3321	//
				3322	// FOR j := 0 to 3
				3323	// i := j*16
				3324	// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
				3325	// ENDFOR
				3326	//
				3327	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
				3328	FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
				3329	{
				3330	return vreinterpret_m64_s16(
				3331	vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
				3332	}
				3333
				3334	// Compare packed signed 16-bit integers in a and b, and store packed maximum
				3335	// values in dst.
				3336	//
				3337	// FOR j := 0 to 3
				3338	// i := j*16
				3339	// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
				3340	// ENDFOR
				3341	//
				3342	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
				3343	#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
				3344
				3345	// Computes the maximums of the four single-precision, floating-point values of
				3346	// a and b.
				3347	// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
				3348	FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
				3349	{
				3350	#if SSE2NEON_PRECISE_MINMAX
				3351	float32x4_t _a = vreinterpretq_f32_m128(a);
				3352	float32x4_t _b = vreinterpretq_f32_m128(b);
				3353	return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
				3354	#else
				3355	return vreinterpretq_m128_f32(
				3356	vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				3357	#endif
				3358	}
				3359
				3360	// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
				3361	// values in dst.
				3362	//
				3363	// FOR j := 0 to 7
				3364	// i := j*8
				3365	// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
				3366	// ENDFOR
				3367	//
				3368	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
				3369	FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
				3370	{
				3371	return vreinterpret_m64_u8(
				3372	vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
				3373	}
				3374
				3375	// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
				3376	// values in dst.
				3377	//
				3378	// FOR j := 0 to 7
				3379	// i := j*8
				3380	// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
				3381	// ENDFOR
				3382	//
				3383	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
				3384	#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
				3385
				3386	// Compare packed signed 16-bit integers in a and b, and store packed minimum
				3387	// values in dst.
				3388	//
				3389	// FOR j := 0 to 3
				3390	// i := j*16
				3391	// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
				3392	// ENDFOR
				3393	//
				3394	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
				3395	FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
				3396	{
				3397	return vreinterpret_m64_s16(
				3398	vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
				3399	}
				3400
				3401	// Compare packed signed 16-bit integers in a and b, and store packed minimum
				3402	// values in dst.
				3403	//
				3404	// FOR j := 0 to 3
				3405	// i := j*16
				3406	// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
				3407	// ENDFOR
				3408	//
				3409	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
				3410	#define _m_pminsw(a, b) _mm_min_pi16(a, b)
				3411
				3412	// Computes the minima of the four single-precision, floating-point values of a
				3413	// and b.
				3414	// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
				3415	FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
				3416	{
				3417	#if SSE2NEON_PRECISE_MINMAX
				3418	float32x4_t _a = vreinterpretq_f32_m128(a);
				3419	float32x4_t _b = vreinterpretq_f32_m128(b);
				3420	return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
				3421	#else
				3422	return vreinterpretq_m128_f32(
				3423	vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				3424	#endif
				3425	}
				3426
				3427	// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
				3428	// values in dst.
				3429	//
				3430	// FOR j := 0 to 7
				3431	// i := j*8
				3432	// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
				3433	// ENDFOR
				3434	//
				3435	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
				3436	FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
				3437	{
				3438	return vreinterpret_m64_u8(
				3439	vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
				3440	}
				3441
				3442	// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
				3443	// values in dst.
				3444	//
				3445	// FOR j := 0 to 7
				3446	// i := j*8
				3447	// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
				3448	// ENDFOR
				3449	//
				3450	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
				3451	#define _m_pminub(a, b) _mm_min_pu8(a, b)
				3452
				3453	// Computes the maximum of the two lower scalar single-precision floating point
				3454	// values of a and b.
				3455	// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
				3456	FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
				3457	{
				3458	float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
				3459	return vreinterpretq_m128_f32(
				3460	vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
				3461	}
				3462
				3463	// Computes the minimum of the two lower scalar single-precision floating point
				3464	// values of a and b.
				3465	// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
				3466	FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
				3467	{
				3468	float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
				3469	return vreinterpretq_m128_f32(
				3470	vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
				3471	}
				3472
				3473	// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
				3474	// 16 unsigned 8-bit integers from b.
				3475	// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
				3476	FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
				3477	{
				3478	return vreinterpretq_m128i_u8(
				3479	vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
				3480	}
				3481
				3482	// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
				3483	// 16 unsigned 8-bit integers from b.
				3484	// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
				3485	FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
				3486	{
				3487	return vreinterpretq_m128i_u8(
				3488	vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
				3489	}
				3490
				3491	// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
				3492	// signed 16-bit integers from b.
				3493	// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
				3494	FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
				3495	{
				3496	return vreinterpretq_m128i_s16(
				3497	vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				3498	}
				3499
				3500	// Compare packed signed 8-bit integers in a and b, and store packed maximum
				3501	// values in dst.
				3502	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
				3503	FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
				3504	{
				3505	return vreinterpretq_m128i_s8(
				3506	vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
				3507	}
				3508
				3509	// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
				3510	// signed 16-bit integers from b.
				3511	// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
				3512	FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
				3513	{
				3514	return vreinterpretq_m128i_s16(
				3515	vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				3516	}
				3517
				3518	// epi versions of min/max
				3519	// Computes the pariwise maximums of the four signed 32-bit integer values of a
				3520	// and b.
				3521	//
				3522	// A 128-bit parameter that can be defined with the following equations:
				3523	// r0 := (a0 > b0) ? a0 : b0
				3524	// r1 := (a1 > b1) ? a1 : b1
				3525	// r2 := (a2 > b2) ? a2 : b2
				3526	// r3 := (a3 > b3) ? a3 : b3
				3527	//
				3528	// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
				3529	FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
				3530	{
				3531	return vreinterpretq_m128i_s32(
				3532	vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				3533	}
				3534
				3535	// Computes the pariwise minima of the four signed 32-bit integer values of a
				3536	// and b.
				3537	//
				3538	// A 128-bit parameter that can be defined with the following equations:
				3539	// r0 := (a0 < b0) ? a0 : b0
				3540	// r1 := (a1 < b1) ? a1 : b1
				3541	// r2 := (a2 < b2) ? a2 : b2
				3542	// r3 := (a3 < b3) ? a3 : b3
				3543	//
				3544	// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
				3545	FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
				3546	{
				3547	return vreinterpretq_m128i_s32(
				3548	vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				3549	}
				3550
				3551	// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
				3552	// values in dst.
				3553	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
				3554	FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
				3555	{
				3556	return vreinterpretq_m128i_u32(
				3557	vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
				3558	}
				3559
				3560	// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
				3561	// values in dst.
				3562	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
				3563	FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
				3564	{
				3565	return vreinterpretq_m128i_u32(
				3566	vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
				3567	}
				3568
				3569	// Multiply the packed unsigned 16-bit integers in a and b, producing
				3570	// intermediate 32-bit integers, and store the high 16 bits of the intermediate
				3571	// integers in dst.
				3572	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
				3573	FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
				3574	{
				3575	return vreinterpret_m64_u16(vshrn_n_u32(
				3576	vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
				3577	}
				3578
				3579	// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
				3580	// integers from b.
				3581	//
				3582	// r0 := (a0 * b0)[31:16]
				3583	// r1 := (a1 * b1)[31:16]
				3584	// ...
				3585	// r7 := (a7 * b7)[31:16]
				3586	//
				3587	// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
				3588	FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
				3589	{
				3590	/* FIXME: issue with large values because of result saturation */
				3591	// int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
				3592	// vreinterpretq_s16_m128i(b)); /* =2ab */ return
				3593	// vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
				3594	int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
				3595	int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
				3596	int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
				3597	int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
				3598	int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
				3599	int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
				3600	uint16x8x2_t r =
				3601	vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
				3602	return vreinterpretq_m128i_u16(r.val[1]);
				3603	}
				3604
				3605	// Computes pairwise add of each argument as single-precision, floating-point
				3606	// values a and b.
				3607	// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
				3608	FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
				3609	{
				3610	#if defined(__aarch64__)
				3611	return vreinterpretq_m128_f32(
				3612	vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				3613	#else
				3614	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
				3615	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
				3616	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
				3617	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
				3618	return vreinterpretq_m128_f32(
				3619	vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
				3620	#endif
				3621	}
				3622
				3623	// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
				3624	// values a and b.
				3625	FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
				3626	{
				3627	int16x8_t a = vreinterpretq_s16_m128i(_a);
				3628	int16x8_t b = vreinterpretq_s16_m128i(_b);
				3629	#if defined(__aarch64__)
				3630	return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
				3631	#else
				3632	return vreinterpretq_m128i_s16(
				3633	vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
				3634	vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
				3635	#endif
				3636	}
				3637
				3638	// Horizontally substract adjacent pairs of single-precision (32-bit)
				3639	// floating-point elements in a and b, and pack the results in dst.
				3640	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
				3641	FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
				3642	{
				3643	#if defined(__aarch64__)
				3644	return vreinterpretq_m128_f32(vsubq_f32(
				3645	vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
				3646	vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
				3647	#else
				3648	float32x4x2_t c =
				3649	vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
				3650	return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
				3651	#endif
				3652	}
				3653
				3654	// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
				3655	// signed 16-bit results in dst.
				3656	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
				3657	FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
				3658	{
				3659	return vreinterpret_m64_s16(
				3660	vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
				3661	}
				3662
				3663	// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
				3664	// signed 32-bit results in dst.
				3665	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
				3666	FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
				3667	{
				3668	return vreinterpret_m64_s32(
				3669	vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
				3670	}
				3671
				3672	// Computes pairwise difference of each argument as a 16-bit signed or unsigned
				3673	// integer values a and b.
				3674	FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
				3675	{
				3676	int32x4_t a = vreinterpretq_s32_m128i(_a);
				3677	int32x4_t b = vreinterpretq_s32_m128i(_b);
				3678	// Interleave using vshrn/vmovn
				3679	// [a0\|a2\|a4\|a6\|b0\|b2\|b4\|b6]
				3680	// [a1\|a3\|a5\|a7\|b1\|b3\|b5\|b7]
				3681	int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
				3682	int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
				3683	// Subtract
				3684	return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
				3685	}
				3686
				3687	// Computes saturated pairwise sub of each argument as a 16-bit signed
				3688	// integer values a and b.
				3689	FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
				3690	{
				3691	#if defined(__aarch64__)
				3692	int16x8_t a = vreinterpretq_s16_m128i(_a);
				3693	int16x8_t b = vreinterpretq_s16_m128i(_b);
				3694	return vreinterpretq_s64_s16(
				3695	vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
				3696	#else
				3697	int32x4_t a = vreinterpretq_s32_m128i(_a);
				3698	int32x4_t b = vreinterpretq_s32_m128i(_b);
				3699	// Interleave using vshrn/vmovn
				3700	// [a0\|a2\|a4\|a6\|b0\|b2\|b4\|b6]
				3701	// [a1\|a3\|a5\|a7\|b1\|b3\|b5\|b7]
				3702	int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
				3703	int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
				3704	// Saturated add
				3705	return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
				3706	#endif
				3707	}
				3708
				3709	// Computes saturated pairwise difference of each argument as a 16-bit signed
				3710	// integer values a and b.
				3711	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
				3712	FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
				3713	{
				3714	#if defined(__aarch64__)
				3715	int16x8_t a = vreinterpretq_s16_m128i(_a);
				3716	int16x8_t b = vreinterpretq_s16_m128i(_b);
				3717	return vreinterpretq_s64_s16(
				3718	vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
				3719	#else
				3720	int32x4_t a = vreinterpretq_s32_m128i(_a);
				3721	int32x4_t b = vreinterpretq_s32_m128i(_b);
				3722	// Interleave using vshrn/vmovn
				3723	// [a0\|a2\|a4\|a6\|b0\|b2\|b4\|b6]
				3724	// [a1\|a3\|a5\|a7\|b1\|b3\|b5\|b7]
				3725	int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
				3726	int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
				3727	// Saturated subtract
				3728	return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
				3729	#endif
				3730	}
				3731
				3732	// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
				3733	// values a and b.
				3734	FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
				3735	{
				3736	int32x4_t a = vreinterpretq_s32_m128i(_a);
				3737	int32x4_t b = vreinterpretq_s32_m128i(_b);
				3738	return vreinterpretq_m128i_s32(
				3739	vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
				3740	vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
				3741	}
				3742
				3743	// Computes pairwise difference of each argument as a 32-bit signed or unsigned
				3744	// integer values a and b.
				3745	FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
				3746	{
				3747	int64x2_t a = vreinterpretq_s64_m128i(_a);
				3748	int64x2_t b = vreinterpretq_s64_m128i(_b);
				3749	// Interleave using vshrn/vmovn
				3750	// [a0\|a2\|b0\|b2]
				3751	// [a1\|a2\|b1\|b3]
				3752	int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
				3753	int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
				3754	// Subtract
				3755	return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
				3756	}
				3757
				3758	// Kahan summation for accurate summation of floating-point numbers.
				3759	// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
				3760	FORCE_INLINE void sse2neon_kadd_f32(float sum, float c, float y)
				3761	{
				3762	y -= *c;
				3763	float t = *sum + y;
				3764	c = (t - sum) - y;
				3765	*sum = t;
				3766	}
				3767
				3768	// Conditionally multiply the packed single-precision (32-bit) floating-point
				3769	// elements in a and b using the high 4 bits in imm8, sum the four products,
				3770	// and conditionally store the sum in dst using the low 4 bits of imm.
				3771	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
				3772	FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
				3773	{
				3774	#if defined(__aarch64__)
				3775	/* shortcuts */
				3776	if (imm == 0xFF) {
				3777	return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
				3778	}
				3779	if (imm == 0x7F) {
				3780	float32x4_t m = _mm_mul_ps(a, b);
				3781	m[3] = 0;
				3782	return _mm_set1_ps(vaddvq_f32(m));
				3783	}
				3784	#endif
				3785
				3786	float s = 0, c = 0;
				3787	float32x4_t f32a = vreinterpretq_f32_m128(a);
				3788	float32x4_t f32b = vreinterpretq_f32_m128(b);
				3789
				3790	/* To improve the accuracy of floating-point summation, Kahan algorithm
				3791	* is used for each operation.
				3792	*/
				3793	if (imm & (1 << 4))
				3794	sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
				3795	if (imm & (1 << 5))
				3796	sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
				3797	if (imm & (1 << 6))
				3798	sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
				3799	if (imm & (1 << 7))
				3800	sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
				3801	s += c;
				3802
				3803	float32x4_t res = {
				3804	(imm & 0x1) ? s : 0,
				3805	(imm & 0x2) ? s : 0,
				3806	(imm & 0x4) ? s : 0,
				3807	(imm & 0x8) ? s : 0,
				3808	};
				3809	return vreinterpretq_m128_f32(res);
				3810	}
				3811
				3812	/* Compare operations */
				3813
				3814	// Compares for less than
				3815	// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
				3816	FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
				3817	{
				3818	return vreinterpretq_m128_u32(
				3819	vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				3820	}
				3821
				3822	// Compares for less than
				3823	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
				3824	FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
				3825	{
				3826	return _mm_move_ss(a, _mm_cmplt_ps(a, b));
				3827	}
				3828
				3829	// Compares for greater than.
				3830	//
				3831	// r0 := (a0 > b0) ? 0xffffffff : 0x0
				3832	// r1 := (a1 > b1) ? 0xffffffff : 0x0
				3833	// r2 := (a2 > b2) ? 0xffffffff : 0x0
				3834	// r3 := (a3 > b3) ? 0xffffffff : 0x0
				3835	//
				3836	// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
				3837	FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
				3838	{
				3839	return vreinterpretq_m128_u32(
				3840	vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				3841	}
				3842
				3843	// Compares for greater than.
				3844	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
				3845	FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
				3846	{
				3847	return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
				3848	}
				3849
				3850	// Compares for greater than or equal.
				3851	// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
				3852	FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
				3853	{
				3854	return vreinterpretq_m128_u32(
				3855	vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				3856	}
				3857
				3858	// Compares for greater than or equal.
				3859	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
				3860	FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
				3861	{
				3862	return _mm_move_ss(a, _mm_cmpge_ps(a, b));
				3863	}
				3864
				3865	// Compares for less than or equal.
				3866	//
				3867	// r0 := (a0 <= b0) ? 0xffffffff : 0x0
				3868	// r1 := (a1 <= b1) ? 0xffffffff : 0x0
				3869	// r2 := (a2 <= b2) ? 0xffffffff : 0x0
				3870	// r3 := (a3 <= b3) ? 0xffffffff : 0x0
				3871	//
				3872	// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
				3873	FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
				3874	{
				3875	return vreinterpretq_m128_u32(
				3876	vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				3877	}
				3878
				3879	// Compares for less than or equal.
				3880	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
				3881	FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
				3882	{
				3883	return _mm_move_ss(a, _mm_cmple_ps(a, b));
				3884	}
				3885
				3886	// Compares for equality.
				3887	// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
				3888	FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
				3889	{
				3890	return vreinterpretq_m128_u32(
				3891	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				3892	}
				3893
				3894	// Compares for equality.
				3895	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
				3896	FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
				3897	{
				3898	return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
				3899	}
				3900
				3901	// Compares for inequality.
				3902	// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
				3903	FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
				3904	{
				3905	return vreinterpretq_m128_u32(vmvnq_u32(
				3906	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
				3907	}
				3908
				3909	// Compares for inequality.
				3910	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
				3911	FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
				3912	{
				3913	return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
				3914	}
				3915
				3916	// Compares for not greater than or equal.
				3917	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
				3918	FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
				3919	{
				3920	return _mm_cmplt_ps(a, b);
				3921	}
				3922
				3923	// Compares for not greater than or equal.
				3924	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
				3925	FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
				3926	{
				3927	return _mm_cmplt_ss(a, b);
				3928	}
				3929
				3930	// Compares for not greater than.
				3931	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
				3932	FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
				3933	{
				3934	return _mm_cmple_ps(a, b);
				3935	}
				3936
				3937	// Compares for not greater than.
				3938	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
				3939	FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
				3940	{
				3941	return _mm_cmple_ss(a, b);
				3942	}
				3943
				3944	// Compares for not less than or equal.
				3945	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
				3946	FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
				3947	{
				3948	return _mm_cmpgt_ps(a, b);
				3949	}
				3950
				3951	// Compares for not less than or equal.
				3952	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
				3953	FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
				3954	{
				3955	return _mm_cmpgt_ss(a, b);
				3956	}
				3957
				3958	// Compares for not less than.
				3959	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
				3960	FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
				3961	{
				3962	return _mm_cmpge_ps(a, b);
				3963	}
				3964
				3965	// Compares for not less than.
				3966	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
				3967	FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
				3968	{
				3969	return _mm_cmpge_ss(a, b);
				3970	}
				3971
				3972	// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
				3973	// unsigned 8-bit integers in b for equality.
				3974	// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
				3975	FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
				3976	{
				3977	return vreinterpretq_m128i_u8(
				3978	vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
				3979	}
				3980
				3981	// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
				3982	// unsigned 16-bit integers in b for equality.
				3983	// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
				3984	FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
				3985	{
				3986	return vreinterpretq_m128i_u16(
				3987	vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				3988	}
				3989
				3990	// Compare packed 32-bit integers in a and b for equality, and store the results
				3991	// in dst
				3992	FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
				3993	{
				3994	return vreinterpretq_m128i_u32(
				3995	vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				3996	}
				3997
				3998	// Compare packed 64-bit integers in a and b for equality, and store the results
				3999	// in dst
				4000	FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
				4001	{
				4002	#if defined(__aarch64__)
				4003	return vreinterpretq_m128i_u64(
				4004	vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
				4005	#else
				4006	// ARMv7 lacks vceqq_u64
				4007	// (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
				4008	uint32x4_t cmp =
				4009	vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
				4010	uint32x4_t swapped = vrev64q_u32(cmp);
				4011	return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
				4012	#endif
				4013	}
				4014
				4015	// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
				4016	// in b for lesser than.
				4017	// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
				4018	FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
				4019	{
				4020	return vreinterpretq_m128i_u8(
				4021	vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
				4022	}
				4023
				4024	// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
				4025	// in b for greater than.
				4026	//
				4027	// r0 := (a0 > b0) ? 0xff : 0x0
				4028	// r1 := (a1 > b1) ? 0xff : 0x0
				4029	// ...
				4030	// r15 := (a15 > b15) ? 0xff : 0x0
				4031	//
				4032	// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
				4033	FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
				4034	{
				4035	return vreinterpretq_m128i_u8(
				4036	vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
				4037	}
				4038
				4039	// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
				4040	// in b for less than.
				4041	//
				4042	// r0 := (a0 < b0) ? 0xffff : 0x0
				4043	// r1 := (a1 < b1) ? 0xffff : 0x0
				4044	// ...
				4045	// r7 := (a7 < b7) ? 0xffff : 0x0
				4046	//
				4047	// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
				4048	FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
				4049	{
				4050	return vreinterpretq_m128i_u16(
				4051	vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				4052	}
				4053
				4054	// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
				4055	// in b for greater than.
				4056	//
				4057	// r0 := (a0 > b0) ? 0xffff : 0x0
				4058	// r1 := (a1 > b1) ? 0xffff : 0x0
				4059	// ...
				4060	// r7 := (a7 > b7) ? 0xffff : 0x0
				4061	//
				4062	// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
				4063	FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
				4064	{
				4065	return vreinterpretq_m128i_u16(
				4066	vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				4067	}
				4068
				4069
				4070	// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
				4071	// in b for less than.
				4072	// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
				4073	FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
				4074	{
				4075	return vreinterpretq_m128i_u32(
				4076	vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				4077	}
				4078
				4079	// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
				4080	// in b for greater than.
				4081	// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
				4082	FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
				4083	{
				4084	return vreinterpretq_m128i_u32(
				4085	vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				4086	}
				4087
				4088	// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
				4089	// in b for greater than.
				4090	FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
				4091	{
				4092	#if defined(__aarch64__)
				4093	return vreinterpretq_m128i_u64(
				4094	vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
				4095	#else
				4096	// ARMv7 lacks vcgtq_s64.
				4097	// This is based off of Clang's SSE2 polyfill:
				4098	// (a > b) -> ((a_hi > b_hi) \|\| (a_lo > b_lo && a_hi == b_hi))
				4099
				4100	// Mask the sign bit out since we need a signed AND an unsigned comparison
				4101	// and it is ugly to try and split them.
				4102	int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
				4103	int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
				4104	int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
				4105	// Check if a > b
				4106	int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
				4107	// Copy upper mask to lower mask
				4108	// a_hi > b_hi
				4109	int64x2_t gt_hi = vshrq_n_s64(greater, 63);
				4110	// Copy lower mask to upper mask
				4111	// a_lo > b_lo
				4112	int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
				4113	// Compare for equality
				4114	int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
				4115	// Copy upper mask to lower mask
				4116	// a_hi == b_hi
				4117	int64x2_t eq_hi = vshrq_n_s64(equal, 63);
				4118	// a_hi > b_hi \|\| (a_lo > b_lo && a_hi == b_hi)
				4119	int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
				4120	return vreinterpretq_m128i_s64(ret);
				4121	#endif
				4122	}
				4123
				4124	// Compares the four 32-bit floats in a and b to check if any values are NaN.
				4125	// Ordered compare between each value returns true for "orderable" and false for
				4126	// "not orderable" (NaN).
				4127	// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
				4128	// also:
				4129	// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
				4130	// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
				4131	FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
				4132	{
				4133	// Note: NEON does not have ordered compare builtin
				4134	// Need to compare a eq a and b eq b to check for NaN
				4135	// Do AND of results to get final
				4136	uint32x4_t ceqaa =
				4137	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
				4138	uint32x4_t ceqbb =
				4139	vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
				4140	return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
				4141	}
				4142
				4143	// Compares for ordered.
				4144	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
				4145	FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
				4146	{
				4147	return _mm_move_ss(a, _mm_cmpord_ps(a, b));
				4148	}
				4149
				4150	// Compares for unordered.
				4151	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
				4152	FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
				4153	{
				4154	uint32x4_t f32a =
				4155	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
				4156	uint32x4_t f32b =
				4157	vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
				4158	return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
				4159	}
				4160
				4161	// Compares for unordered.
				4162	// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
				4163	FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
				4164	{
				4165	return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
				4166	}
				4167
				4168	// Compares the lower single-precision floating point scalar values of a and b
				4169	// using a less than operation. :
				4170	// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
				4171	// note!! The documentation on MSDN is incorrect! If either of the values is a
				4172	// NAN the docs say you will get a one, but in fact, it will return a zero!!
				4173	FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
				4174	{
				4175	uint32x4_t a_not_nan =
				4176	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
				4177	uint32x4_t b_not_nan =
				4178	vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
				4179	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
				4180	uint32x4_t a_lt_b =
				4181	vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
				4182	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
				4183	}
				4184
				4185	// Compares the lower single-precision floating point scalar values of a and b
				4186	// using a greater than operation. :
				4187	// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
				4188	FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
				4189	{
				4190	// return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
				4191	// vreinterpretq_f32_m128(b)), 0);
				4192	uint32x4_t a_not_nan =
				4193	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
				4194	uint32x4_t b_not_nan =
				4195	vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
				4196	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
				4197	uint32x4_t a_gt_b =
				4198	vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
				4199	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
				4200	}
				4201
				4202	// Compares the lower single-precision floating point scalar values of a and b
				4203	// using a less than or equal operation. :
				4204	// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
				4205	FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
				4206	{
				4207	// return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
				4208	// vreinterpretq_f32_m128(b)), 0);
				4209	uint32x4_t a_not_nan =
				4210	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
				4211	uint32x4_t b_not_nan =
				4212	vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
				4213	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
				4214	uint32x4_t a_le_b =
				4215	vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
				4216	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
				4217	}
				4218
				4219	// Compares the lower single-precision floating point scalar values of a and b
				4220	// using a greater than or equal operation. :
				4221	// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
				4222	FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
				4223	{
				4224	// return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
				4225	// vreinterpretq_f32_m128(b)), 0);
				4226	uint32x4_t a_not_nan =
				4227	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
				4228	uint32x4_t b_not_nan =
				4229	vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
				4230	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
				4231	uint32x4_t a_ge_b =
				4232	vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
				4233	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
				4234	}
				4235
				4236	// Compares the lower single-precision floating point scalar values of a and b
				4237	// using an equality operation. :
				4238	// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
				4239	FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
				4240	{
				4241	// return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
				4242	// vreinterpretq_f32_m128(b)), 0);
				4243	uint32x4_t a_not_nan =
				4244	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
				4245	uint32x4_t b_not_nan =
				4246	vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
				4247	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
				4248	uint32x4_t a_eq_b =
				4249	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
				4250	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
				4251	}
				4252
				4253	// Compares the lower single-precision floating point scalar values of a and b
				4254	// using an inequality operation. :
				4255	// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
				4256	FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
				4257	{
				4258	// return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
				4259	// vreinterpretq_f32_m128(b)), 0);
				4260	uint32x4_t a_not_nan =
				4261	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
				4262	uint32x4_t b_not_nan =
				4263	vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
				4264	uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
				4265	uint32x4_t a_neq_b = vmvnq_u32(
				4266	vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				4267	return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
				4268	}
				4269
				4270	// according to the documentation, these intrinsics behave the same as the
				4271	// non-'u' versions. We'll just alias them here.
				4272	#define _mm_ucomilt_ss _mm_comilt_ss
				4273	#define _mm_ucomile_ss _mm_comile_ss
				4274	#define _mm_ucomigt_ss _mm_comigt_ss
				4275	#define _mm_ucomige_ss _mm_comige_ss
				4276	#define _mm_ucomieq_ss _mm_comieq_ss
				4277	#define _mm_ucomineq_ss _mm_comineq_ss
				4278
				4279	/* Conversions */
				4280
				4281	// Convert packed signed 32-bit integers in b to packed single-precision
				4282	// (32-bit) floating-point elements, store the results in the lower 2 elements
				4283	// of dst, and copy the upper 2 packed elements from a to the upper elements of
				4284	// dst.
				4285	//
				4286	// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
				4287	// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
				4288	// dst[95:64] := a[95:64]
				4289	// dst[127:96] := a[127:96]
				4290	//
				4291	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
				4292	FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
				4293	{
				4294	return vreinterpretq_m128_f32(
				4295	vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
				4296	vget_high_f32(vreinterpretq_f32_m128(a))));
				4297	}
				4298
				4299	// Convert the signed 32-bit integer b to a single-precision (32-bit)
				4300	// floating-point element, store the result in the lower element of dst, and
				4301	// copy the upper 3 packed elements from a to the upper elements of dst.
				4302	//
				4303	// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
				4304	// dst[127:32] := a[127:32]
				4305	//
				4306	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
				4307	FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
				4308	{
				4309	__m128 ret = a;
				4310	return vreinterpretq_m128_f32(
				4311	vsetq_lane_f32((float) b, vreinterpretq_f32_m128(ret), 0));
				4312	}
				4313
				4314	// Convert the lower single-precision (32-bit) floating-point element in a to a
				4315	// 32-bit integer, and store the result in dst.
				4316	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
				4317	FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
				4318	{
				4319	#if defined(__aarch64__)
				4320	return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
				4321	#else
				4322	float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
				4323	float32_t diff = data - floor(data);
				4324	if (diff > 0.5)
				4325	return (int32_t) ceil(data);
				4326	if (diff == 0.5) {
				4327	int32_t f = (int32_t) floor(data);
				4328	int32_t c = (int32_t) ceil(data);
				4329	return c & 1 ? f : c;
				4330	}
				4331	return (int32_t) floor(data);
				4332	#endif
				4333	}
				4334
				4335	// Convert packed 16-bit integers in a to packed single-precision (32-bit)
				4336	// floating-point elements, and store the results in dst.
				4337	//
				4338	// FOR j := 0 to 3
				4339	// i := j*16
				4340	// m := j*32
				4341	// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
				4342	// ENDFOR
				4343	//
				4344	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
				4345	FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
				4346	{
				4347	return vreinterpretq_m128_f32(
				4348	vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
				4349	}
				4350
				4351	// Convert packed 32-bit integers in b to packed single-precision (32-bit)
				4352	// floating-point elements, store the results in the lower 2 elements of dst,
				4353	// and copy the upper 2 packed elements from a to the upper elements of dst.
				4354	//
				4355	// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
				4356	// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
				4357	// dst[95:64] := a[95:64]
				4358	// dst[127:96] := a[127:96]
				4359	//
				4360	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
				4361	FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
				4362	{
				4363	return vreinterpretq_m128_f32(
				4364	vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
				4365	vget_high_f32(vreinterpretq_f32_m128(a))));
				4366	}
				4367
				4368	// Convert packed signed 32-bit integers in a to packed single-precision
				4369	// (32-bit) floating-point elements, store the results in the lower 2 elements
				4370	// of dst, then covert the packed signed 32-bit integers in b to
				4371	// single-precision (32-bit) floating-point element, and store the results in
				4372	// the upper 2 elements of dst.
				4373	//
				4374	// dst[31:0] := Convert_Int32_To_FP32(a[31:0])
				4375	// dst[63:32] := Convert_Int32_To_FP32(a[63:32])
				4376	// dst[95:64] := Convert_Int32_To_FP32(b[31:0])
				4377	// dst[127:96] := Convert_Int32_To_FP32(b[63:32])
				4378	//
				4379	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
				4380	FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
				4381	{
				4382	return vreinterpretq_m128_f32(vcvtq_f32_s32(
				4383	vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
				4384	}
				4385
				4386	// Convert the lower packed 8-bit integers in a to packed single-precision
				4387	// (32-bit) floating-point elements, and store the results in dst.
				4388	//
				4389	// FOR j := 0 to 3
				4390	// i := j*8
				4391	// m := j*32
				4392	// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
				4393	// ENDFOR
				4394	//
				4395	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
				4396	FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
				4397	{
				4398	return vreinterpretq_m128_f32(vcvtq_f32_s32(
				4399	vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
				4400	}
				4401
				4402	// Convert packed unsigned 16-bit integers in a to packed single-precision
				4403	// (32-bit) floating-point elements, and store the results in dst.
				4404	//
				4405	// FOR j := 0 to 3
				4406	// i := j*16
				4407	// m := j*32
				4408	// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
				4409	// ENDFOR
				4410	//
				4411	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
				4412	FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
				4413	{
				4414	return vreinterpretq_m128_f32(
				4415	vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
				4416	}
				4417
				4418	// Convert the lower packed unsigned 8-bit integers in a to packed
				4419	// single-precision (32-bit) floating-point elements, and store the results in
				4420	// dst.
				4421	//
				4422	// FOR j := 0 to 3
				4423	// i := j*8
				4424	// m := j*32
				4425	// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
				4426	// ENDFOR
				4427	//
				4428	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
				4429	FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
				4430	{
				4431	return vreinterpretq_m128_f32(vcvtq_f32_u32(
				4432	vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
				4433	}
				4434
				4435	// Converts the four single-precision, floating-point values of a to signed
				4436	// 32-bit integer values using truncate.
				4437	// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
				4438	FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
				4439	{
				4440	return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
				4441	}
				4442
				4443	// Converts the four signed 32-bit integer values of a to single-precision,
				4444	// floating-point values
				4445	// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
				4446	FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
				4447	{
				4448	return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
				4449	}
				4450
				4451	// Converts the four unsigned 8-bit integers in the lower 16 bits to four
				4452	// unsigned 32-bit integers.
				4453	FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
				4454	{
				4455	uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
				4456	uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
				4457	return vreinterpretq_m128i_u16(u16x8);
				4458	}
				4459
				4460	// Converts the four unsigned 8-bit integers in the lower 32 bits to four
				4461	// unsigned 32-bit integers.
				4462	// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
				4463	FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
				4464	{
				4465	uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
				4466	uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
				4467	uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
				4468	return vreinterpretq_m128i_u32(u32x4);
				4469	}
				4470
				4471	// Converts the two unsigned 8-bit integers in the lower 16 bits to two
				4472	// unsigned 64-bit integers.
				4473	FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
				4474	{
				4475	uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
				4476	uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
				4477	uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
				4478	uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
				4479	return vreinterpretq_m128i_u64(u64x2);
				4480	}
				4481
				4482	// Converts the four unsigned 8-bit integers in the lower 16 bits to four
				4483	// unsigned 32-bit integers.
				4484	FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
				4485	{
				4486	int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
				4487	int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
				4488	return vreinterpretq_m128i_s16(s16x8);
				4489	}
				4490
				4491	// Converts the four unsigned 8-bit integers in the lower 32 bits to four
				4492	// unsigned 32-bit integers.
				4493	FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
				4494	{
				4495	int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
				4496	int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
				4497	int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
				4498	return vreinterpretq_m128i_s32(s32x4);
				4499	}
				4500
				4501	// Converts the two signed 8-bit integers in the lower 32 bits to four
				4502	// signed 64-bit integers.
				4503	FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
				4504	{
				4505	int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
				4506	int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
				4507	int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
				4508	int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
				4509	return vreinterpretq_m128i_s64(s64x2);
				4510	}
				4511
				4512	// Converts the four signed 16-bit integers in the lower 64 bits to four signed
				4513	// 32-bit integers.
				4514	FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
				4515	{
				4516	return vreinterpretq_m128i_s32(
				4517	vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
				4518	}
				4519
				4520	// Converts the two signed 16-bit integers in the lower 32 bits two signed
				4521	// 32-bit integers.
				4522	FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
				4523	{
				4524	int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
				4525	int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
				4526	int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
				4527	return vreinterpretq_m128i_s64(s64x2);
				4528	}
				4529
				4530	// Converts the four unsigned 16-bit integers in the lower 64 bits to four
				4531	// unsigned 32-bit integers.
				4532	FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
				4533	{
				4534	return vreinterpretq_m128i_u32(
				4535	vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
				4536	}
				4537
				4538	// Converts the two unsigned 16-bit integers in the lower 32 bits to two
				4539	// unsigned 64-bit integers.
				4540	FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
				4541	{
				4542	uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
				4543	uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
				4544	uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
				4545	return vreinterpretq_m128i_u64(u64x2);
				4546	}
				4547
				4548	// Converts the two unsigned 32-bit integers in the lower 64 bits to two
				4549	// unsigned 64-bit integers.
				4550	FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
				4551	{
				4552	return vreinterpretq_m128i_u64(
				4553	vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
				4554	}
				4555
				4556	// Converts the two signed 32-bit integers in the lower 64 bits to two signed
				4557	// 64-bit integers.
				4558	FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
				4559	{
				4560	return vreinterpretq_m128i_s64(
				4561	vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
				4562	}
				4563
				4564	// Converts the four single-precision, floating-point values of a to signed
				4565	// 32-bit integer values.
				4566	//
				4567	// r0 := (int) a0
				4568	// r1 := (int) a1
				4569	// r2 := (int) a2
				4570	// r3 := (int) a3
				4571	//
				4572	// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
				4573	// NOTE. The default rounding mode on SSE is 'round to even', which ARMv7-A
				4574	// does not support! It is supported on ARMv8-A however.
				4575	FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
				4576	{
				4577	#if defined(__aarch64__)
				4578	return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
				4579	#else
				4580	uint32x4_t signmask = vdupq_n_u32(0x80000000);
				4581	float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
				4582	vdupq_n_f32(0.5f)); /* +/- 0.5 */
				4583	int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
				4584	vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
				4585	int32x4_t r_trunc =
				4586	vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
				4587	int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
				4588	vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
				4589	int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
				4590	vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
				4591	float32x4_t delta = vsubq_f32(
				4592	vreinterpretq_f32_m128(a),
				4593	vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
				4594	uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
				4595	return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
				4596	#endif
				4597	}
				4598
				4599	// Copy the lower 32-bit integer in a to dst.
				4600	//
				4601	// dst[31:0] := a[31:0]
				4602	//
				4603	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
				4604	FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
				4605	{
				4606	return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
				4607	}
				4608
				4609	// Copy the lower 64-bit integer in a to dst.
				4610	//
				4611	// dst[63:0] := a[63:0]
				4612	//
				4613	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
				4614	FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
				4615	{
				4616	return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
				4617	}
				4618
				4619	// Copy the lower 64-bit integer in a to dst.
				4620	//
				4621	// dst[63:0] := a[63:0]
				4622	//
				4623	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
				4624	#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
				4625
				4626	// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
				4627	// zero extending the upper bits.
				4628	//
				4629	// r0 := a
				4630	// r1 := 0x0
				4631	// r2 := 0x0
				4632	// r3 := 0x0
				4633	//
				4634	// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
				4635	FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
				4636	{
				4637	return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
				4638	}
				4639
				4640	// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
				4641	// zero extending the upper bits.
				4642	//
				4643	// r0 := a
				4644	// r1 := 0x0
				4645	FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
				4646	{
				4647	return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
				4648	}
				4649
				4650	// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
				4651	// compilation and does not generate any instructions, thus it has zero latency.
				4652	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
				4653	FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
				4654	{
				4655	return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
				4656	}
				4657
				4658	// Applies a type cast to reinterpret four 32-bit floating point values passed
				4659	// in as a 128-bit parameter as packed 32-bit integers.
				4660	// https://msdn.microsoft.com/en-us/library/bb514099.aspx
				4661	FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
				4662	{
				4663	return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
				4664	}
				4665
				4666	// Applies a type cast to reinterpret four 32-bit integers passed in as a
				4667	// 128-bit parameter as packed 32-bit floating point values.
				4668	// https://msdn.microsoft.com/en-us/library/bb514029.aspx
				4669	FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
				4670	{
				4671	return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
				4672	}
				4673
				4674	// Loads 128-bit value. :
				4675	// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
				4676	FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
				4677	{
				4678	return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
				4679	}
				4680
				4681	// Load a double-precision (64-bit) floating-point element from memory into both
				4682	// elements of dst.
				4683	//
				4684	// dst[63:0] := MEM[mem_addr+63:mem_addr]
				4685	// dst[127:64] := MEM[mem_addr+63:mem_addr]
				4686	//
				4687	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
				4688	FORCE_INLINE __m128d _mm_load1_pd(const double *p)
				4689	{
				4690	#if defined(__aarch64__)
				4691	return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
				4692	#else
				4693	return vreinterpretq_m128d_s64(vdupq_n_s64((const int64_t ) p));
				4694	#endif
				4695	}
				4696
				4697	// Load a double-precision (64-bit) floating-point element from memory into the
				4698	// upper element of dst, and copy the lower element from a to dst. mem_addr does
				4699	// not need to be aligned on any particular boundary.
				4700	//
				4701	// dst[63:0] := a[63:0]
				4702	// dst[127:64] := MEM[mem_addr+63:mem_addr]
				4703	//
				4704	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
				4705	FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
				4706	{
				4707	#if defined(__aarch64__)
				4708	return vreinterpretq_m128d_f64(
				4709	vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
				4710	#else
				4711	return vreinterpretq_m128d_f32(vcombine_f32(
				4712	vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
				4713	#endif
				4714	}
				4715
				4716	// Load a double-precision (64-bit) floating-point element from memory into both
				4717	// elements of dst.
				4718	//
				4719	// dst[63:0] := MEM[mem_addr+63:mem_addr]
				4720	// dst[127:64] := MEM[mem_addr+63:mem_addr]
				4721	//
				4722	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
				4723	#define _mm_load_pd1 _mm_load1_pd
				4724
				4725	// Load a double-precision (64-bit) floating-point element from memory into both
				4726	// elements of dst.
				4727	//
				4728	// dst[63:0] := MEM[mem_addr+63:mem_addr]
				4729	// dst[127:64] := MEM[mem_addr+63:mem_addr]
				4730	//
				4731	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
				4732	#define _mm_loaddup_pd _mm_load1_pd
				4733
				4734	// Loads 128-bit value. :
				4735	// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
				4736	FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
				4737	{
				4738	return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
				4739	}
				4740
				4741	// Load unaligned 32-bit integer from memory into the first element of dst.
				4742	//
				4743	// dst[31:0] := MEM[mem_addr+31:mem_addr]
				4744	// dst[MAX:32] := 0
				4745	//
				4746	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
				4747	FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
				4748	{
				4749	return vreinterpretq_m128i_s32(
				4750	vsetq_lane_s32((const int32_t ) p, vdupq_n_s32(0), 0));
				4751	}
				4752
				4753	// Convert packed double-precision (64-bit) floating-point elements in a to
				4754	// packed single-precision (32-bit) floating-point elements, and store the
				4755	// results in dst.
				4756	//
				4757	// FOR j := 0 to 1
				4758	// i := 32*j
				4759	// k := 64*j
				4760	// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
				4761	// ENDFOR
				4762	// dst[127:64] := 0
				4763	//
				4764	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
				4765	FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
				4766	{
				4767	#if defined(__aarch64__)
				4768	float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
				4769	return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
				4770	#else
				4771	float a0 = (float) ((double *) &a)[0];
				4772	float a1 = (float) ((double *) &a)[1];
				4773	return _mm_set_ps(0, 0, a1, a0);
				4774	#endif
				4775	}
				4776
				4777	// Copy the lower double-precision (64-bit) floating-point element of a to dst.
				4778	//
				4779	// dst[63:0] := a[63:0]
				4780	//
				4781	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
				4782	FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
				4783	{
				4784	#if defined(__aarch64__)
				4785	return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
				4786	#else
				4787	return ((double *) &a)[0];
				4788	#endif
				4789	}
				4790
				4791	// Convert packed single-precision (32-bit) floating-point elements in a to
				4792	// packed double-precision (64-bit) floating-point elements, and store the
				4793	// results in dst.
				4794	//
				4795	// FOR j := 0 to 1
				4796	// i := 64*j
				4797	// k := 32*j
				4798	// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
				4799	// ENDFOR
				4800	//
				4801	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
				4802	FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
				4803	{
				4804	#if defined(__aarch64__)
				4805	return vreinterpretq_m128d_f64(
				4806	vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
				4807	#else
				4808	double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
				4809	double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
				4810	return _mm_set_pd(a1, a0);
				4811	#endif
				4812	}
				4813
				4814	// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
				4815	// compilation and does not generate any instructions, thus it has zero latency.
				4816	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
				4817	FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
				4818	{
				4819	return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
				4820	}
				4821
				4822	// Blend packed single-precision (32-bit) floating-point elements from a and b
				4823	// using mask, and store the results in dst.
				4824	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
				4825	FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
				4826	{
				4827	return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
				4828	vreinterpretq_f32_m128(b),
				4829	vreinterpretq_f32_m128(a)));
				4830	}
				4831
				4832	// Round the packed single-precision (32-bit) floating-point elements in a using
				4833	// the rounding parameter, and store the results as packed single-precision
				4834	// floating-point elements in dst.
				4835	// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
				4836	FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
				4837	{
				4838	#if defined(__aarch64__)
				4839	switch (rounding) {
				4840	case (_MM_FROUND_TO_NEAREST_INT \| _MM_FROUND_NO_EXC):
				4841	return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
				4842	case (_MM_FROUND_TO_NEG_INF \| _MM_FROUND_NO_EXC):
				4843	return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
				4844	case (_MM_FROUND_TO_POS_INF \| _MM_FROUND_NO_EXC):
				4845	return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
				4846	case (_MM_FROUND_TO_ZERO \| _MM_FROUND_NO_EXC):
				4847	return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
				4848	default: //_MM_FROUND_CUR_DIRECTION
				4849	return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
				4850	}
				4851	#else
				4852	float v_float = (float ) &a;
				4853	__m128 zero, neg_inf, pos_inf;
				4854
				4855	switch (rounding) {
				4856	case (_MM_FROUND_TO_NEAREST_INT \| _MM_FROUND_NO_EXC):
				4857	return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
				4858	case (_MM_FROUND_TO_NEG_INF \| _MM_FROUND_NO_EXC):
				4859	return (__m128){floorf(v_float[0]), floorf(v_float[1]),
				4860	floorf(v_float[2]), floorf(v_float[3])};
				4861	case (_MM_FROUND_TO_POS_INF \| _MM_FROUND_NO_EXC):
				4862	return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
				4863	ceilf(v_float[3])};
				4864	case (_MM_FROUND_TO_ZERO \| _MM_FROUND_NO_EXC):
				4865	zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
				4866	neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
				4867	floorf(v_float[2]), floorf(v_float[3]));
				4868	pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
				4869	ceilf(v_float[2]), ceilf(v_float[3]));
				4870	return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
				4871	default: //_MM_FROUND_CUR_DIRECTION
				4872	return (__m128){roundf(v_float[0]), roundf(v_float[1]),
				4873	roundf(v_float[2]), roundf(v_float[3])};
				4874	}
				4875	#endif
				4876	}
				4877
				4878	// Round the packed single-precision (32-bit) floating-point elements in a up to
				4879	// an integer value, and store the results as packed single-precision
				4880	// floating-point elements in dst.
				4881	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
				4882	FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
				4883	{
				4884	return _mm_round_ps(a, _MM_FROUND_TO_POS_INF \| _MM_FROUND_NO_EXC);
				4885	}
				4886
				4887	// Round the packed single-precision (32-bit) floating-point elements in a down
				4888	// to an integer value, and store the results as packed single-precision
				4889	// floating-point elements in dst.
				4890	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
				4891	FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
				4892	{
				4893	return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF \| _MM_FROUND_NO_EXC);
				4894	}
				4895
				4896
				4897	// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
				4898	// may perform better than _mm_loadu_si128 when the data crosses a cache line
				4899	// boundary.
				4900	//
				4901	// dst[127:0] := MEM[mem_addr+127:mem_addr]
				4902	//
				4903	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
				4904	#define _mm_lddqu_si128 _mm_loadu_si128
				4905
				4906	/* Miscellaneous Operations */
				4907
				4908	// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
				4909	// in the sign bit.
				4910	//
				4911	// r0 := a0 >> count
				4912	// r1 := a1 >> count
				4913	// ...
				4914	// r7 := a7 >> count
				4915	//
				4916	// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
				4917	FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
				4918	{
				4919	int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
				4920	if (c > 15)
				4921	return _mm_cmplt_epi16(a, _mm_setzero_si128());
				4922	return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
				4923	}
				4924
				4925	// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
				4926	// in the sign bit.
				4927	//
				4928	// r0 := a0 >> count
				4929	// r1 := a1 >> count
				4930	// r2 := a2 >> count
				4931	// r3 := a3 >> count
				4932	//
				4933	// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
				4934	FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
				4935	{
				4936	int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
				4937	if (c > 31)
				4938	return _mm_cmplt_epi32(a, _mm_setzero_si128());
				4939	return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
				4940	}
				4941
				4942	// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
				4943	// saturates.
				4944	// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
				4945	FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
				4946	{
				4947	return vreinterpretq_m128i_s8(
				4948	vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
				4949	vqmovn_s16(vreinterpretq_s16_m128i(b))));
				4950	}
				4951
				4952	// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
				4953	// integers and saturates.
				4954	//
				4955	// r0 := UnsignedSaturate(a0)
				4956	// r1 := UnsignedSaturate(a1)
				4957	// ...
				4958	// r7 := UnsignedSaturate(a7)
				4959	// r8 := UnsignedSaturate(b0)
				4960	// r9 := UnsignedSaturate(b1)
				4961	// ...
				4962	// r15 := UnsignedSaturate(b7)
				4963	//
				4964	// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
				4965	FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
				4966	{
				4967	return vreinterpretq_m128i_u8(
				4968	vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
				4969	vqmovun_s16(vreinterpretq_s16_m128i(b))));
				4970	}
				4971
				4972	// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
				4973	// and saturates.
				4974	//
				4975	// r0 := SignedSaturate(a0)
				4976	// r1 := SignedSaturate(a1)
				4977	// r2 := SignedSaturate(a2)
				4978	// r3 := SignedSaturate(a3)
				4979	// r4 := SignedSaturate(b0)
				4980	// r5 := SignedSaturate(b1)
				4981	// r6 := SignedSaturate(b2)
				4982	// r7 := SignedSaturate(b3)
				4983	//
				4984	// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
				4985	FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
				4986	{
				4987	return vreinterpretq_m128i_s16(
				4988	vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
				4989	vqmovn_s32(vreinterpretq_s32_m128i(b))));
				4990	}
				4991
				4992	// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
				4993	// integers and saturates.
				4994	//
				4995	// r0 := UnsignedSaturate(a0)
				4996	// r1 := UnsignedSaturate(a1)
				4997	// r2 := UnsignedSaturate(a2)
				4998	// r3 := UnsignedSaturate(a3)
				4999	// r4 := UnsignedSaturate(b0)
				5000	// r5 := UnsignedSaturate(b1)
				5001	// r6 := UnsignedSaturate(b2)
				5002	// r7 := UnsignedSaturate(b3)
				5003	FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
				5004	{
				5005	return vreinterpretq_m128i_u16(
				5006	vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
				5007	vqmovun_s32(vreinterpretq_s32_m128i(b))));
				5008	}
				5009
				5010	// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
				5011	// 8 signed or unsigned 8-bit integers in b.
				5012	//
				5013	// r0 := a0
				5014	// r1 := b0
				5015	// r2 := a1
				5016	// r3 := b1
				5017	// ...
				5018	// r14 := a7
				5019	// r15 := b7
				5020	//
				5021	// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
				5022	FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
				5023	{
				5024	#if defined(__aarch64__)
				5025	return vreinterpretq_m128i_s8(
				5026	vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
				5027	#else
				5028	int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
				5029	int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
				5030	int8x8x2_t result = vzip_s8(a1, b1);
				5031	return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
				5032	#endif
				5033	}
				5034
				5035	// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
				5036	// lower 4 signed or unsigned 16-bit integers in b.
				5037	//
				5038	// r0 := a0
				5039	// r1 := b0
				5040	// r2 := a1
				5041	// r3 := b1
				5042	// r4 := a2
				5043	// r5 := b2
				5044	// r6 := a3
				5045	// r7 := b3
				5046	//
				5047	// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
				5048	FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
				5049	{
				5050	#if defined(__aarch64__)
				5051	return vreinterpretq_m128i_s16(
				5052	vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				5053	#else
				5054	int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
				5055	int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
				5056	int16x4x2_t result = vzip_s16(a1, b1);
				5057	return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
				5058	#endif
				5059	}
				5060
				5061	// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
				5062	// lower 2 signed or unsigned 32 - bit integers in b.
				5063	//
				5064	// r0 := a0
				5065	// r1 := b0
				5066	// r2 := a1
				5067	// r3 := b1
				5068	//
				5069	// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
				5070	FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
				5071	{
				5072	#if defined(__aarch64__)
				5073	return vreinterpretq_m128i_s32(
				5074	vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				5075	#else
				5076	int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
				5077	int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
				5078	int32x2x2_t result = vzip_s32(a1, b1);
				5079	return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
				5080	#endif
				5081	}
				5082
				5083	FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
				5084	{
				5085	int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
				5086	int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
				5087	return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
				5088	}
				5089
				5090	// Selects and interleaves the lower two single-precision, floating-point values
				5091	// from a and b.
				5092	//
				5093	// r0 := a0
				5094	// r1 := b0
				5095	// r2 := a1
				5096	// r3 := b1
				5097	//
				5098	// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
				5099	FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
				5100	{
				5101	#if defined(__aarch64__)
				5102	return vreinterpretq_m128_f32(
				5103	vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				5104	#else
				5105	float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
				5106	float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
				5107	float32x2x2_t result = vzip_f32(a1, b1);
				5108	return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
				5109	#endif
				5110	}
				5111
				5112	// Selects and interleaves the upper two single-precision, floating-point values
				5113	// from a and b.
				5114	//
				5115	// r0 := a2
				5116	// r1 := b2
				5117	// r2 := a3
				5118	// r3 := b3
				5119	//
				5120	// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
				5121	FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
				5122	{
				5123	#if defined(__aarch64__)
				5124	return vreinterpretq_m128_f32(
				5125	vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
				5126	#else
				5127	float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
				5128	float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
				5129	float32x2x2_t result = vzip_f32(a1, b1);
				5130	return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
				5131	#endif
				5132	}
				5133
				5134	// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
				5135	// 8 signed or unsigned 8-bit integers in b.
				5136	//
				5137	// r0 := a8
				5138	// r1 := b8
				5139	// r2 := a9
				5140	// r3 := b9
				5141	// ...
				5142	// r14 := a15
				5143	// r15 := b15
				5144	//
				5145	// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
				5146	FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
				5147	{
				5148	#if defined(__aarch64__)
				5149	return vreinterpretq_m128i_s8(
				5150	vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
				5151	#else
				5152	int8x8_t a1 =
				5153	vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
				5154	int8x8_t b1 =
				5155	vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
				5156	int8x8x2_t result = vzip_s8(a1, b1);
				5157	return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
				5158	#endif
				5159	}
				5160
				5161	// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
				5162	// upper 4 signed or unsigned 16-bit integers in b.
				5163	//
				5164	// r0 := a4
				5165	// r1 := b4
				5166	// r2 := a5
				5167	// r3 := b5
				5168	// r4 := a6
				5169	// r5 := b6
				5170	// r6 := a7
				5171	// r7 := b7
				5172	//
				5173	// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
				5174	FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
				5175	{
				5176	#if defined(__aarch64__)
				5177	return vreinterpretq_m128i_s16(
				5178	vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
				5179	#else
				5180	int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
				5181	int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
				5182	int16x4x2_t result = vzip_s16(a1, b1);
				5183	return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
				5184	#endif
				5185	}
				5186
				5187	// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
				5188	// upper 2 signed or unsigned 32-bit integers in b.
				5189	// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
				5190	FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
				5191	{
				5192	#if defined(__aarch64__)
				5193	return vreinterpretq_m128i_s32(
				5194	vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
				5195	#else
				5196	int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
				5197	int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
				5198	int32x2x2_t result = vzip_s32(a1, b1);
				5199	return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
				5200	#endif
				5201	}
				5202
				5203	// Interleaves the upper signed or unsigned 64-bit integer in a with the
				5204	// upper signed or unsigned 64-bit integer in b.
				5205	//
				5206	// r0 := a1
				5207	// r1 := b1
				5208	FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
				5209	{
				5210	int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
				5211	int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
				5212	return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
				5213	}
				5214
				5215	// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
				5216	// in a, store the minimum and index in dst, and zero the remaining bits in dst.
				5217	//
				5218	// index[2:0] := 0
				5219	// min[15:0] := a[15:0]
				5220	// FOR j := 0 to 7
				5221	// i := j*16
				5222	// IF a[i+15:i] < min[15:0]
				5223	// index[2:0] := j
				5224	// min[15:0] := a[i+15:i]
				5225	// FI
				5226	// ENDFOR
				5227	// dst[15:0] := min[15:0]
				5228	// dst[18:16] := index[2:0]
				5229	// dst[127:19] := 0
				5230	//
				5231	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
				5232	FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
				5233	{
				5234	__m128i dst;
				5235	uint16_t min, idx = 0;
				5236	// Find the minimum value
				5237	#if defined(__aarch64__)
				5238	min = vminvq_u16(vreinterpretq_u16_m128i(a));
				5239	#else
				5240	__m64 tmp;
				5241	tmp = vreinterpret_m64_u16(
				5242	vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
				5243	vget_high_u16(vreinterpretq_u16_m128i(a))));
				5244	tmp = vreinterpret_m64_u16(
				5245	vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
				5246	tmp = vreinterpret_m64_u16(
				5247	vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
				5248	min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
				5249	#endif
				5250	// Get the index of the minimum value
				5251	int i;
				5252	for (i = 0; i < 8; i++) {
				5253	if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
				5254	idx = (uint16_t) i;
				5255	break;
				5256	}
				5257	a = _mm_srli_si128(a, 2);
				5258	}
				5259	// Generate result
				5260	dst = _mm_setzero_si128();
				5261	dst = vreinterpretq_m128i_u16(
				5262	vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
				5263	dst = vreinterpretq_m128i_u16(
				5264	vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
				5265	return dst;
				5266	}
				5267
				5268	// shift to right
				5269	// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
				5270	// http://blog.csdn.net/hemmingway/article/details/44828303
				5271	// Clang requires a macro here, as it is extremely picky about c being a
				5272	// literal.
				5273	#define _mm_alignr_epi8(a, b, c) \
				5274	((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
				5275
				5276	// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
				5277	// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
				5278	// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
				5279	// otherwise set CF to 0. Return the CF value.
				5280	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
				5281	FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
				5282	{
				5283	int64x2_t s64 =
				5284	vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
				5285	vreinterpretq_s64_m128i(b));
				5286	return !(vgetq_lane_s64(s64, 0) \| vgetq_lane_s64(s64, 1));
				5287	}
				5288
				5289	// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
				5290	// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
				5291	// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
				5292	// otherwise set CF to 0. Return the ZF value.
				5293	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
				5294	FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
				5295	{
				5296	int64x2_t s64 =
				5297	vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
				5298	return !(vgetq_lane_s64(s64, 0) \| vgetq_lane_s64(s64, 1));
				5299	}
				5300
				5301	// Extracts the selected signed or unsigned 8-bit integer from a and zero
				5302	// extends.
				5303	// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
				5304	#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
				5305
				5306	// Inserts the least significant 8 bits of b into the selected 8-bit integer
				5307	// of a.
				5308	// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
				5309	// __constrange(0,16) int imm)
				5310	#define _mm_insert_epi8(a, b, imm) \
				5311	__extension__({ \
				5312	vreinterpretq_m128i_s8( \
				5313	vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
				5314	})
				5315
				5316	// Extracts the selected signed or unsigned 16-bit integer from a and zero
				5317	// extends.
				5318	// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
				5319	// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
				5320	#define _mm_extract_epi16(a, imm) \
				5321	vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
				5322
				5323	// Inserts the least significant 16 bits of b into the selected 16-bit integer
				5324	// of a.
				5325	// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
				5326	// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
				5327	// __constrange(0,8) int imm)
				5328	#define _mm_insert_epi16(a, b, imm) \
				5329	__extension__({ \
				5330	vreinterpretq_m128i_s16( \
				5331	vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
				5332	})
				5333
				5334	// Extracts the selected signed or unsigned 32-bit integer from a and zero
				5335	// extends.
				5336	// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
				5337	#define _mm_extract_epi32(a, imm) \
				5338	vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
				5339
				5340	// Extracts the selected single-precision (32-bit) floating-point from a.
				5341	// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
				5342	#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
				5343
				5344	// Inserts the least significant 32 bits of b into the selected 32-bit integer
				5345	// of a.
				5346	// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
				5347	// __constrange(0,4) int imm)
				5348	#define _mm_insert_epi32(a, b, imm) \
				5349	__extension__({ \
				5350	vreinterpretq_m128i_s32( \
				5351	vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
				5352	})
				5353
				5354	// Extracts the selected signed or unsigned 64-bit integer from a and zero
				5355	// extends.
				5356	// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
				5357	#define _mm_extract_epi64(a, imm) \
				5358	vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
				5359
				5360	// Inserts the least significant 64 bits of b into the selected 64-bit integer
				5361	// of a.
				5362	// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
				5363	// __constrange(0,2) int imm)
				5364	#define _mm_insert_epi64(a, b, imm) \
				5365	__extension__({ \
				5366	vreinterpretq_m128i_s64( \
				5367	vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
				5368	})
				5369
				5370	// Count the number of bits set to 1 in unsigned 32-bit integer a, and
				5371	// return that count in dst.
				5372	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
				5373	FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
				5374	{
				5375	#if defined(__aarch64__)
				5376	#if __has_builtin(__builtin_popcount)
				5377	return __builtin_popcount(a);
				5378	#else
				5379	return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
				5380	#endif
				5381	#else
				5382	uint32_t count = 0;
				5383	uint8x8_t input_val, count8x8_val;
				5384	uint16x4_t count16x4_val;
				5385	uint32x2_t count32x2_val;
				5386
				5387	input_val = vld1_u8((uint8_t *) &a);
				5388	count8x8_val = vcnt_u8(input_val);
				5389	count16x4_val = vpaddl_u8(count8x8_val);
				5390	count32x2_val = vpaddl_u16(count16x4_val);
				5391
				5392	vst1_u32(&count, count32x2_val);
				5393	return count;
				5394	#endif
				5395	}
				5396
				5397	// Count the number of bits set to 1 in unsigned 64-bit integer a, and
				5398	// return that count in dst.
				5399	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
				5400	FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
				5401	{
				5402	#if defined(__aarch64__)
				5403	#if __has_builtin(__builtin_popcountll)
				5404	return __builtin_popcountll(a);
				5405	#else
				5406	return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
				5407	#endif
				5408	#else
				5409	uint64_t count = 0;
				5410	uint8x8_t input_val, count8x8_val;
				5411	uint16x4_t count16x4_val;
				5412	uint32x2_t count32x2_val;
				5413	uint64x1_t count64x1_val;
				5414
				5415	input_val = vld1_u8((uint8_t *) &a);
				5416	count8x8_val = vcnt_u8(input_val);
				5417	count16x4_val = vpaddl_u8(count8x8_val);
				5418	count32x2_val = vpaddl_u16(count16x4_val);
				5419	count64x1_val = vpaddl_u32(count32x2_val);
				5420	vst1_u64(&count, count64x1_val);
				5421	return count;
				5422	#endif
				5423	}
				5424
				5425	// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
				5426	// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
				5427	// transposed matrix in these vectors (row0 now contains column 0, etc.).
				5428	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
				5429	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
				5430	do { \
				5431	float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
				5432	float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
				5433	row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
				5434	vget_low_f32(ROW23.val[0])); \
				5435	row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
				5436	vget_low_f32(ROW23.val[1])); \
				5437	row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
				5438	vget_high_f32(ROW23.val[0])); \
				5439	row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
				5440	vget_high_f32(ROW23.val[1])); \
				5441	} while (0)
				5442
				5443	/* Crypto Extensions */
				5444
				5445	#if defined(__ARM_FEATURE_CRYPTO)
				5446	// Wraps vmull_p64
				5447	FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
				5448	{
				5449	poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
				5450	poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
				5451	return vreinterpretq_u64_p128(vmull_p64(a, b));
				5452	}
				5453	#else // ARMv7 polyfill
				5454	// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
				5455	//
				5456	// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
				5457	// 64-bit->128-bit polynomial multiply.
				5458	//
				5459	// It needs some work and is somewhat slow, but it is still faster than all
				5460	// known scalar methods.
				5461	//
				5462	// Algorithm adapted to C from
				5463	// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
				5464	// from "Fast Software Polynomial Multiplication on ARM Processors Using the
				5465	// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
				5466	// (https://hal.inria.fr/hal-01506572)
				5467	static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
				5468	{
				5469	poly8x8_t a = vreinterpret_p8_u64(_a);
				5470	poly8x8_t b = vreinterpret_p8_u64(_b);
				5471
				5472	// Masks
				5473	uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
				5474	vcreate_u8(0x00000000ffffffff));
				5475	uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
				5476	vcreate_u8(0x0000000000000000));
				5477
				5478	// Do the multiplies, rotating with vext to get all combinations
				5479	uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
				5480	uint8x16_t e =
				5481	vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
				5482	uint8x16_t f =
				5483	vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
				5484	uint8x16_t g =
				5485	vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
				5486	uint8x16_t h =
				5487	vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
				5488	uint8x16_t i =
				5489	vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
				5490	uint8x16_t j =
				5491	vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
				5492	uint8x16_t k =
				5493	vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
				5494
				5495	// Add cross products
				5496	uint8x16_t l = veorq_u8(e, f); // L = E + F
				5497	uint8x16_t m = veorq_u8(g, h); // M = G + H
				5498	uint8x16_t n = veorq_u8(i, j); // N = I + J
				5499
				5500	// Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
				5501	// instructions.
				5502	#if defined(__aarch64__)
				5503	uint8x16_t lm_p0 = vreinterpretq_u8_u64(
				5504	vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
				5505	uint8x16_t lm_p1 = vreinterpretq_u8_u64(
				5506	vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
				5507	uint8x16_t nk_p0 = vreinterpretq_u8_u64(
				5508	vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
				5509	uint8x16_t nk_p1 = vreinterpretq_u8_u64(
				5510	vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
				5511	#else
				5512	uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
				5513	uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
				5514	uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
				5515	uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
				5516	#endif
				5517	// t0 = (L) (P0 + P1) << 8
				5518	// t1 = (M) (P2 + P3) << 16
				5519	uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
				5520	uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
				5521	uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
				5522
				5523	// t2 = (N) (P4 + P5) << 24
				5524	// t3 = (K) (P6 + P7) << 32
				5525	uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
				5526	uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
				5527	uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
				5528
				5529	// De-interleave
				5530	#if defined(__aarch64__)
				5531	uint8x16_t t0 = vreinterpretq_u8_u64(
				5532	vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
				5533	uint8x16_t t1 = vreinterpretq_u8_u64(
				5534	vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
				5535	uint8x16_t t2 = vreinterpretq_u8_u64(
				5536	vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
				5537	uint8x16_t t3 = vreinterpretq_u8_u64(
				5538	vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
				5539	#else
				5540	uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
				5541	uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
				5542	uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
				5543	uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
				5544	#endif
				5545	// Shift the cross products
				5546	uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
				5547	uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
				5548	uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
				5549	uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
				5550
				5551	// Accumulate the products
				5552	uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
				5553	uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
				5554	uint8x16_t mix = veorq_u8(d, cross1);
				5555	uint8x16_t r = veorq_u8(mix, cross2);
				5556	return vreinterpretq_u64_u8(r);
				5557	}
				5558	#endif // ARMv7 polyfill
				5559
				5560	FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
				5561	{
				5562	uint64x2_t a = vreinterpretq_u64_m128i(_a);
				5563	uint64x2_t b = vreinterpretq_u64_m128i(_b);
				5564	switch (imm & 0x11) {
				5565	case 0x00:
				5566	return vreinterpretq_m128i_u64(
				5567	_sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
				5568	case 0x01:
				5569	return vreinterpretq_m128i_u64(
				5570	_sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
				5571	case 0x10:
				5572	return vreinterpretq_m128i_u64(
				5573	_sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
				5574	case 0x11:
				5575	return vreinterpretq_m128i_u64(
				5576	_sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
				5577	default:
				5578	abort();
				5579	}
				5580	}
				5581
				5582	#if !defined(__ARM_FEATURE_CRYPTO)
				5583	/* clang-format off */
				5584	#define SSE2NEON_AES_DATA(w) \
				5585	{ \
				5586	w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
				5587	w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
				5588	w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
				5589	w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
				5590	w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
				5591	w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
				5592	w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
				5593	w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
				5594	w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
				5595	w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
				5596	w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
				5597	w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
				5598	w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
				5599	w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
				5600	w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
				5601	w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
				5602	w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
				5603	w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
				5604	w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
				5605	w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
				5606	w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
				5607	w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
				5608	w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
				5609	w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
				5610	w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
				5611	w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
				5612	w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
				5613	w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
				5614	w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
				5615	w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
				5616	w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
				5617	w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
				5618	w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
				5619	w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
				5620	w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
				5621	w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
				5622	w(0xb0), w(0x54), w(0xbb), w(0x16) \
				5623	}
				5624	/* clang-format on */
				5625
				5626	/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
				5627	#define SSE2NEON_AES_H0(x) (x)
				5628	static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
				5629	#undef SSE2NEON_AES_H0
				5630
				5631	// In the absence of crypto extensions, implement aesenc using regular neon
				5632	// intrinsics instead. See:
				5633	// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
				5634	// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
				5635	// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
				5636	// for more information Reproduced with permission of the author.
				5637	FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
				5638	{
				5639	#if defined(__aarch64__)
				5640	static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
				5641	0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
				5642	0xc, 0x1, 0x6, 0xb};
				5643	static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
				5644	0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
				5645
				5646	uint8x16_t v;
				5647	uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
				5648
				5649	// shift rows
				5650	w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
				5651
				5652	// sub bytes
				5653	v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
				5654	v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
				5655	v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
				5656	v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
				5657
				5658	// mix columns
				5659	w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
				5660	w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
				5661	w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
				5662
				5663	// add round key
				5664	return vreinterpretq_m128i_u8(w) ^ RoundKey;
				5665
				5666	#else /* ARMv7-A NEON implementation */
				5667	#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
				5668	(((uint32_t)(b3) << 24) \| ((uint32_t)(b2) << 16) \| ((uint32_t)(b1) << 8) \| \
				5669	(b0))
				5670	#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
				5671	#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
				5672	#define SSE2NEON_AES_U0(p) \
				5673	SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
				5674	#define SSE2NEON_AES_U1(p) \
				5675	SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
				5676	#define SSE2NEON_AES_U2(p) \
				5677	SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
				5678	#define SSE2NEON_AES_U3(p) \
				5679	SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
				5680	static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
				5681	SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
				5682	SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
				5683	SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
				5684	SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
				5685	};
				5686	#undef SSE2NEON_AES_B2W
				5687	#undef SSE2NEON_AES_F2
				5688	#undef SSE2NEON_AES_F3
				5689	#undef SSE2NEON_AES_U0
				5690	#undef SSE2NEON_AES_U1
				5691	#undef SSE2NEON_AES_U2
				5692	#undef SSE2NEON_AES_U3
				5693
				5694	uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
				5695	uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
				5696	uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
				5697	uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
				5698
				5699	__m128i out = _mm_set_epi32(
				5700	(aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
				5701	aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
				5702	(aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
				5703	aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
				5704	(aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
				5705	aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
				5706	(aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
				5707	aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
				5708
				5709	return _mm_xor_si128(out, RoundKey);
				5710	#endif
				5711	}
				5712
				5713	FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
				5714	{
				5715	/* FIXME: optimized for NEON */
				5716	uint8_t v[4][4] = {
				5717	[0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
				5718	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
				5719	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
				5720	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
				5721	[1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
				5722	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
				5723	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
				5724	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
				5725	[2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
				5726	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
				5727	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
				5728	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
				5729	[3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
				5730	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
				5731	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
				5732	SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
				5733	};
				5734	for (int i = 0; i < 16; i++)
				5735	vreinterpretq_nth_u8_m128i(a, i) =
				5736	v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
				5737	return a;
				5738	}
				5739
				5740	// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
				5741	// This instruction generates a round key for AES encryption. See
				5742	// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
				5743	// for details.
				5744	//
				5745	// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
				5746	FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
				5747	{
				5748	uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
				5749	uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
				5750	for (int i = 0; i < 4; ++i) {
				5751	((uint8_t ) &X1)[i] = SSE2NEON_sbox[((uint8_t ) &X1)[i]];
				5752	((uint8_t ) &X3)[i] = SSE2NEON_sbox[((uint8_t ) &X3)[i]];
				5753	}
				5754	return _mm_set_epi32(((X3 >> 8) \| (X3 << 24)) ^ rcon, X3,
				5755	((X1 >> 8) \| (X1 << 24)) ^ rcon, X1);
				5756	}
				5757	#undef SSE2NEON_AES_DATA
				5758
				5759	#else /* __ARM_FEATURE_CRYPTO */
				5760	// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
				5761	// AESMC and then manually applying the real key as an xor operation. This
				5762	// unfortunately means an additional xor op; the compiler should be able to
				5763	// optimize this away for repeated calls however. See
				5764	// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
				5765	// for more details.
				5766	FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
				5767	{
				5768	return vreinterpretq_m128i_u8(
				5769	vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
				5770	vreinterpretq_u8_m128i(b));
				5771	}
				5772
				5773	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
				5774	FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
				5775	{
				5776	return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
				5777	vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
				5778	RoundKey);
				5779	}
				5780
				5781	FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
				5782	{
				5783	// AESE does ShiftRows and SubBytes on A
				5784	uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
				5785
				5786	uint8x16_t dest = {
				5787	// Undo ShiftRows step from AESE and extract X1 and X3
				5788	u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
				5789	u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
				5790	u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
				5791	u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
				5792	};
				5793	uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
				5794	return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
				5795	}
				5796	#endif
				5797
				5798	/* Streaming Extensions */
				5799
				5800	// Guarantees that every preceding store is globally visible before any
				5801	// subsequent store.
				5802	// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
				5803	FORCE_INLINE void _mm_sfence(void)
				5804	{
				5805	__sync_synchronize();
				5806	}
				5807
				5808	// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
				5809	// point elements) from a into memory using a non-temporal memory hint.
				5810	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
				5811	FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
				5812	{
				5813	#if __has_builtin(__builtin_nontemporal_store)
				5814	__builtin_nontemporal_store(a, (float32x4_t *) p);
				5815	#else
				5816	vst1q_f32(p, vreinterpretq_f32_m128(a));
				5817	#endif
				5818	}
				5819
				5820	// Stores the data in a to the address p without polluting the caches. If the
				5821	// cache line containing address p is already in the cache, the cache will be
				5822	// updated.
				5823	// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
				5824	FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
				5825	{
				5826	#if __has_builtin(__builtin_nontemporal_store)
				5827	__builtin_nontemporal_store(a, p);
				5828	#else
				5829	vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
				5830	#endif
				5831	}
				5832
				5833	// Load 128-bits of integer data from memory into dst using a non-temporal
				5834	// memory hint. mem_addr must be aligned on a 16-byte boundary or a
				5835	// general-protection exception may be generated.
				5836	//
				5837	// dst[127:0] := MEM[mem_addr+127:mem_addr]
				5838	//
				5839	// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
				5840	FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
				5841	{
				5842	#if __has_builtin(__builtin_nontemporal_store)
				5843	return __builtin_nontemporal_load(p);
				5844	#else
				5845	return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
				5846	#endif
				5847	}
				5848
				5849	// Cache line containing p is flushed and invalidated from all caches in the
				5850	// coherency domain. :
				5851	// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
				5852	FORCE_INLINE void _mm_clflush(void const *p)
				5853	{
				5854	(void) p;
				5855	// no corollary for Neon?
				5856	}
				5857
				5858	// Allocate aligned blocks of memory.
				5859	// https://software.intel.com/en-us/
				5860	// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
				5861	FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
				5862	{
				5863	void *ptr;
				5864	if (align == 1)
				5865	return malloc(size);
				5866	if (align == 2 \|\| (sizeof(void *) == 8 && align == 4))
				5867	align = sizeof(void *);
				5868	if (!posix_memalign(&ptr, align, size))
				5869	return ptr;
				5870	return NULL;
				5871	}
				5872
				5873	FORCE_INLINE void _mm_free(void *addr)
				5874	{
				5875	free(addr);
				5876	}
				5877
				5878	// Starting with the initial value in crc, accumulates a CRC32 value for
				5879	// unsigned 8-bit integer v.
				5880	// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
				5881	FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
				5882	{
				5883	#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
				5884	__asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
				5885	: [c] "+r"(crc)
				5886	: [v] "r"(v));
				5887	#else
				5888	crc ^= v;
				5889	for (int bit = 0; bit < 8; bit++) {
				5890	if (crc & 1)
				5891	crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
				5892	else
				5893	crc = (crc >> 1);
				5894	}
				5895	#endif
				5896	return crc;
				5897	}
				5898
				5899	// Starting with the initial value in crc, accumulates a CRC32 value for
				5900	// unsigned 16-bit integer v.
				5901	// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
				5902	FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
				5903	{
				5904	#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
				5905	__asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
				5906	: [c] "+r"(crc)
				5907	: [v] "r"(v));
				5908	#else
				5909	crc = _mm_crc32_u8(crc, v & 0xff);
				5910	crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
				5911	#endif
				5912	return crc;
				5913	}
				5914
				5915	// Starting with the initial value in crc, accumulates a CRC32 value for
				5916	// unsigned 32-bit integer v.
				5917	// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
				5918	FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
				5919	{
				5920	#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
				5921	__asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
				5922	: [c] "+r"(crc)
				5923	: [v] "r"(v));
				5924	#else
				5925	crc = _mm_crc32_u16(crc, v & 0xffff);
				5926	crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
				5927	#endif
				5928	return crc;
				5929	}
				5930
				5931	// Starting with the initial value in crc, accumulates a CRC32 value for
				5932	// unsigned 64-bit integer v.
				5933	// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
				5934	FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
				5935	{
				5936	#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
				5937	__asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
				5938	: [c] "+r"(crc)
				5939	: [v] "r"(v));
				5940	#else
				5941	crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
				5942	crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
				5943	#endif
				5944	return crc;
				5945	}
				5946
				5947	#if defined(__GNUC__) \|\| defined(__clang__)
				5948	#pragma pop_macro("ALIGN_STRUCT")
				5949	#pragma pop_macro("FORCE_INLINE")
				5950	#endif
				5951
				5952	#if defined(__GNUC__)
				5953	#pragma GCC pop_options
				5954	#endif
				5955
				5956	#endif