blob: b28a797037d08d2b0a2b7081efae58c98994ddba [file] [log] [blame]
hayati ayguen2587d832020-11-12 07:15:43 +00001#ifndef SSE2NEON_H
2#define SSE2NEON_H
3
4// This header file provides a simple API translation layer
5// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6//
7// This header file does not yet translate all of the SSE intrinsics.
8//
9// Contributors to this work are:
10// John W. Ratcliff <jratcliffscarab@gmail.com>
11// Brandon Rowlett <browlett@nvidia.com>
12// Ken Fast <kfast@gdeb.com>
13// Eric van Beurden <evanbeurden@nvidia.com>
14// Alexander Potylitsin <apotylitsin@nvidia.com>
15// Hasindu Gamaarachchi <hasindu2008@gmail.com>
16// Jim Huang <jserv@biilabs.io>
17// Mark Cheng <marktwtn@biilabs.io>
18// Malcolm James MacLeod <malcolm@gulden.com>
19// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20// Sebastian Pop <spop@amazon.com>
21// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
22// Danila Kutenin <danilak@google.com>
23// François Turban (JishinMaster) <francois.turban@gmail.com>
24// Pei-Hsuan Hung <afcidk@gmail.com>
25// Yang-Hao Yuan <yanghau@biilabs.io>
26
27/*
28 * sse2neon is freely redistributable under the MIT License.
29 *
30 * Permission is hereby granted, free of charge, to any person obtaining a copy
31 * of this software and associated documentation files (the "Software"), to deal
32 * in the Software without restriction, including without limitation the rights
33 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
34 * copies of the Software, and to permit persons to whom the Software is
35 * furnished to do so, subject to the following conditions:
36 *
37 * The above copyright notice and this permission notice shall be included in
38 * all copies or substantial portions of the Software.
39 *
40 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
45 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
46 * SOFTWARE.
47 */
48
49/* Tunable configurations */
50
51/* Enable precise implementation of _mm_min_ps and _mm_max_ps
52 * This would slow down the computation a bit, but gives consistent result with
53 * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
54 */
55#ifndef SSE2NEON_PRECISE_MINMAX
56#define SSE2NEON_PRECISE_MINMAX (0)
57#endif
58
59#if defined(__GNUC__) || defined(__clang__)
60#pragma push_macro("FORCE_INLINE")
61#pragma push_macro("ALIGN_STRUCT")
62#define FORCE_INLINE static inline __attribute__((always_inline))
63#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
64#else
65#error "Macro name collisions may happen with unsupported compiler."
66#ifdef FORCE_INLINE
67#undef FORCE_INLINE
68#endif
69#define FORCE_INLINE static inline
70#ifndef ALIGN_STRUCT
71#define ALIGN_STRUCT(x) __declspec(align(x))
72#endif
73#endif
74
75#include <stdint.h>
76#include <stdlib.h>
77
78/* Architecture-specific build options */
79/* FIXME: #pragma GCC push_options is only available on GCC */
80#if defined(__GNUC__)
81#if defined(__arm__) && __ARM_ARCH == 7
82/* According to ARM C Language Extensions Architecture specification,
83 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
84 * architecture supported.
85 */
86#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
87#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
88#endif
89#pragma GCC push_options
90#pragma GCC target("fpu=neon")
91#elif defined(__aarch64__)
92#pragma GCC push_options
93#pragma GCC target("+simd")
94#else
95#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
96#endif
97#endif
98
99#include <arm_neon.h>
100
101/* Rounding functions require either Aarch64 instructions or libm failback */
102#if !defined(__aarch64__)
103#include <math.h>
104#endif
105
106/* "__has_builtin" can be used to query support for built-in functions
107 * provided by gcc/clang and other compilers that support it.
108 */
109#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
110/* Compatibility with gcc <= 9 */
111#if __GNUC__ <= 9
112#define __has_builtin(x) HAS##x
113#define HAS__builtin_popcount 1
114#define HAS__builtin_popcountll 1
115#else
116#define __has_builtin(x) 0
117#endif
118#endif
119
120/**
121 * MACRO for shuffle parameter for _mm_shuffle_ps().
122 * Argument fp3 is a digit[0123] that represents the fp from argument "b"
123 * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
124 * for fp2 in result. fp1 is a digit[0123] that represents the fp from
125 * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
126 * fp0 is the same for fp0 of result.
127 */
128#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
129 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
130
131/* Rounding mode macros. */
132#define _MM_FROUND_TO_NEAREST_INT 0x00
133#define _MM_FROUND_TO_NEG_INF 0x01
134#define _MM_FROUND_TO_POS_INF 0x02
135#define _MM_FROUND_TO_ZERO 0x03
136#define _MM_FROUND_CUR_DIRECTION 0x04
137#define _MM_FROUND_NO_EXC 0x08
138
139/* indicate immediate constant argument in a given range */
140#define __constrange(a, b) const
141
142/* A few intrinsics accept traditional data types like ints or floats, but
143 * most operate on data types that are specific to SSE.
144 * If a vector type ends in d, it contains doubles, and if it does not have
145 * a suffix, it contains floats. An integer vector type can contain any type
146 * of integer, from chars to shorts to unsigned long longs.
147 */
148typedef int64x1_t __m64;
149typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
150// On ARM 32-bit architecture, the float64x2_t is not supported.
151// The data type __m128d should be represented in a different way for related
152// intrinsic conversion.
153#if defined(__aarch64__)
154typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
155#else
156typedef float32x4_t __m128d;
157#endif
158typedef int64x2_t __m128i; /* 128-bit vector containing integers */
159
160/* type-safe casting between types */
161
162#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
163#define vreinterpretq_m128_f32(x) (x)
164#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
165
166#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
167#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
168#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
169#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
170
171#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
172#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
173#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
174#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
175
176#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
177#define vreinterpretq_f32_m128(x) (x)
178#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
179
180#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
181#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
182#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
183#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
184
185#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
186#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
187#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
188#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
189
190#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
191#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
192#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
193#define vreinterpretq_m128i_s64(x) (x)
194
195#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
196#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
197#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
198#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
199
200#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
201#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
202#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
203#define vreinterpretq_s64_m128i(x) (x)
204
205#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
206#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
207#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
208#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
209
210#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
211#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
212#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
213#define vreinterpret_m64_s64(x) (x)
214
215#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
216#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
217#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
218#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
219
220#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
221#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
222#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
223
224#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
225#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
226#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
227#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
228
229#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
230#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
231#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
232#define vreinterpret_s64_m64(x) (x)
233
234#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
235
236#if defined(__aarch64__)
237#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
238#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
239
240#define vreinterpretq_m128d_f64(x) (x)
241
242#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
243
244#define vreinterpretq_f64_m128d(x) (x)
245#else
246#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
247#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
248
249#define vreinterpretq_m128d_f32(x) (x)
250
251#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
252
253#define vreinterpretq_f32_m128d(x) (x)
254#endif
255
256// A struct is defined in this header file called 'SIMDVec' which can be used
257// by applications which attempt to access the contents of an _m128 struct
258// directly. It is important to note that accessing the __m128 struct directly
259// is bad coding practice by Microsoft: @see:
260// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
261//
262// However, some legacy source code may try to access the contents of an __m128
263// struct directly so the developer can use the SIMDVec as an alias for it. Any
264// casting must be done manually by the developer, as you cannot cast or
265// otherwise alias the base NEON data type for intrinsic operations.
266//
267// union intended to allow direct access to an __m128 variable using the names
268// that the MSVC compiler provides. This union should really only be used when
269// trying to access the members of the vector as integer values. GCC/clang
270// allow native access to the float members through a simple array access
271// operator (in C since 4.6, in C++ since 4.8).
272//
273// Ideally direct accesses to SIMD vectors should not be used since it can cause
274// a performance hit. If it really is needed however, the original __m128
275// variable can be aliased with a pointer to this union and used to access
276// individual components. The use of this union should be hidden behind a macro
277// that is used throughout the codebase to access the members instead of always
278// declaring this type of variable.
279typedef union ALIGN_STRUCT(16) SIMDVec {
280 float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
281 int8_t m128_i8[16]; // as signed 8-bit integers.
282 int16_t m128_i16[8]; // as signed 16-bit integers.
283 int32_t m128_i32[4]; // as signed 32-bit integers.
284 int64_t m128_i64[2]; // as signed 64-bit integers.
285 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
286 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
287 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
288 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
289} SIMDVec;
290
291// casting using SIMDVec
292#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
293#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
294#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
295
296/* Backwards compatibility for compilers with lack of specific type support */
297
298// Older gcc does not define vld1q_u8_x4 type
299#if defined(__GNUC__) && !defined(__clang__)
300#if __GNUC__ <= 9
301FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
302{
303 uint8x16x4_t ret;
304 ret.val[0] = vld1q_u8(p + 0);
305 ret.val[1] = vld1q_u8(p + 16);
306 ret.val[2] = vld1q_u8(p + 32);
307 ret.val[3] = vld1q_u8(p + 48);
308 return ret;
309}
310#endif
311#endif
312
313/* Function Naming Conventions
314 * The naming convention of SSE intrinsics is straightforward. A generic SSE
315 * intrinsic function is given as follows:
316 * _mm_<name>_<data_type>
317 *
318 * The parts of this format are given as follows:
319 * 1. <name> describes the operation performed by the intrinsic
320 * 2. <data_type> identifies the data type of the function's primary arguments
321 *
322 * This last part, <data_type>, is a little complicated. It identifies the
323 * content of the input values, and can be set to any of the following values:
324 * + ps - vectors contain floats (ps stands for packed single-precision)
325 * + pd - vectors cantain doubles (pd stands for packed double-precision)
326 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
327 * signed integers
328 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
329 * unsigned integers
330 * + si128 - unspecified 128-bit vector or 256-bit vector
331 * + m128/m128i/m128d - identifies input vector types when they are different
332 * than the type of the returned vector
333 *
334 * For example, _mm_setzero_ps. The _mm implies that the function returns
335 * a 128-bit vector. The _ps at the end implies that the argument vectors
336 * contain floats.
337 *
338 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
339 * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
340 * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
341 * // Set packed 8-bit integers
342 * // 128 bits, 16 chars, per 8 bits
343 * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
344 * 4, 5, 12, 13, 6, 7, 14, 15);
345 * // Shuffle packed 8-bit integers
346 * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
347 *
348 * Data (Number, Binary, Byte Index):
349 +------+------+-------------+------+------+-------------+
350 | 1 | 2 | 3 | 4 | Number
351 +------+------+------+------+------+------+------+------+
352 | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
353 +------+------+------+------+------+------+------+------+
354 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
355 +------+------+------+------+------+------+------+------+
356
357 +------+------+------+------+------+------+------+------+
358 | 5 | 6 | 7 | 8 | Number
359 +------+------+------+------+------+------+------+------+
360 | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
361 +------+------+------+------+------+------+------+------+
362 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
363 +------+------+------+------+------+------+------+------+
364 * Index (Byte Index):
365 +------+------+------+------+------+------+------+------+
366 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
367 +------+------+------+------+------+------+------+------+
368
369 +------+------+------+------+------+------+------+------+
370 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
371 +------+------+------+------+------+------+------+------+
372 * Result:
373 +------+------+------+------+------+------+------+------+
374 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
375 +------+------+------+------+------+------+------+------+
376 | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
377 +------+------+------+------+------+------+------+------+
378 | 256 | 2 | 5 | 6 | Number
379 +------+------+------+------+------+------+------+------+
380
381 +------+------+------+------+------+------+------+------+
382 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
383 +------+------+------+------+------+------+------+------+
384 | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
385 +------+------+------+------+------+------+------+------+
386 | 3 | 7 | 4 | 8 | Number
387 +------+------+------+------+------+------+-------------+
388 */
389
390/* Set/get methods */
391
392/* Constants for use with _mm_prefetch. */
393enum _mm_hint {
394 _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
395 _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
396 _MM_HINT_T1 = 2, /* load data to L2 cache only */
397 _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
398 _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
399 _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
400 _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
401 _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
402};
403
404// Loads one cache line of data from address p to a location closer to the
405// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
406FORCE_INLINE void _mm_prefetch(const void *p, int i)
407{
408 (void) i;
409 __builtin_prefetch(p);
410}
411
412// Copy the lower single-precision (32-bit) floating-point element of a to dst.
413//
414// dst[31:0] := a[31:0]
415//
416// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
417FORCE_INLINE float _mm_cvtss_f32(__m128 a)
418{
419 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
420}
421
422// Sets the 128-bit value to zero
423// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
424FORCE_INLINE __m128i _mm_setzero_si128(void)
425{
426 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
427}
428
429// Clears the four single-precision, floating-point values.
430// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
431FORCE_INLINE __m128 _mm_setzero_ps(void)
432{
433 return vreinterpretq_m128_f32(vdupq_n_f32(0));
434}
435
436// Sets the four single-precision, floating-point values to w.
437//
438// r0 := r1 := r2 := r3 := w
439//
440// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
441FORCE_INLINE __m128 _mm_set1_ps(float _w)
442{
443 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
444}
445
446// Sets the four single-precision, floating-point values to w.
447// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
448FORCE_INLINE __m128 _mm_set_ps1(float _w)
449{
450 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
451}
452
453// Sets the four single-precision, floating-point values to the four inputs.
454// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
455FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
456{
457 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
458 return vreinterpretq_m128_f32(vld1q_f32(data));
459}
460
461// Copy single-precision (32-bit) floating-point element a to the lower element
462// of dst, and zero the upper 3 elements.
463// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
464FORCE_INLINE __m128 _mm_set_ss(float a)
465{
466 float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
467 return vreinterpretq_m128_f32(vld1q_f32(data));
468}
469
470// Sets the four single-precision, floating-point values to the four inputs in
471// reverse order.
472// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
473FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
474{
475 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
476 return vreinterpretq_m128_f32(vld1q_f32(data));
477}
478
479// Sets the 8 signed 16-bit integer values in reverse order.
480//
481// Return Value
482// r0 := w0
483// r1 := w1
484// ...
485// r7 := w7
486FORCE_INLINE __m128i _mm_setr_epi16(short w0,
487 short w1,
488 short w2,
489 short w3,
490 short w4,
491 short w5,
492 short w6,
493 short w7)
494{
495 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
496 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
497}
498
499// Sets the 4 signed 32-bit integer values in reverse order
500// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
501FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
502{
503 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
504 return vreinterpretq_m128i_s32(vld1q_s32(data));
505}
506
507// Set packed 64-bit integers in dst with the supplied values in reverse order.
508// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
509FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
510{
511 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
512}
513
514// Sets the 16 signed 8-bit integer values to b.
515//
516// r0 := b
517// r1 := b
518// ...
519// r15 := b
520//
521// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
522FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
523{
524 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
525}
526
527// Sets the 8 signed 16-bit integer values to w.
528//
529// r0 := w
530// r1 := w
531// ...
532// r7 := w
533//
534// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
535FORCE_INLINE __m128i _mm_set1_epi16(short w)
536{
537 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
538}
539
540// Sets the 16 signed 8-bit integer values.
541// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
542FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
543 signed char b14,
544 signed char b13,
545 signed char b12,
546 signed char b11,
547 signed char b10,
548 signed char b9,
549 signed char b8,
550 signed char b7,
551 signed char b6,
552 signed char b5,
553 signed char b4,
554 signed char b3,
555 signed char b2,
556 signed char b1,
557 signed char b0)
558{
559 int8_t ALIGN_STRUCT(16)
560 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
561 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
562 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
563 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
564 return (__m128i) vld1q_s8(data);
565}
566
567// Sets the 8 signed 16-bit integer values.
568// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
569FORCE_INLINE __m128i _mm_set_epi16(short i7,
570 short i6,
571 short i5,
572 short i4,
573 short i3,
574 short i2,
575 short i1,
576 short i0)
577{
578 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
579 return vreinterpretq_m128i_s16(vld1q_s16(data));
580}
581
582// Sets the 16 signed 8-bit integer values in reverse order.
583// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
584FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
585 signed char b1,
586 signed char b2,
587 signed char b3,
588 signed char b4,
589 signed char b5,
590 signed char b6,
591 signed char b7,
592 signed char b8,
593 signed char b9,
594 signed char b10,
595 signed char b11,
596 signed char b12,
597 signed char b13,
598 signed char b14,
599 signed char b15)
600{
601 int8_t ALIGN_STRUCT(16)
602 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
603 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
604 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
605 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
606 return (__m128i) vld1q_s8(data);
607}
608
609// Sets the 4 signed 32-bit integer values to i.
610//
611// r0 := i
612// r1 := i
613// r2 := i
614// r3 := I
615//
616// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
617FORCE_INLINE __m128i _mm_set1_epi32(int _i)
618{
619 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
620}
621
622// Sets the 2 signed 64-bit integer values to i.
623// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
624FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
625{
626 return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
627}
628
629// Sets the 2 signed 64-bit integer values to i.
630// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
631FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
632{
633 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
634}
635
636// Sets the 4 signed 32-bit integer values.
637// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
638FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
639{
640 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
641 return vreinterpretq_m128i_s32(vld1q_s32(data));
642}
643
644// Returns the __m128i structure with its two 64-bit integer values
645// initialized to the values of the two 64-bit integers passed in.
646// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
647FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
648{
649 int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
650 return vreinterpretq_m128i_s64(vld1q_s64(data));
651}
652
653// Returns the __m128i structure with its two 64-bit integer values
654// initialized to the values of the two 64-bit integers passed in.
655// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
656FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
657{
658 return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
659}
660
661// Set packed double-precision (64-bit) floating-point elements in dst with the
662// supplied values.
663// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
664FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
665{
666 double ALIGN_STRUCT(16) data[2] = {e0, e1};
667#if defined(__aarch64__)
668 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
669#else
670 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
671#endif
672}
673
674// Stores four single-precision, floating-point values.
675// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
676FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
677{
678 vst1q_f32(p, vreinterpretq_f32_m128(a));
679}
680
681// Stores four single-precision, floating-point values.
682// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
683FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
684{
685 vst1q_f32(p, vreinterpretq_f32_m128(a));
686}
687
688// Stores four 32-bit integer values as (as a __m128i value) at the address p.
689// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
690FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
691{
692 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
693}
694
695// Stores four 32-bit integer values as (as a __m128i value) at the address p.
696// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
697FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
698{
699 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
700}
701
702// Stores the lower single - precision, floating - point value.
703// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
704FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
705{
706 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
707}
708
709// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
710// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
711// or a general-protection exception may be generated.
712// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
713FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
714{
715#if defined(__aarch64__)
716 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
717#else
718 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
719#endif
720}
721
722// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
723// elements) from a into memory. mem_addr does not need to be aligned on any
724// particular boundary.
725// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
726FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
727{
728 _mm_store_pd(mem_addr, a);
729}
730
731// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
732// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
733FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
734{
735 uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
736 uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
737 *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
738}
739
740// Stores the lower two single-precision floating point values of a to the
741// address p.
742//
743// *p0 := a0
744// *p1 := a1
745//
746// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
747FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
748{
749 *p = vreinterpret_m64_f32(vget_low_f32(a));
750}
751
752// Stores the upper two single-precision, floating-point values of a to the
753// address p.
754//
755// *p0 := a2
756// *p1 := a3
757//
758// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
759FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
760{
761 *p = vreinterpret_m64_f32(vget_high_f32(a));
762}
763
764// Loads a single single-precision, floating-point value, copying it into all
765// four words
766// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
767FORCE_INLINE __m128 _mm_load1_ps(const float *p)
768{
769 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
770}
771
772// Load a single-precision (32-bit) floating-point element from memory into all
773// elements of dst.
774//
775// dst[31:0] := MEM[mem_addr+31:mem_addr]
776// dst[63:32] := MEM[mem_addr+31:mem_addr]
777// dst[95:64] := MEM[mem_addr+31:mem_addr]
778// dst[127:96] := MEM[mem_addr+31:mem_addr]
779//
780// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
781#define _mm_load_ps1 _mm_load1_ps
782
783// Sets the lower two single-precision, floating-point values with 64
784// bits of data loaded from the address p; the upper two values are passed
785// through from a.
786//
787// Return Value
788// r0 := *p0
789// r1 := *p1
790// r2 := a2
791// r3 := a3
792//
793// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
794FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
795{
796 return vreinterpretq_m128_f32(
797 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
798}
799
800// Load 4 single-precision (32-bit) floating-point elements from memory into dst
801// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
802// general-protection exception may be generated.
803//
804// dst[31:0] := MEM[mem_addr+127:mem_addr+96]
805// dst[63:32] := MEM[mem_addr+95:mem_addr+64]
806// dst[95:64] := MEM[mem_addr+63:mem_addr+32]
807// dst[127:96] := MEM[mem_addr+31:mem_addr]
808//
809// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
810FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
811{
812 float32x4_t v = vrev64q_f32(vld1q_f32(p));
813 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
814}
815
816// Sets the upper two single-precision, floating-point values with 64
817// bits of data loaded from the address p; the lower two values are passed
818// through from a.
819//
820// r0 := a0
821// r1 := a1
822// r2 := *p0
823// r3 := *p1
824//
825// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
826FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
827{
828 return vreinterpretq_m128_f32(
829 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
830}
831
832// Loads four single-precision, floating-point values.
833// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
834FORCE_INLINE __m128 _mm_load_ps(const float *p)
835{
836 return vreinterpretq_m128_f32(vld1q_f32(p));
837}
838
839// Loads four single-precision, floating-point values.
840// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
841FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
842{
843 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
844 // equivalent for neon
845 return vreinterpretq_m128_f32(vld1q_f32(p));
846}
847
848// Load unaligned 16-bit integer from memory into the first element of dst.
849//
850// dst[15:0] := MEM[mem_addr+15:mem_addr]
851// dst[MAX:16] := 0
852//
853// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
854FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
855{
856 return vreinterpretq_m128i_s16(
857 vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
858}
859
860// Load unaligned 64-bit integer from memory into the first element of dst.
861//
862// dst[63:0] := MEM[mem_addr+63:mem_addr]
863// dst[MAX:64] := 0
864//
865// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
866FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
867{
868 return vreinterpretq_m128i_s64(
869 vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
870}
871
872// Load a double-precision (64-bit) floating-point element from memory into the
873// lower of dst, and zero the upper element. mem_addr does not need to be
874// aligned on any particular boundary.
875//
876// dst[63:0] := MEM[mem_addr+63:mem_addr]
877// dst[127:64] := 0
878//
879// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
880FORCE_INLINE __m128d _mm_load_sd(const double *p)
881{
882#if defined(__aarch64__)
883 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
884#else
885 const float *fp = (const float *) p;
886 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
887 return vreinterpretq_m128d_f32(vld1q_f32(data));
888#endif
889}
890
891// Loads two double-precision from 16-byte aligned memory, floating-point
892// values.
893//
894// dst[127:0] := MEM[mem_addr+127:mem_addr]
895//
896// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
897FORCE_INLINE __m128d _mm_load_pd(const double *p)
898{
899#if defined(__aarch64__)
900 return vreinterpretq_m128d_f64(vld1q_f64(p));
901#else
902 const float *fp = (const float *) p;
903 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
904 return vreinterpretq_m128d_f32(vld1q_f32(data));
905#endif
906}
907
908// Loads two double-precision from unaligned memory, floating-point values.
909// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
910FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
911{
912 return _mm_load_pd(p);
913}
914
915// Loads an single - precision, floating - point value into the low word and
916// clears the upper three words.
917// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
918FORCE_INLINE __m128 _mm_load_ss(const float *p)
919{
920 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
921}
922
923FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
924{
925 /* Load the lower 64 bits of the value pointed to by p into the
926 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
927 */
928 return vreinterpretq_m128i_s32(
929 vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
930}
931
932// Load a double-precision (64-bit) floating-point element from memory into the
933// lower element of dst, and copy the upper element from a to dst. mem_addr does
934// not need to be aligned on any particular boundary.
935//
936// dst[63:0] := MEM[mem_addr+63:mem_addr]
937// dst[127:64] := a[127:64]
938//
939// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
940FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
941{
942#if defined(__aarch64__)
943 return vreinterpretq_m128d_f64(
944 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
945#else
946 return vreinterpretq_m128d_f32(
947 vcombine_f32(vld1_f32((const float *) p),
948 vget_high_f32(vreinterpretq_f32_m128d(a))));
949#endif
950}
951
952// Load 2 double-precision (64-bit) floating-point elements from memory into dst
953// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
954// general-protection exception may be generated.
955//
956// dst[63:0] := MEM[mem_addr+127:mem_addr+64]
957// dst[127:64] := MEM[mem_addr+63:mem_addr]
958//
959// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
960FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
961{
962#if defined(__aarch64__)
963 float64x2_t v = vld1q_f64(p);
964 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
965#else
966 int64x2_t v = vld1q_s64((const int64_t *) p);
967 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
968#endif
969}
970
971// Sets the low word to the single-precision, floating-point value of b
972// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
973FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
974{
975 return vreinterpretq_m128_f32(
976 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
977 vreinterpretq_f32_m128(a), 0));
978}
979
980// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
981// upper element.
982//
983// dst[63:0] := a[63:0]
984// dst[127:64] := 0
985//
986// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
987FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
988{
989 return vreinterpretq_m128i_s64(
990 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
991}
992
993// Return vector of type __m128 with undefined elements.
994// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
995FORCE_INLINE __m128 _mm_undefined_ps(void)
996{
997 __m128 a;
998 return a;
999}
1000
1001/* Logic/Binary operations */
1002
1003// Computes the bitwise AND-NOT of the four single-precision, floating-point
1004// values of a and b.
1005//
1006// r0 := ~a0 & b0
1007// r1 := ~a1 & b1
1008// r2 := ~a2 & b2
1009// r3 := ~a3 & b3
1010//
1011// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
1012FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1013{
1014 return vreinterpretq_m128_s32(
1015 vbicq_s32(vreinterpretq_s32_m128(b),
1016 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1017}
1018
1019// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
1020// elements in a and then AND with b, and store the results in dst.
1021//
1022// FOR j := 0 to 1
1023// i := j*64
1024// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
1025// ENDFOR
1026//
1027// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
1028FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
1029{
1030 // *NOTE* argument swap
1031 return vreinterpretq_m128d_s64(
1032 vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
1033}
1034
1035// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
1036// 128-bit value in a.
1037//
1038// r := (~a) & b
1039//
1040// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
1041FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
1042{
1043 return vreinterpretq_m128i_s32(
1044 vbicq_s32(vreinterpretq_s32_m128i(b),
1045 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
1046}
1047
1048// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
1049// b.
1050//
1051// r := a & b
1052//
1053// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
1054FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
1055{
1056 return vreinterpretq_m128i_s32(
1057 vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1058}
1059
1060// Computes the bitwise AND of the four single-precision, floating-point values
1061// of a and b.
1062//
1063// r0 := a0 & b0
1064// r1 := a1 & b1
1065// r2 := a2 & b2
1066// r3 := a3 & b3
1067//
1068// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
1069FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1070{
1071 return vreinterpretq_m128_s32(
1072 vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1073}
1074
1075// Compute the bitwise AND of packed double-precision (64-bit) floating-point
1076// elements in a and b, and store the results in dst.
1077//
1078// FOR j := 0 to 1
1079// i := j*64
1080// dst[i+63:i] := a[i+63:i] AND b[i+63:i]
1081// ENDFOR
1082//
1083// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
1084FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
1085{
1086 return vreinterpretq_m128d_s64(
1087 vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1088}
1089
1090// Computes the bitwise OR of the four single-precision, floating-point values
1091// of a and b.
1092// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
1093FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
1094{
1095 return vreinterpretq_m128_s32(
1096 vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1097}
1098
1099// Computes bitwise EXOR (exclusive-or) of the four single-precision,
1100// floating-point values of a and b.
1101// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
1102FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
1103{
1104 return vreinterpretq_m128_s32(
1105 veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1106}
1107
1108// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
1109// elements in a and b, and store the results in dst.
1110//
1111// FOR j := 0 to 1
1112// i := j*64
1113// dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
1114// ENDFOR
1115//
1116// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
1117FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
1118{
1119 return vreinterpretq_m128d_s64(
1120 veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1121}
1122
1123// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
1124//
1125// r := a | b
1126//
1127// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
1128FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
1129{
1130 return vreinterpretq_m128i_s32(
1131 vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1132}
1133
1134// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
1135// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
1136FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
1137{
1138 return vreinterpretq_m128i_s32(
1139 veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1140}
1141
1142// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1143// from a, and store the results in dst.
1144// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
1145FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
1146{
1147#if __has_builtin(__builtin_shufflevector)
1148 return vreinterpretq_m128_f32(__builtin_shufflevector(
1149 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
1150#else
1151 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
1152 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
1153 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
1154 return vreinterpretq_m128_f32(vld1q_f32(data));
1155#endif
1156}
1157
1158// Duplicate even-indexed single-precision (32-bit) floating-point elements
1159// from a, and store the results in dst.
1160// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
1161FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
1162{
1163#if __has_builtin(__builtin_shufflevector)
1164 return vreinterpretq_m128_f32(__builtin_shufflevector(
1165 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
1166#else
1167 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1168 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
1169 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
1170 return vreinterpretq_m128_f32(vld1q_f32(data));
1171#endif
1172}
1173
1174// Moves the upper two values of B into the lower two values of A.
1175//
1176// r3 := a3
1177// r2 := a2
1178// r1 := b3
1179// r0 := b2
1180FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
1181{
1182 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
1183 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
1184 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
1185}
1186
1187// Moves the lower two values of B into the upper two values of A.
1188//
1189// r3 := b1
1190// r2 := b0
1191// r1 := a1
1192// r0 := a0
1193FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
1194{
1195 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
1196 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
1197 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1198}
1199
1200// Compute the absolute value of packed signed 32-bit integers in a, and store
1201// the unsigned results in dst.
1202//
1203// FOR j := 0 to 3
1204// i := j*32
1205// dst[i+31:i] := ABS(a[i+31:i])
1206// ENDFOR
1207//
1208// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
1209FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
1210{
1211 return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
1212}
1213
1214// Compute the absolute value of packed signed 16-bit integers in a, and store
1215// the unsigned results in dst.
1216//
1217// FOR j := 0 to 7
1218// i := j*16
1219// dst[i+15:i] := ABS(a[i+15:i])
1220// ENDFOR
1221//
1222// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
1223FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
1224{
1225 return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
1226}
1227
1228// Compute the absolute value of packed signed 8-bit integers in a, and store
1229// the unsigned results in dst.
1230//
1231// FOR j := 0 to 15
1232// i := j*8
1233// dst[i+7:i] := ABS(a[i+7:i])
1234// ENDFOR
1235//
1236// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
1237FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
1238{
1239 return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
1240}
1241
1242// Compute the absolute value of packed signed 32-bit integers in a, and store
1243// the unsigned results in dst.
1244//
1245// FOR j := 0 to 1
1246// i := j*32
1247// dst[i+31:i] := ABS(a[i+31:i])
1248// ENDFOR
1249//
1250// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
1251FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
1252{
1253 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
1254}
1255
1256// Compute the absolute value of packed signed 16-bit integers in a, and store
1257// the unsigned results in dst.
1258//
1259// FOR j := 0 to 3
1260// i := j*16
1261// dst[i+15:i] := ABS(a[i+15:i])
1262// ENDFOR
1263//
1264// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
1265FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
1266{
1267 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
1268}
1269
1270// Compute the absolute value of packed signed 8-bit integers in a, and store
1271// the unsigned results in dst.
1272//
1273// FOR j := 0 to 7
1274// i := j*8
1275// dst[i+7:i] := ABS(a[i+7:i])
1276// ENDFOR
1277//
1278// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
1279FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
1280{
1281 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
1282}
1283
1284// Takes the upper 64 bits of a and places it in the low end of the result
1285// Takes the lower 64 bits of b and places it into the high end of the result.
1286FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
1287{
1288 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1289 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1290 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
1291}
1292
1293// takes the lower two 32-bit values from a and swaps them and places in high
1294// end of result takes the higher two 32 bit values from b and swaps them and
1295// places in low end of result.
1296FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
1297{
1298 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1299 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
1300 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
1301}
1302
1303FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
1304{
1305 float32x2_t a21 = vget_high_f32(
1306 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1307 float32x2_t b03 = vget_low_f32(
1308 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1309 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
1310}
1311
1312FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
1313{
1314 float32x2_t a03 = vget_low_f32(
1315 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1316 float32x2_t b21 = vget_high_f32(
1317 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1318 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
1319}
1320
1321FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
1322{
1323 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1324 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1325 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1326}
1327
1328FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
1329{
1330 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1331 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1332 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
1333}
1334
1335FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
1336{
1337 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1338 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
1339 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
1340}
1341
1342// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
1343// high
1344FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
1345{
1346 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1347 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1348 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
1349}
1350
1351FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
1352{
1353 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
1354 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1355 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
1356}
1357
1358FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
1359{
1360 float32x2_t a22 =
1361 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1362 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1363 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
1364}
1365
1366FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
1367{
1368 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
1369 float32x2_t b22 =
1370 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
1371 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
1372}
1373
1374FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
1375{
1376 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1377 float32x2_t a22 =
1378 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1379 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
1380 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1381 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
1382}
1383
1384FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
1385{
1386 float32x2_t a33 =
1387 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
1388 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
1389 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
1390}
1391
1392FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
1393{
1394 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1395 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
1396 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1397 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1398 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
1399}
1400
1401FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
1402{
1403 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1404 float32_t b2 = vgetq_lane_f32(b, 2);
1405 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1406 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1407 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
1408}
1409
1410FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
1411{
1412 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1413 float32_t b2 = vgetq_lane_f32(b, 2);
1414 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1415 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1416 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
1417}
1418
1419// NEON does not support a general purpose permute intrinsic
1420// Selects four specific single-precision, floating-point values from a and b,
1421// based on the mask i.
1422//
1423// C equivalent:
1424// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1425// __constrange(0, 255) int imm) {
1426// __m128 ret;
1427// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1428// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
1429// return ret;
1430// }
1431//
1432// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
1433#define _mm_shuffle_ps_default(a, b, imm) \
1434 __extension__({ \
1435 float32x4_t ret; \
1436 ret = vmovq_n_f32( \
1437 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
1438 ret = vsetq_lane_f32( \
1439 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1440 ret, 1); \
1441 ret = vsetq_lane_f32( \
1442 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1443 ret, 2); \
1444 ret = vsetq_lane_f32( \
1445 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1446 ret, 3); \
1447 vreinterpretq_m128_f32(ret); \
1448 })
1449
1450// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
1451// int imm)
1452#if __has_builtin(__builtin_shufflevector)
1453#define _mm_shuffle_ps(a, b, imm) \
1454 __extension__({ \
1455 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
1456 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
1457 float32x4_t _shuf = __builtin_shufflevector( \
1458 _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1459 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
1460 vreinterpretq_m128_f32(_shuf); \
1461 })
1462#else // generic
1463#define _mm_shuffle_ps(a, b, imm) \
1464 __extension__({ \
1465 __m128 ret; \
1466 switch (imm) { \
1467 case _MM_SHUFFLE(1, 0, 3, 2): \
1468 ret = _mm_shuffle_ps_1032((a), (b)); \
1469 break; \
1470 case _MM_SHUFFLE(2, 3, 0, 1): \
1471 ret = _mm_shuffle_ps_2301((a), (b)); \
1472 break; \
1473 case _MM_SHUFFLE(0, 3, 2, 1): \
1474 ret = _mm_shuffle_ps_0321((a), (b)); \
1475 break; \
1476 case _MM_SHUFFLE(2, 1, 0, 3): \
1477 ret = _mm_shuffle_ps_2103((a), (b)); \
1478 break; \
1479 case _MM_SHUFFLE(1, 0, 1, 0): \
1480 ret = _mm_movelh_ps((a), (b)); \
1481 break; \
1482 case _MM_SHUFFLE(1, 0, 0, 1): \
1483 ret = _mm_shuffle_ps_1001((a), (b)); \
1484 break; \
1485 case _MM_SHUFFLE(0, 1, 0, 1): \
1486 ret = _mm_shuffle_ps_0101((a), (b)); \
1487 break; \
1488 case _MM_SHUFFLE(3, 2, 1, 0): \
1489 ret = _mm_shuffle_ps_3210((a), (b)); \
1490 break; \
1491 case _MM_SHUFFLE(0, 0, 1, 1): \
1492 ret = _mm_shuffle_ps_0011((a), (b)); \
1493 break; \
1494 case _MM_SHUFFLE(0, 0, 2, 2): \
1495 ret = _mm_shuffle_ps_0022((a), (b)); \
1496 break; \
1497 case _MM_SHUFFLE(2, 2, 0, 0): \
1498 ret = _mm_shuffle_ps_2200((a), (b)); \
1499 break; \
1500 case _MM_SHUFFLE(3, 2, 0, 2): \
1501 ret = _mm_shuffle_ps_3202((a), (b)); \
1502 break; \
1503 case _MM_SHUFFLE(3, 2, 3, 2): \
1504 ret = _mm_movehl_ps((b), (a)); \
1505 break; \
1506 case _MM_SHUFFLE(1, 1, 3, 3): \
1507 ret = _mm_shuffle_ps_1133((a), (b)); \
1508 break; \
1509 case _MM_SHUFFLE(2, 0, 1, 0): \
1510 ret = _mm_shuffle_ps_2010((a), (b)); \
1511 break; \
1512 case _MM_SHUFFLE(2, 0, 0, 1): \
1513 ret = _mm_shuffle_ps_2001((a), (b)); \
1514 break; \
1515 case _MM_SHUFFLE(2, 0, 3, 2): \
1516 ret = _mm_shuffle_ps_2032((a), (b)); \
1517 break; \
1518 default: \
1519 ret = _mm_shuffle_ps_default((a), (b), (imm)); \
1520 break; \
1521 } \
1522 ret; \
1523 })
1524#endif
1525
1526// Takes the upper 64 bits of a and places it in the low end of the result
1527// Takes the lower 64 bits of a and places it into the high end of the result.
1528FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
1529{
1530 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1531 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1532 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
1533}
1534
1535// takes the lower two 32-bit values from a and swaps them and places in low end
1536// of result takes the higher two 32 bit values from a and swaps them and places
1537// in high end of result.
1538FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1539{
1540 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1541 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1542 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1543}
1544
1545// rotates the least significant 32 bits into the most signficant 32 bits, and
1546// shifts the rest down
1547FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1548{
1549 return vreinterpretq_m128i_s32(
1550 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1551}
1552
1553// rotates the most significant 32 bits into the least signficant 32 bits, and
1554// shifts the rest up
1555FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1556{
1557 return vreinterpretq_m128i_s32(
1558 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1559}
1560
1561// gets the lower 64 bits of a, and places it in the upper 64 bits
1562// gets the lower 64 bits of a and places it in the lower 64 bits
1563FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1564{
1565 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1566 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1567}
1568
1569// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1570// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1571FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1572{
1573 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1574 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1575 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1576}
1577
1578// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1579// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1580// places it in the lower 64 bits
1581FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1582{
1583 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1584 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1585}
1586
1587FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1588{
1589 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1590 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1591 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1592}
1593
1594FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1595{
1596 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1597 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1598 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1599}
1600
1601FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1602{
1603 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1604 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1605 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1606}
1607
1608// Shuffle packed 8-bit integers in a according to shuffle control mask in the
1609// corresponding 8-bit element of b, and store the results in dst.
1610// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
1611FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
1612{
1613 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
1614 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
1615 uint8x16_t idx_masked =
1616 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
1617#if defined(__aarch64__)
1618 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
1619#elif defined(__GNUC__)
1620 int8x16_t ret;
1621 // %e and %f represent the even and odd D registers
1622 // respectively.
1623 __asm__ __volatile__(
1624 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
1625 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
1626 : [ret] "=&w"(ret)
1627 : [tbl] "w"(tbl), [idx] "w"(idx_masked));
1628 return vreinterpretq_m128i_s8(ret);
1629#else
1630 // use this line if testing on aarch64
1631 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
1632 return vreinterpretq_m128i_s8(
1633 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
1634 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
1635#endif
1636}
1637
1638// C equivalent:
1639// __m128i _mm_shuffle_epi32_default(__m128i a,
1640// __constrange(0, 255) int imm) {
1641// __m128i ret;
1642// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1643// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
1644// return ret;
1645// }
1646#define _mm_shuffle_epi32_default(a, imm) \
1647 __extension__({ \
1648 int32x4_t ret; \
1649 ret = vmovq_n_s32( \
1650 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
1651 ret = vsetq_lane_s32( \
1652 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
1653 ret, 1); \
1654 ret = vsetq_lane_s32( \
1655 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
1656 ret, 2); \
1657 ret = vsetq_lane_s32( \
1658 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
1659 ret, 3); \
1660 vreinterpretq_m128i_s32(ret); \
1661 })
1662
1663// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1664// int imm)
1665#if defined(__aarch64__)
1666#define _mm_shuffle_epi32_splat(a, imm) \
1667 __extension__({ \
1668 vreinterpretq_m128i_s32( \
1669 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1670 })
1671#else
1672#define _mm_shuffle_epi32_splat(a, imm) \
1673 __extension__({ \
1674 vreinterpretq_m128i_s32( \
1675 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1676 })
1677#endif
1678
1679// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
1680// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
1681// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
1682// __constrange(0,255) int imm)
1683#if __has_builtin(__builtin_shufflevector)
1684#define _mm_shuffle_epi32(a, imm) \
1685 __extension__({ \
1686 int32x4_t _input = vreinterpretq_s32_m128i(a); \
1687 int32x4_t _shuf = __builtin_shufflevector( \
1688 _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1689 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
1690 vreinterpretq_m128i_s32(_shuf); \
1691 })
1692#else // generic
1693#define _mm_shuffle_epi32(a, imm) \
1694 __extension__({ \
1695 __m128i ret; \
1696 switch (imm) { \
1697 case _MM_SHUFFLE(1, 0, 3, 2): \
1698 ret = _mm_shuffle_epi_1032((a)); \
1699 break; \
1700 case _MM_SHUFFLE(2, 3, 0, 1): \
1701 ret = _mm_shuffle_epi_2301((a)); \
1702 break; \
1703 case _MM_SHUFFLE(0, 3, 2, 1): \
1704 ret = _mm_shuffle_epi_0321((a)); \
1705 break; \
1706 case _MM_SHUFFLE(2, 1, 0, 3): \
1707 ret = _mm_shuffle_epi_2103((a)); \
1708 break; \
1709 case _MM_SHUFFLE(1, 0, 1, 0): \
1710 ret = _mm_shuffle_epi_1010((a)); \
1711 break; \
1712 case _MM_SHUFFLE(1, 0, 0, 1): \
1713 ret = _mm_shuffle_epi_1001((a)); \
1714 break; \
1715 case _MM_SHUFFLE(0, 1, 0, 1): \
1716 ret = _mm_shuffle_epi_0101((a)); \
1717 break; \
1718 case _MM_SHUFFLE(2, 2, 1, 1): \
1719 ret = _mm_shuffle_epi_2211((a)); \
1720 break; \
1721 case _MM_SHUFFLE(0, 1, 2, 2): \
1722 ret = _mm_shuffle_epi_0122((a)); \
1723 break; \
1724 case _MM_SHUFFLE(3, 3, 3, 2): \
1725 ret = _mm_shuffle_epi_3332((a)); \
1726 break; \
1727 case _MM_SHUFFLE(0, 0, 0, 0): \
1728 ret = _mm_shuffle_epi32_splat((a), 0); \
1729 break; \
1730 case _MM_SHUFFLE(1, 1, 1, 1): \
1731 ret = _mm_shuffle_epi32_splat((a), 1); \
1732 break; \
1733 case _MM_SHUFFLE(2, 2, 2, 2): \
1734 ret = _mm_shuffle_epi32_splat((a), 2); \
1735 break; \
1736 case _MM_SHUFFLE(3, 3, 3, 3): \
1737 ret = _mm_shuffle_epi32_splat((a), 3); \
1738 break; \
1739 default: \
1740 ret = _mm_shuffle_epi32_default((a), (imm)); \
1741 break; \
1742 } \
1743 ret; \
1744 })
1745#endif
1746
1747// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
1748// by imm.
1749// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
1750// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
1751// __constrange(0,255) int
1752// imm)
1753#define _mm_shufflelo_epi16_function(a, imm) \
1754 __extension__({ \
1755 int16x8_t ret = vreinterpretq_s16_m128i(a); \
1756 int16x4_t lowBits = vget_low_s16(ret); \
1757 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
1758 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1759 1); \
1760 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1761 2); \
1762 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1763 3); \
1764 vreinterpretq_m128i_s16(ret); \
1765 })
1766
1767// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
1768// __constrange(0,255) int imm)
1769#if __has_builtin(__builtin_shufflevector)
1770#define _mm_shufflelo_epi16(a, imm) \
1771 __extension__({ \
1772 int16x8_t _input = vreinterpretq_s16_m128i(a); \
1773 int16x8_t _shuf = __builtin_shufflevector( \
1774 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
1775 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
1776 vreinterpretq_m128i_s16(_shuf); \
1777 })
1778#else // generic
1779#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
1780#endif
1781
1782// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1783// by imm.
1784// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1785// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1786// __constrange(0,255) int
1787// imm)
1788#define _mm_shufflehi_epi16_function(a, imm) \
1789 __extension__({ \
1790 int16x8_t ret = vreinterpretq_s16_m128i(a); \
1791 int16x4_t highBits = vget_high_s16(ret); \
1792 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1793 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1794 5); \
1795 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1796 6); \
1797 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1798 7); \
1799 vreinterpretq_m128i_s16(ret); \
1800 })
1801
1802// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
1803// __constrange(0,255) int imm)
1804#if __has_builtin(__builtin_shufflevector)
1805#define _mm_shufflehi_epi16(a, imm) \
1806 __extension__({ \
1807 int16x8_t _input = vreinterpretq_s16_m128i(a); \
1808 int16x8_t _shuf = __builtin_shufflevector( \
1809 _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
1810 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
1811 (((imm) >> 6) & 0x3) + 4); \
1812 vreinterpretq_m128i_s16(_shuf); \
1813 })
1814#else // generic
1815#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
1816#endif
1817
1818// Blend packed 16-bit integers from a and b using control mask imm8, and store
1819// the results in dst.
1820//
1821// FOR j := 0 to 7
1822// i := j*16
1823// IF imm8[j]
1824// dst[i+15:i] := b[i+15:i]
1825// ELSE
1826// dst[i+15:i] := a[i+15:i]
1827// FI
1828// ENDFOR
1829// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
1830// __constrange(0,255) int imm)
1831#define _mm_blend_epi16(a, b, imm) \
1832 __extension__({ \
1833 const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \
1834 ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \
1835 ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \
1836 ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \
1837 ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \
1838 ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \
1839 ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \
1840 ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
1841 uint16x8_t _mask_vec = vld1q_u16(_mask); \
1842 uint16x8_t _a = vreinterpretq_u16_m128i(a); \
1843 uint16x8_t _b = vreinterpretq_u16_m128i(b); \
1844 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
1845 })
1846
1847// Blend packed 8-bit integers from a and b using mask, and store the results in
1848// dst.
1849//
1850// FOR j := 0 to 15
1851// i := j*8
1852// IF mask[i+7]
1853// dst[i+7:i] := b[i+7:i]
1854// ELSE
1855// dst[i+7:i] := a[i+7:i]
1856// FI
1857// ENDFOR
1858FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
1859{
1860 // Use a signed shift right to create a mask with the sign bit
1861 uint8x16_t mask =
1862 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
1863 uint8x16_t a = vreinterpretq_u8_m128i(_a);
1864 uint8x16_t b = vreinterpretq_u8_m128i(_b);
1865 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
1866}
1867
1868/* Shifts */
1869
1870
1871// Shift packed 16-bit integers in a right by imm while shifting in sign
1872// bits, and store the results in dst.
1873// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
1874FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
1875{
1876 const int count = (imm & ~15) ? 15 : imm;
1877 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
1878}
1879
1880// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1881// shifting in zeros.
1882//
1883// r0 := a0 << count
1884// r1 := a1 << count
1885// ...
1886// r7 := a7 << count
1887//
1888// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
1889#define _mm_slli_epi16(a, imm) \
1890 __extension__({ \
1891 __m128i ret; \
1892 if ((imm) <= 0) { \
1893 ret = a; \
1894 } else if ((imm) > 15) { \
1895 ret = _mm_setzero_si128(); \
1896 } else { \
1897 ret = vreinterpretq_m128i_s16( \
1898 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
1899 } \
1900 ret; \
1901 })
1902
1903// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
1904// shifting in zeros. :
1905// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
1906// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
1907FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
1908{
1909 if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
1910 return a;
1911 if (imm > 31) /* TODO: add unlikely macro */
1912 return _mm_setzero_si128();
1913 return vreinterpretq_m128i_s32(
1914 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
1915}
1916
1917// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
1918// store the results in dst.
1919FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
1920{
1921 if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
1922 return a;
1923 if (imm > 63) /* TODO: add unlikely macro */
1924 return _mm_setzero_si128();
1925 return vreinterpretq_m128i_s64(
1926 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
1927}
1928
1929// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
1930// store the results in dst.
1931//
1932// FOR j := 0 to 7
1933// i := j*16
1934// IF imm8[7:0] > 15
1935// dst[i+15:i] := 0
1936// ELSE
1937// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
1938// FI
1939// ENDFOR
1940//
1941// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
1942#define _mm_srli_epi16(a, imm) \
1943 __extension__({ \
1944 __m128i ret; \
1945 if ((imm) == 0) { \
1946 ret = a; \
1947 } else if (0 < (imm) && (imm) < 16) { \
1948 ret = vreinterpretq_m128i_u16( \
1949 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
1950 } else { \
1951 ret = _mm_setzero_si128(); \
1952 } \
1953 ret; \
1954 })
1955
1956// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
1957// store the results in dst.
1958//
1959// FOR j := 0 to 3
1960// i := j*32
1961// IF imm8[7:0] > 31
1962// dst[i+31:i] := 0
1963// ELSE
1964// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
1965// FI
1966// ENDFOR
1967//
1968// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
1969// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
1970#define _mm_srli_epi32(a, imm) \
1971 __extension__({ \
1972 __m128i ret; \
1973 if ((imm) == 0) { \
1974 ret = a; \
1975 } else if (0 < (imm) && (imm) < 32) { \
1976 ret = vreinterpretq_m128i_u32( \
1977 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
1978 } else { \
1979 ret = _mm_setzero_si128(); \
1980 } \
1981 ret; \
1982 })
1983
1984// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
1985// store the results in dst.
1986//
1987// FOR j := 0 to 1
1988// i := j*64
1989// IF imm8[7:0] > 63
1990// dst[i+63:i] := 0
1991// ELSE
1992// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
1993// FI
1994// ENDFOR
1995//
1996// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
1997#define _mm_srli_epi64(a, imm) \
1998 __extension__({ \
1999 __m128i ret; \
2000 if ((imm) == 0) { \
2001 ret = a; \
2002 } else if (0 < (imm) && (imm) < 64) { \
2003 ret = vreinterpretq_m128i_u64( \
2004 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
2005 } else { \
2006 ret = _mm_setzero_si128(); \
2007 } \
2008 ret; \
2009 })
2010
2011// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
2012// and store the results in dst.
2013//
2014// FOR j := 0 to 3
2015// i := j*32
2016// IF imm8[7:0] > 31
2017// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
2018// ELSE
2019// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
2020// FI
2021// ENDFOR
2022//
2023// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
2024// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
2025#define _mm_srai_epi32(a, imm) \
2026 __extension__({ \
2027 __m128i ret; \
2028 if ((imm) == 0) { \
2029 ret = a; \
2030 } else if (0 < (imm) && (imm) < 32) { \
2031 ret = vreinterpretq_m128i_s32( \
2032 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
2033 } else { \
2034 ret = vreinterpretq_m128i_s32( \
2035 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
2036 } \
2037 ret; \
2038 })
2039
2040// Shifts the 128 - bit value in a right by imm bytes while shifting in
2041// zeros.imm must be an immediate.
2042//
2043// r := srl(a, imm*8)
2044//
2045// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
2046// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
2047#define _mm_srli_si128(a, imm) \
2048 __extension__({ \
2049 __m128i ret; \
2050 if ((imm) <= 0) { \
2051 ret = a; \
2052 } else if ((imm) > 15) { \
2053 ret = _mm_setzero_si128(); \
2054 } else { \
2055 ret = vreinterpretq_m128i_s8( \
2056 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
2057 } \
2058 ret; \
2059 })
2060
2061// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
2062// must be an immediate.
2063//
2064// r := a << (imm * 8)
2065//
2066// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
2067// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
2068#define _mm_slli_si128(a, imm) \
2069 __extension__({ \
2070 __m128i ret; \
2071 if ((imm) <= 0) { \
2072 ret = a; \
2073 } else if ((imm) > 15) { \
2074 ret = _mm_setzero_si128(); \
2075 } else { \
2076 ret = vreinterpretq_m128i_s8(vextq_s8( \
2077 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
2078 } \
2079 ret; \
2080 })
2081
2082// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
2083// shifting in zeros.
2084//
2085// r0 := a0 << count
2086// r1 := a1 << count
2087// ...
2088// r7 := a7 << count
2089//
2090// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
2091FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
2092{
2093 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2094 if (c > 15)
2095 return _mm_setzero_si128();
2096
2097 int16x8_t vc = vdupq_n_s16((int16_t) c);
2098 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
2099}
2100
2101// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2102// shifting in zeros.
2103//
2104// r0 := a0 << count
2105// r1 := a1 << count
2106// r2 := a2 << count
2107// r3 := a3 << count
2108//
2109// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
2110FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
2111{
2112 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2113 if (c > 31)
2114 return _mm_setzero_si128();
2115
2116 int32x4_t vc = vdupq_n_s32((int32_t) c);
2117 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
2118}
2119
2120// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
2121// shifting in zeros.
2122//
2123// r0 := a0 << count
2124// r1 := a1 << count
2125//
2126// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
2127FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
2128{
2129 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2130 if (c > 63)
2131 return _mm_setzero_si128();
2132
2133 int64x2_t vc = vdupq_n_s64((int64_t) c);
2134 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
2135}
2136
2137// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
2138// while shifting in zeros.
2139//
2140// r0 := srl(a0, count)
2141// r1 := srl(a1, count)
2142// ...
2143// r7 := srl(a7, count)
2144//
2145// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
2146FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
2147{
2148 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2149 if (c > 15)
2150 return _mm_setzero_si128();
2151
2152 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
2153 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
2154}
2155
2156// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
2157// while shifting in zeros.
2158//
2159// r0 := srl(a0, count)
2160// r1 := srl(a1, count)
2161// r2 := srl(a2, count)
2162// r3 := srl(a3, count)
2163//
2164// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
2165FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
2166{
2167 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2168 if (c > 31)
2169 return _mm_setzero_si128();
2170
2171 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
2172 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
2173}
2174
2175// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
2176// while shifting in zeros.
2177//
2178// r0 := srl(a0, count)
2179// r1 := srl(a1, count)
2180//
2181// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
2182FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
2183{
2184 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2185 if (c > 63)
2186 return _mm_setzero_si128();
2187
2188 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
2189 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
2190}
2191
2192// NEON does not provide a version of this function.
2193// Creates a 16-bit mask from the most significant bits of the 16 signed or
2194// unsigned 8-bit integers in a and zero extends the upper bits.
2195// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
2196FORCE_INLINE int _mm_movemask_epi8(__m128i a)
2197{
2198#if defined(__aarch64__)
2199 uint8x16_t input = vreinterpretq_u8_m128i(a);
2200 const int8_t ALIGN_STRUCT(16)
2201 xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
2202 const uint8x16_t mask_and = vdupq_n_u8(0x80);
2203 const int8x16_t mask_shift = vld1q_s8(xr);
2204 const uint8x16_t mask_result =
2205 vshlq_u8(vandq_u8(input, mask_and), mask_shift);
2206 uint8x8_t lo = vget_low_u8(mask_result);
2207 uint8x8_t hi = vget_high_u8(mask_result);
2208
2209 return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
2210#else
2211 // Use increasingly wide shifts+adds to collect the sign bits
2212 // together.
2213 // Since the widening shifts would be rather confusing to follow in little
2214 // endian, everything will be illustrated in big endian order instead. This
2215 // has a different result - the bits would actually be reversed on a big
2216 // endian machine.
2217
2218 // Starting input (only half the elements are shown):
2219 // 89 ff 1d c0 00 10 99 33
2220 uint8x16_t input = vreinterpretq_u8_m128i(a);
2221
2222 // Shift out everything but the sign bits with an unsigned shift right.
2223 //
2224 // Bytes of the vector::
2225 // 89 ff 1d c0 00 10 99 33
2226 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
2227 // | | | | | | | |
2228 // 01 01 00 01 00 00 01 00
2229 //
2230 // Bits of first important lane(s):
2231 // 10001001 (89)
2232 // \______
2233 // |
2234 // 00000001 (01)
2235 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
2236
2237 // Merge the even lanes together with a 16-bit unsigned shift right + add.
2238 // 'xx' represents garbage data which will be ignored in the final result.
2239 // In the important bytes, the add functions like a binary OR.
2240 //
2241 // 01 01 00 01 00 00 01 00
2242 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
2243 // \| \| \| \|
2244 // xx 03 xx 01 xx 00 xx 02
2245 //
2246 // 00000001 00000001 (01 01)
2247 // \_______ |
2248 // \|
2249 // xxxxxxxx xxxxxx11 (xx 03)
2250 uint32x4_t paired16 =
2251 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2252
2253 // Repeat with a wider 32-bit shift + add.
2254 // xx 03 xx 01 xx 00 xx 02
2255 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
2256 // 14))
2257 // \| \|
2258 // xx xx xx 0d xx xx xx 02
2259 //
2260 // 00000011 00000001 (03 01)
2261 // \\_____ ||
2262 // '----.\||
2263 // xxxxxxxx xxxx1101 (xx 0d)
2264 uint64x2_t paired32 =
2265 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2266
2267 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
2268 // lanes. xx xx xx 0d xx xx xx 02
2269 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
2270 // 28))
2271 // \|
2272 // xx xx xx xx xx xx xx d2
2273 //
2274 // 00001101 00000010 (0d 02)
2275 // \ \___ | |
2276 // '---. \| |
2277 // xxxxxxxx 11010010 (xx d2)
2278 uint8x16_t paired64 =
2279 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2280
2281 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
2282 // xx xx xx xx xx xx xx d2
2283 // || return paired64[0]
2284 // d2
2285 // Note: Little endian would return the correct value 4b (01001011) instead.
2286 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
2287#endif
2288}
2289
2290// Copy the lower 64-bit integer in a to dst.
2291//
2292// dst[63:0] := a[63:0]
2293//
2294// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
2295FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
2296{
2297 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
2298}
2299
2300// Copy the 64-bit integer a to the lower element of dst, and zero the upper
2301// element.
2302//
2303// dst[63:0] := a[63:0]
2304// dst[127:64] := 0
2305//
2306// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
2307FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
2308{
2309 return vreinterpretq_m128i_s64(
2310 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
2311}
2312
2313// NEON does not provide this method
2314// Creates a 4-bit mask from the most significant bits of the four
2315// single-precision, floating-point values.
2316// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
2317FORCE_INLINE int _mm_movemask_ps(__m128 a)
2318{
2319 uint32x4_t input = vreinterpretq_u32_m128(a);
2320#if defined(__aarch64__)
2321 static const int32x4_t shift = {0, 1, 2, 3};
2322 uint32x4_t tmp = vshrq_n_u32(input, 31);
2323 return vaddvq_u32(vshlq_u32(tmp, shift));
2324#else
2325 // Uses the exact same method as _mm_movemask_epi8, see that for details.
2326 // Shift out everything but the sign bits with a 32-bit unsigned shift
2327 // right.
2328 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2329 // Merge the two pairs together with a 64-bit unsigned shift right + add.
2330 uint8x16_t paired =
2331 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2332 // Extract the result.
2333 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2334#endif
2335}
2336
2337// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
2338// all 1's, and return 1 if the result is zero, otherwise return 0.
2339// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
2340FORCE_INLINE int _mm_test_all_ones(__m128i a)
2341{
2342 return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
2343 ~(uint64_t) 0;
2344}
2345
2346// Compute the bitwise AND of 128 bits (representing integer data) in a and
2347// mask, and return 1 if the result is zero, otherwise return 0.
2348// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
2349FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
2350{
2351 int64x2_t a_and_mask =
2352 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
2353 return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
2354 : 1;
2355}
2356
2357/* Math operations */
2358
2359// Subtracts the four single-precision, floating-point values of a and b.
2360//
2361// r0 := a0 - b0
2362// r1 := a1 - b1
2363// r2 := a2 - b2
2364// r3 := a3 - b3
2365//
2366// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
2367FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2368{
2369 return vreinterpretq_m128_f32(
2370 vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2371}
2372
2373// Subtract the lower single-precision (32-bit) floating-point element in b from
2374// the lower single-precision (32-bit) floating-point element in a, store the
2375// result in the lower element of dst, and copy the upper 3 packed elements from
2376// a to the upper elements of dst.
2377//
2378// dst[31:0] := a[31:0] - b[31:0]
2379// dst[127:32] := a[127:32]
2380//
2381// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
2382FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2383{
2384 return _mm_move_ss(a, _mm_sub_ps(a, b));
2385}
2386
2387// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
2388// and store the results in dst.
2389// r0 := a0 - b0
2390// r1 := a1 - b1
2391FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
2392{
2393 return vreinterpretq_m128i_s64(
2394 vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2395}
2396
2397// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
2398// unsigned 32-bit integers of a.
2399//
2400// r0 := a0 - b0
2401// r1 := a1 - b1
2402// r2 := a2 - b2
2403// r3 := a3 - b3
2404//
2405// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
2406FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
2407{
2408 return vreinterpretq_m128i_s32(
2409 vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2410}
2411
2412FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
2413{
2414 return vreinterpretq_m128i_s16(
2415 vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2416}
2417
2418FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
2419{
2420 return vreinterpretq_m128i_s8(
2421 vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2422}
2423
2424// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
2425//
2426// dst[63:0] := a[63:0] - b[63:0]
2427//
2428// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
2429FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
2430{
2431 return vreinterpret_m64_s64(
2432 vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2433}
2434
2435// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
2436// integers of a and saturates..
2437// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
2438FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
2439{
2440 return vreinterpretq_m128i_u16(
2441 vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2442}
2443
2444// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
2445// integers of a and saturates.
2446//
2447// r0 := UnsignedSaturate(a0 - b0)
2448// r1 := UnsignedSaturate(a1 - b1)
2449// ...
2450// r15 := UnsignedSaturate(a15 - b15)
2451//
2452// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
2453FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
2454{
2455 return vreinterpretq_m128i_u8(
2456 vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2457}
2458
2459// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
2460// of a and saturates.
2461//
2462// r0 := SignedSaturate(a0 - b0)
2463// r1 := SignedSaturate(a1 - b1)
2464// ...
2465// r15 := SignedSaturate(a15 - b15)
2466//
2467// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
2468FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
2469{
2470 return vreinterpretq_m128i_s8(
2471 vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2472}
2473
2474// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
2475// of a and saturates.
2476//
2477// r0 := SignedSaturate(a0 - b0)
2478// r1 := SignedSaturate(a1 - b1)
2479// ...
2480// r7 := SignedSaturate(a7 - b7)
2481//
2482// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
2483FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
2484{
2485 return vreinterpretq_m128i_s16(
2486 vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2487}
2488
2489FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2490{
2491 return vreinterpretq_m128i_u16(
2492 vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2493}
2494
2495// Negate packed 8-bit integers in a when the corresponding signed
2496// 8-bit integer in b is negative, and store the results in dst.
2497// Element in dst are zeroed out when the corresponding element
2498// in b is zero.
2499//
2500// for i in 0..15
2501// if b[i] < 0
2502// r[i] := -a[i]
2503// else if b[i] == 0
2504// r[i] := 0
2505// else
2506// r[i] := a[i]
2507// fi
2508// done
2509FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
2510{
2511 int8x16_t a = vreinterpretq_s8_m128i(_a);
2512 int8x16_t b = vreinterpretq_s8_m128i(_b);
2513
2514 // signed shift right: faster than vclt
2515 // (b < 0) ? 0xFF : 0
2516 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
2517
2518 // (b == 0) ? 0xFF : 0
2519#if defined(__aarch64__)
2520 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
2521#else
2522 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
2523#endif
2524
2525 // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
2526 // based on ltMask
2527 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
2528 // res = masked & (~zeroMask)
2529 int8x16_t res = vbicq_s8(masked, zeroMask);
2530
2531 return vreinterpretq_m128i_s8(res);
2532}
2533
2534// Negate packed 16-bit integers in a when the corresponding signed
2535// 16-bit integer in b is negative, and store the results in dst.
2536// Element in dst are zeroed out when the corresponding element
2537// in b is zero.
2538//
2539// for i in 0..7
2540// if b[i] < 0
2541// r[i] := -a[i]
2542// else if b[i] == 0
2543// r[i] := 0
2544// else
2545// r[i] := a[i]
2546// fi
2547// done
2548FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
2549{
2550 int16x8_t a = vreinterpretq_s16_m128i(_a);
2551 int16x8_t b = vreinterpretq_s16_m128i(_b);
2552
2553 // signed shift right: faster than vclt
2554 // (b < 0) ? 0xFFFF : 0
2555 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
2556 // (b == 0) ? 0xFFFF : 0
2557#if defined(__aarch64__)
2558 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
2559#else
2560 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
2561#endif
2562
2563 // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
2564 // 'a') based on ltMask
2565 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
2566 // res = masked & (~zeroMask)
2567 int16x8_t res = vbicq_s16(masked, zeroMask);
2568 return vreinterpretq_m128i_s16(res);
2569}
2570
2571// Negate packed 32-bit integers in a when the corresponding signed
2572// 32-bit integer in b is negative, and store the results in dst.
2573// Element in dst are zeroed out when the corresponding element
2574// in b is zero.
2575//
2576// for i in 0..3
2577// if b[i] < 0
2578// r[i] := -a[i]
2579// else if b[i] == 0
2580// r[i] := 0
2581// else
2582// r[i] := a[i]
2583// fi
2584// done
2585FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
2586{
2587 int32x4_t a = vreinterpretq_s32_m128i(_a);
2588 int32x4_t b = vreinterpretq_s32_m128i(_b);
2589
2590 // signed shift right: faster than vclt
2591 // (b < 0) ? 0xFFFFFFFF : 0
2592 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
2593
2594 // (b == 0) ? 0xFFFFFFFF : 0
2595#if defined(__aarch64__)
2596 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
2597#else
2598 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
2599#endif
2600
2601 // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
2602 // 'a') based on ltMask
2603 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
2604 // res = masked & (~zeroMask)
2605 int32x4_t res = vbicq_s32(masked, zeroMask);
2606 return vreinterpretq_m128i_s32(res);
2607}
2608
2609// Negate packed 16-bit integers in a when the corresponding signed 16-bit
2610// integer in b is negative, and store the results in dst. Element in dst are
2611// zeroed out when the corresponding element in b is zero.
2612//
2613// FOR j := 0 to 3
2614// i := j*16
2615// IF b[i+15:i] < 0
2616// dst[i+15:i] := -(a[i+15:i])
2617// ELSE IF b[i+15:i] == 0
2618// dst[i+15:i] := 0
2619// ELSE
2620// dst[i+15:i] := a[i+15:i]
2621// FI
2622// ENDFOR
2623//
2624// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
2625FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
2626{
2627 int16x4_t a = vreinterpret_s16_m64(_a);
2628 int16x4_t b = vreinterpret_s16_m64(_b);
2629
2630 // signed shift right: faster than vclt
2631 // (b < 0) ? 0xFFFF : 0
2632 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
2633
2634 // (b == 0) ? 0xFFFF : 0
2635#if defined(__aarch64__)
2636 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
2637#else
2638 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
2639#endif
2640
2641 // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
2642 // based on ltMask
2643 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
2644 // res = masked & (~zeroMask)
2645 int16x4_t res = vbic_s16(masked, zeroMask);
2646
2647 return vreinterpret_m64_s16(res);
2648}
2649
2650// Negate packed 32-bit integers in a when the corresponding signed 32-bit
2651// integer in b is negative, and store the results in dst. Element in dst are
2652// zeroed out when the corresponding element in b is zero.
2653//
2654// FOR j := 0 to 1
2655// i := j*32
2656// IF b[i+31:i] < 0
2657// dst[i+31:i] := -(a[i+31:i])
2658// ELSE IF b[i+31:i] == 0
2659// dst[i+31:i] := 0
2660// ELSE
2661// dst[i+31:i] := a[i+31:i]
2662// FI
2663// ENDFOR
2664//
2665// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
2666FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
2667{
2668 int32x2_t a = vreinterpret_s32_m64(_a);
2669 int32x2_t b = vreinterpret_s32_m64(_b);
2670
2671 // signed shift right: faster than vclt
2672 // (b < 0) ? 0xFFFFFFFF : 0
2673 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
2674
2675 // (b == 0) ? 0xFFFFFFFF : 0
2676#if defined(__aarch64__)
2677 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
2678#else
2679 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
2680#endif
2681
2682 // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
2683 // based on ltMask
2684 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
2685 // res = masked & (~zeroMask)
2686 int32x2_t res = vbic_s32(masked, zeroMask);
2687
2688 return vreinterpret_m64_s32(res);
2689}
2690
2691// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
2692// in b is negative, and store the results in dst. Element in dst are zeroed out
2693// when the corresponding element in b is zero.
2694//
2695// FOR j := 0 to 7
2696// i := j*8
2697// IF b[i+7:i] < 0
2698// dst[i+7:i] := -(a[i+7:i])
2699// ELSE IF b[i+7:i] == 0
2700// dst[i+7:i] := 0
2701// ELSE
2702// dst[i+7:i] := a[i+7:i]
2703// FI
2704// ENDFOR
2705//
2706// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
2707FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
2708{
2709 int8x8_t a = vreinterpret_s8_m64(_a);
2710 int8x8_t b = vreinterpret_s8_m64(_b);
2711
2712 // signed shift right: faster than vclt
2713 // (b < 0) ? 0xFF : 0
2714 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
2715
2716 // (b == 0) ? 0xFF : 0
2717#if defined(__aarch64__)
2718 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
2719#else
2720 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
2721#endif
2722
2723 // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
2724 // based on ltMask
2725 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
2726 // res = masked & (~zeroMask)
2727 int8x8_t res = vbic_s8(masked, zeroMask);
2728
2729 return vreinterpret_m64_s8(res);
2730}
2731
2732// Average packed unsigned 16-bit integers in a and b, and store the results in
2733// dst.
2734//
2735// FOR j := 0 to 3
2736// i := j*16
2737// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2738// ENDFOR
2739//
2740// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
2741FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
2742{
2743 return vreinterpret_m64_u16(
2744 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
2745}
2746
2747// Average packed unsigned 8-bit integers in a and b, and store the results in
2748// dst.
2749//
2750// FOR j := 0 to 7
2751// i := j*8
2752// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2753// ENDFOR
2754//
2755// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
2756FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
2757{
2758 return vreinterpret_m64_u8(
2759 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2760}
2761
2762// Average packed unsigned 8-bit integers in a and b, and store the results in
2763// dst.
2764//
2765// FOR j := 0 to 7
2766// i := j*8
2767// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2768// ENDFOR
2769//
2770// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2771#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2772
2773// Average packed unsigned 16-bit integers in a and b, and store the results in
2774// dst.
2775//
2776// FOR j := 0 to 3
2777// i := j*16
2778// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2779// ENDFOR
2780//
2781// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2782#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2783
2784// Computes the average of the 16 unsigned 8-bit integers in a and the 16
2785// unsigned 8-bit integers in b and rounds.
2786//
2787// r0 := (a0 + b0) / 2
2788// r1 := (a1 + b1) / 2
2789// ...
2790// r15 := (a15 + b15) / 2
2791//
2792// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
2793FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
2794{
2795 return vreinterpretq_m128i_u8(
2796 vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2797}
2798
2799// Computes the average of the 8 unsigned 16-bit integers in a and the 8
2800// unsigned 16-bit integers in b and rounds.
2801//
2802// r0 := (a0 + b0) / 2
2803// r1 := (a1 + b1) / 2
2804// ...
2805// r7 := (a7 + b7) / 2
2806//
2807// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
2808FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
2809{
2810 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
2811 vreinterpretq_u16_m128i(b));
2812}
2813
2814// Adds the four single-precision, floating-point values of a and b.
2815//
2816// r0 := a0 + b0
2817// r1 := a1 + b1
2818// r2 := a2 + b2
2819// r3 := a3 + b3
2820//
2821// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
2822FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
2823{
2824 return vreinterpretq_m128_f32(
2825 vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2826}
2827
2828// Add packed double-precision (64-bit) floating-point elements in a and b, and
2829// store the results in dst.
2830// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
2831FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2832{
2833#if defined(__aarch64__)
2834 return vreinterpretq_m128d_f64(
2835 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2836#else
2837 double *da = (double *) &a;
2838 double *db = (double *) &b;
2839 double c[2];
2840 c[0] = da[0] + db[0];
2841 c[1] = da[1] + db[1];
2842 return vld1q_f32((float32_t *) c);
2843#endif
2844}
2845
2846// Add 64-bit integers a and b, and store the result in dst.
2847//
2848// dst[63:0] := a[63:0] + b[63:0]
2849//
2850// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
2851FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2852{
2853 return vreinterpret_m64_s64(
2854 vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2855}
2856
2857// adds the scalar single-precision floating point values of a and b.
2858// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
2859FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
2860{
2861 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
2862 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
2863 // the upper values in the result must be the remnants of <a>.
2864 return vreinterpretq_m128_f32(vaddq_f32(a, value));
2865}
2866
2867// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2868// unsigned 32-bit integers in b.
2869// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2870FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2871{
2872 return vreinterpretq_m128i_s64(
2873 vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2874}
2875
2876// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2877// unsigned 32-bit integers in b.
2878//
2879// r0 := a0 + b0
2880// r1 := a1 + b1
2881// r2 := a2 + b2
2882// r3 := a3 + b3
2883//
2884// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2885FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2886{
2887 return vreinterpretq_m128i_s32(
2888 vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2889}
2890
2891// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2892// unsigned 16-bit integers in b.
2893// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
2894FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2895{
2896 return vreinterpretq_m128i_s16(
2897 vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2898}
2899
2900// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
2901// unsigned 8-bit integers in b.
2902// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
2903FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2904{
2905 return vreinterpretq_m128i_s8(
2906 vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2907}
2908
2909// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
2910// and saturates.
2911//
2912// r0 := SignedSaturate(a0 + b0)
2913// r1 := SignedSaturate(a1 + b1)
2914// ...
2915// r7 := SignedSaturate(a7 + b7)
2916//
2917// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
2918FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
2919{
2920 return vreinterpretq_m128i_s16(
2921 vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2922}
2923
2924// Add packed signed 8-bit integers in a and b using saturation, and store the
2925// results in dst.
2926//
2927// FOR j := 0 to 15
2928// i := j*8
2929// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
2930// ENDFOR
2931//
2932// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
2933FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
2934{
2935 return vreinterpretq_m128i_s8(
2936 vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2937}
2938
2939// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
2940// b and saturates..
2941// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
2942FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
2943{
2944 return vreinterpretq_m128i_u8(
2945 vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2946}
2947
2948// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
2949// unsigned 16-bit integers from b.
2950//
2951// r0 := (a0 * b0)[15:0]
2952// r1 := (a1 * b1)[15:0]
2953// ...
2954// r7 := (a7 * b7)[15:0]
2955//
2956// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
2957FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
2958{
2959 return vreinterpretq_m128i_s16(
2960 vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2961}
2962
2963// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
2964// unsigned 32-bit integers from b.
2965// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
2966FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
2967{
2968 return vreinterpretq_m128i_s32(
2969 vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2970}
2971
2972// Multiply the packed unsigned 16-bit integers in a and b, producing
2973// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2974// integers in dst.
2975//
2976// FOR j := 0 to 3
2977// i := j*16
2978// tmp[31:0] := a[i+15:i] * b[i+15:i]
2979// dst[i+15:i] := tmp[31:16]
2980// ENDFOR
2981//
2982// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2983#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2984
2985// Multiplies the four single-precision, floating-point values of a and b.
2986//
2987// r0 := a0 * b0
2988// r1 := a1 * b1
2989// r2 := a2 * b2
2990// r3 := a3 * b3
2991//
2992// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2993FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2994{
2995 return vreinterpretq_m128_f32(
2996 vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2997}
2998
2999// Multiply the lower single-precision (32-bit) floating-point element in a and
3000// b, store the result in the lower element of dst, and copy the upper 3 packed
3001// elements from a to the upper elements of dst.
3002//
3003// dst[31:0] := a[31:0] * b[31:0]
3004// dst[127:32] := a[127:32]
3005//
3006// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
3007FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
3008{
3009 return _mm_move_ss(a, _mm_mul_ps(a, b));
3010}
3011
3012// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
3013// a and b, and store the unsigned 64-bit results in dst.
3014//
3015// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
3016// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
3017FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
3018{
3019 // vmull_u32 upcasts instead of masking, so we downcast.
3020 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
3021 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
3022 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
3023}
3024
3025// Multiply the low unsigned 32-bit integers from a and b, and store the
3026// unsigned 64-bit result in dst.
3027//
3028// dst[63:0] := a[31:0] * b[31:0]
3029//
3030// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
3031FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
3032{
3033 return vreinterpret_m64_u64(vget_low_u64(
3034 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
3035}
3036
3037// Multiply the low signed 32-bit integers from each packed 64-bit element in
3038// a and b, and store the signed 64-bit results in dst.
3039//
3040// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
3041// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
3042FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
3043{
3044 // vmull_s32 upcasts instead of masking, so we downcast.
3045 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
3046 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
3047 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
3048}
3049
3050// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3051// integers from b.
3052//
3053// r0 := (a0 * b0) + (a1 * b1)
3054// r1 := (a2 * b2) + (a3 * b3)
3055// r2 := (a4 * b4) + (a5 * b5)
3056// r3 := (a6 * b6) + (a7 * b7)
3057// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
3058FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
3059{
3060 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3061 vget_low_s16(vreinterpretq_s16_m128i(b)));
3062 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3063 vget_high_s16(vreinterpretq_s16_m128i(b)));
3064
3065 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
3066 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
3067
3068 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
3069}
3070
3071// Multiply packed signed 16-bit integers in a and b, producing intermediate
3072// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
3073// the packed 16-bit integers in dst.
3074//
3075// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
3076// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
3077// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
3078// ...
3079// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
3080FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
3081{
3082 // Has issues due to saturation
3083 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
3084
3085 // Multiply
3086 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3087 vget_low_s16(vreinterpretq_s16_m128i(b)));
3088 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3089 vget_high_s16(vreinterpretq_s16_m128i(b)));
3090
3091 // Rounding narrowing shift right
3092 // narrow = (int16_t)((mul + 16384) >> 15);
3093 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
3094 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
3095
3096 // Join together
3097 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
3098}
3099
3100// Vertically multiply each unsigned 8-bit integer from a with the corresponding
3101// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
3102// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
3103// and pack the saturated results in dst.
3104//
3105// FOR j := 0 to 7
3106// i := j*16
3107// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
3108// a[i+7:i]*b[i+7:i] )
3109// ENDFOR
3110FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
3111{
3112#if defined(__aarch64__)
3113 uint8x16_t a = vreinterpretq_u8_m128i(_a);
3114 int8x16_t b = vreinterpretq_s8_m128i(_b);
3115 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
3116 vmovl_s8(vget_low_s8(b)));
3117 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
3118 vmovl_s8(vget_high_s8(b)));
3119 return vreinterpretq_m128i_s16(
3120 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
3121#else
3122 // This would be much simpler if x86 would choose to zero extend OR sign
3123 // extend, not both. This could probably be optimized better.
3124 uint16x8_t a = vreinterpretq_u16_m128i(_a);
3125 int16x8_t b = vreinterpretq_s16_m128i(_b);
3126
3127 // Zero extend a
3128 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
3129 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
3130
3131 // Sign extend by shifting left then shifting right.
3132 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
3133 int16x8_t b_odd = vshrq_n_s16(b, 8);
3134
3135 // multiply
3136 int16x8_t prod1 = vmulq_s16(a_even, b_even);
3137 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
3138
3139 // saturated add
3140 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
3141#endif
3142}
3143
3144// Computes the fused multiple add product of 32-bit floating point numbers.
3145//
3146// Return Value
3147// Multiplies A and B, and adds C to the temporary result before returning it.
3148// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
3149FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
3150{
3151#if defined(__aarch64__)
3152 return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
3153 vreinterpretq_f32_m128(b),
3154 vreinterpretq_f32_m128(a)));
3155#else
3156 return _mm_add_ps(_mm_mul_ps(a, b), c);
3157#endif
3158}
3159
3160// Alternatively add and subtract packed single-precision (32-bit)
3161// floating-point elements in a to/from packed elements in b, and store the
3162// results in dst.
3163//
3164// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
3165FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
3166{
3167 __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
3168 return _mm_fmadd_ps(b, mask, a);
3169}
3170
3171// Compute the absolute differences of packed unsigned 8-bit integers in a and
3172// b, then horizontally sum each consecutive 8 differences to produce two
3173// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3174// 16 bits of 64-bit elements in dst.
3175// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
3176FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
3177{
3178 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
3179 uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3180 uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3181 uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3182 return (__m128i) vsetq_lane_u16(r4, r, 4);
3183}
3184
3185// Compute the absolute differences of packed unsigned 8-bit integers in a and
3186// b, then horizontally sum each consecutive 8 differences to produce four
3187// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3188// 16 bits of dst.
3189// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
3190FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
3191{
3192 uint16x4_t t =
3193 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3194 uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3195 return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
3196}
3197
3198// Compute the absolute differences of packed unsigned 8-bit integers in a and
3199// b, then horizontally sum each consecutive 8 differences to produce four
3200// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3201// 16 bits of dst.
3202//
3203// FOR j := 0 to 7
3204// i := j*8
3205// tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
3206// ENDFOR
3207// dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
3208// tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
3209//
3210// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
3211#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
3212
3213// Divides the four single-precision, floating-point values of a and b.
3214//
3215// r0 := a0 / b0
3216// r1 := a1 / b1
3217// r2 := a2 / b2
3218// r3 := a3 / b3
3219//
3220// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
3221FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
3222{
3223#if defined(__aarch64__)
3224 return vreinterpretq_m128_f32(
3225 vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3226#else
3227 float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
3228 float32x4_t recip1 =
3229 vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
3230 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
3231#endif
3232}
3233
3234// Divides the scalar single-precision floating point value of a by b.
3235// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
3236FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
3237{
3238 float32_t value =
3239 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
3240 return vreinterpretq_m128_f32(
3241 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3242}
3243
3244// Computes the approximations of reciprocals of the four single-precision,
3245// floating-point values of a.
3246// https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
3247FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
3248{
3249 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
3250 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
3251 return vreinterpretq_m128_f32(recip);
3252}
3253
3254// Compute the approximate reciprocal of the lower single-precision (32-bit)
3255// floating-point element in a, store the result in the lower element of dst,
3256// and copy the upper 3 packed elements from a to the upper elements of dst. The
3257// maximum relative error for this approximation is less than 1.5*2^-12.
3258//
3259// dst[31:0] := (1.0 / a[31:0])
3260// dst[127:32] := a[127:32]
3261//
3262// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
3263FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
3264{
3265 return _mm_move_ss(a, _mm_rcp_ps(a));
3266}
3267
3268// Computes the approximations of square roots of the four single-precision,
3269// floating-point values of a. First computes reciprocal square roots and then
3270// reciprocals of the four values.
3271//
3272// r0 := sqrt(a0)
3273// r1 := sqrt(a1)
3274// r2 := sqrt(a2)
3275// r3 := sqrt(a3)
3276//
3277// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
3278FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
3279{
3280#if defined(__aarch64__)
3281 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
3282#else
3283 float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3284 float32x4_t sq = vrecpeq_f32(recipsq);
3285 // ??? use step versions of both sqrt and recip for better accuracy?
3286 return vreinterpretq_m128_f32(sq);
3287#endif
3288}
3289
3290// Computes the approximation of the square root of the scalar single-precision
3291// floating point value of in.
3292// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
3293FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
3294{
3295 float32_t value =
3296 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
3297 return vreinterpretq_m128_f32(
3298 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
3299}
3300
3301// Computes the approximations of the reciprocal square roots of the four
3302// single-precision floating point values of in.
3303// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
3304FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
3305{
3306 return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
3307}
3308
3309// Compute the approximate reciprocal square root of the lower single-precision
3310// (32-bit) floating-point element in a, store the result in the lower element
3311// of dst, and copy the upper 3 packed elements from a to the upper elements of
3312// dst.
3313// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
3314FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
3315{
3316 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
3317}
3318
3319// Compare packed signed 16-bit integers in a and b, and store packed maximum
3320// values in dst.
3321//
3322// FOR j := 0 to 3
3323// i := j*16
3324// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3325// ENDFOR
3326//
3327// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
3328FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
3329{
3330 return vreinterpret_m64_s16(
3331 vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3332}
3333
3334// Compare packed signed 16-bit integers in a and b, and store packed maximum
3335// values in dst.
3336//
3337// FOR j := 0 to 3
3338// i := j*16
3339// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3340// ENDFOR
3341//
3342// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
3343#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
3344
3345// Computes the maximums of the four single-precision, floating-point values of
3346// a and b.
3347// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
3348FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
3349{
3350#if SSE2NEON_PRECISE_MINMAX
3351 float32x4_t _a = vreinterpretq_f32_m128(a);
3352 float32x4_t _b = vreinterpretq_f32_m128(b);
3353 return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
3354#else
3355 return vreinterpretq_m128_f32(
3356 vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3357#endif
3358}
3359
3360// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3361// values in dst.
3362//
3363// FOR j := 0 to 7
3364// i := j*8
3365// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3366// ENDFOR
3367//
3368// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
3369FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
3370{
3371 return vreinterpret_m64_u8(
3372 vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3373}
3374
3375// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3376// values in dst.
3377//
3378// FOR j := 0 to 7
3379// i := j*8
3380// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3381// ENDFOR
3382//
3383// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
3384#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
3385
3386// Compare packed signed 16-bit integers in a and b, and store packed minimum
3387// values in dst.
3388//
3389// FOR j := 0 to 3
3390// i := j*16
3391// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3392// ENDFOR
3393//
3394// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
3395FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
3396{
3397 return vreinterpret_m64_s16(
3398 vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3399}
3400
3401// Compare packed signed 16-bit integers in a and b, and store packed minimum
3402// values in dst.
3403//
3404// FOR j := 0 to 3
3405// i := j*16
3406// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3407// ENDFOR
3408//
3409// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
3410#define _m_pminsw(a, b) _mm_min_pi16(a, b)
3411
3412// Computes the minima of the four single-precision, floating-point values of a
3413// and b.
3414// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
3415FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
3416{
3417#if SSE2NEON_PRECISE_MINMAX
3418 float32x4_t _a = vreinterpretq_f32_m128(a);
3419 float32x4_t _b = vreinterpretq_f32_m128(b);
3420 return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
3421#else
3422 return vreinterpretq_m128_f32(
3423 vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3424#endif
3425}
3426
3427// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3428// values in dst.
3429//
3430// FOR j := 0 to 7
3431// i := j*8
3432// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3433// ENDFOR
3434//
3435// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
3436FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
3437{
3438 return vreinterpret_m64_u8(
3439 vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3440}
3441
3442// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3443// values in dst.
3444//
3445// FOR j := 0 to 7
3446// i := j*8
3447// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3448// ENDFOR
3449//
3450// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
3451#define _m_pminub(a, b) _mm_min_pu8(a, b)
3452
3453// Computes the maximum of the two lower scalar single-precision floating point
3454// values of a and b.
3455// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
3456FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
3457{
3458 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
3459 return vreinterpretq_m128_f32(
3460 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3461}
3462
3463// Computes the minimum of the two lower scalar single-precision floating point
3464// values of a and b.
3465// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
3466FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
3467{
3468 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
3469 return vreinterpretq_m128_f32(
3470 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3471}
3472
3473// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
3474// 16 unsigned 8-bit integers from b.
3475// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
3476FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
3477{
3478 return vreinterpretq_m128i_u8(
3479 vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3480}
3481
3482// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
3483// 16 unsigned 8-bit integers from b.
3484// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
3485FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
3486{
3487 return vreinterpretq_m128i_u8(
3488 vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3489}
3490
3491// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
3492// signed 16-bit integers from b.
3493// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
3494FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
3495{
3496 return vreinterpretq_m128i_s16(
3497 vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3498}
3499
3500// Compare packed signed 8-bit integers in a and b, and store packed maximum
3501// values in dst.
3502// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
3503FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
3504{
3505 return vreinterpretq_m128i_s8(
3506 vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3507}
3508
3509// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
3510// signed 16-bit integers from b.
3511// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
3512FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
3513{
3514 return vreinterpretq_m128i_s16(
3515 vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3516}
3517
3518// epi versions of min/max
3519// Computes the pariwise maximums of the four signed 32-bit integer values of a
3520// and b.
3521//
3522// A 128-bit parameter that can be defined with the following equations:
3523// r0 := (a0 > b0) ? a0 : b0
3524// r1 := (a1 > b1) ? a1 : b1
3525// r2 := (a2 > b2) ? a2 : b2
3526// r3 := (a3 > b3) ? a3 : b3
3527//
3528// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
3529FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
3530{
3531 return vreinterpretq_m128i_s32(
3532 vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3533}
3534
3535// Computes the pariwise minima of the four signed 32-bit integer values of a
3536// and b.
3537//
3538// A 128-bit parameter that can be defined with the following equations:
3539// r0 := (a0 < b0) ? a0 : b0
3540// r1 := (a1 < b1) ? a1 : b1
3541// r2 := (a2 < b2) ? a2 : b2
3542// r3 := (a3 < b3) ? a3 : b3
3543//
3544// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
3545FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
3546{
3547 return vreinterpretq_m128i_s32(
3548 vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3549}
3550
3551// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
3552// values in dst.
3553// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
3554FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
3555{
3556 return vreinterpretq_m128i_u32(
3557 vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3558}
3559
3560// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
3561// values in dst.
3562// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
3563FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
3564{
3565 return vreinterpretq_m128i_u32(
3566 vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3567}
3568
3569// Multiply the packed unsigned 16-bit integers in a and b, producing
3570// intermediate 32-bit integers, and store the high 16 bits of the intermediate
3571// integers in dst.
3572// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
3573FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
3574{
3575 return vreinterpret_m64_u16(vshrn_n_u32(
3576 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
3577}
3578
3579// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3580// integers from b.
3581//
3582// r0 := (a0 * b0)[31:16]
3583// r1 := (a1 * b1)[31:16]
3584// ...
3585// r7 := (a7 * b7)[31:16]
3586//
3587// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
3588FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
3589{
3590 /* FIXME: issue with large values because of result saturation */
3591 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
3592 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
3593 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
3594 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
3595 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
3596 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
3597 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
3598 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
3599 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
3600 uint16x8x2_t r =
3601 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
3602 return vreinterpretq_m128i_u16(r.val[1]);
3603}
3604
3605// Computes pairwise add of each argument as single-precision, floating-point
3606// values a and b.
3607// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
3608FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
3609{
3610#if defined(__aarch64__)
3611 return vreinterpretq_m128_f32(
3612 vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3613#else
3614 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
3615 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
3616 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
3617 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
3618 return vreinterpretq_m128_f32(
3619 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
3620#endif
3621}
3622
3623// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
3624// values a and b.
3625FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
3626{
3627 int16x8_t a = vreinterpretq_s16_m128i(_a);
3628 int16x8_t b = vreinterpretq_s16_m128i(_b);
3629#if defined(__aarch64__)
3630 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
3631#else
3632 return vreinterpretq_m128i_s16(
3633 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
3634 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
3635#endif
3636}
3637
3638// Horizontally substract adjacent pairs of single-precision (32-bit)
3639// floating-point elements in a and b, and pack the results in dst.
3640// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
3641FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
3642{
3643#if defined(__aarch64__)
3644 return vreinterpretq_m128_f32(vsubq_f32(
3645 vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
3646 vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
3647#else
3648 float32x4x2_t c =
3649 vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
3650 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
3651#endif
3652}
3653
3654// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
3655// signed 16-bit results in dst.
3656// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
3657FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
3658{
3659 return vreinterpret_m64_s16(
3660 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3661}
3662
3663// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
3664// signed 32-bit results in dst.
3665// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
3666FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
3667{
3668 return vreinterpret_m64_s32(
3669 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
3670}
3671
3672// Computes pairwise difference of each argument as a 16-bit signed or unsigned
3673// integer values a and b.
3674FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
3675{
3676 int32x4_t a = vreinterpretq_s32_m128i(_a);
3677 int32x4_t b = vreinterpretq_s32_m128i(_b);
3678 // Interleave using vshrn/vmovn
3679 // [a0|a2|a4|a6|b0|b2|b4|b6]
3680 // [a1|a3|a5|a7|b1|b3|b5|b7]
3681 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3682 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3683 // Subtract
3684 return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
3685}
3686
3687// Computes saturated pairwise sub of each argument as a 16-bit signed
3688// integer values a and b.
3689FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
3690{
3691#if defined(__aarch64__)
3692 int16x8_t a = vreinterpretq_s16_m128i(_a);
3693 int16x8_t b = vreinterpretq_s16_m128i(_b);
3694 return vreinterpretq_s64_s16(
3695 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3696#else
3697 int32x4_t a = vreinterpretq_s32_m128i(_a);
3698 int32x4_t b = vreinterpretq_s32_m128i(_b);
3699 // Interleave using vshrn/vmovn
3700 // [a0|a2|a4|a6|b0|b2|b4|b6]
3701 // [a1|a3|a5|a7|b1|b3|b5|b7]
3702 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3703 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3704 // Saturated add
3705 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
3706#endif
3707}
3708
3709// Computes saturated pairwise difference of each argument as a 16-bit signed
3710// integer values a and b.
3711// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
3712FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
3713{
3714#if defined(__aarch64__)
3715 int16x8_t a = vreinterpretq_s16_m128i(_a);
3716 int16x8_t b = vreinterpretq_s16_m128i(_b);
3717 return vreinterpretq_s64_s16(
3718 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3719#else
3720 int32x4_t a = vreinterpretq_s32_m128i(_a);
3721 int32x4_t b = vreinterpretq_s32_m128i(_b);
3722 // Interleave using vshrn/vmovn
3723 // [a0|a2|a4|a6|b0|b2|b4|b6]
3724 // [a1|a3|a5|a7|b1|b3|b5|b7]
3725 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3726 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3727 // Saturated subtract
3728 return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
3729#endif
3730}
3731
3732// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
3733// values a and b.
3734FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
3735{
3736 int32x4_t a = vreinterpretq_s32_m128i(_a);
3737 int32x4_t b = vreinterpretq_s32_m128i(_b);
3738 return vreinterpretq_m128i_s32(
3739 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
3740 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
3741}
3742
3743// Computes pairwise difference of each argument as a 32-bit signed or unsigned
3744// integer values a and b.
3745FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
3746{
3747 int64x2_t a = vreinterpretq_s64_m128i(_a);
3748 int64x2_t b = vreinterpretq_s64_m128i(_b);
3749 // Interleave using vshrn/vmovn
3750 // [a0|a2|b0|b2]
3751 // [a1|a2|b1|b3]
3752 int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
3753 int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
3754 // Subtract
3755 return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
3756}
3757
3758// Kahan summation for accurate summation of floating-point numbers.
3759// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
3760FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y)
3761{
3762 y -= *c;
3763 float t = *sum + y;
3764 *c = (t - *sum) - y;
3765 *sum = t;
3766}
3767
3768// Conditionally multiply the packed single-precision (32-bit) floating-point
3769// elements in a and b using the high 4 bits in imm8, sum the four products,
3770// and conditionally store the sum in dst using the low 4 bits of imm.
3771// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
3772FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
3773{
3774#if defined(__aarch64__)
3775 /* shortcuts */
3776 if (imm == 0xFF) {
3777 return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
3778 }
3779 if (imm == 0x7F) {
3780 float32x4_t m = _mm_mul_ps(a, b);
3781 m[3] = 0;
3782 return _mm_set1_ps(vaddvq_f32(m));
3783 }
3784#endif
3785
3786 float s = 0, c = 0;
3787 float32x4_t f32a = vreinterpretq_f32_m128(a);
3788 float32x4_t f32b = vreinterpretq_f32_m128(b);
3789
3790 /* To improve the accuracy of floating-point summation, Kahan algorithm
3791 * is used for each operation.
3792 */
3793 if (imm & (1 << 4))
3794 sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
3795 if (imm & (1 << 5))
3796 sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
3797 if (imm & (1 << 6))
3798 sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
3799 if (imm & (1 << 7))
3800 sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
3801 s += c;
3802
3803 float32x4_t res = {
3804 (imm & 0x1) ? s : 0,
3805 (imm & 0x2) ? s : 0,
3806 (imm & 0x4) ? s : 0,
3807 (imm & 0x8) ? s : 0,
3808 };
3809 return vreinterpretq_m128_f32(res);
3810}
3811
3812/* Compare operations */
3813
3814// Compares for less than
3815// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
3816FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
3817{
3818 return vreinterpretq_m128_u32(
3819 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3820}
3821
3822// Compares for less than
3823// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
3824FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
3825{
3826 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
3827}
3828
3829// Compares for greater than.
3830//
3831// r0 := (a0 > b0) ? 0xffffffff : 0x0
3832// r1 := (a1 > b1) ? 0xffffffff : 0x0
3833// r2 := (a2 > b2) ? 0xffffffff : 0x0
3834// r3 := (a3 > b3) ? 0xffffffff : 0x0
3835//
3836// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
3837FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
3838{
3839 return vreinterpretq_m128_u32(
3840 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3841}
3842
3843// Compares for greater than.
3844// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
3845FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
3846{
3847 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
3848}
3849
3850// Compares for greater than or equal.
3851// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
3852FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
3853{
3854 return vreinterpretq_m128_u32(
3855 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3856}
3857
3858// Compares for greater than or equal.
3859// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
3860FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
3861{
3862 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
3863}
3864
3865// Compares for less than or equal.
3866//
3867// r0 := (a0 <= b0) ? 0xffffffff : 0x0
3868// r1 := (a1 <= b1) ? 0xffffffff : 0x0
3869// r2 := (a2 <= b2) ? 0xffffffff : 0x0
3870// r3 := (a3 <= b3) ? 0xffffffff : 0x0
3871//
3872// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
3873FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
3874{
3875 return vreinterpretq_m128_u32(
3876 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3877}
3878
3879// Compares for less than or equal.
3880// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
3881FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
3882{
3883 return _mm_move_ss(a, _mm_cmple_ps(a, b));
3884}
3885
3886// Compares for equality.
3887// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
3888FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
3889{
3890 return vreinterpretq_m128_u32(
3891 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3892}
3893
3894// Compares for equality.
3895// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
3896FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
3897{
3898 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
3899}
3900
3901// Compares for inequality.
3902// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
3903FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
3904{
3905 return vreinterpretq_m128_u32(vmvnq_u32(
3906 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
3907}
3908
3909// Compares for inequality.
3910// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
3911FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
3912{
3913 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
3914}
3915
3916// Compares for not greater than or equal.
3917// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
3918FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
3919{
3920 return _mm_cmplt_ps(a, b);
3921}
3922
3923// Compares for not greater than or equal.
3924// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
3925FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
3926{
3927 return _mm_cmplt_ss(a, b);
3928}
3929
3930// Compares for not greater than.
3931// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
3932FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
3933{
3934 return _mm_cmple_ps(a, b);
3935}
3936
3937// Compares for not greater than.
3938// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
3939FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
3940{
3941 return _mm_cmple_ss(a, b);
3942}
3943
3944// Compares for not less than or equal.
3945// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
3946FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
3947{
3948 return _mm_cmpgt_ps(a, b);
3949}
3950
3951// Compares for not less than or equal.
3952// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
3953FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
3954{
3955 return _mm_cmpgt_ss(a, b);
3956}
3957
3958// Compares for not less than.
3959// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
3960FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
3961{
3962 return _mm_cmpge_ps(a, b);
3963}
3964
3965// Compares for not less than.
3966// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
3967FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
3968{
3969 return _mm_cmpge_ss(a, b);
3970}
3971
3972// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3973// unsigned 8-bit integers in b for equality.
3974// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
3975FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3976{
3977 return vreinterpretq_m128i_u8(
3978 vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3979}
3980
3981// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3982// unsigned 16-bit integers in b for equality.
3983// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
3984FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3985{
3986 return vreinterpretq_m128i_u16(
3987 vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3988}
3989
3990// Compare packed 32-bit integers in a and b for equality, and store the results
3991// in dst
3992FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3993{
3994 return vreinterpretq_m128i_u32(
3995 vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3996}
3997
3998// Compare packed 64-bit integers in a and b for equality, and store the results
3999// in dst
4000FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
4001{
4002#if defined(__aarch64__)
4003 return vreinterpretq_m128i_u64(
4004 vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
4005#else
4006 // ARMv7 lacks vceqq_u64
4007 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
4008 uint32x4_t cmp =
4009 vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
4010 uint32x4_t swapped = vrev64q_u32(cmp);
4011 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
4012#endif
4013}
4014
4015// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4016// in b for lesser than.
4017// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
4018FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
4019{
4020 return vreinterpretq_m128i_u8(
4021 vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4022}
4023
4024// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4025// in b for greater than.
4026//
4027// r0 := (a0 > b0) ? 0xff : 0x0
4028// r1 := (a1 > b1) ? 0xff : 0x0
4029// ...
4030// r15 := (a15 > b15) ? 0xff : 0x0
4031//
4032// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
4033FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
4034{
4035 return vreinterpretq_m128i_u8(
4036 vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4037}
4038
4039// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4040// in b for less than.
4041//
4042// r0 := (a0 < b0) ? 0xffff : 0x0
4043// r1 := (a1 < b1) ? 0xffff : 0x0
4044// ...
4045// r7 := (a7 < b7) ? 0xffff : 0x0
4046//
4047// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
4048FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
4049{
4050 return vreinterpretq_m128i_u16(
4051 vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4052}
4053
4054// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4055// in b for greater than.
4056//
4057// r0 := (a0 > b0) ? 0xffff : 0x0
4058// r1 := (a1 > b1) ? 0xffff : 0x0
4059// ...
4060// r7 := (a7 > b7) ? 0xffff : 0x0
4061//
4062// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
4063FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
4064{
4065 return vreinterpretq_m128i_u16(
4066 vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4067}
4068
4069
4070// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4071// in b for less than.
4072// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
4073FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
4074{
4075 return vreinterpretq_m128i_u32(
4076 vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4077}
4078
4079// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4080// in b for greater than.
4081// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
4082FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
4083{
4084 return vreinterpretq_m128i_u32(
4085 vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4086}
4087
4088// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
4089// in b for greater than.
4090FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
4091{
4092#if defined(__aarch64__)
4093 return vreinterpretq_m128i_u64(
4094 vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
4095#else
4096 // ARMv7 lacks vcgtq_s64.
4097 // This is based off of Clang's SSE2 polyfill:
4098 // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
4099
4100 // Mask the sign bit out since we need a signed AND an unsigned comparison
4101 // and it is ugly to try and split them.
4102 int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
4103 int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
4104 int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
4105 // Check if a > b
4106 int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
4107 // Copy upper mask to lower mask
4108 // a_hi > b_hi
4109 int64x2_t gt_hi = vshrq_n_s64(greater, 63);
4110 // Copy lower mask to upper mask
4111 // a_lo > b_lo
4112 int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
4113 // Compare for equality
4114 int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
4115 // Copy upper mask to lower mask
4116 // a_hi == b_hi
4117 int64x2_t eq_hi = vshrq_n_s64(equal, 63);
4118 // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
4119 int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
4120 return vreinterpretq_m128i_s64(ret);
4121#endif
4122}
4123
4124// Compares the four 32-bit floats in a and b to check if any values are NaN.
4125// Ordered compare between each value returns true for "orderable" and false for
4126// "not orderable" (NaN).
4127// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
4128// also:
4129// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
4130// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
4131FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
4132{
4133 // Note: NEON does not have ordered compare builtin
4134 // Need to compare a eq a and b eq b to check for NaN
4135 // Do AND of results to get final
4136 uint32x4_t ceqaa =
4137 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4138 uint32x4_t ceqbb =
4139 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4140 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
4141}
4142
4143// Compares for ordered.
4144// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
4145FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
4146{
4147 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
4148}
4149
4150// Compares for unordered.
4151// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
4152FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
4153{
4154 uint32x4_t f32a =
4155 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4156 uint32x4_t f32b =
4157 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4158 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
4159}
4160
4161// Compares for unordered.
4162// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
4163FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
4164{
4165 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
4166}
4167
4168// Compares the lower single-precision floating point scalar values of a and b
4169// using a less than operation. :
4170// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
4171// note!! The documentation on MSDN is incorrect! If either of the values is a
4172// NAN the docs say you will get a one, but in fact, it will return a zero!!
4173FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
4174{
4175 uint32x4_t a_not_nan =
4176 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4177 uint32x4_t b_not_nan =
4178 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4179 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4180 uint32x4_t a_lt_b =
4181 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4182 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
4183}
4184
4185// Compares the lower single-precision floating point scalar values of a and b
4186// using a greater than operation. :
4187// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
4188FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
4189{
4190 // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
4191 // vreinterpretq_f32_m128(b)), 0);
4192 uint32x4_t a_not_nan =
4193 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4194 uint32x4_t b_not_nan =
4195 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4196 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4197 uint32x4_t a_gt_b =
4198 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4199 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
4200}
4201
4202// Compares the lower single-precision floating point scalar values of a and b
4203// using a less than or equal operation. :
4204// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
4205FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
4206{
4207 // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
4208 // vreinterpretq_f32_m128(b)), 0);
4209 uint32x4_t a_not_nan =
4210 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4211 uint32x4_t b_not_nan =
4212 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4213 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4214 uint32x4_t a_le_b =
4215 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4216 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
4217}
4218
4219// Compares the lower single-precision floating point scalar values of a and b
4220// using a greater than or equal operation. :
4221// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
4222FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
4223{
4224 // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
4225 // vreinterpretq_f32_m128(b)), 0);
4226 uint32x4_t a_not_nan =
4227 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4228 uint32x4_t b_not_nan =
4229 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4230 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4231 uint32x4_t a_ge_b =
4232 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4233 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
4234}
4235
4236// Compares the lower single-precision floating point scalar values of a and b
4237// using an equality operation. :
4238// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
4239FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
4240{
4241 // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4242 // vreinterpretq_f32_m128(b)), 0);
4243 uint32x4_t a_not_nan =
4244 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4245 uint32x4_t b_not_nan =
4246 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4247 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4248 uint32x4_t a_eq_b =
4249 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4250 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
4251}
4252
4253// Compares the lower single-precision floating point scalar values of a and b
4254// using an inequality operation. :
4255// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
4256FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
4257{
4258 // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4259 // vreinterpretq_f32_m128(b)), 0);
4260 uint32x4_t a_not_nan =
4261 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4262 uint32x4_t b_not_nan =
4263 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4264 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4265 uint32x4_t a_neq_b = vmvnq_u32(
4266 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4267 return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
4268}
4269
4270// according to the documentation, these intrinsics behave the same as the
4271// non-'u' versions. We'll just alias them here.
4272#define _mm_ucomilt_ss _mm_comilt_ss
4273#define _mm_ucomile_ss _mm_comile_ss
4274#define _mm_ucomigt_ss _mm_comigt_ss
4275#define _mm_ucomige_ss _mm_comige_ss
4276#define _mm_ucomieq_ss _mm_comieq_ss
4277#define _mm_ucomineq_ss _mm_comineq_ss
4278
4279/* Conversions */
4280
4281// Convert packed signed 32-bit integers in b to packed single-precision
4282// (32-bit) floating-point elements, store the results in the lower 2 elements
4283// of dst, and copy the upper 2 packed elements from a to the upper elements of
4284// dst.
4285//
4286// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4287// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4288// dst[95:64] := a[95:64]
4289// dst[127:96] := a[127:96]
4290//
4291// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
4292FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
4293{
4294 return vreinterpretq_m128_f32(
4295 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4296 vget_high_f32(vreinterpretq_f32_m128(a))));
4297}
4298
4299// Convert the signed 32-bit integer b to a single-precision (32-bit)
4300// floating-point element, store the result in the lower element of dst, and
4301// copy the upper 3 packed elements from a to the upper elements of dst.
4302//
4303// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4304// dst[127:32] := a[127:32]
4305//
4306// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
4307FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
4308{
4309 __m128 ret = a;
4310 return vreinterpretq_m128_f32(
4311 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(ret), 0));
4312}
4313
4314// Convert the lower single-precision (32-bit) floating-point element in a to a
4315// 32-bit integer, and store the result in dst.
4316// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
4317FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
4318{
4319#if defined(__aarch64__)
4320 return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
4321#else
4322 float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4323 float32_t diff = data - floor(data);
4324 if (diff > 0.5)
4325 return (int32_t) ceil(data);
4326 if (diff == 0.5) {
4327 int32_t f = (int32_t) floor(data);
4328 int32_t c = (int32_t) ceil(data);
4329 return c & 1 ? f : c;
4330 }
4331 return (int32_t) floor(data);
4332#endif
4333}
4334
4335// Convert packed 16-bit integers in a to packed single-precision (32-bit)
4336// floating-point elements, and store the results in dst.
4337//
4338// FOR j := 0 to 3
4339// i := j*16
4340// m := j*32
4341// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
4342// ENDFOR
4343//
4344// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
4345FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
4346{
4347 return vreinterpretq_m128_f32(
4348 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
4349}
4350
4351// Convert packed 32-bit integers in b to packed single-precision (32-bit)
4352// floating-point elements, store the results in the lower 2 elements of dst,
4353// and copy the upper 2 packed elements from a to the upper elements of dst.
4354//
4355// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4356// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4357// dst[95:64] := a[95:64]
4358// dst[127:96] := a[127:96]
4359//
4360// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
4361FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
4362{
4363 return vreinterpretq_m128_f32(
4364 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4365 vget_high_f32(vreinterpretq_f32_m128(a))));
4366}
4367
4368// Convert packed signed 32-bit integers in a to packed single-precision
4369// (32-bit) floating-point elements, store the results in the lower 2 elements
4370// of dst, then covert the packed signed 32-bit integers in b to
4371// single-precision (32-bit) floating-point element, and store the results in
4372// the upper 2 elements of dst.
4373//
4374// dst[31:0] := Convert_Int32_To_FP32(a[31:0])
4375// dst[63:32] := Convert_Int32_To_FP32(a[63:32])
4376// dst[95:64] := Convert_Int32_To_FP32(b[31:0])
4377// dst[127:96] := Convert_Int32_To_FP32(b[63:32])
4378//
4379// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
4380FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
4381{
4382 return vreinterpretq_m128_f32(vcvtq_f32_s32(
4383 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
4384}
4385
4386// Convert the lower packed 8-bit integers in a to packed single-precision
4387// (32-bit) floating-point elements, and store the results in dst.
4388//
4389// FOR j := 0 to 3
4390// i := j*8
4391// m := j*32
4392// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
4393// ENDFOR
4394//
4395// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
4396FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
4397{
4398 return vreinterpretq_m128_f32(vcvtq_f32_s32(
4399 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
4400}
4401
4402// Convert packed unsigned 16-bit integers in a to packed single-precision
4403// (32-bit) floating-point elements, and store the results in dst.
4404//
4405// FOR j := 0 to 3
4406// i := j*16
4407// m := j*32
4408// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
4409// ENDFOR
4410//
4411// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
4412FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
4413{
4414 return vreinterpretq_m128_f32(
4415 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
4416}
4417
4418// Convert the lower packed unsigned 8-bit integers in a to packed
4419// single-precision (32-bit) floating-point elements, and store the results in
4420// dst.
4421//
4422// FOR j := 0 to 3
4423// i := j*8
4424// m := j*32
4425// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
4426// ENDFOR
4427//
4428// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
4429FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
4430{
4431 return vreinterpretq_m128_f32(vcvtq_f32_u32(
4432 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
4433}
4434
4435// Converts the four single-precision, floating-point values of a to signed
4436// 32-bit integer values using truncate.
4437// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
4438FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4439{
4440 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4441}
4442
4443// Converts the four signed 32-bit integer values of a to single-precision,
4444// floating-point values
4445// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
4446FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
4447{
4448 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
4449}
4450
4451// Converts the four unsigned 8-bit integers in the lower 16 bits to four
4452// unsigned 32-bit integers.
4453FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
4454{
4455 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
4456 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4457 return vreinterpretq_m128i_u16(u16x8);
4458}
4459
4460// Converts the four unsigned 8-bit integers in the lower 32 bits to four
4461// unsigned 32-bit integers.
4462// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
4463FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
4464{
4465 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
4466 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4467 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
4468 return vreinterpretq_m128i_u32(u32x4);
4469}
4470
4471// Converts the two unsigned 8-bit integers in the lower 16 bits to two
4472// unsigned 64-bit integers.
4473FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
4474{
4475 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
4476 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
4477 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4478 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4479 return vreinterpretq_m128i_u64(u64x2);
4480}
4481
4482// Converts the four unsigned 8-bit integers in the lower 16 bits to four
4483// unsigned 32-bit integers.
4484FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
4485{
4486 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
4487 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4488 return vreinterpretq_m128i_s16(s16x8);
4489}
4490
4491// Converts the four unsigned 8-bit integers in the lower 32 bits to four
4492// unsigned 32-bit integers.
4493FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
4494{
4495 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
4496 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4497 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
4498 return vreinterpretq_m128i_s32(s32x4);
4499}
4500
4501// Converts the two signed 8-bit integers in the lower 32 bits to four
4502// signed 64-bit integers.
4503FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
4504{
4505 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
4506 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
4507 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4508 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4509 return vreinterpretq_m128i_s64(s64x2);
4510}
4511
4512// Converts the four signed 16-bit integers in the lower 64 bits to four signed
4513// 32-bit integers.
4514FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
4515{
4516 return vreinterpretq_m128i_s32(
4517 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
4518}
4519
4520// Converts the two signed 16-bit integers in the lower 32 bits two signed
4521// 32-bit integers.
4522FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
4523{
4524 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
4525 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4526 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4527 return vreinterpretq_m128i_s64(s64x2);
4528}
4529
4530// Converts the four unsigned 16-bit integers in the lower 64 bits to four
4531// unsigned 32-bit integers.
4532FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
4533{
4534 return vreinterpretq_m128i_u32(
4535 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
4536}
4537
4538// Converts the two unsigned 16-bit integers in the lower 32 bits to two
4539// unsigned 64-bit integers.
4540FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
4541{
4542 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
4543 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4544 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4545 return vreinterpretq_m128i_u64(u64x2);
4546}
4547
4548// Converts the two unsigned 32-bit integers in the lower 64 bits to two
4549// unsigned 64-bit integers.
4550FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
4551{
4552 return vreinterpretq_m128i_u64(
4553 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
4554}
4555
4556// Converts the two signed 32-bit integers in the lower 64 bits to two signed
4557// 64-bit integers.
4558FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
4559{
4560 return vreinterpretq_m128i_s64(
4561 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
4562}
4563
4564// Converts the four single-precision, floating-point values of a to signed
4565// 32-bit integer values.
4566//
4567// r0 := (int) a0
4568// r1 := (int) a1
4569// r2 := (int) a2
4570// r3 := (int) a3
4571//
4572// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
4573// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
4574// does not support! It is supported on ARMv8-A however.
4575FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
4576{
4577#if defined(__aarch64__)
4578 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
4579#else
4580 uint32x4_t signmask = vdupq_n_u32(0x80000000);
4581 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4582 vdupq_n_f32(0.5f)); /* +/- 0.5 */
4583 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4584 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4585 int32x4_t r_trunc =
4586 vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4587 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4588 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4589 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4590 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4591 float32x4_t delta = vsubq_f32(
4592 vreinterpretq_f32_m128(a),
4593 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4594 uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
4595 return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
4596#endif
4597}
4598
4599// Copy the lower 32-bit integer in a to dst.
4600//
4601// dst[31:0] := a[31:0]
4602//
4603// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
4604FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4605{
4606 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4607}
4608
4609// Copy the lower 64-bit integer in a to dst.
4610//
4611// dst[63:0] := a[63:0]
4612//
4613// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
4614FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4615{
4616 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4617}
4618
4619// Copy the lower 64-bit integer in a to dst.
4620//
4621// dst[63:0] := a[63:0]
4622//
4623// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4624#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4625
4626// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4627// zero extending the upper bits.
4628//
4629// r0 := a
4630// r1 := 0x0
4631// r2 := 0x0
4632// r3 := 0x0
4633//
4634// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
4635FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4636{
4637 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4638}
4639
4640// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4641// zero extending the upper bits.
4642//
4643// r0 := a
4644// r1 := 0x0
4645FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4646{
4647 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4648}
4649
4650// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
4651// compilation and does not generate any instructions, thus it has zero latency.
4652// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
4653FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
4654{
4655 return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
4656}
4657
4658// Applies a type cast to reinterpret four 32-bit floating point values passed
4659// in as a 128-bit parameter as packed 32-bit integers.
4660// https://msdn.microsoft.com/en-us/library/bb514099.aspx
4661FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
4662{
4663 return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
4664}
4665
4666// Applies a type cast to reinterpret four 32-bit integers passed in as a
4667// 128-bit parameter as packed 32-bit floating point values.
4668// https://msdn.microsoft.com/en-us/library/bb514029.aspx
4669FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
4670{
4671 return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
4672}
4673
4674// Loads 128-bit value. :
4675// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
4676FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4677{
4678 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4679}
4680
4681// Load a double-precision (64-bit) floating-point element from memory into both
4682// elements of dst.
4683//
4684// dst[63:0] := MEM[mem_addr+63:mem_addr]
4685// dst[127:64] := MEM[mem_addr+63:mem_addr]
4686//
4687// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
4688FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4689{
4690#if defined(__aarch64__)
4691 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4692#else
4693 return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4694#endif
4695}
4696
4697// Load a double-precision (64-bit) floating-point element from memory into the
4698// upper element of dst, and copy the lower element from a to dst. mem_addr does
4699// not need to be aligned on any particular boundary.
4700//
4701// dst[63:0] := a[63:0]
4702// dst[127:64] := MEM[mem_addr+63:mem_addr]
4703//
4704// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
4705FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4706{
4707#if defined(__aarch64__)
4708 return vreinterpretq_m128d_f64(
4709 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4710#else
4711 return vreinterpretq_m128d_f32(vcombine_f32(
4712 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4713#endif
4714}
4715
4716// Load a double-precision (64-bit) floating-point element from memory into both
4717// elements of dst.
4718//
4719// dst[63:0] := MEM[mem_addr+63:mem_addr]
4720// dst[127:64] := MEM[mem_addr+63:mem_addr]
4721//
4722// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4723#define _mm_load_pd1 _mm_load1_pd
4724
4725// Load a double-precision (64-bit) floating-point element from memory into both
4726// elements of dst.
4727//
4728// dst[63:0] := MEM[mem_addr+63:mem_addr]
4729// dst[127:64] := MEM[mem_addr+63:mem_addr]
4730//
4731// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
4732#define _mm_loaddup_pd _mm_load1_pd
4733
4734// Loads 128-bit value. :
4735// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
4736FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4737{
4738 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4739}
4740
4741// Load unaligned 32-bit integer from memory into the first element of dst.
4742//
4743// dst[31:0] := MEM[mem_addr+31:mem_addr]
4744// dst[MAX:32] := 0
4745//
4746// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
4747FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4748{
4749 return vreinterpretq_m128i_s32(
4750 vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4751}
4752
4753// Convert packed double-precision (64-bit) floating-point elements in a to
4754// packed single-precision (32-bit) floating-point elements, and store the
4755// results in dst.
4756//
4757// FOR j := 0 to 1
4758// i := 32*j
4759// k := 64*j
4760// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
4761// ENDFOR
4762// dst[127:64] := 0
4763//
4764// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
4765FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
4766{
4767#if defined(__aarch64__)
4768 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
4769 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
4770#else
4771 float a0 = (float) ((double *) &a)[0];
4772 float a1 = (float) ((double *) &a)[1];
4773 return _mm_set_ps(0, 0, a1, a0);
4774#endif
4775}
4776
4777// Copy the lower double-precision (64-bit) floating-point element of a to dst.
4778//
4779// dst[63:0] := a[63:0]
4780//
4781// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
4782FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
4783{
4784#if defined(__aarch64__)
4785 return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4786#else
4787 return ((double *) &a)[0];
4788#endif
4789}
4790
4791// Convert packed single-precision (32-bit) floating-point elements in a to
4792// packed double-precision (64-bit) floating-point elements, and store the
4793// results in dst.
4794//
4795// FOR j := 0 to 1
4796// i := 64*j
4797// k := 32*j
4798// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
4799// ENDFOR
4800//
4801// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
4802FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
4803{
4804#if defined(__aarch64__)
4805 return vreinterpretq_m128d_f64(
4806 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4807#else
4808 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4809 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4810 return _mm_set_pd(a1, a0);
4811#endif
4812}
4813
4814// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
4815// compilation and does not generate any instructions, thus it has zero latency.
4816// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
4817FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
4818{
4819 return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
4820}
4821
4822// Blend packed single-precision (32-bit) floating-point elements from a and b
4823// using mask, and store the results in dst.
4824// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
4825FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
4826{
4827 return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
4828 vreinterpretq_f32_m128(b),
4829 vreinterpretq_f32_m128(a)));
4830}
4831
4832// Round the packed single-precision (32-bit) floating-point elements in a using
4833// the rounding parameter, and store the results as packed single-precision
4834// floating-point elements in dst.
4835// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
4836FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
4837{
4838#if defined(__aarch64__)
4839 switch (rounding) {
4840 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
4841 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
4842 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
4843 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
4844 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
4845 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
4846 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
4847 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
4848 default: //_MM_FROUND_CUR_DIRECTION
4849 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
4850 }
4851#else
4852 float *v_float = (float *) &a;
4853 __m128 zero, neg_inf, pos_inf;
4854
4855 switch (rounding) {
4856 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
4857 return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
4858 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
4859 return (__m128){floorf(v_float[0]), floorf(v_float[1]),
4860 floorf(v_float[2]), floorf(v_float[3])};
4861 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
4862 return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
4863 ceilf(v_float[3])};
4864 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
4865 zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
4866 neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
4867 floorf(v_float[2]), floorf(v_float[3]));
4868 pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
4869 ceilf(v_float[2]), ceilf(v_float[3]));
4870 return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
4871 default: //_MM_FROUND_CUR_DIRECTION
4872 return (__m128){roundf(v_float[0]), roundf(v_float[1]),
4873 roundf(v_float[2]), roundf(v_float[3])};
4874 }
4875#endif
4876}
4877
4878// Round the packed single-precision (32-bit) floating-point elements in a up to
4879// an integer value, and store the results as packed single-precision
4880// floating-point elements in dst.
4881// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
4882FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
4883{
4884 return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4885}
4886
4887// Round the packed single-precision (32-bit) floating-point elements in a down
4888// to an integer value, and store the results as packed single-precision
4889// floating-point elements in dst.
4890// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
4891FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
4892{
4893 return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4894}
4895
4896
4897// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
4898// may perform better than _mm_loadu_si128 when the data crosses a cache line
4899// boundary.
4900//
4901// dst[127:0] := MEM[mem_addr+127:mem_addr]
4902//
4903// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
4904#define _mm_lddqu_si128 _mm_loadu_si128
4905
4906/* Miscellaneous Operations */
4907
4908// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
4909// in the sign bit.
4910//
4911// r0 := a0 >> count
4912// r1 := a1 >> count
4913// ...
4914// r7 := a7 >> count
4915//
4916// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
4917FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
4918{
4919 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
4920 if (c > 15)
4921 return _mm_cmplt_epi16(a, _mm_setzero_si128());
4922 return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
4923}
4924
4925// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
4926// in the sign bit.
4927//
4928// r0 := a0 >> count
4929// r1 := a1 >> count
4930// r2 := a2 >> count
4931// r3 := a3 >> count
4932//
4933// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
4934FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
4935{
4936 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
4937 if (c > 31)
4938 return _mm_cmplt_epi32(a, _mm_setzero_si128());
4939 return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
4940}
4941
4942// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
4943// saturates.
4944// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
4945FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4946{
4947 return vreinterpretq_m128i_s8(
4948 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4949 vqmovn_s16(vreinterpretq_s16_m128i(b))));
4950}
4951
4952// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
4953// integers and saturates.
4954//
4955// r0 := UnsignedSaturate(a0)
4956// r1 := UnsignedSaturate(a1)
4957// ...
4958// r7 := UnsignedSaturate(a7)
4959// r8 := UnsignedSaturate(b0)
4960// r9 := UnsignedSaturate(b1)
4961// ...
4962// r15 := UnsignedSaturate(b7)
4963//
4964// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
4965FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4966{
4967 return vreinterpretq_m128i_u8(
4968 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4969 vqmovun_s16(vreinterpretq_s16_m128i(b))));
4970}
4971
4972// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
4973// and saturates.
4974//
4975// r0 := SignedSaturate(a0)
4976// r1 := SignedSaturate(a1)
4977// r2 := SignedSaturate(a2)
4978// r3 := SignedSaturate(a3)
4979// r4 := SignedSaturate(b0)
4980// r5 := SignedSaturate(b1)
4981// r6 := SignedSaturate(b2)
4982// r7 := SignedSaturate(b3)
4983//
4984// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
4985FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4986{
4987 return vreinterpretq_m128i_s16(
4988 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4989 vqmovn_s32(vreinterpretq_s32_m128i(b))));
4990}
4991
4992// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
4993// integers and saturates.
4994//
4995// r0 := UnsignedSaturate(a0)
4996// r1 := UnsignedSaturate(a1)
4997// r2 := UnsignedSaturate(a2)
4998// r3 := UnsignedSaturate(a3)
4999// r4 := UnsignedSaturate(b0)
5000// r5 := UnsignedSaturate(b1)
5001// r6 := UnsignedSaturate(b2)
5002// r7 := UnsignedSaturate(b3)
5003FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
5004{
5005 return vreinterpretq_m128i_u16(
5006 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
5007 vqmovun_s32(vreinterpretq_s32_m128i(b))));
5008}
5009
5010// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
5011// 8 signed or unsigned 8-bit integers in b.
5012//
5013// r0 := a0
5014// r1 := b0
5015// r2 := a1
5016// r3 := b1
5017// ...
5018// r14 := a7
5019// r15 := b7
5020//
5021// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
5022FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
5023{
5024#if defined(__aarch64__)
5025 return vreinterpretq_m128i_s8(
5026 vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5027#else
5028 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5029 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5030 int8x8x2_t result = vzip_s8(a1, b1);
5031 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5032#endif
5033}
5034
5035// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
5036// lower 4 signed or unsigned 16-bit integers in b.
5037//
5038// r0 := a0
5039// r1 := b0
5040// r2 := a1
5041// r3 := b1
5042// r4 := a2
5043// r5 := b2
5044// r6 := a3
5045// r7 := b3
5046//
5047// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
5048FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
5049{
5050#if defined(__aarch64__)
5051 return vreinterpretq_m128i_s16(
5052 vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5053#else
5054 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5055 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5056 int16x4x2_t result = vzip_s16(a1, b1);
5057 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5058#endif
5059}
5060
5061// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
5062// lower 2 signed or unsigned 32 - bit integers in b.
5063//
5064// r0 := a0
5065// r1 := b0
5066// r2 := a1
5067// r3 := b1
5068//
5069// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
5070FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
5071{
5072#if defined(__aarch64__)
5073 return vreinterpretq_m128i_s32(
5074 vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5075#else
5076 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5077 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5078 int32x2x2_t result = vzip_s32(a1, b1);
5079 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5080#endif
5081}
5082
5083FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
5084{
5085 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5086 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5087 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5088}
5089
5090// Selects and interleaves the lower two single-precision, floating-point values
5091// from a and b.
5092//
5093// r0 := a0
5094// r1 := b0
5095// r2 := a1
5096// r3 := b1
5097//
5098// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
5099FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
5100{
5101#if defined(__aarch64__)
5102 return vreinterpretq_m128_f32(
5103 vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5104#else
5105 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
5106 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
5107 float32x2x2_t result = vzip_f32(a1, b1);
5108 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5109#endif
5110}
5111
5112// Selects and interleaves the upper two single-precision, floating-point values
5113// from a and b.
5114//
5115// r0 := a2
5116// r1 := b2
5117// r2 := a3
5118// r3 := b3
5119//
5120// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
5121FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
5122{
5123#if defined(__aarch64__)
5124 return vreinterpretq_m128_f32(
5125 vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5126#else
5127 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
5128 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
5129 float32x2x2_t result = vzip_f32(a1, b1);
5130 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5131#endif
5132}
5133
5134// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
5135// 8 signed or unsigned 8-bit integers in b.
5136//
5137// r0 := a8
5138// r1 := b8
5139// r2 := a9
5140// r3 := b9
5141// ...
5142// r14 := a15
5143// r15 := b15
5144//
5145// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
5146FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
5147{
5148#if defined(__aarch64__)
5149 return vreinterpretq_m128i_s8(
5150 vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5151#else
5152 int8x8_t a1 =
5153 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5154 int8x8_t b1 =
5155 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5156 int8x8x2_t result = vzip_s8(a1, b1);
5157 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5158#endif
5159}
5160
5161// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
5162// upper 4 signed or unsigned 16-bit integers in b.
5163//
5164// r0 := a4
5165// r1 := b4
5166// r2 := a5
5167// r3 := b5
5168// r4 := a6
5169// r5 := b6
5170// r6 := a7
5171// r7 := b7
5172//
5173// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
5174FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5175{
5176#if defined(__aarch64__)
5177 return vreinterpretq_m128i_s16(
5178 vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5179#else
5180 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5181 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5182 int16x4x2_t result = vzip_s16(a1, b1);
5183 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5184#endif
5185}
5186
5187// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
5188// upper 2 signed or unsigned 32-bit integers in b.
5189// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
5190FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
5191{
5192#if defined(__aarch64__)
5193 return vreinterpretq_m128i_s32(
5194 vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5195#else
5196 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5197 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5198 int32x2x2_t result = vzip_s32(a1, b1);
5199 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5200#endif
5201}
5202
5203// Interleaves the upper signed or unsigned 64-bit integer in a with the
5204// upper signed or unsigned 64-bit integer in b.
5205//
5206// r0 := a1
5207// r1 := b1
5208FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
5209{
5210 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5211 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5212 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5213}
5214
5215// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
5216// in a, store the minimum and index in dst, and zero the remaining bits in dst.
5217//
5218// index[2:0] := 0
5219// min[15:0] := a[15:0]
5220// FOR j := 0 to 7
5221// i := j*16
5222// IF a[i+15:i] < min[15:0]
5223// index[2:0] := j
5224// min[15:0] := a[i+15:i]
5225// FI
5226// ENDFOR
5227// dst[15:0] := min[15:0]
5228// dst[18:16] := index[2:0]
5229// dst[127:19] := 0
5230//
5231// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
5232FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
5233{
5234 __m128i dst;
5235 uint16_t min, idx = 0;
5236 // Find the minimum value
5237#if defined(__aarch64__)
5238 min = vminvq_u16(vreinterpretq_u16_m128i(a));
5239#else
5240 __m64 tmp;
5241 tmp = vreinterpret_m64_u16(
5242 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
5243 vget_high_u16(vreinterpretq_u16_m128i(a))));
5244 tmp = vreinterpret_m64_u16(
5245 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5246 tmp = vreinterpret_m64_u16(
5247 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5248 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
5249#endif
5250 // Get the index of the minimum value
5251 int i;
5252 for (i = 0; i < 8; i++) {
5253 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
5254 idx = (uint16_t) i;
5255 break;
5256 }
5257 a = _mm_srli_si128(a, 2);
5258 }
5259 // Generate result
5260 dst = _mm_setzero_si128();
5261 dst = vreinterpretq_m128i_u16(
5262 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
5263 dst = vreinterpretq_m128i_u16(
5264 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
5265 return dst;
5266}
5267
5268// shift to right
5269// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
5270// http://blog.csdn.net/hemmingway/article/details/44828303
5271// Clang requires a macro here, as it is extremely picky about c being a
5272// literal.
5273#define _mm_alignr_epi8(a, b, c) \
5274 ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
5275
5276// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5277// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5278// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5279// otherwise set CF to 0. Return the CF value.
5280// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
5281FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
5282{
5283 int64x2_t s64 =
5284 vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
5285 vreinterpretq_s64_m128i(b));
5286 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5287}
5288
5289// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5290// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5291// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5292// otherwise set CF to 0. Return the ZF value.
5293// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
5294FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
5295{
5296 int64x2_t s64 =
5297 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
5298 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5299}
5300
5301// Extracts the selected signed or unsigned 8-bit integer from a and zero
5302// extends.
5303// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
5304#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
5305
5306// Inserts the least significant 8 bits of b into the selected 8-bit integer
5307// of a.
5308// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
5309// __constrange(0,16) int imm)
5310#define _mm_insert_epi8(a, b, imm) \
5311 __extension__({ \
5312 vreinterpretq_m128i_s8( \
5313 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
5314 })
5315
5316// Extracts the selected signed or unsigned 16-bit integer from a and zero
5317// extends.
5318// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
5319// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
5320#define _mm_extract_epi16(a, imm) \
5321 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
5322
5323// Inserts the least significant 16 bits of b into the selected 16-bit integer
5324// of a.
5325// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
5326// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
5327// __constrange(0,8) int imm)
5328#define _mm_insert_epi16(a, b, imm) \
5329 __extension__({ \
5330 vreinterpretq_m128i_s16( \
5331 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
5332 })
5333
5334// Extracts the selected signed or unsigned 32-bit integer from a and zero
5335// extends.
5336// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
5337#define _mm_extract_epi32(a, imm) \
5338 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
5339
5340// Extracts the selected single-precision (32-bit) floating-point from a.
5341// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
5342#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
5343
5344// Inserts the least significant 32 bits of b into the selected 32-bit integer
5345// of a.
5346// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
5347// __constrange(0,4) int imm)
5348#define _mm_insert_epi32(a, b, imm) \
5349 __extension__({ \
5350 vreinterpretq_m128i_s32( \
5351 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
5352 })
5353
5354// Extracts the selected signed or unsigned 64-bit integer from a and zero
5355// extends.
5356// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
5357#define _mm_extract_epi64(a, imm) \
5358 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
5359
5360// Inserts the least significant 64 bits of b into the selected 64-bit integer
5361// of a.
5362// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
5363// __constrange(0,2) int imm)
5364#define _mm_insert_epi64(a, b, imm) \
5365 __extension__({ \
5366 vreinterpretq_m128i_s64( \
5367 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
5368 })
5369
5370// Count the number of bits set to 1 in unsigned 32-bit integer a, and
5371// return that count in dst.
5372// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
5373FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
5374{
5375#if defined(__aarch64__)
5376#if __has_builtin(__builtin_popcount)
5377 return __builtin_popcount(a);
5378#else
5379 return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
5380#endif
5381#else
5382 uint32_t count = 0;
5383 uint8x8_t input_val, count8x8_val;
5384 uint16x4_t count16x4_val;
5385 uint32x2_t count32x2_val;
5386
5387 input_val = vld1_u8((uint8_t *) &a);
5388 count8x8_val = vcnt_u8(input_val);
5389 count16x4_val = vpaddl_u8(count8x8_val);
5390 count32x2_val = vpaddl_u16(count16x4_val);
5391
5392 vst1_u32(&count, count32x2_val);
5393 return count;
5394#endif
5395}
5396
5397// Count the number of bits set to 1 in unsigned 64-bit integer a, and
5398// return that count in dst.
5399// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
5400FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
5401{
5402#if defined(__aarch64__)
5403#if __has_builtin(__builtin_popcountll)
5404 return __builtin_popcountll(a);
5405#else
5406 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
5407#endif
5408#else
5409 uint64_t count = 0;
5410 uint8x8_t input_val, count8x8_val;
5411 uint16x4_t count16x4_val;
5412 uint32x2_t count32x2_val;
5413 uint64x1_t count64x1_val;
5414
5415 input_val = vld1_u8((uint8_t *) &a);
5416 count8x8_val = vcnt_u8(input_val);
5417 count16x4_val = vpaddl_u8(count8x8_val);
5418 count32x2_val = vpaddl_u16(count16x4_val);
5419 count64x1_val = vpaddl_u32(count32x2_val);
5420 vst1_u64(&count, count64x1_val);
5421 return count;
5422#endif
5423}
5424
5425// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
5426// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
5427// transposed matrix in these vectors (row0 now contains column 0, etc.).
5428// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
5429#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
5430 do { \
5431 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
5432 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
5433 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
5434 vget_low_f32(ROW23.val[0])); \
5435 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
5436 vget_low_f32(ROW23.val[1])); \
5437 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
5438 vget_high_f32(ROW23.val[0])); \
5439 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
5440 vget_high_f32(ROW23.val[1])); \
5441 } while (0)
5442
5443/* Crypto Extensions */
5444
5445#if defined(__ARM_FEATURE_CRYPTO)
5446// Wraps vmull_p64
5447FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5448{
5449 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
5450 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
5451 return vreinterpretq_u64_p128(vmull_p64(a, b));
5452}
5453#else // ARMv7 polyfill
5454// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
5455//
5456// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
5457// 64-bit->128-bit polynomial multiply.
5458//
5459// It needs some work and is somewhat slow, but it is still faster than all
5460// known scalar methods.
5461//
5462// Algorithm adapted to C from
5463// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
5464// from "Fast Software Polynomial Multiplication on ARM Processors Using the
5465// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
5466// (https://hal.inria.fr/hal-01506572)
5467static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5468{
5469 poly8x8_t a = vreinterpret_p8_u64(_a);
5470 poly8x8_t b = vreinterpret_p8_u64(_b);
5471
5472 // Masks
5473 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
5474 vcreate_u8(0x00000000ffffffff));
5475 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
5476 vcreate_u8(0x0000000000000000));
5477
5478 // Do the multiplies, rotating with vext to get all combinations
5479 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
5480 uint8x16_t e =
5481 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
5482 uint8x16_t f =
5483 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
5484 uint8x16_t g =
5485 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
5486 uint8x16_t h =
5487 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
5488 uint8x16_t i =
5489 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
5490 uint8x16_t j =
5491 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
5492 uint8x16_t k =
5493 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
5494
5495 // Add cross products
5496 uint8x16_t l = veorq_u8(e, f); // L = E + F
5497 uint8x16_t m = veorq_u8(g, h); // M = G + H
5498 uint8x16_t n = veorq_u8(i, j); // N = I + J
5499
5500 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
5501 // instructions.
5502#if defined(__aarch64__)
5503 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
5504 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5505 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
5506 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5507 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
5508 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5509 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
5510 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5511#else
5512 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
5513 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
5514 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
5515 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
5516#endif
5517 // t0 = (L) (P0 + P1) << 8
5518 // t1 = (M) (P2 + P3) << 16
5519 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
5520 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
5521 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
5522
5523 // t2 = (N) (P4 + P5) << 24
5524 // t3 = (K) (P6 + P7) << 32
5525 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
5526 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
5527 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
5528
5529 // De-interleave
5530#if defined(__aarch64__)
5531 uint8x16_t t0 = vreinterpretq_u8_u64(
5532 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5533 uint8x16_t t1 = vreinterpretq_u8_u64(
5534 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5535 uint8x16_t t2 = vreinterpretq_u8_u64(
5536 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5537 uint8x16_t t3 = vreinterpretq_u8_u64(
5538 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5539#else
5540 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
5541 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
5542 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
5543 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
5544#endif
5545 // Shift the cross products
5546 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
5547 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
5548 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
5549 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
5550
5551 // Accumulate the products
5552 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
5553 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
5554 uint8x16_t mix = veorq_u8(d, cross1);
5555 uint8x16_t r = veorq_u8(mix, cross2);
5556 return vreinterpretq_u64_u8(r);
5557}
5558#endif // ARMv7 polyfill
5559
5560FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
5561{
5562 uint64x2_t a = vreinterpretq_u64_m128i(_a);
5563 uint64x2_t b = vreinterpretq_u64_m128i(_b);
5564 switch (imm & 0x11) {
5565 case 0x00:
5566 return vreinterpretq_m128i_u64(
5567 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
5568 case 0x01:
5569 return vreinterpretq_m128i_u64(
5570 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
5571 case 0x10:
5572 return vreinterpretq_m128i_u64(
5573 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
5574 case 0x11:
5575 return vreinterpretq_m128i_u64(
5576 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
5577 default:
5578 abort();
5579 }
5580}
5581
5582#if !defined(__ARM_FEATURE_CRYPTO)
5583/* clang-format off */
5584#define SSE2NEON_AES_DATA(w) \
5585 { \
5586 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
5587 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
5588 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
5589 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
5590 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
5591 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
5592 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
5593 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
5594 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
5595 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
5596 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
5597 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
5598 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
5599 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
5600 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
5601 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
5602 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
5603 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
5604 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
5605 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
5606 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
5607 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
5608 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
5609 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
5610 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
5611 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
5612 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
5613 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
5614 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
5615 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
5616 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
5617 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
5618 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
5619 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
5620 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
5621 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
5622 w(0xb0), w(0x54), w(0xbb), w(0x16) \
5623 }
5624/* clang-format on */
5625
5626/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
5627#define SSE2NEON_AES_H0(x) (x)
5628static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
5629#undef SSE2NEON_AES_H0
5630
5631// In the absence of crypto extensions, implement aesenc using regular neon
5632// intrinsics instead. See:
5633// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
5634// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
5635// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
5636// for more information Reproduced with permission of the author.
5637FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
5638{
5639#if defined(__aarch64__)
5640 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
5641 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
5642 0xc, 0x1, 0x6, 0xb};
5643 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
5644 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
5645
5646 uint8x16_t v;
5647 uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
5648
5649 // shift rows
5650 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
5651
5652 // sub bytes
5653 v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
5654 v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
5655 v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
5656 v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
5657
5658 // mix columns
5659 w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
5660 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
5661 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
5662
5663 // add round key
5664 return vreinterpretq_m128i_u8(w) ^ RoundKey;
5665
5666#else /* ARMv7-A NEON implementation */
5667#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
5668 (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
5669 (b0))
5670#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
5671#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
5672#define SSE2NEON_AES_U0(p) \
5673 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
5674#define SSE2NEON_AES_U1(p) \
5675 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
5676#define SSE2NEON_AES_U2(p) \
5677 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
5678#define SSE2NEON_AES_U3(p) \
5679 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
5680 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
5681 SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
5682 SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
5683 SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
5684 SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
5685 };
5686#undef SSE2NEON_AES_B2W
5687#undef SSE2NEON_AES_F2
5688#undef SSE2NEON_AES_F3
5689#undef SSE2NEON_AES_U0
5690#undef SSE2NEON_AES_U1
5691#undef SSE2NEON_AES_U2
5692#undef SSE2NEON_AES_U3
5693
5694 uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
5695 uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
5696 uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
5697 uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
5698
5699 __m128i out = _mm_set_epi32(
5700 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
5701 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
5702 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
5703 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
5704 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
5705 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
5706 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
5707 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
5708
5709 return _mm_xor_si128(out, RoundKey);
5710#endif
5711}
5712
5713FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
5714{
5715 /* FIXME: optimized for NEON */
5716 uint8_t v[4][4] = {
5717 [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
5718 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
5719 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
5720 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
5721 [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
5722 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
5723 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
5724 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
5725 [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
5726 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
5727 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
5728 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
5729 [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
5730 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
5731 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
5732 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
5733 };
5734 for (int i = 0; i < 16; i++)
5735 vreinterpretq_nth_u8_m128i(a, i) =
5736 v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
5737 return a;
5738}
5739
5740// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
5741// This instruction generates a round key for AES encryption. See
5742// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
5743// for details.
5744//
5745// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
5746FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
5747{
5748 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
5749 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
5750 for (int i = 0; i < 4; ++i) {
5751 ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
5752 ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
5753 }
5754 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
5755 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
5756}
5757#undef SSE2NEON_AES_DATA
5758
5759#else /* __ARM_FEATURE_CRYPTO */
5760// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
5761// AESMC and then manually applying the real key as an xor operation. This
5762// unfortunately means an additional xor op; the compiler should be able to
5763// optimize this away for repeated calls however. See
5764// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
5765// for more details.
5766FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
5767{
5768 return vreinterpretq_m128i_u8(
5769 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
5770 vreinterpretq_u8_m128i(b));
5771}
5772
5773// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
5774FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
5775{
5776 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
5777 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
5778 RoundKey);
5779}
5780
5781FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
5782{
5783 // AESE does ShiftRows and SubBytes on A
5784 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
5785
5786 uint8x16_t dest = {
5787 // Undo ShiftRows step from AESE and extract X1 and X3
5788 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
5789 u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
5790 u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
5791 u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
5792 };
5793 uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
5794 return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
5795}
5796#endif
5797
5798/* Streaming Extensions */
5799
5800// Guarantees that every preceding store is globally visible before any
5801// subsequent store.
5802// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
5803FORCE_INLINE void _mm_sfence(void)
5804{
5805 __sync_synchronize();
5806}
5807
5808// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
5809// point elements) from a into memory using a non-temporal memory hint.
5810// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
5811FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
5812{
5813#if __has_builtin(__builtin_nontemporal_store)
5814 __builtin_nontemporal_store(a, (float32x4_t *) p);
5815#else
5816 vst1q_f32(p, vreinterpretq_f32_m128(a));
5817#endif
5818}
5819
5820// Stores the data in a to the address p without polluting the caches. If the
5821// cache line containing address p is already in the cache, the cache will be
5822// updated.
5823// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
5824FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5825{
5826#if __has_builtin(__builtin_nontemporal_store)
5827 __builtin_nontemporal_store(a, p);
5828#else
5829 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5830#endif
5831}
5832
5833// Load 128-bits of integer data from memory into dst using a non-temporal
5834// memory hint. mem_addr must be aligned on a 16-byte boundary or a
5835// general-protection exception may be generated.
5836//
5837// dst[127:0] := MEM[mem_addr+127:mem_addr]
5838//
5839// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
5840FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
5841{
5842#if __has_builtin(__builtin_nontemporal_store)
5843 return __builtin_nontemporal_load(p);
5844#else
5845 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
5846#endif
5847}
5848
5849// Cache line containing p is flushed and invalidated from all caches in the
5850// coherency domain. :
5851// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
5852FORCE_INLINE void _mm_clflush(void const *p)
5853{
5854 (void) p;
5855 // no corollary for Neon?
5856}
5857
5858// Allocate aligned blocks of memory.
5859// https://software.intel.com/en-us/
5860// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
5861FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
5862{
5863 void *ptr;
5864 if (align == 1)
5865 return malloc(size);
5866 if (align == 2 || (sizeof(void *) == 8 && align == 4))
5867 align = sizeof(void *);
5868 if (!posix_memalign(&ptr, align, size))
5869 return ptr;
5870 return NULL;
5871}
5872
5873FORCE_INLINE void _mm_free(void *addr)
5874{
5875 free(addr);
5876}
5877
5878// Starting with the initial value in crc, accumulates a CRC32 value for
5879// unsigned 8-bit integer v.
5880// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
5881FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
5882{
5883#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5884 __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
5885 : [c] "+r"(crc)
5886 : [v] "r"(v));
5887#else
5888 crc ^= v;
5889 for (int bit = 0; bit < 8; bit++) {
5890 if (crc & 1)
5891 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
5892 else
5893 crc = (crc >> 1);
5894 }
5895#endif
5896 return crc;
5897}
5898
5899// Starting with the initial value in crc, accumulates a CRC32 value for
5900// unsigned 16-bit integer v.
5901// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
5902FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
5903{
5904#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5905 __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
5906 : [c] "+r"(crc)
5907 : [v] "r"(v));
5908#else
5909 crc = _mm_crc32_u8(crc, v & 0xff);
5910 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
5911#endif
5912 return crc;
5913}
5914
5915// Starting with the initial value in crc, accumulates a CRC32 value for
5916// unsigned 32-bit integer v.
5917// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
5918FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
5919{
5920#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5921 __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
5922 : [c] "+r"(crc)
5923 : [v] "r"(v));
5924#else
5925 crc = _mm_crc32_u16(crc, v & 0xffff);
5926 crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
5927#endif
5928 return crc;
5929}
5930
5931// Starting with the initial value in crc, accumulates a CRC32 value for
5932// unsigned 64-bit integer v.
5933// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
5934FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
5935{
5936#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5937 __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
5938 : [c] "+r"(crc)
5939 : [v] "r"(v));
5940#else
5941 crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
5942 crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
5943#endif
5944 return crc;
5945}
5946
5947#if defined(__GNUC__) || defined(__clang__)
5948#pragma pop_macro("ALIGN_STRUCT")
5949#pragma pop_macro("FORCE_INLINE")
5950#endif
5951
5952#if defined(__GNUC__)
5953#pragma GCC pop_options
5954#endif
5955
5956#endif