blob: b8aede6f21a2634d9399001ba9e470ef35b47b0f [file] [log] [blame]
Marat Dukhan0525a852014-12-13 15:48:12 -05001#pragma once
2#ifndef PSIMD_H
3#define PSIMD_H
4
5#if defined(__CUDA_ARCH__)
6 /* CUDA compiler */
7 #define PSIMD_INTRINSIC __forceinline__ __device__
8#elif defined(__OPENCL_VERSION__)
9 /* OpenCL compiler */
10 #define PSIMD_INTRINSIC inline static
11#elif defined(__INTEL_COMPILER)
12 /* Intel compiler, even on Windows */
Marat Dukhanc81e1a82017-02-22 09:16:09 -050013 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
Marat Dukhan0525a852014-12-13 15:48:12 -050014#elif defined(__GNUC__)
15 /* GCC-compatible compiler (gcc/clang/icc) */
Marat Dukhanc81e1a82017-02-22 09:16:09 -050016 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
Marat Dukhan0525a852014-12-13 15:48:12 -050017#elif defined(_MSC_VER)
18 /* MSVC-compatible compiler (cl/icl/clang-cl) */
19 #define PSIMD_INTRINSIC __forceinline static
20#elif defined(__cplusplus)
21 /* Generic C++ compiler */
22 #define PSIMD_INTRINSIC inline static
23#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
24 /* Generic C99 compiler */
25 #define PSIMD_INTRINSIC inline static
26#else
27 /* Generic C compiler */
28 #define PSIMD_INTRINSIC static
29#endif
30
Marat Dukhan90a938f2018-09-06 19:11:46 +030031#if defined(__GNUC__)
32 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
33 #include <arm_neon.h>
34 #endif
35
36 #if defined(__SSE2__)
37 #include <emmintrin.h>
38 #endif
39
40 #if defined(__SSE3__)
41 #include <pmmintrin.h>
42 #endif
43
44 #if defined(__SSSE3__)
45 #include <tmmintrin.h>
46 #endif
47
48 #if defined(__SSE4_1__)
49 #include <smmintrin.h>
50 #endif
51
52 #if defined(__SSE4_2__)
53 #include <nmmintrin.h>
54 #endif
55
56 #if defined(__AVX__)
57 #include <immintrin.h>
58 #endif
59#elif defined(_MSC_VER)
60 #include <intrin.h>
Marat Dukhan287f0702017-03-23 16:29:17 +000061#endif
62
Marat Dukhan0525a852014-12-13 15:48:12 -050063#if defined(__cplusplus)
64 #define PSIMD_CXX_SYNTAX
65#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
66 #define PSIMD_C11_SYNTAX
67#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
68 #define PSIMD_C99_SYNTAX
69#else
70 #define PSIMD_C89_SYNTAX
71#endif
72
Marat Dukhan663e0aa2017-03-01 03:59:55 -050073#if defined(__cplusplus) && (__cplusplus >= 201103L)
74 #include <cstddef>
75 #include <cstdint>
76#elif !defined(__OPENCL_VERSION__)
77 #include <stddef.h>
Marat Dukhan0525a852014-12-13 15:48:12 -050078 #include <stdint.h>
79#endif
80
81#if defined(__GNUC__)
82 #define PSIMD_HAVE_F64 0
83 #define PSIMD_HAVE_F32 1
84 #define PSIMD_HAVE_U8 1
85 #define PSIMD_HAVE_S8 1
86 #define PSIMD_HAVE_U16 1
87 #define PSIMD_HAVE_S16 1
88 #define PSIMD_HAVE_U32 1
89 #define PSIMD_HAVE_S32 1
90 #define PSIMD_HAVE_U64 0
91 #define PSIMD_HAVE_S64 0
92
Marat Dukhan0525a852014-12-13 15:48:12 -050093 typedef int8_t psimd_s8 __attribute__((vector_size(16), aligned(1)));
Marat Dukhan0e589252017-02-17 02:57:08 -050094 typedef uint8_t psimd_u8 __attribute__((vector_size(16), aligned(1)));
Marat Dukhan0525a852014-12-13 15:48:12 -050095 typedef int16_t psimd_s16 __attribute__((vector_size(16), aligned(2)));
Marat Dukhan0e589252017-02-17 02:57:08 -050096 typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2)));
Marat Dukhan0525a852014-12-13 15:48:12 -050097 typedef int32_t psimd_s32 __attribute__((vector_size(16), aligned(4)));
Marat Dukhan0e589252017-02-17 02:57:08 -050098 typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4)));
99 typedef float psimd_f32 __attribute__((vector_size(16), aligned(4)));
100
101 typedef struct {
102 psimd_s8 lo;
103 psimd_s8 hi;
104 } psimd_s8x2;
105
106 typedef struct {
107 psimd_u8 lo;
108 psimd_u8 hi;
109 } psimd_u8x2;
110
111 typedef struct {
112 psimd_s16 lo;
113 psimd_s16 hi;
114 } psimd_s16x2;
115
116 typedef struct {
117 psimd_u16 lo;
118 psimd_u16 hi;
119 } psimd_u16x2;
120
121 typedef struct {
122 psimd_s32 lo;
123 psimd_s32 hi;
124 } psimd_s32x2;
125
126 typedef struct {
127 psimd_u32 lo;
128 psimd_u32 hi;
129 } psimd_u32x2;
130
131 typedef struct {
132 psimd_f32 lo;
133 psimd_f32 hi;
134 } psimd_f32x2;
135
136 /* Bit casts */
137 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) {
138 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
139 }
140
141 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) {
142 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
143 }
144
145 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) {
146 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
147 }
148
149 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) {
150 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
151 }
152
153 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) {
154 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
155 }
156
157 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) {
158 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
159 }
Marat Dukhan0525a852014-12-13 15:48:12 -0500160
Marat Dukhan69138482017-02-22 09:21:10 -0500161 /* Swap */
162 PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) {
163 const psimd_s8 new_a = *b;
164 const psimd_s8 new_b = *a;
165 *a = new_a;
166 *b = new_b;
167 }
168
169 PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) {
170 const psimd_u8 new_a = *b;
171 const psimd_u8 new_b = *a;
172 *a = new_a;
173 *b = new_b;
174 }
175
176 PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) {
177 const psimd_s16 new_a = *b;
178 const psimd_s16 new_b = *a;
179 *a = new_a;
180 *b = new_b;
181 }
182
183 PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) {
184 const psimd_u16 new_a = *b;
185 const psimd_u16 new_b = *a;
186 *a = new_a;
187 *b = new_b;
188 }
189
190 PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) {
191 const psimd_s32 new_a = *b;
192 const psimd_s32 new_b = *a;
193 *a = new_a;
194 *b = new_b;
195 }
196
197 PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) {
198 const psimd_u32 new_a = *b;
199 const psimd_u32 new_b = *a;
200 *a = new_a;
201 *b = new_b;
202 }
203
204 PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) {
205 const psimd_f32 new_a = *b;
206 const psimd_f32 new_b = *a;
207 *a = new_a;
208 *b = new_b;
209 }
210
Marat Dukhan0525a852014-12-13 15:48:12 -0500211 /* Zero-initialization */
212 PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) {
213 return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
214 }
215
216 PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) {
217 return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
218 }
219
220 PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) {
221 return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 };
222 }
223
224 PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) {
225 return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 };
226 }
227
228 PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) {
229 return (psimd_s32) { 0, 0, 0, 0 };
230 }
231
232 PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) {
233 return (psimd_u32) { 0, 0, 0, 0 };
234 }
235
236 PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) {
237 return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f };
238 }
239
240 /* Initialization to the same constant */
241 PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) {
242 return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
243 }
244
245 PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) {
246 return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
247 }
248
249 PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) {
250 return (psimd_s16) { c, c, c, c, c, c, c, c };
251 }
252
253 PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) {
254 return (psimd_u16) { c, c, c, c, c, c, c, c };
255 }
256
257 PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) {
258 return (psimd_s32) { c, c, c, c };
259 }
260
261 PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) {
262 return (psimd_u32) { c, c, c, c };
263 }
264
265 PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
266 return (psimd_f32) { c, c, c, c };
267 }
268
269 /* Load vector */
270 PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) {
271 return *((const psimd_s8*) address);
272 }
273
274 PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) {
275 return *((const psimd_u8*) address);
276 }
277
278 PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) {
279 return *((const psimd_s16*) address);
280 }
281
282 PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) {
283 return *((const psimd_u16*) address);
284 }
285
286 PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) {
287 return *((const psimd_s32*) address);
288 }
289
290 PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) {
291 return *((const psimd_u32*) address);
292 }
293
294 PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
295 return *((const psimd_f32*) address);
296 }
297
Marat Dukhan19a380c2017-02-26 16:56:04 -0500298 PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) {
299 return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f };
300 }
301
302 PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) {
303 const float* address_f32 = (const float*) address;
304 return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f };
305 }
306
307 PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) {
308 const float* address_f32 = (const float*) address;
309 return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f };
310 }
311
312 PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) {
313 return psimd_load_f32(address);
314 }
315
316 PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) {
317 const psimd_f32 v0x1x = psimd_load_f32(address);
Marat Dukhan94f61c02017-03-22 09:49:27 -0400318 const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3);
Marat Dukhan19a380c2017-02-26 16:56:04 -0500319 #if defined(__clang__)
320 return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7);
321 #else
322 return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 });
323 #endif
324 }
325
326 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) {
327 return psimd_load_f32(address);
328 }
329
330 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) {
331 const float* address_f32 = (const float*) address;
332 return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f };
333 }
334
335 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) {
336 const psimd_f32 v0x1x = psimd_load_f32(address);
Marat Dukhan94f61c02017-03-22 09:49:27 -0400337 const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2);
Marat Dukhan19a380c2017-02-26 16:56:04 -0500338 #if defined(__clang__)
339 return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6);
340 #else
341 return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 });
342 #endif
343 }
344
345 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) {
346 return psimd_load_stride2_f32(address);
347 }
348
349 PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) {
350 const float* address0_f32 = (const float*) address;
351 const float* address1_f32 = address0_f32 + stride;
352 const float* address2_f32 = address1_f32 + stride;
353 const float* address3_f32 = address2_f32 + stride;
354 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 };
355 }
356
357 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) {
358 return psimd_load1_f32(address);
359 }
360
361 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) {
362 const float* address_f32 = (const float*) address;
363 return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f };
364 }
365
366 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) {
367 const float* address0_f32 = (const float*) address;
368 const float* address1_f32 = address0_f32 + stride;
369 const float* address2_f32 = address1_f32 + stride;
370 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f };
371 }
372
373 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) {
374 return psimd_load_stride_f32(address, stride);
375 }
376
Marat Dukhan0525a852014-12-13 15:48:12 -0500377 /* Store vector */
378 PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) {
379 *((psimd_s8*) address) = value;
380 }
381
382 PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) {
383 *((psimd_u8*) address) = value;
384 }
385
386 PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) {
387 *((psimd_s16*) address) = value;
388 }
389
390 PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) {
391 *((psimd_u16*) address) = value;
392 }
393
394 PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) {
395 *((psimd_s32*) address) = value;
396 }
397
398 PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) {
399 *((psimd_u32*) address) = value;
400 }
401
402 PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
403 *((psimd_f32*) address) = value;
404 }
405
Marat Dukhan19a380c2017-02-26 16:56:04 -0500406 PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) {
407 *((float*) address) = value[0];
408 }
409
410 PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) {
411 float* address_f32 = (float*) address;
412 address_f32[0] = value[0];
413 address_f32[1] = value[1];
414 }
415
416 PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) {
417 float* address_f32 = (float*) address;
418 address_f32[0] = value[0];
419 address_f32[1] = value[1];
420 address_f32[2] = value[2];
421 }
422
423 PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) {
424 psimd_store_f32(address, value);
425 }
426
427 PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) {
428 float* address0_f32 = (float*) address;
429 float* address1_f32 = address0_f32 + stride;
430 float* address2_f32 = address1_f32 + stride;
431 float* address3_f32 = address2_f32 + stride;
432 *address0_f32 = value[0];
433 *address1_f32 = value[1];
434 *address2_f32 = value[2];
435 *address3_f32 = value[3];
436 }
437
438 PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) {
439 psimd_store1_f32(address, value);
440 }
441
442 PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) {
443 float* address_f32 = (float*) address;
444 address_f32[0] = value[0];
445 address_f32[stride] = value[1];
446 }
447
448 PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) {
449 float* address0_f32 = (float*) address;
450 float* address1_f32 = address0_f32 + stride;
451 float* address2_f32 = address1_f32 + stride;
452 *address0_f32 = value[0];
453 *address1_f32 = value[1];
454 *address2_f32 = value[2];
455 }
456
Marat Dukhan0525a852014-12-13 15:48:12 -0500457 /* Vector addition */
458 PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) {
459 return a + b;
460 }
461
462 PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) {
463 return a + b;
464 }
465
466 PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) {
467 return a + b;
468 }
469
470 PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) {
471 return a + b;
472 }
473
474 PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) {
475 return a + b;
476 }
477
478 PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) {
479 return a + b;
480 }
481
482 PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) {
Marat Dukhan287f0702017-03-23 16:29:17 +0000483 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
484 return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b);
485 #else
486 return a + b;
487 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500488 }
489
490 /* Vector subtraction */
491 PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) {
492 return a - b;
493 }
494
495 PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) {
496 return a - b;
497 }
498
499 PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) {
500 return a - b;
501 }
502
503 PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) {
504 return a - b;
505 }
506
507 PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) {
508 return a - b;
509 }
510
511 PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) {
512 return a - b;
513 }
514
515 PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) {
Marat Dukhan287f0702017-03-23 16:29:17 +0000516 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
517 return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b);
518 #else
519 return a - b;
520 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500521 }
522
523 /* Vector multiplication */
524 PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) {
525 return a * b;
526 }
527
528 PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) {
529 return a * b;
530 }
531
532 PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) {
533 return a * b;
534 }
535
536 PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) {
537 return a * b;
538 }
539
540 PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) {
541 return a * b;
542 }
543
544 PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) {
545 return a * b;
546 }
547
548 PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) {
Marat Dukhan287f0702017-03-23 16:29:17 +0000549 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
550 return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b);
551 #else
552 return a * b;
553 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500554 }
555
Marat Dukhan4a916292017-02-22 09:23:23 -0500556 /* Vector and */
557 PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) {
558 return (psimd_f32) (mask & (psimd_s32) v);
559 }
560
Marat Dukhan0525a852014-12-13 15:48:12 -0500561 /* Vector blend */
562 PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300563 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhand9dff1f2017-03-23 18:07:02 +0000564 return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b);
565 #else
566 return (mask & a) | (~mask & b);
567 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500568 }
569
570 PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_u8 mask, psimd_u8 a, psimd_u8 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300571 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhand9dff1f2017-03-23 18:07:02 +0000572 return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b);
573 #else
574 return (mask & a) | (~mask & b);
575 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500576 }
577
578 PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300579 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhand9dff1f2017-03-23 18:07:02 +0000580 return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b);
581 #else
582 return (mask & a) | (~mask & b);
583 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500584 }
585
586 PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_u16 mask, psimd_u16 a, psimd_u16 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300587 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhand9dff1f2017-03-23 18:07:02 +0000588 return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b);
589 #else
590 return (mask & a) | (~mask & b);
591 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500592 }
593
594 PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300595 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhand9dff1f2017-03-23 18:07:02 +0000596 return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b);
597 #else
598 return (mask & a) | (~mask & b);
599 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500600 }
601
602 PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_u32 mask, psimd_u32 a, psimd_u32 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300603 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhand9dff1f2017-03-23 18:07:02 +0000604 return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b);
605 #else
606 return (mask & a) | (~mask & b);
607 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500608 }
609
610 PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300611 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhand9dff1f2017-03-23 18:07:02 +0000612 return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b);
613 #else
614 return (psimd_f32) psimd_blend_s32(mask, (psimd_s32) a, (psimd_s32) b);
615 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500616 }
617
Marat Dukhan95c5be72017-02-22 09:23:06 -0500618 /* Vector blend on sign */
619 PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) {
620 return psimd_blend_s8(x >> psimd_splat_s8(7), a, b);
621 }
622
623 PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) {
624 return psimd_blend_u8((psimd_u8) (x >> psimd_splat_s8(7)), a, b);
625 }
626
627 PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) {
628 return psimd_blend_s16(x >> psimd_splat_s16(15), a, b);
629 }
630
631 PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) {
632 return psimd_blend_u16((psimd_u16) (x >> psimd_splat_s16(15)), a, b);
633 }
634
635 PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) {
636 return psimd_blend_s32(x >> psimd_splat_s32(31), a, b);
637 }
638
639 PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) {
640 return psimd_blend_u32((psimd_u32) (x >> psimd_splat_s32(31)), a, b);
641 }
642
643 PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) {
644 const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31);
645 return psimd_blend_f32(mask, a, b);
646 }
647
Marat Dukhan0525a852014-12-13 15:48:12 -0500648 /* Vector absolute value */
649 PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) {
650 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
651 return (psimd_f32) ((psimd_s32) v & mask);
652 }
653
654 /* Vector negation */
655 PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) {
656 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
657 return (psimd_f32) ((psimd_s32) v ^ mask);
658 }
659
660 /* Vector maximum */
661 PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300662 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000663 return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b);
664 #else
665 return psimd_blend_s8(a > b, a, b);
666 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500667 }
668
669 PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300670 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000671 return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b);
672 #else
673 return psimd_blend_u8(a > b, a, b);
674 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500675 }
676
677 PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300678 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000679 return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b);
680 #else
681 return psimd_blend_s16(a > b, a, b);
682 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500683 }
684
685 PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300686 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000687 return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b);
688 #else
689 return psimd_blend_u16(a > b, a, b);
690 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500691 }
692
693 PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300694 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000695 return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b);
696 #else
697 return psimd_blend_s32(a > b, a, b);
698 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500699 }
700
701 PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300702 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000703 return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b);
704 #else
705 return psimd_blend_u32(a > b, a, b);
706 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500707 }
708
709 PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300710 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000711 return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b);
712 #else
713 return psimd_blend_f32(a > b, a, b);
714 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500715 }
716
717 /* Vector minimum */
718 PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300719 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000720 return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b);
721 #else
722 return psimd_blend_s8(a < b, a, b);
723 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500724 }
725
726 PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300727 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000728 return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b);
729 #else
730 return psimd_blend_u8(a < b, a, b);
731 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500732 }
733
734 PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300735 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000736 return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b);
737 #else
738 return psimd_blend_s16(a < b, a, b);
739 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500740 }
741
742 PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300743 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000744 return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b);
745 #else
746 return psimd_blend_u16(a < b, a, b);
747 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500748 }
749
750 PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300751 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000752 return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b);
753 #else
754 return psimd_blend_s32(a < b, a, b);
755 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500756 }
757
758 PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300759 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000760 return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b);
761 #else
762 return psimd_blend_u32(a < b, a, b);
763 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500764 }
765
766 PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) {
Marat Dukhan0abf73c2018-09-06 18:40:13 +0300767 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
Marat Dukhan287f0702017-03-23 16:29:17 +0000768 return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b);
769 #else
770 return psimd_blend_f32(a < b, a, b);
771 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500772 }
773
Marat Dukhan9efe01e2018-09-06 18:49:25 +0300774 PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) {
775 #if defined(__clang__)
776 return __builtin_convertvector(v, psimd_f32);
777 #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
778 return (psimd_f32) vcvtq_f32_s32((int32x4_t) v);
779 #elif defined(__SSE2__)
780 return (psimd_f32) _mm_cvtepi32_ps((__m128i) v);
781 #else
782 return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] };
783 #endif
784 }
785
Marat Dukhan0b26a3f2017-04-15 05:51:43 -0400786 /* Broadcast vector element */
787 #if defined(__clang__)
788 PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
789 return __builtin_shufflevector(v, v, 0, 0, 0, 0);
790 }
791
792 PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
793 return __builtin_shufflevector(v, v, 1, 1, 1, 1);
794 }
795
796 PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
797 return __builtin_shufflevector(v, v, 2, 2, 2, 2);
798 }
799
800 PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
801 return __builtin_shufflevector(v, v, 3, 3, 3, 3);
802 }
803 #else
804 PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
805 return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 });
806 }
807
808 PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
809 return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 });
810 }
811
812 PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
813 return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 });
814 }
815
816 PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
817 return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 });
818 }
819 #endif
820
Marat Dukhan6d9ce9d2017-04-14 01:20:18 +0000821 /* Reversal of vector elements */
822 #if defined(__clang__)
823 PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
824 return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
825 }
826
827 PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
828 return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
829 }
830
831 PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
832 return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
833 }
834
835 PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
836 return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
837 }
838
839 PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
840 return __builtin_shufflevector(v, v, 3, 2, 1, 0);
841 }
842
843 PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
844 return __builtin_shufflevector(v, v, 3, 2, 1, 0);
845 }
846
847 PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
848 return __builtin_shufflevector(v, v, 3, 2, 1, 0);
849 }
850 #else
851 PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
852 return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
853 }
854
855 PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
856 return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
857 }
858
859 PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
860 return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
861 }
862
863 PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
864 return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
865 }
866
867 PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
868 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
869 }
870
871 PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
872 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
873 }
874
875 PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
876 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
877 }
878 #endif
879
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000880 /* Interleaving of vector elements */
Marat Dukhan0525a852014-12-13 15:48:12 -0500881 #if defined(__clang__)
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000882 PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500883 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
884 }
885
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000886 PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500887 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
888 }
889
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000890 PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500891 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
892 }
893
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000894 PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500895 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
896 }
897
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000898 PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500899 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
900 }
901
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000902 PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500903 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
904 }
905
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000906 PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500907 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
908 }
909
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000910 PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500911 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
912 }
913
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000914 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500915 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
916 }
917
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000918 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500919 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
920 }
921 #else
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000922 PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500923 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
924 }
925
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000926 PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500927 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
928 }
929
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000930 PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500931 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
932 }
933
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000934 PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500935 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
936 }
937
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000938 PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500939 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
940 }
941
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000942 PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500943 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
944 }
945
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000946 PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500947 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
948 }
949
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000950 PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500951 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
952 }
953
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000954 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500955 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
956 }
957
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000958 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
Marat Dukhan0525a852014-12-13 15:48:12 -0500959 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
960 }
961 #endif
Marat Dukhan457042c2017-02-22 04:23:57 -0500962
Marat Dukhan0be0b6e2017-04-14 01:09:22 +0000963 /* Concatenation of low/high vector elements */
964 #if defined(__clang__)
965 PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
966 return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
967 }
968
969 PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
970 return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
971 }
972
973 PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
974 return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
975 }
976
977 PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
978 return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
979 }
980
981 PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
982 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
983 }
984
985 PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
986 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
987 }
988
989 PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
990 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
991 }
992
993 PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
994 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
995 }
996
997 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
998 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
999 }
1000
1001 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1002 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1003 }
1004 #else
1005 PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
1006 return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1007 }
1008
1009 PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
1010 return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1011 }
1012
1013 PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
1014 return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1015 }
1016
1017 PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
1018 return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1019 }
1020
1021 PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
1022 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1023 }
1024
1025 PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
1026 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1027 }
1028
1029 PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
1030 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1031 }
1032
1033 PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
1034 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1035 }
1036
1037 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
1038 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1039 }
1040
1041 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1042 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1043 }
1044 #endif
1045
1046 /* Concatenation of even/odd vector elements */
1047 #if defined(__clang__)
Marat Dukhanc7f1f992018-09-06 18:56:18 +03001048 PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1049 return __builtin_shufflevector(a, b,
1050 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1051 }
1052
1053 PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1054 return __builtin_shufflevector(a, b,
1055 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1056 }
1057
1058 PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1059 return __builtin_shufflevector(a, b,
1060 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1061 }
1062
1063 PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1064 return __builtin_shufflevector(a, b,
1065 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1066 }
1067
Marat Dukhan0be0b6e2017-04-14 01:09:22 +00001068 PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1069 return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1070 }
1071
1072 PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1073 return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1074 }
1075
1076 PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1077 return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1078 }
1079
1080 PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1081 return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1082 }
1083
1084 PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1085 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1086 }
1087
1088 PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1089 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1090 }
1091
1092 PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1093 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1094 }
1095
1096 PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1097 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1098 }
1099
1100 PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1101 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1102 }
1103
1104 PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1105 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1106 }
1107 #else
Marat Dukhanc7f1f992018-09-06 18:56:18 +03001108 PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1109 return __builtin_shuffle(a, b,
1110 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1111 }
1112
1113 PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1114 return __builtin_shuffle(a, b,
1115 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1116 }
1117
1118 PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1119 return __builtin_shuffle(a, b,
1120 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1121 }
1122
1123 PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1124 return __builtin_shuffle(a, b,
1125 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1126 }
1127
Marat Dukhan0be0b6e2017-04-14 01:09:22 +00001128 PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1129 return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1130 }
1131
1132 PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1133 return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1134 }
1135
1136 PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1137 return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1138 }
1139
1140 PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1141 return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1142 }
1143
1144 PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1145 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1146 }
1147
1148 PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1149 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1150 }
1151
1152 PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1153 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1154 }
1155
1156 PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1157 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1158 }
1159
1160 PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1161 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1162 }
1163
1164 PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1165 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1166 }
1167 #endif
1168
Marat Dukhan457042c2017-02-22 04:23:57 -05001169 /* Vector reduce */
1170 #if defined(__clang__)
1171 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1172 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1);
1173 return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2);
1174 }
1175
1176 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1177 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1178 return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1179 }
1180
1181 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1182 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1183 return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1184 }
1185
1186 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1187 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1);
1188 const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1);
1189 return result[0];
1190 }
1191
1192 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1193 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1194 const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1195 return result[0];
1196 }
1197
1198 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1199 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1200 const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1201 return result[0];
1202 }
1203 #else
1204 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1205 const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 });
1206 return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 });
1207 }
1208
1209 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1210 const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1211 return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1212 }
1213
1214 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1215 const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1216 return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1217 }
1218
1219 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1220 const psimd_f32 result = psimd_allreduce_sum_f32(v);
1221 return result[0];
1222 }
1223
1224 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1225 const psimd_f32 result = psimd_allreduce_max_f32(v);
1226 return result[0];
1227 }
1228
1229 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1230 const psimd_f32 result = psimd_allreduce_min_f32(v);
1231 return result[0];
1232 }
1233 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -05001234#endif
1235
1236#endif /* PSIMD_H */