blob: 807a65c9f5ce70d3ff212ade65f03b4265258bd0 [file] [log] [blame]
Marat Dukhan0525a852014-12-13 15:48:12 -05001#pragma once
2#ifndef PSIMD_H
3#define PSIMD_H
4
5#if defined(__CUDA_ARCH__)
6 /* CUDA compiler */
7 #define PSIMD_INTRINSIC __forceinline__ __device__
8#elif defined(__OPENCL_VERSION__)
9 /* OpenCL compiler */
10 #define PSIMD_INTRINSIC inline static
11#elif defined(__INTEL_COMPILER)
12 /* Intel compiler, even on Windows */
Marat Dukhanc81e1a82017-02-22 09:16:09 -050013 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
Marat Dukhan0525a852014-12-13 15:48:12 -050014#elif defined(__GNUC__)
15 /* GCC-compatible compiler (gcc/clang/icc) */
Marat Dukhanc81e1a82017-02-22 09:16:09 -050016 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
Marat Dukhan0525a852014-12-13 15:48:12 -050017#elif defined(_MSC_VER)
18 /* MSVC-compatible compiler (cl/icl/clang-cl) */
19 #define PSIMD_INTRINSIC __forceinline static
20#elif defined(__cplusplus)
21 /* Generic C++ compiler */
22 #define PSIMD_INTRINSIC inline static
23#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
24 /* Generic C99 compiler */
25 #define PSIMD_INTRINSIC inline static
26#else
27 /* Generic C compiler */
28 #define PSIMD_INTRINSIC static
29#endif
30
31#if defined(__cplusplus)
32 #define PSIMD_CXX_SYNTAX
33#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
34 #define PSIMD_C11_SYNTAX
35#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
36 #define PSIMD_C99_SYNTAX
37#else
38 #define PSIMD_C89_SYNTAX
39#endif
40
41#ifndef _MSC_VER
42 #include <stdint.h>
43#endif
44
45#if defined(__GNUC__)
46 #define PSIMD_HAVE_F64 0
47 #define PSIMD_HAVE_F32 1
48 #define PSIMD_HAVE_U8 1
49 #define PSIMD_HAVE_S8 1
50 #define PSIMD_HAVE_U16 1
51 #define PSIMD_HAVE_S16 1
52 #define PSIMD_HAVE_U32 1
53 #define PSIMD_HAVE_S32 1
54 #define PSIMD_HAVE_U64 0
55 #define PSIMD_HAVE_S64 0
56
Marat Dukhan0525a852014-12-13 15:48:12 -050057 typedef int8_t psimd_s8 __attribute__((vector_size(16), aligned(1)));
Marat Dukhan0e589252017-02-17 02:57:08 -050058 typedef uint8_t psimd_u8 __attribute__((vector_size(16), aligned(1)));
Marat Dukhan0525a852014-12-13 15:48:12 -050059 typedef int16_t psimd_s16 __attribute__((vector_size(16), aligned(2)));
Marat Dukhan0e589252017-02-17 02:57:08 -050060 typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2)));
Marat Dukhan0525a852014-12-13 15:48:12 -050061 typedef int32_t psimd_s32 __attribute__((vector_size(16), aligned(4)));
Marat Dukhan0e589252017-02-17 02:57:08 -050062 typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4)));
63 typedef float psimd_f32 __attribute__((vector_size(16), aligned(4)));
64
65 typedef struct {
66 psimd_s8 lo;
67 psimd_s8 hi;
68 } psimd_s8x2;
69
70 typedef struct {
71 psimd_u8 lo;
72 psimd_u8 hi;
73 } psimd_u8x2;
74
75 typedef struct {
76 psimd_s16 lo;
77 psimd_s16 hi;
78 } psimd_s16x2;
79
80 typedef struct {
81 psimd_u16 lo;
82 psimd_u16 hi;
83 } psimd_u16x2;
84
85 typedef struct {
86 psimd_s32 lo;
87 psimd_s32 hi;
88 } psimd_s32x2;
89
90 typedef struct {
91 psimd_u32 lo;
92 psimd_u32 hi;
93 } psimd_u32x2;
94
95 typedef struct {
96 psimd_f32 lo;
97 psimd_f32 hi;
98 } psimd_f32x2;
99
100 /* Bit casts */
101 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) {
102 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
103 }
104
105 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) {
106 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
107 }
108
109 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) {
110 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
111 }
112
113 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) {
114 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
115 }
116
117 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) {
118 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
119 }
120
121 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) {
122 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
123 }
Marat Dukhan0525a852014-12-13 15:48:12 -0500124
Marat Dukhan69138482017-02-22 09:21:10 -0500125 /* Swap */
126 PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) {
127 const psimd_s8 new_a = *b;
128 const psimd_s8 new_b = *a;
129 *a = new_a;
130 *b = new_b;
131 }
132
133 PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) {
134 const psimd_u8 new_a = *b;
135 const psimd_u8 new_b = *a;
136 *a = new_a;
137 *b = new_b;
138 }
139
140 PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) {
141 const psimd_s16 new_a = *b;
142 const psimd_s16 new_b = *a;
143 *a = new_a;
144 *b = new_b;
145 }
146
147 PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) {
148 const psimd_u16 new_a = *b;
149 const psimd_u16 new_b = *a;
150 *a = new_a;
151 *b = new_b;
152 }
153
154 PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) {
155 const psimd_s32 new_a = *b;
156 const psimd_s32 new_b = *a;
157 *a = new_a;
158 *b = new_b;
159 }
160
161 PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) {
162 const psimd_u32 new_a = *b;
163 const psimd_u32 new_b = *a;
164 *a = new_a;
165 *b = new_b;
166 }
167
168 PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) {
169 const psimd_f32 new_a = *b;
170 const psimd_f32 new_b = *a;
171 *a = new_a;
172 *b = new_b;
173 }
174
Marat Dukhan0525a852014-12-13 15:48:12 -0500175 /* Zero-initialization */
176 PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) {
177 return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
178 }
179
180 PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) {
181 return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
182 }
183
184 PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) {
185 return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 };
186 }
187
188 PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) {
189 return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 };
190 }
191
192 PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) {
193 return (psimd_s32) { 0, 0, 0, 0 };
194 }
195
196 PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) {
197 return (psimd_u32) { 0, 0, 0, 0 };
198 }
199
200 PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) {
201 return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f };
202 }
203
204 /* Initialization to the same constant */
205 PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) {
206 return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
207 }
208
209 PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) {
210 return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
211 }
212
213 PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) {
214 return (psimd_s16) { c, c, c, c, c, c, c, c };
215 }
216
217 PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) {
218 return (psimd_u16) { c, c, c, c, c, c, c, c };
219 }
220
221 PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) {
222 return (psimd_s32) { c, c, c, c };
223 }
224
225 PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) {
226 return (psimd_u32) { c, c, c, c };
227 }
228
229 PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
230 return (psimd_f32) { c, c, c, c };
231 }
232
233 /* Load vector */
234 PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) {
235 return *((const psimd_s8*) address);
236 }
237
238 PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) {
239 return *((const psimd_u8*) address);
240 }
241
242 PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) {
243 return *((const psimd_s16*) address);
244 }
245
246 PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) {
247 return *((const psimd_u16*) address);
248 }
249
250 PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) {
251 return *((const psimd_s32*) address);
252 }
253
254 PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) {
255 return *((const psimd_u32*) address);
256 }
257
258 PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
259 return *((const psimd_f32*) address);
260 }
261
Marat Dukhan19a380c2017-02-26 16:56:04 -0500262 PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) {
263 return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f };
264 }
265
266 PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) {
267 const float* address_f32 = (const float*) address;
268 return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f };
269 }
270
271 PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) {
272 const float* address_f32 = (const float*) address;
273 return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f };
274 }
275
276 PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) {
277 return psimd_load_f32(address);
278 }
279
280 PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) {
281 const psimd_f32 v0x1x = psimd_load_f32(address);
282 const psimd_f32 vx2x3 = psimd_load_f32(address + 3 * sizeof(float));
283 #if defined(__clang__)
284 return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7);
285 #else
286 return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 });
287 #endif
288 }
289
290 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) {
291 return psimd_load_f32(address);
292 }
293
294 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) {
295 const float* address_f32 = (const float*) address;
296 return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f };
297 }
298
299 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) {
300 const psimd_f32 v0x1x = psimd_load_f32(address);
301 const psimd_f32 v2zzz = psimd_load1_f32(address + 2 * sizeof(float));
302 #if defined(__clang__)
303 return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6);
304 #else
305 return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 });
306 #endif
307 }
308
309 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) {
310 return psimd_load_stride2_f32(address);
311 }
312
313 PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) {
314 const float* address0_f32 = (const float*) address;
315 const float* address1_f32 = address0_f32 + stride;
316 const float* address2_f32 = address1_f32 + stride;
317 const float* address3_f32 = address2_f32 + stride;
318 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 };
319 }
320
321 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) {
322 return psimd_load1_f32(address);
323 }
324
325 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) {
326 const float* address_f32 = (const float*) address;
327 return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f };
328 }
329
330 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) {
331 const float* address0_f32 = (const float*) address;
332 const float* address1_f32 = address0_f32 + stride;
333 const float* address2_f32 = address1_f32 + stride;
334 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f };
335 }
336
337 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) {
338 return psimd_load_stride_f32(address, stride);
339 }
340
Marat Dukhan0525a852014-12-13 15:48:12 -0500341 /* Store vector */
342 PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) {
343 *((psimd_s8*) address) = value;
344 }
345
346 PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) {
347 *((psimd_u8*) address) = value;
348 }
349
350 PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) {
351 *((psimd_s16*) address) = value;
352 }
353
354 PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) {
355 *((psimd_u16*) address) = value;
356 }
357
358 PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) {
359 *((psimd_s32*) address) = value;
360 }
361
362 PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) {
363 *((psimd_u32*) address) = value;
364 }
365
366 PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
367 *((psimd_f32*) address) = value;
368 }
369
Marat Dukhan19a380c2017-02-26 16:56:04 -0500370 PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) {
371 *((float*) address) = value[0];
372 }
373
374 PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) {
375 float* address_f32 = (float*) address;
376 address_f32[0] = value[0];
377 address_f32[1] = value[1];
378 }
379
380 PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) {
381 float* address_f32 = (float*) address;
382 address_f32[0] = value[0];
383 address_f32[1] = value[1];
384 address_f32[2] = value[2];
385 }
386
387 PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) {
388 psimd_store_f32(address, value);
389 }
390
391 PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) {
392 float* address0_f32 = (float*) address;
393 float* address1_f32 = address0_f32 + stride;
394 float* address2_f32 = address1_f32 + stride;
395 float* address3_f32 = address2_f32 + stride;
396 *address0_f32 = value[0];
397 *address1_f32 = value[1];
398 *address2_f32 = value[2];
399 *address3_f32 = value[3];
400 }
401
402 PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) {
403 psimd_store1_f32(address, value);
404 }
405
406 PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) {
407 float* address_f32 = (float*) address;
408 address_f32[0] = value[0];
409 address_f32[stride] = value[1];
410 }
411
412 PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) {
413 float* address0_f32 = (float*) address;
414 float* address1_f32 = address0_f32 + stride;
415 float* address2_f32 = address1_f32 + stride;
416 *address0_f32 = value[0];
417 *address1_f32 = value[1];
418 *address2_f32 = value[2];
419 }
420
Marat Dukhan0525a852014-12-13 15:48:12 -0500421 /* Vector addition */
422 PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) {
423 return a + b;
424 }
425
426 PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) {
427 return a + b;
428 }
429
430 PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) {
431 return a + b;
432 }
433
434 PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) {
435 return a + b;
436 }
437
438 PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) {
439 return a + b;
440 }
441
442 PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) {
443 return a + b;
444 }
445
446 PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) {
447 return a + b;
448 }
449
450 /* Vector subtraction */
451 PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) {
452 return a - b;
453 }
454
455 PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) {
456 return a - b;
457 }
458
459 PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) {
460 return a - b;
461 }
462
463 PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) {
464 return a - b;
465 }
466
467 PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) {
468 return a - b;
469 }
470
471 PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) {
472 return a - b;
473 }
474
475 PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) {
476 return a - b;
477 }
478
479 /* Vector multiplication */
480 PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) {
481 return a * b;
482 }
483
484 PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) {
485 return a * b;
486 }
487
488 PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) {
489 return a * b;
490 }
491
492 PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) {
493 return a * b;
494 }
495
496 PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) {
497 return a * b;
498 }
499
500 PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) {
501 return a * b;
502 }
503
504 PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) {
505 return a * b;
506 }
507
Marat Dukhan4a916292017-02-22 09:23:23 -0500508 /* Vector and */
509 PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) {
510 return (psimd_f32) (mask & (psimd_s32) v);
511 }
512
Marat Dukhan0525a852014-12-13 15:48:12 -0500513 /* Vector blend */
514 PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) {
515 return (mask & a) | (~mask & b);
516 }
517
518 PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_u8 mask, psimd_u8 a, psimd_u8 b) {
519 return (mask & a) | (~mask & b);
520 }
521
522 PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) {
523 return (mask & a) | (~mask & b);
524 }
525
526 PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_u16 mask, psimd_u16 a, psimd_u16 b) {
527 return (mask & a) | (~mask & b);
528 }
529
530 PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) {
531 return (mask & a) | (~mask & b);
532 }
533
534 PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_u32 mask, psimd_u32 a, psimd_u32 b) {
535 return (mask & a) | (~mask & b);
536 }
537
538 PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) {
539 return (psimd_f32) psimd_blend_s32(mask, (psimd_s32) a, (psimd_s32) b);
540 }
541
Marat Dukhan95c5be72017-02-22 09:23:06 -0500542 /* Vector blend on sign */
543 PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) {
544 return psimd_blend_s8(x >> psimd_splat_s8(7), a, b);
545 }
546
547 PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) {
548 return psimd_blend_u8((psimd_u8) (x >> psimd_splat_s8(7)), a, b);
549 }
550
551 PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) {
552 return psimd_blend_s16(x >> psimd_splat_s16(15), a, b);
553 }
554
555 PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) {
556 return psimd_blend_u16((psimd_u16) (x >> psimd_splat_s16(15)), a, b);
557 }
558
559 PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) {
560 return psimd_blend_s32(x >> psimd_splat_s32(31), a, b);
561 }
562
563 PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) {
564 return psimd_blend_u32((psimd_u32) (x >> psimd_splat_s32(31)), a, b);
565 }
566
567 PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) {
568 const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31);
569 return psimd_blend_f32(mask, a, b);
570 }
571
Marat Dukhan0525a852014-12-13 15:48:12 -0500572 /* Vector absolute value */
573 PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) {
574 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
575 return (psimd_f32) ((psimd_s32) v & mask);
576 }
577
578 /* Vector negation */
579 PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) {
580 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
581 return (psimd_f32) ((psimd_s32) v ^ mask);
582 }
583
584 /* Vector maximum */
585 PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) {
586 return psimd_blend_s8(a > b, a, b);
587 }
588
589 PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) {
590 return psimd_blend_u8(a > b, a, b);
591 }
592
593 PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) {
594 return psimd_blend_s16(a > b, a, b);
595 }
596
597 PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) {
598 return psimd_blend_u16(a > b, a, b);
599 }
600
601 PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) {
602 return psimd_blend_s32(a > b, a, b);
603 }
604
605 PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) {
606 return psimd_blend_u32(a > b, a, b);
607 }
608
609 PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) {
610 return psimd_blend_f32(a > b, a, b);
611 }
612
613 /* Vector minimum */
614 PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) {
615 return psimd_blend_s8(a < b, a, b);
616 }
617
618 PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) {
619 return psimd_blend_u8(a < b, a, b);
620 }
621
622 PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) {
623 return psimd_blend_s16(a < b, a, b);
624 }
625
626 PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) {
627 return psimd_blend_u16(a < b, a, b);
628 }
629
630 PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) {
631 return psimd_blend_s32(a < b, a, b);
632 }
633
634 PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) {
635 return psimd_blend_u32(a < b, a, b);
636 }
637
638 PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) {
639 return psimd_blend_f32(a < b, a, b);
640 }
641
642 /* Vector unpack */
643 #if defined(__clang__)
644 PSIMD_INTRINSIC psimd_s16 psimd_unpacklo_s16(psimd_s16 a, psimd_s16 b) {
645 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
646 }
647
648 PSIMD_INTRINSIC psimd_s16 psimd_unpackhi_s16(psimd_s16 a, psimd_s16 b) {
649 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
650 }
651
652 PSIMD_INTRINSIC psimd_u16 psimd_unpacklo_u16(psimd_u16 a, psimd_u16 b) {
653 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
654 }
655
656 PSIMD_INTRINSIC psimd_u16 psimd_unpackhi_u16(psimd_u16 a, psimd_u16 b) {
657 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
658 }
659
660 PSIMD_INTRINSIC psimd_s32 psimd_unpacklo_s32(psimd_s32 a, psimd_s32 b) {
661 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
662 }
663
664 PSIMD_INTRINSIC psimd_s32 psimd_unpackhi_s32(psimd_s32 a, psimd_s32 b) {
665 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
666 }
667
668 PSIMD_INTRINSIC psimd_u32 psimd_unpacklo_u32(psimd_u32 a, psimd_u32 b) {
669 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
670 }
671
672 PSIMD_INTRINSIC psimd_u32 psimd_unpackhi_u32(psimd_u32 a, psimd_u32 b) {
673 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
674 }
675
676 PSIMD_INTRINSIC psimd_f32 psimd_unpacklo_f32(psimd_f32 a, psimd_f32 b) {
677 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
678 }
679
680 PSIMD_INTRINSIC psimd_f32 psimd_unpackhi_f32(psimd_f32 a, psimd_f32 b) {
681 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
682 }
683 #else
684 PSIMD_INTRINSIC psimd_s16 psimd_unpacklo_s16(psimd_s16 a, psimd_s16 b) {
685 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
686 }
687
688 PSIMD_INTRINSIC psimd_s16 psimd_unpackhi_s16(psimd_s16 a, psimd_s16 b) {
689 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
690 }
691
692 PSIMD_INTRINSIC psimd_u16 psimd_unpacklo_u16(psimd_u16 a, psimd_u16 b) {
693 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
694 }
695
696 PSIMD_INTRINSIC psimd_u16 psimd_unpackhi_u16(psimd_u16 a, psimd_u16 b) {
697 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
698 }
699
700 PSIMD_INTRINSIC psimd_s32 psimd_unpacklo_s32(psimd_s32 a, psimd_s32 b) {
701 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
702 }
703
704 PSIMD_INTRINSIC psimd_s32 psimd_unpackhi_s32(psimd_s32 a, psimd_s32 b) {
705 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
706 }
707
708 PSIMD_INTRINSIC psimd_u32 psimd_unpacklo_u32(psimd_u32 a, psimd_u32 b) {
709 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
710 }
711
712 PSIMD_INTRINSIC psimd_u32 psimd_unpackhi_u32(psimd_u32 a, psimd_u32 b) {
713 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
714 }
715
716 PSIMD_INTRINSIC psimd_f32 psimd_unpacklo_f32(psimd_f32 a, psimd_f32 b) {
717 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
718 }
719
720 PSIMD_INTRINSIC psimd_f32 psimd_unpackhi_f32(psimd_f32 a, psimd_f32 b) {
721 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
722 }
723 #endif
Marat Dukhan457042c2017-02-22 04:23:57 -0500724
725 /* Vector reduce */
726 #if defined(__clang__)
727 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
728 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1);
729 return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2);
730 }
731
732 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
733 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
734 return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
735 }
736
737 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
738 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
739 return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
740 }
741
742 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
743 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1);
744 const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1);
745 return result[0];
746 }
747
748 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
749 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
750 const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
751 return result[0];
752 }
753
754 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
755 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
756 const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
757 return result[0];
758 }
759 #else
760 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
761 const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 });
762 return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 });
763 }
764
765 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
766 const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
767 return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
768 }
769
770 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
771 const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
772 return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
773 }
774
775 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
776 const psimd_f32 result = psimd_allreduce_sum_f32(v);
777 return result[0];
778 }
779
780 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
781 const psimd_f32 result = psimd_allreduce_max_f32(v);
782 return result[0];
783 }
784
785 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
786 const psimd_f32 result = psimd_allreduce_min_f32(v);
787 return result[0];
788 }
789 #endif
Marat Dukhan0525a852014-12-13 15:48:12 -0500790#endif
791
792#endif /* PSIMD_H */