Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1 | #pragma once |
| 2 | #ifndef PSIMD_H |
| 3 | #define PSIMD_H |
| 4 | |
| 5 | #if defined(__CUDA_ARCH__) |
| 6 | /* CUDA compiler */ |
| 7 | #define PSIMD_INTRINSIC __forceinline__ __device__ |
| 8 | #elif defined(__OPENCL_VERSION__) |
| 9 | /* OpenCL compiler */ |
| 10 | #define PSIMD_INTRINSIC inline static |
| 11 | #elif defined(__INTEL_COMPILER) |
| 12 | /* Intel compiler, even on Windows */ |
Marat Dukhan | c81e1a8 | 2017-02-22 09:16:09 -0500 | [diff] [blame] | 13 | #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__)) |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 14 | #elif defined(__GNUC__) |
| 15 | /* GCC-compatible compiler (gcc/clang/icc) */ |
Marat Dukhan | c81e1a8 | 2017-02-22 09:16:09 -0500 | [diff] [blame] | 16 | #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__)) |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 17 | #elif defined(_MSC_VER) |
| 18 | /* MSVC-compatible compiler (cl/icl/clang-cl) */ |
| 19 | #define PSIMD_INTRINSIC __forceinline static |
| 20 | #elif defined(__cplusplus) |
| 21 | /* Generic C++ compiler */ |
| 22 | #define PSIMD_INTRINSIC inline static |
| 23 | #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) |
| 24 | /* Generic C99 compiler */ |
| 25 | #define PSIMD_INTRINSIC inline static |
| 26 | #else |
| 27 | /* Generic C compiler */ |
| 28 | #define PSIMD_INTRINSIC static |
| 29 | #endif |
| 30 | |
Marat Dukhan | 85427dd | 2020-04-16 23:58:47 -0700 | [diff] [blame] | 31 | #if defined(__GNUC__) || defined(__clang__) |
Marat Dukhan | 90a938f | 2018-09-06 19:11:46 +0300 | [diff] [blame] | 32 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
| 33 | #include <arm_neon.h> |
| 34 | #endif |
| 35 | |
| 36 | #if defined(__SSE2__) |
| 37 | #include <emmintrin.h> |
| 38 | #endif |
| 39 | |
| 40 | #if defined(__SSE3__) |
| 41 | #include <pmmintrin.h> |
| 42 | #endif |
| 43 | |
| 44 | #if defined(__SSSE3__) |
| 45 | #include <tmmintrin.h> |
| 46 | #endif |
| 47 | |
| 48 | #if defined(__SSE4_1__) |
| 49 | #include <smmintrin.h> |
| 50 | #endif |
| 51 | |
| 52 | #if defined(__SSE4_2__) |
| 53 | #include <nmmintrin.h> |
| 54 | #endif |
| 55 | |
| 56 | #if defined(__AVX__) |
| 57 | #include <immintrin.h> |
| 58 | #endif |
| 59 | #elif defined(_MSC_VER) |
| 60 | #include <intrin.h> |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 61 | #endif |
| 62 | |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 63 | #if defined(__cplusplus) |
| 64 | #define PSIMD_CXX_SYNTAX |
| 65 | #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) |
| 66 | #define PSIMD_C11_SYNTAX |
| 67 | #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) |
| 68 | #define PSIMD_C99_SYNTAX |
| 69 | #else |
| 70 | #define PSIMD_C89_SYNTAX |
| 71 | #endif |
| 72 | |
Marat Dukhan | 663e0aa | 2017-03-01 03:59:55 -0500 | [diff] [blame] | 73 | #if defined(__cplusplus) && (__cplusplus >= 201103L) |
| 74 | #include <cstddef> |
| 75 | #include <cstdint> |
| 76 | #elif !defined(__OPENCL_VERSION__) |
| 77 | #include <stddef.h> |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 78 | #include <stdint.h> |
| 79 | #endif |
| 80 | |
Marat Dukhan | 85427dd | 2020-04-16 23:58:47 -0700 | [diff] [blame] | 81 | #if defined(__GNUC__) || defined(__clang__) |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 82 | #define PSIMD_HAVE_F64 0 |
| 83 | #define PSIMD_HAVE_F32 1 |
| 84 | #define PSIMD_HAVE_U8 1 |
| 85 | #define PSIMD_HAVE_S8 1 |
| 86 | #define PSIMD_HAVE_U16 1 |
| 87 | #define PSIMD_HAVE_S16 1 |
| 88 | #define PSIMD_HAVE_U32 1 |
| 89 | #define PSIMD_HAVE_S32 1 |
| 90 | #define PSIMD_HAVE_U64 0 |
| 91 | #define PSIMD_HAVE_S64 0 |
| 92 | |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 93 | typedef int8_t psimd_s8 __attribute__((vector_size(16), aligned(1))); |
Marat Dukhan | 0e58925 | 2017-02-17 02:57:08 -0500 | [diff] [blame] | 94 | typedef uint8_t psimd_u8 __attribute__((vector_size(16), aligned(1))); |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 95 | typedef int16_t psimd_s16 __attribute__((vector_size(16), aligned(2))); |
Marat Dukhan | 0e58925 | 2017-02-17 02:57:08 -0500 | [diff] [blame] | 96 | typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2))); |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 97 | typedef int32_t psimd_s32 __attribute__((vector_size(16), aligned(4))); |
Marat Dukhan | 0e58925 | 2017-02-17 02:57:08 -0500 | [diff] [blame] | 98 | typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4))); |
| 99 | typedef float psimd_f32 __attribute__((vector_size(16), aligned(4))); |
| 100 | |
| 101 | typedef struct { |
| 102 | psimd_s8 lo; |
| 103 | psimd_s8 hi; |
| 104 | } psimd_s8x2; |
| 105 | |
| 106 | typedef struct { |
| 107 | psimd_u8 lo; |
| 108 | psimd_u8 hi; |
| 109 | } psimd_u8x2; |
| 110 | |
| 111 | typedef struct { |
| 112 | psimd_s16 lo; |
| 113 | psimd_s16 hi; |
| 114 | } psimd_s16x2; |
| 115 | |
| 116 | typedef struct { |
| 117 | psimd_u16 lo; |
| 118 | psimd_u16 hi; |
| 119 | } psimd_u16x2; |
| 120 | |
| 121 | typedef struct { |
| 122 | psimd_s32 lo; |
| 123 | psimd_s32 hi; |
| 124 | } psimd_s32x2; |
| 125 | |
| 126 | typedef struct { |
| 127 | psimd_u32 lo; |
| 128 | psimd_u32 hi; |
| 129 | } psimd_u32x2; |
| 130 | |
| 131 | typedef struct { |
| 132 | psimd_f32 lo; |
| 133 | psimd_f32 hi; |
| 134 | } psimd_f32x2; |
| 135 | |
| 136 | /* Bit casts */ |
| 137 | PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) { |
| 138 | return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi }; |
| 139 | } |
| 140 | |
| 141 | PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) { |
| 142 | return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi }; |
| 143 | } |
| 144 | |
| 145 | PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) { |
| 146 | return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi }; |
| 147 | } |
| 148 | |
| 149 | PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) { |
| 150 | return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi }; |
| 151 | } |
| 152 | |
| 153 | PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) { |
| 154 | return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi }; |
| 155 | } |
| 156 | |
| 157 | PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) { |
| 158 | return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi }; |
| 159 | } |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 160 | |
Marat Dukhan | 6913848 | 2017-02-22 09:21:10 -0500 | [diff] [blame] | 161 | /* Swap */ |
| 162 | PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) { |
| 163 | const psimd_s8 new_a = *b; |
| 164 | const psimd_s8 new_b = *a; |
| 165 | *a = new_a; |
| 166 | *b = new_b; |
| 167 | } |
| 168 | |
| 169 | PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) { |
| 170 | const psimd_u8 new_a = *b; |
| 171 | const psimd_u8 new_b = *a; |
| 172 | *a = new_a; |
| 173 | *b = new_b; |
| 174 | } |
| 175 | |
| 176 | PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) { |
| 177 | const psimd_s16 new_a = *b; |
| 178 | const psimd_s16 new_b = *a; |
| 179 | *a = new_a; |
| 180 | *b = new_b; |
| 181 | } |
| 182 | |
| 183 | PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) { |
| 184 | const psimd_u16 new_a = *b; |
| 185 | const psimd_u16 new_b = *a; |
| 186 | *a = new_a; |
| 187 | *b = new_b; |
| 188 | } |
| 189 | |
| 190 | PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) { |
| 191 | const psimd_s32 new_a = *b; |
| 192 | const psimd_s32 new_b = *a; |
| 193 | *a = new_a; |
| 194 | *b = new_b; |
| 195 | } |
| 196 | |
| 197 | PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) { |
| 198 | const psimd_u32 new_a = *b; |
| 199 | const psimd_u32 new_b = *a; |
| 200 | *a = new_a; |
| 201 | *b = new_b; |
| 202 | } |
| 203 | |
| 204 | PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) { |
| 205 | const psimd_f32 new_a = *b; |
| 206 | const psimd_f32 new_b = *a; |
| 207 | *a = new_a; |
| 208 | *b = new_b; |
| 209 | } |
| 210 | |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 211 | /* Zero-initialization */ |
| 212 | PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) { |
| 213 | return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; |
| 214 | } |
| 215 | |
| 216 | PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) { |
| 217 | return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; |
| 218 | } |
| 219 | |
| 220 | PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) { |
| 221 | return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 }; |
| 222 | } |
| 223 | |
| 224 | PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) { |
| 225 | return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 }; |
| 226 | } |
| 227 | |
| 228 | PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) { |
| 229 | return (psimd_s32) { 0, 0, 0, 0 }; |
| 230 | } |
| 231 | |
| 232 | PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) { |
| 233 | return (psimd_u32) { 0, 0, 0, 0 }; |
| 234 | } |
| 235 | |
| 236 | PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) { |
| 237 | return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f }; |
| 238 | } |
| 239 | |
| 240 | /* Initialization to the same constant */ |
| 241 | PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) { |
| 242 | return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }; |
| 243 | } |
| 244 | |
| 245 | PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) { |
| 246 | return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }; |
| 247 | } |
| 248 | |
| 249 | PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) { |
| 250 | return (psimd_s16) { c, c, c, c, c, c, c, c }; |
| 251 | } |
| 252 | |
| 253 | PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) { |
| 254 | return (psimd_u16) { c, c, c, c, c, c, c, c }; |
| 255 | } |
| 256 | |
| 257 | PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) { |
| 258 | return (psimd_s32) { c, c, c, c }; |
| 259 | } |
| 260 | |
| 261 | PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) { |
| 262 | return (psimd_u32) { c, c, c, c }; |
| 263 | } |
| 264 | |
| 265 | PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) { |
| 266 | return (psimd_f32) { c, c, c, c }; |
| 267 | } |
| 268 | |
| 269 | /* Load vector */ |
| 270 | PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) { |
| 271 | return *((const psimd_s8*) address); |
| 272 | } |
| 273 | |
| 274 | PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) { |
| 275 | return *((const psimd_u8*) address); |
| 276 | } |
| 277 | |
| 278 | PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) { |
| 279 | return *((const psimd_s16*) address); |
| 280 | } |
| 281 | |
| 282 | PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) { |
| 283 | return *((const psimd_u16*) address); |
| 284 | } |
| 285 | |
| 286 | PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) { |
| 287 | return *((const psimd_s32*) address); |
| 288 | } |
| 289 | |
| 290 | PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) { |
| 291 | return *((const psimd_u32*) address); |
| 292 | } |
| 293 | |
| 294 | PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) { |
| 295 | return *((const psimd_f32*) address); |
| 296 | } |
| 297 | |
Marat Dukhan | 4822bd4 | 2019-08-29 22:18:57 -0700 | [diff] [blame] | 298 | PSIMD_INTRINSIC psimd_s8 psimd_load_splat_s8(const void* address) { |
| 299 | return psimd_splat_s8(*((const int8_t*) address)); |
| 300 | } |
| 301 | |
| 302 | PSIMD_INTRINSIC psimd_u8 psimd_load_splat_u8(const void* address) { |
| 303 | return psimd_splat_u8(*((const uint8_t*) address)); |
| 304 | } |
| 305 | |
| 306 | PSIMD_INTRINSIC psimd_s16 psimd_load_splat_s16(const void* address) { |
| 307 | return psimd_splat_s16(*((const int16_t*) address)); |
| 308 | } |
| 309 | |
| 310 | PSIMD_INTRINSIC psimd_u16 psimd_load_splat_u16(const void* address) { |
| 311 | return psimd_splat_u16(*((const uint16_t*) address)); |
| 312 | } |
| 313 | |
| 314 | PSIMD_INTRINSIC psimd_s32 psimd_load_splat_s32(const void* address) { |
| 315 | return psimd_splat_s32(*((const int32_t*) address)); |
| 316 | } |
| 317 | |
| 318 | PSIMD_INTRINSIC psimd_u32 psimd_load_splat_u32(const void* address) { |
| 319 | return psimd_splat_u32(*((const uint32_t*) address)); |
| 320 | } |
| 321 | |
| 322 | PSIMD_INTRINSIC psimd_f32 psimd_load_splat_f32(const void* address) { |
| 323 | return psimd_splat_f32(*((const float*) address)); |
| 324 | } |
| 325 | |
Marat Dukhan | daa4ba3 | 2019-08-29 22:29:32 -0700 | [diff] [blame] | 326 | PSIMD_INTRINSIC psimd_s32 psimd_load1_s32(const void* address) { |
| 327 | return (psimd_s32) { *((const int32_t*) address), 0, 0, 0 }; |
| 328 | } |
| 329 | |
| 330 | PSIMD_INTRINSIC psimd_u32 psimd_load1_u32(const void* address) { |
| 331 | return (psimd_u32) { *((const uint32_t*) address), 0, 0, 0 }; |
| 332 | } |
| 333 | |
Marat Dukhan | 19a380c | 2017-02-26 16:56:04 -0500 | [diff] [blame] | 334 | PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) { |
| 335 | return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f }; |
| 336 | } |
| 337 | |
Marat Dukhan | daa4ba3 | 2019-08-29 22:29:32 -0700 | [diff] [blame] | 338 | PSIMD_INTRINSIC psimd_s32 psimd_load2_s32(const void* address) { |
| 339 | const int32_t* address_s32 = (const int32_t*) address; |
| 340 | return (psimd_s32) { address_s32[0], address_s32[1], 0, 0 }; |
| 341 | } |
| 342 | |
| 343 | PSIMD_INTRINSIC psimd_u32 psimd_load2_u32(const void* address) { |
| 344 | const uint32_t* address_u32 = (const uint32_t*) address; |
| 345 | return (psimd_u32) { address_u32[0], address_u32[1], 0, 0 }; |
| 346 | } |
| 347 | |
Marat Dukhan | 19a380c | 2017-02-26 16:56:04 -0500 | [diff] [blame] | 348 | PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) { |
| 349 | const float* address_f32 = (const float*) address; |
| 350 | return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f }; |
| 351 | } |
| 352 | |
Marat Dukhan | daa4ba3 | 2019-08-29 22:29:32 -0700 | [diff] [blame] | 353 | PSIMD_INTRINSIC psimd_s32 psimd_load3_s32(const void* address) { |
| 354 | const int32_t* address_s32 = (const int32_t*) address; |
| 355 | return (psimd_s32) { address_s32[0], address_s32[1], address_s32[2], 0 }; |
| 356 | } |
| 357 | |
| 358 | PSIMD_INTRINSIC psimd_u32 psimd_load3_u32(const void* address) { |
| 359 | const uint32_t* address_u32 = (const uint32_t*) address; |
| 360 | return (psimd_u32) { address_u32[0], address_u32[1], address_u32[2], 0 }; |
| 361 | } |
| 362 | |
Marat Dukhan | 19a380c | 2017-02-26 16:56:04 -0500 | [diff] [blame] | 363 | PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) { |
| 364 | const float* address_f32 = (const float*) address; |
| 365 | return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f }; |
| 366 | } |
| 367 | |
Marat Dukhan | daa4ba3 | 2019-08-29 22:29:32 -0700 | [diff] [blame] | 368 | PSIMD_INTRINSIC psimd_s32 psimd_load4_s32(const void* address) { |
| 369 | return psimd_load_s32(address); |
| 370 | } |
| 371 | |
| 372 | PSIMD_INTRINSIC psimd_u32 psimd_load4_u32(const void* address) { |
| 373 | return psimd_load_u32(address); |
| 374 | } |
| 375 | |
Marat Dukhan | 19a380c | 2017-02-26 16:56:04 -0500 | [diff] [blame] | 376 | PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) { |
| 377 | return psimd_load_f32(address); |
| 378 | } |
| 379 | |
| 380 | PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) { |
| 381 | const psimd_f32 v0x1x = psimd_load_f32(address); |
Marat Dukhan | 94f61c0 | 2017-03-22 09:49:27 -0400 | [diff] [blame] | 382 | const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3); |
Marat Dukhan | 19a380c | 2017-02-26 16:56:04 -0500 | [diff] [blame] | 383 | #if defined(__clang__) |
| 384 | return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7); |
| 385 | #else |
| 386 | return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 }); |
| 387 | #endif |
| 388 | } |
| 389 | |
| 390 | PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) { |
| 391 | return psimd_load_f32(address); |
| 392 | } |
| 393 | |
| 394 | PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) { |
| 395 | const float* address_f32 = (const float*) address; |
| 396 | return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f }; |
| 397 | } |
| 398 | |
| 399 | PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) { |
| 400 | const psimd_f32 v0x1x = psimd_load_f32(address); |
Marat Dukhan | 94f61c0 | 2017-03-22 09:49:27 -0400 | [diff] [blame] | 401 | const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2); |
Marat Dukhan | 19a380c | 2017-02-26 16:56:04 -0500 | [diff] [blame] | 402 | #if defined(__clang__) |
| 403 | return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6); |
| 404 | #else |
| 405 | return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 }); |
| 406 | #endif |
| 407 | } |
| 408 | |
| 409 | PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) { |
| 410 | return psimd_load_stride2_f32(address); |
| 411 | } |
| 412 | |
| 413 | PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) { |
| 414 | const float* address0_f32 = (const float*) address; |
| 415 | const float* address1_f32 = address0_f32 + stride; |
| 416 | const float* address2_f32 = address1_f32 + stride; |
| 417 | const float* address3_f32 = address2_f32 + stride; |
| 418 | return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 }; |
| 419 | } |
| 420 | |
| 421 | PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) { |
| 422 | return psimd_load1_f32(address); |
| 423 | } |
| 424 | |
| 425 | PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) { |
| 426 | const float* address_f32 = (const float*) address; |
| 427 | return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f }; |
| 428 | } |
| 429 | |
| 430 | PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) { |
| 431 | const float* address0_f32 = (const float*) address; |
| 432 | const float* address1_f32 = address0_f32 + stride; |
| 433 | const float* address2_f32 = address1_f32 + stride; |
| 434 | return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f }; |
| 435 | } |
| 436 | |
| 437 | PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) { |
| 438 | return psimd_load_stride_f32(address, stride); |
| 439 | } |
| 440 | |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 441 | /* Store vector */ |
| 442 | PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) { |
| 443 | *((psimd_s8*) address) = value; |
| 444 | } |
| 445 | |
| 446 | PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) { |
| 447 | *((psimd_u8*) address) = value; |
| 448 | } |
| 449 | |
| 450 | PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) { |
| 451 | *((psimd_s16*) address) = value; |
| 452 | } |
| 453 | |
| 454 | PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) { |
| 455 | *((psimd_u16*) address) = value; |
| 456 | } |
| 457 | |
| 458 | PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) { |
| 459 | *((psimd_s32*) address) = value; |
| 460 | } |
| 461 | |
| 462 | PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) { |
| 463 | *((psimd_u32*) address) = value; |
| 464 | } |
| 465 | |
| 466 | PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) { |
| 467 | *((psimd_f32*) address) = value; |
| 468 | } |
| 469 | |
Marat Dukhan | 40183e6 | 2019-08-29 22:16:24 -0700 | [diff] [blame] | 470 | PSIMD_INTRINSIC void psimd_store1_s32(void* address, psimd_s32 value) { |
| 471 | *((int32_t*) address) = value[0]; |
| 472 | } |
| 473 | |
| 474 | PSIMD_INTRINSIC void psimd_store1_u32(void* address, psimd_u32 value) { |
| 475 | *((uint32_t*) address) = value[0]; |
| 476 | } |
| 477 | |
Marat Dukhan | 19a380c | 2017-02-26 16:56:04 -0500 | [diff] [blame] | 478 | PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) { |
| 479 | *((float*) address) = value[0]; |
| 480 | } |
| 481 | |
Marat Dukhan | 40183e6 | 2019-08-29 22:16:24 -0700 | [diff] [blame] | 482 | PSIMD_INTRINSIC void psimd_store2_s32(void* address, psimd_s32 value) { |
| 483 | int32_t* address_s32 = (int32_t*) address; |
| 484 | address_s32[0] = value[0]; |
| 485 | address_s32[1] = value[1]; |
| 486 | } |
| 487 | |
| 488 | PSIMD_INTRINSIC void psimd_store2_u32(void* address, psimd_u32 value) { |
| 489 | uint32_t* address_u32 = (uint32_t*) address; |
| 490 | address_u32[0] = value[0]; |
| 491 | address_u32[1] = value[1]; |
| 492 | } |
| 493 | |
Marat Dukhan | 19a380c | 2017-02-26 16:56:04 -0500 | [diff] [blame] | 494 | PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) { |
| 495 | float* address_f32 = (float*) address; |
| 496 | address_f32[0] = value[0]; |
| 497 | address_f32[1] = value[1]; |
| 498 | } |
| 499 | |
Marat Dukhan | 40183e6 | 2019-08-29 22:16:24 -0700 | [diff] [blame] | 500 | PSIMD_INTRINSIC void psimd_store3_s32(void* address, psimd_s32 value) { |
| 501 | int32_t* address_s32 = (int32_t*) address; |
| 502 | address_s32[0] = value[0]; |
| 503 | address_s32[1] = value[1]; |
| 504 | address_s32[2] = value[2]; |
| 505 | } |
| 506 | |
| 507 | PSIMD_INTRINSIC void psimd_store3_u32(void* address, psimd_u32 value) { |
| 508 | uint32_t* address_u32 = (uint32_t*) address; |
| 509 | address_u32[0] = value[0]; |
| 510 | address_u32[1] = value[1]; |
| 511 | address_u32[2] = value[2]; |
| 512 | } |
| 513 | |
Marat Dukhan | 19a380c | 2017-02-26 16:56:04 -0500 | [diff] [blame] | 514 | PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) { |
| 515 | float* address_f32 = (float*) address; |
| 516 | address_f32[0] = value[0]; |
| 517 | address_f32[1] = value[1]; |
| 518 | address_f32[2] = value[2]; |
| 519 | } |
| 520 | |
Marat Dukhan | 40183e6 | 2019-08-29 22:16:24 -0700 | [diff] [blame] | 521 | PSIMD_INTRINSIC void psimd_store4_s32(void* address, psimd_s32 value) { |
| 522 | psimd_store_s32(address, value); |
| 523 | } |
| 524 | |
| 525 | PSIMD_INTRINSIC void psimd_store4_u32(void* address, psimd_u32 value) { |
| 526 | psimd_store_u32(address, value); |
| 527 | } |
| 528 | |
Marat Dukhan | 19a380c | 2017-02-26 16:56:04 -0500 | [diff] [blame] | 529 | PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) { |
| 530 | psimd_store_f32(address, value); |
| 531 | } |
| 532 | |
| 533 | PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) { |
| 534 | float* address0_f32 = (float*) address; |
| 535 | float* address1_f32 = address0_f32 + stride; |
| 536 | float* address2_f32 = address1_f32 + stride; |
| 537 | float* address3_f32 = address2_f32 + stride; |
| 538 | *address0_f32 = value[0]; |
| 539 | *address1_f32 = value[1]; |
| 540 | *address2_f32 = value[2]; |
| 541 | *address3_f32 = value[3]; |
| 542 | } |
| 543 | |
| 544 | PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) { |
| 545 | psimd_store1_f32(address, value); |
| 546 | } |
| 547 | |
| 548 | PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) { |
| 549 | float* address_f32 = (float*) address; |
| 550 | address_f32[0] = value[0]; |
| 551 | address_f32[stride] = value[1]; |
| 552 | } |
| 553 | |
| 554 | PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) { |
| 555 | float* address0_f32 = (float*) address; |
| 556 | float* address1_f32 = address0_f32 + stride; |
| 557 | float* address2_f32 = address1_f32 + stride; |
| 558 | *address0_f32 = value[0]; |
| 559 | *address1_f32 = value[1]; |
| 560 | *address2_f32 = value[2]; |
| 561 | } |
| 562 | |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 563 | /* Vector addition */ |
| 564 | PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) { |
| 565 | return a + b; |
| 566 | } |
| 567 | |
| 568 | PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) { |
| 569 | return a + b; |
| 570 | } |
| 571 | |
| 572 | PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) { |
| 573 | return a + b; |
| 574 | } |
| 575 | |
| 576 | PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) { |
| 577 | return a + b; |
| 578 | } |
| 579 | |
| 580 | PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) { |
| 581 | return a + b; |
| 582 | } |
| 583 | |
| 584 | PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) { |
| 585 | return a + b; |
| 586 | } |
| 587 | |
| 588 | PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) { |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 589 | #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) |
| 590 | return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b); |
| 591 | #else |
| 592 | return a + b; |
| 593 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 594 | } |
| 595 | |
| 596 | /* Vector subtraction */ |
| 597 | PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) { |
| 598 | return a - b; |
| 599 | } |
| 600 | |
| 601 | PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) { |
| 602 | return a - b; |
| 603 | } |
| 604 | |
| 605 | PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) { |
| 606 | return a - b; |
| 607 | } |
| 608 | |
| 609 | PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) { |
| 610 | return a - b; |
| 611 | } |
| 612 | |
| 613 | PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) { |
| 614 | return a - b; |
| 615 | } |
| 616 | |
| 617 | PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) { |
| 618 | return a - b; |
| 619 | } |
| 620 | |
| 621 | PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) { |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 622 | #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) |
| 623 | return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b); |
| 624 | #else |
| 625 | return a - b; |
| 626 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 627 | } |
| 628 | |
| 629 | /* Vector multiplication */ |
| 630 | PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) { |
| 631 | return a * b; |
| 632 | } |
| 633 | |
| 634 | PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) { |
| 635 | return a * b; |
| 636 | } |
| 637 | |
| 638 | PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) { |
| 639 | return a * b; |
| 640 | } |
| 641 | |
| 642 | PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) { |
| 643 | return a * b; |
| 644 | } |
| 645 | |
| 646 | PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) { |
| 647 | return a * b; |
| 648 | } |
| 649 | |
| 650 | PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) { |
| 651 | return a * b; |
| 652 | } |
| 653 | |
| 654 | PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) { |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 655 | #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) |
| 656 | return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b); |
| 657 | #else |
| 658 | return a * b; |
| 659 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 660 | } |
| 661 | |
Marat Dukhan | 58f1cea | 2019-09-27 14:51:35 -0700 | [diff] [blame] | 662 | /* Quasi-Fused Multiply-Add */ |
| 663 | PSIMD_INTRINSIC psimd_f32 psimd_qfma_f32(psimd_f32 a, psimd_f32 b, psimd_f32 c) { |
| 664 | #if defined(__aarch64__) || defined(__ARM_NEON__) && defined(__ARM_FEATURE_FMA) |
| 665 | return (psimd_f32) vfmaq_f32((float32x4_t) a, (float32x4_t) b, (float32x4_t) c); |
| 666 | #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA__) |
Marat Dukhan | 072586a | 2020-05-17 03:58:41 -0700 | [diff] [blame] | 667 | return (psimd_f32) _mm_fmadd_ps((__m128) b, (__m128) c, (__m128) a); |
Marat Dukhan | 58f1cea | 2019-09-27 14:51:35 -0700 | [diff] [blame] | 668 | #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA4__) |
Marat Dukhan | 072586a | 2020-05-17 03:58:41 -0700 | [diff] [blame] | 669 | return (psimd_f32) _mm_macc_ps((__m128) b, (__m128) c, (__m128) a); |
Marat Dukhan | 88882f6 | 2020-02-21 05:46:31 -0800 | [diff] [blame] | 670 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) && PSIMD_ENABLE_WASM_QFMA |
Marat Dukhan | 58f1cea | 2019-09-27 14:51:35 -0700 | [diff] [blame] | 671 | return (psimd_f32) __builtin_wasm_qfma_f32x4(a, b, c); |
| 672 | #else |
| 673 | return a + b * c; |
| 674 | #endif |
| 675 | } |
| 676 | |
Marat Dukhan | 8fd2884 | 2019-12-06 12:23:07 -0800 | [diff] [blame] | 677 | PSIMD_INTRINSIC psimd_f32 psimd_div_f32(psimd_f32 a, psimd_f32 b) { |
| 678 | return a / b; |
| 679 | } |
| 680 | |
Marat Dukhan | 4a91629 | 2017-02-22 09:23:23 -0500 | [diff] [blame] | 681 | /* Vector and */ |
| 682 | PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) { |
| 683 | return (psimd_f32) (mask & (psimd_s32) v); |
| 684 | } |
| 685 | |
Marat Dukhan | 10b4ffc | 2019-12-26 22:22:39 +0300 | [diff] [blame] | 686 | /* Vector and-not */ |
| 687 | PSIMD_INTRINSIC psimd_f32 psimd_andnotmask_f32(psimd_s32 mask, psimd_f32 v) { |
| 688 | return (psimd_f32) (~mask & (psimd_s32) v); |
| 689 | } |
| 690 | |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 691 | /* Vector blend */ |
| 692 | PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 693 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 694 | return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b); |
Marat Dukhan | 363d461 | 2019-08-29 22:26:05 -0700 | [diff] [blame] | 695 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
| 696 | return (psimd_s8) __builtin_wasm_bitselect(a, b, mask); |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 697 | #else |
| 698 | return (mask & a) | (~mask & b); |
| 699 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 700 | } |
| 701 | |
Marat Dukhan | 4afd402 | 2019-09-30 21:24:46 -0700 | [diff] [blame] | 702 | PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_s8 mask, psimd_u8 a, psimd_u8 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 703 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 704 | return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b); |
Marat Dukhan | 363d461 | 2019-08-29 22:26:05 -0700 | [diff] [blame] | 705 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
| 706 | return (psimd_u8) __builtin_wasm_bitselect(a, b, mask); |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 707 | #else |
Marat Dukhan | 4afd402 | 2019-09-30 21:24:46 -0700 | [diff] [blame] | 708 | return (psimd_u8) ((mask & (psimd_s8) a) | (~mask & (psimd_s8) b)); |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 709 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 710 | } |
| 711 | |
| 712 | PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 713 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 714 | return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b); |
Marat Dukhan | 363d461 | 2019-08-29 22:26:05 -0700 | [diff] [blame] | 715 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
| 716 | return (psimd_s16) __builtin_wasm_bitselect(a, b, mask); |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 717 | #else |
| 718 | return (mask & a) | (~mask & b); |
| 719 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 720 | } |
| 721 | |
Marat Dukhan | 4afd402 | 2019-09-30 21:24:46 -0700 | [diff] [blame] | 722 | PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_s16 mask, psimd_u16 a, psimd_u16 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 723 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 724 | return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b); |
Marat Dukhan | 363d461 | 2019-08-29 22:26:05 -0700 | [diff] [blame] | 725 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
| 726 | return (psimd_u16) __builtin_wasm_bitselect(a, b, mask); |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 727 | #else |
Marat Dukhan | 4afd402 | 2019-09-30 21:24:46 -0700 | [diff] [blame] | 728 | return (psimd_u16) ((mask & (psimd_s16) a) | (~mask & (psimd_s16) b)); |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 729 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 730 | } |
| 731 | |
| 732 | PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 733 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 734 | return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b); |
Marat Dukhan | 363d461 | 2019-08-29 22:26:05 -0700 | [diff] [blame] | 735 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
| 736 | return (psimd_s32) __builtin_wasm_bitselect(a, b, mask); |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 737 | #else |
| 738 | return (mask & a) | (~mask & b); |
| 739 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 740 | } |
| 741 | |
Marat Dukhan | 4afd402 | 2019-09-30 21:24:46 -0700 | [diff] [blame] | 742 | PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_s32 mask, psimd_u32 a, psimd_u32 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 743 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 744 | return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b); |
Marat Dukhan | 363d461 | 2019-08-29 22:26:05 -0700 | [diff] [blame] | 745 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
| 746 | return (psimd_u32) __builtin_wasm_bitselect(a, b, mask); |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 747 | #else |
Marat Dukhan | 4afd402 | 2019-09-30 21:24:46 -0700 | [diff] [blame] | 748 | return (psimd_u32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b)); |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 749 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 750 | } |
| 751 | |
| 752 | PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 753 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 754 | return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b); |
Marat Dukhan | 363d461 | 2019-08-29 22:26:05 -0700 | [diff] [blame] | 755 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
| 756 | return (psimd_f32) __builtin_wasm_bitselect(a, b, mask); |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 757 | #else |
Marat Dukhan | 4afd402 | 2019-09-30 21:24:46 -0700 | [diff] [blame] | 758 | return (psimd_f32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b)); |
Marat Dukhan | d9dff1f | 2017-03-23 18:07:02 +0000 | [diff] [blame] | 759 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 760 | } |
| 761 | |
Marat Dukhan | 95c5be7 | 2017-02-22 09:23:06 -0500 | [diff] [blame] | 762 | /* Vector blend on sign */ |
| 763 | PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) { |
| 764 | return psimd_blend_s8(x >> psimd_splat_s8(7), a, b); |
| 765 | } |
| 766 | |
| 767 | PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) { |
Marat Dukhan | 4f2c539 | 2019-09-30 21:30:06 -0700 | [diff] [blame] | 768 | return psimd_blend_u8((x >> psimd_splat_s8(7)), a, b); |
Marat Dukhan | 95c5be7 | 2017-02-22 09:23:06 -0500 | [diff] [blame] | 769 | } |
| 770 | |
| 771 | PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) { |
| 772 | return psimd_blend_s16(x >> psimd_splat_s16(15), a, b); |
| 773 | } |
| 774 | |
| 775 | PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) { |
Marat Dukhan | 4f2c539 | 2019-09-30 21:30:06 -0700 | [diff] [blame] | 776 | return psimd_blend_u16((x >> psimd_splat_s16(15)), a, b); |
Marat Dukhan | 95c5be7 | 2017-02-22 09:23:06 -0500 | [diff] [blame] | 777 | } |
| 778 | |
| 779 | PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) { |
| 780 | return psimd_blend_s32(x >> psimd_splat_s32(31), a, b); |
| 781 | } |
| 782 | |
| 783 | PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) { |
Marat Dukhan | 4f2c539 | 2019-09-30 21:30:06 -0700 | [diff] [blame] | 784 | return psimd_blend_u32((x >> psimd_splat_s32(31)), a, b); |
Marat Dukhan | 95c5be7 | 2017-02-22 09:23:06 -0500 | [diff] [blame] | 785 | } |
| 786 | |
| 787 | PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) { |
| 788 | const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31); |
| 789 | return psimd_blend_f32(mask, a, b); |
| 790 | } |
| 791 | |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 792 | /* Vector absolute value */ |
| 793 | PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) { |
| 794 | const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f); |
Marat Dukhan | 56620a9 | 2019-12-26 22:20:57 +0300 | [diff] [blame] | 795 | return (psimd_f32) ((psimd_s32) v & ~mask); |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 796 | } |
| 797 | |
| 798 | /* Vector negation */ |
| 799 | PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) { |
| 800 | const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f); |
| 801 | return (psimd_f32) ((psimd_s32) v ^ mask); |
| 802 | } |
| 803 | |
| 804 | /* Vector maximum */ |
| 805 | PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 806 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 807 | return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b); |
| 808 | #else |
| 809 | return psimd_blend_s8(a > b, a, b); |
| 810 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 811 | } |
| 812 | |
| 813 | PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 814 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 815 | return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b); |
| 816 | #else |
| 817 | return psimd_blend_u8(a > b, a, b); |
| 818 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 819 | } |
| 820 | |
| 821 | PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 822 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 823 | return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b); |
| 824 | #else |
| 825 | return psimd_blend_s16(a > b, a, b); |
| 826 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 827 | } |
| 828 | |
| 829 | PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 830 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 831 | return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b); |
| 832 | #else |
| 833 | return psimd_blend_u16(a > b, a, b); |
| 834 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 835 | } |
| 836 | |
| 837 | PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 838 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 839 | return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b); |
| 840 | #else |
| 841 | return psimd_blend_s32(a > b, a, b); |
| 842 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 843 | } |
| 844 | |
| 845 | PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 846 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 847 | return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b); |
| 848 | #else |
| 849 | return psimd_blend_u32(a > b, a, b); |
| 850 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 851 | } |
| 852 | |
| 853 | PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 854 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 855 | return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b); |
Marat Dukhan | 363d461 | 2019-08-29 22:26:05 -0700 | [diff] [blame] | 856 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
| 857 | return __builtin_wasm_max_f32x4(a, b); |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 858 | #else |
| 859 | return psimd_blend_f32(a > b, a, b); |
| 860 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 861 | } |
| 862 | |
| 863 | /* Vector minimum */ |
| 864 | PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 865 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 866 | return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b); |
| 867 | #else |
| 868 | return psimd_blend_s8(a < b, a, b); |
| 869 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 870 | } |
| 871 | |
| 872 | PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 873 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 874 | return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b); |
| 875 | #else |
| 876 | return psimd_blend_u8(a < b, a, b); |
| 877 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 878 | } |
| 879 | |
| 880 | PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 881 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 882 | return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b); |
| 883 | #else |
| 884 | return psimd_blend_s16(a < b, a, b); |
| 885 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 886 | } |
| 887 | |
| 888 | PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 889 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 890 | return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b); |
| 891 | #else |
| 892 | return psimd_blend_u16(a < b, a, b); |
| 893 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 894 | } |
| 895 | |
| 896 | PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 897 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 898 | return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b); |
| 899 | #else |
| 900 | return psimd_blend_s32(a < b, a, b); |
| 901 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 902 | } |
| 903 | |
| 904 | PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 905 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 906 | return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b); |
| 907 | #else |
| 908 | return psimd_blend_u32(a < b, a, b); |
| 909 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 910 | } |
| 911 | |
| 912 | PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) { |
Marat Dukhan | 0abf73c | 2018-09-06 18:40:13 +0300 | [diff] [blame] | 913 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 914 | return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b); |
Marat Dukhan | 363d461 | 2019-08-29 22:26:05 -0700 | [diff] [blame] | 915 | #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) |
| 916 | return __builtin_wasm_min_f32x4(a, b); |
Marat Dukhan | 287f070 | 2017-03-23 16:29:17 +0000 | [diff] [blame] | 917 | #else |
| 918 | return psimd_blend_f32(a < b, a, b); |
| 919 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 920 | } |
| 921 | |
Marat Dukhan | 9efe01e | 2018-09-06 18:49:25 +0300 | [diff] [blame] | 922 | PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) { |
| 923 | #if defined(__clang__) |
| 924 | return __builtin_convertvector(v, psimd_f32); |
| 925 | #elif defined(__ARM_NEON__) || defined(__ARM_NEON) |
| 926 | return (psimd_f32) vcvtq_f32_s32((int32x4_t) v); |
| 927 | #elif defined(__SSE2__) |
| 928 | return (psimd_f32) _mm_cvtepi32_ps((__m128i) v); |
| 929 | #else |
| 930 | return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] }; |
| 931 | #endif |
| 932 | } |
| 933 | |
Marat Dukhan | 0b26a3f | 2017-04-15 05:51:43 -0400 | [diff] [blame] | 934 | /* Broadcast vector element */ |
| 935 | #if defined(__clang__) |
| 936 | PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) { |
| 937 | return __builtin_shufflevector(v, v, 0, 0, 0, 0); |
| 938 | } |
| 939 | |
| 940 | PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) { |
| 941 | return __builtin_shufflevector(v, v, 1, 1, 1, 1); |
| 942 | } |
| 943 | |
| 944 | PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) { |
| 945 | return __builtin_shufflevector(v, v, 2, 2, 2, 2); |
| 946 | } |
| 947 | |
| 948 | PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) { |
| 949 | return __builtin_shufflevector(v, v, 3, 3, 3, 3); |
| 950 | } |
| 951 | #else |
| 952 | PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) { |
| 953 | return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 }); |
| 954 | } |
| 955 | |
| 956 | PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) { |
| 957 | return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 }); |
| 958 | } |
| 959 | |
| 960 | PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) { |
| 961 | return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 }); |
| 962 | } |
| 963 | |
| 964 | PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) { |
| 965 | return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 }); |
| 966 | } |
| 967 | #endif |
| 968 | |
Marat Dukhan | 6d9ce9d | 2017-04-14 01:20:18 +0000 | [diff] [blame] | 969 | /* Reversal of vector elements */ |
| 970 | #if defined(__clang__) |
| 971 | PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) { |
| 972 | return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); |
| 973 | } |
| 974 | |
| 975 | PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) { |
| 976 | return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); |
| 977 | } |
| 978 | |
| 979 | PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) { |
| 980 | return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0); |
| 981 | } |
| 982 | |
| 983 | PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) { |
| 984 | return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0); |
| 985 | } |
| 986 | |
| 987 | PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) { |
| 988 | return __builtin_shufflevector(v, v, 3, 2, 1, 0); |
| 989 | } |
| 990 | |
| 991 | PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) { |
| 992 | return __builtin_shufflevector(v, v, 3, 2, 1, 0); |
| 993 | } |
| 994 | |
| 995 | PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) { |
| 996 | return __builtin_shufflevector(v, v, 3, 2, 1, 0); |
| 997 | } |
| 998 | #else |
| 999 | PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) { |
| 1000 | return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }); |
| 1001 | } |
| 1002 | |
| 1003 | PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) { |
| 1004 | return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }); |
| 1005 | } |
| 1006 | |
| 1007 | PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) { |
| 1008 | return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 }); |
| 1009 | } |
| 1010 | |
| 1011 | PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) { |
| 1012 | return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 }); |
| 1013 | } |
| 1014 | |
| 1015 | PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) { |
| 1016 | return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); |
| 1017 | } |
| 1018 | |
| 1019 | PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) { |
| 1020 | return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); |
| 1021 | } |
| 1022 | |
| 1023 | PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) { |
| 1024 | return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); |
| 1025 | } |
| 1026 | #endif |
| 1027 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1028 | /* Interleaving of vector elements */ |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1029 | #if defined(__clang__) |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1030 | PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1031 | return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); |
| 1032 | } |
| 1033 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1034 | PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1035 | return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); |
| 1036 | } |
| 1037 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1038 | PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1039 | return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); |
| 1040 | } |
| 1041 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1042 | PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1043 | return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); |
| 1044 | } |
| 1045 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1046 | PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1047 | return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); |
| 1048 | } |
| 1049 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1050 | PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1051 | return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); |
| 1052 | } |
| 1053 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1054 | PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1055 | return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); |
| 1056 | } |
| 1057 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1058 | PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1059 | return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); |
| 1060 | } |
| 1061 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1062 | PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1063 | return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); |
| 1064 | } |
| 1065 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1066 | PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1067 | return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); |
| 1068 | } |
| 1069 | #else |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1070 | PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1071 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 }); |
| 1072 | } |
| 1073 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1074 | PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1075 | return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 }); |
| 1076 | } |
| 1077 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1078 | PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1079 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 }); |
| 1080 | } |
| 1081 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1082 | PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1083 | return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 }); |
| 1084 | } |
| 1085 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1086 | PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1087 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); |
| 1088 | } |
| 1089 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1090 | PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1091 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); |
| 1092 | } |
| 1093 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1094 | PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1095 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); |
| 1096 | } |
| 1097 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1098 | PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1099 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); |
| 1100 | } |
| 1101 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1102 | PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1103 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); |
| 1104 | } |
| 1105 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1106 | PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) { |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1107 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); |
| 1108 | } |
| 1109 | #endif |
Marat Dukhan | 457042c | 2017-02-22 04:23:57 -0500 | [diff] [blame] | 1110 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1111 | /* Concatenation of low/high vector elements */ |
| 1112 | #if defined(__clang__) |
| 1113 | PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) { |
| 1114 | return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3); |
| 1115 | } |
| 1116 | |
| 1117 | PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) { |
| 1118 | return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7); |
| 1119 | } |
| 1120 | |
| 1121 | PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) { |
| 1122 | return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3); |
| 1123 | } |
| 1124 | |
| 1125 | PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) { |
| 1126 | return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7); |
| 1127 | } |
| 1128 | |
| 1129 | PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) { |
| 1130 | return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); |
| 1131 | } |
| 1132 | |
| 1133 | PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) { |
| 1134 | return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); |
| 1135 | } |
| 1136 | |
| 1137 | PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) { |
| 1138 | return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); |
| 1139 | } |
| 1140 | |
| 1141 | PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) { |
| 1142 | return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); |
| 1143 | } |
| 1144 | |
| 1145 | PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) { |
| 1146 | return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); |
| 1147 | } |
| 1148 | |
| 1149 | PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) { |
| 1150 | return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); |
| 1151 | } |
| 1152 | #else |
| 1153 | PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) { |
| 1154 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 }); |
| 1155 | } |
| 1156 | |
| 1157 | PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) { |
| 1158 | return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 }); |
| 1159 | } |
| 1160 | |
| 1161 | PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) { |
| 1162 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 }); |
| 1163 | } |
| 1164 | |
| 1165 | PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) { |
| 1166 | return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 }); |
| 1167 | } |
| 1168 | |
| 1169 | PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) { |
| 1170 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); |
| 1171 | } |
| 1172 | |
| 1173 | PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) { |
| 1174 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); |
| 1175 | } |
| 1176 | |
| 1177 | PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) { |
| 1178 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); |
| 1179 | } |
| 1180 | |
| 1181 | PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) { |
| 1182 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); |
| 1183 | } |
| 1184 | |
| 1185 | PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) { |
| 1186 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); |
| 1187 | } |
| 1188 | |
| 1189 | PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) { |
| 1190 | return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); |
| 1191 | } |
| 1192 | #endif |
| 1193 | |
| 1194 | /* Concatenation of even/odd vector elements */ |
| 1195 | #if defined(__clang__) |
Marat Dukhan | c7f1f99 | 2018-09-06 18:56:18 +0300 | [diff] [blame] | 1196 | PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) { |
| 1197 | return __builtin_shufflevector(a, b, |
| 1198 | 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14); |
| 1199 | } |
| 1200 | |
| 1201 | PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) { |
| 1202 | return __builtin_shufflevector(a, b, |
| 1203 | 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15); |
| 1204 | } |
| 1205 | |
| 1206 | PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) { |
| 1207 | return __builtin_shufflevector(a, b, |
| 1208 | 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14); |
| 1209 | } |
| 1210 | |
| 1211 | PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) { |
| 1212 | return __builtin_shufflevector(a, b, |
| 1213 | 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15); |
| 1214 | } |
| 1215 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1216 | PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) { |
| 1217 | return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6); |
| 1218 | } |
| 1219 | |
| 1220 | PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) { |
| 1221 | return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7); |
| 1222 | } |
| 1223 | |
| 1224 | PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) { |
| 1225 | return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6); |
| 1226 | } |
| 1227 | |
| 1228 | PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) { |
| 1229 | return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7); |
| 1230 | } |
| 1231 | |
| 1232 | PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) { |
| 1233 | return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); |
| 1234 | } |
| 1235 | |
| 1236 | PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) { |
| 1237 | return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); |
| 1238 | } |
| 1239 | |
| 1240 | PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) { |
| 1241 | return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); |
| 1242 | } |
| 1243 | |
| 1244 | PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) { |
| 1245 | return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); |
| 1246 | } |
| 1247 | |
| 1248 | PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) { |
| 1249 | return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); |
| 1250 | } |
| 1251 | |
| 1252 | PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) { |
| 1253 | return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); |
| 1254 | } |
| 1255 | #else |
Marat Dukhan | c7f1f99 | 2018-09-06 18:56:18 +0300 | [diff] [blame] | 1256 | PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) { |
| 1257 | return __builtin_shuffle(a, b, |
| 1258 | (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 }); |
| 1259 | } |
| 1260 | |
| 1261 | PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) { |
| 1262 | return __builtin_shuffle(a, b, |
| 1263 | (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 }); |
| 1264 | } |
| 1265 | |
| 1266 | PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) { |
| 1267 | return __builtin_shuffle(a, b, |
| 1268 | (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 }); |
| 1269 | } |
| 1270 | |
| 1271 | PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) { |
| 1272 | return __builtin_shuffle(a, b, |
| 1273 | (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 }); |
| 1274 | } |
| 1275 | |
Marat Dukhan | 0be0b6e | 2017-04-14 01:09:22 +0000 | [diff] [blame] | 1276 | PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) { |
| 1277 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 }); |
| 1278 | } |
| 1279 | |
| 1280 | PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) { |
| 1281 | return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 }); |
| 1282 | } |
| 1283 | |
| 1284 | PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) { |
| 1285 | return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 }); |
| 1286 | } |
| 1287 | |
| 1288 | PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) { |
| 1289 | return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 }); |
| 1290 | } |
| 1291 | |
| 1292 | PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) { |
| 1293 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); |
| 1294 | } |
| 1295 | |
| 1296 | PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) { |
| 1297 | return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); |
| 1298 | } |
| 1299 | |
| 1300 | PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) { |
| 1301 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); |
| 1302 | } |
| 1303 | |
| 1304 | PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) { |
| 1305 | return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); |
| 1306 | } |
| 1307 | |
| 1308 | PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) { |
| 1309 | return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); |
| 1310 | } |
| 1311 | |
| 1312 | PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) { |
| 1313 | return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); |
| 1314 | } |
| 1315 | #endif |
| 1316 | |
Marat Dukhan | 457042c | 2017-02-22 04:23:57 -0500 | [diff] [blame] | 1317 | /* Vector reduce */ |
| 1318 | #if defined(__clang__) |
| 1319 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) { |
| 1320 | const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1); |
| 1321 | return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2); |
| 1322 | } |
| 1323 | |
| 1324 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) { |
| 1325 | const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1)); |
| 1326 | return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2)); |
| 1327 | } |
| 1328 | |
| 1329 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) { |
| 1330 | const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1)); |
| 1331 | return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2)); |
| 1332 | } |
| 1333 | |
| 1334 | PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) { |
| 1335 | const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1); |
| 1336 | const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1); |
| 1337 | return result[0]; |
| 1338 | } |
| 1339 | |
| 1340 | PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) { |
| 1341 | const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1)); |
| 1342 | const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1)); |
| 1343 | return result[0]; |
| 1344 | } |
| 1345 | |
| 1346 | PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) { |
| 1347 | const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1)); |
| 1348 | const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1)); |
| 1349 | return result[0]; |
| 1350 | } |
| 1351 | #else |
| 1352 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) { |
| 1353 | const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }); |
| 1354 | return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }); |
| 1355 | } |
| 1356 | |
| 1357 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) { |
| 1358 | const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 })); |
| 1359 | return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 })); |
| 1360 | } |
| 1361 | |
| 1362 | PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) { |
| 1363 | const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 })); |
| 1364 | return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 })); |
| 1365 | } |
| 1366 | |
| 1367 | PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) { |
| 1368 | const psimd_f32 result = psimd_allreduce_sum_f32(v); |
| 1369 | return result[0]; |
| 1370 | } |
| 1371 | |
| 1372 | PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) { |
| 1373 | const psimd_f32 result = psimd_allreduce_max_f32(v); |
| 1374 | return result[0]; |
| 1375 | } |
| 1376 | |
| 1377 | PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) { |
| 1378 | const psimd_f32 result = psimd_allreduce_min_f32(v); |
| 1379 | return result[0]; |
| 1380 | } |
| 1381 | #endif |
Marat Dukhan | 0525a85 | 2014-12-13 15:48:12 -0500 | [diff] [blame] | 1382 | #endif |
| 1383 | |
| 1384 | #endif /* PSIMD_H */ |