blob: 2779de4d720a6f77105d59c56adae86734006664 [file] [log] [blame]
Mike Klein455c7472019-02-05 13:42:46 -05001/*
2 * Copyright 2019 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SKVX_DEFINED
9#define SKVX_DEFINED
10
11// skvx::Vec<N,T> are SIMD vectors of N T's, a v1.5 successor to SkNx<N,T>.
12//
13// This time we're leaning a bit less on platform-specific intrinsics and a bit
14// more on Clang/GCC vector extensions, but still keeping the option open to
15// drop in platform-specific intrinsics, actually more easily than before.
16//
17// We've also fixed a few of the caveats that used to make SkNx awkward to work
18// with across translation units. skvx::Vec<N,T> always has N*sizeof(T) size
Mike Kleina1711092020-09-02 09:00:57 -050019// and alignment and is safe to use across translation units freely.
Mike Klein3637a442020-10-12 13:13:28 -050020// (Ideally we'd only align to T, but that tanks ARMv7 NEON codegen.)
Mike Klein455c7472019-02-05 13:42:46 -050021
Mike Klein7d3b27d2019-06-07 10:57:58 -050022// Please try to keep this file independent of Skia headers.
Mike Kleindcfc3ef2019-02-07 09:49:17 -050023#include <algorithm> // std::min, std::max
Mike Klein3637a442020-10-12 13:13:28 -050024#include <cmath> // ceilf, floorf, truncf, roundf, sqrtf, etc.
Mike Klein455c7472019-02-05 13:42:46 -050025#include <cstdint> // intXX_t
26#include <cstring> // memcpy()
Mike Klein455c7472019-02-05 13:42:46 -050027#include <initializer_list> // std::initializer_list
Mike Klein3637a442020-10-12 13:13:28 -050028#include <utility> // std::index_sequence
Mike Klein455c7472019-02-05 13:42:46 -050029
Mike Klein5caf7de2020-03-12 11:05:46 -050030#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__)
Mike Kleindcfc3ef2019-02-07 09:49:17 -050031 #include <immintrin.h>
Mike Klein7d3b27d2019-06-07 10:57:58 -050032#elif defined(__ARM_NEON)
Mike Kleindcfc3ef2019-02-07 09:49:17 -050033 #include <arm_neon.h>
Mike Kleina1711092020-09-02 09:00:57 -050034#elif defined(__wasm_simd128__)
Elliot Evansfe7e74b2020-06-30 16:08:44 -060035 #include <wasm_simd128.h>
36#endif
37
Mike Kleina1711092020-09-02 09:00:57 -050038// To avoid ODR violations, all methods must be force-inlined...
Mike Klein21ef0d52019-12-17 11:40:14 -060039#if defined(_MSC_VER)
40 #define SKVX_ALWAYS_INLINE __forceinline
41#else
42 #define SKVX_ALWAYS_INLINE __attribute__((always_inline))
43#endif
44
Mike Kleina1711092020-09-02 09:00:57 -050045// ... and all standalone functions must be static. Please use these helpers:
46#define SI static inline
47#define SIT template < typename T> SI
48#define SIN template <int N > SI
49#define SINT template <int N, typename T> SI
Mike Klein21ef0d52019-12-17 11:40:14 -060050#define SINTU template <int N, typename T, typename U, \
Mike Kleina1711092020-09-02 09:00:57 -050051 typename=std::enable_if_t<std::is_convertible<U,T>::value>> SI
Mike Klein41b995c2019-02-27 10:24:55 -060052
Mike Klein455c7472019-02-05 13:42:46 -050053namespace skvx {
54
55// All Vec have the same simple memory layout, the same as `T vec[N]`.
Mike Klein455c7472019-02-05 13:42:46 -050056template <int N, typename T>
Mike Kleina1711092020-09-02 09:00:57 -050057struct alignas(N*sizeof(T)) Vec {
Mike Klein96e4e532019-04-16 11:36:55 -050058 static_assert((N & (N-1)) == 0, "N must be a power of 2.");
59 static_assert(sizeof(T) >= alignof(T), "What kind of crazy T is this?");
Mike Klein455c7472019-02-05 13:42:46 -050060
Mike Kleindcfc3ef2019-02-07 09:49:17 -050061 Vec<N/2,T> lo, hi;
Mike Klein455c7472019-02-05 13:42:46 -050062
Mike Klein42925152019-02-06 11:56:58 -050063 // Methods belong here in the class declaration of Vec only if:
64 // - they must be here, like constructors or operator[];
65 // - they'll definitely never want a specialized implementation.
66 // Other operations on Vec should be defined outside the type.
67
Mike Klein21ef0d52019-12-17 11:40:14 -060068 SKVX_ALWAYS_INLINE Vec() = default;
Mike Kleinf4438d52019-03-14 13:30:42 -050069
Mike Kleina1711092020-09-02 09:00:57 -050070 template <typename U, typename=std::enable_if_t<std::is_convertible<U,T>::value>>
Mike Klein21ef0d52019-12-17 11:40:14 -060071 SKVX_ALWAYS_INLINE
Mike Kleinf4438d52019-03-14 13:30:42 -050072 Vec(U x) : lo(x), hi(x) {}
Mike Klein455c7472019-02-05 13:42:46 -050073
Mike Klein21ef0d52019-12-17 11:40:14 -060074 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) {
Mike Kleindcfc3ef2019-02-07 09:49:17 -050075 T vals[N] = {0};
76 memcpy(vals, xs.begin(), std::min(xs.size(), (size_t)N)*sizeof(T));
Mike Klein455c7472019-02-05 13:42:46 -050077
Mike Kleindcfc3ef2019-02-07 09:49:17 -050078 lo = Vec<N/2,T>::Load(vals + 0);
79 hi = Vec<N/2,T>::Load(vals + N/2);
Mike Klein455c7472019-02-05 13:42:46 -050080 }
81
Mike Klein21ef0d52019-12-17 11:40:14 -060082 SKVX_ALWAYS_INLINE T operator[](int i) const { return i < N/2 ? lo[i] : hi[i-N/2]; }
83 SKVX_ALWAYS_INLINE T& operator[](int i) { return i < N/2 ? lo[i] : hi[i-N/2]; }
Mike Klein42925152019-02-06 11:56:58 -050084
Mike Klein21ef0d52019-12-17 11:40:14 -060085 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {
Mike Klein42925152019-02-06 11:56:58 -050086 Vec v;
87 memcpy(&v, ptr, sizeof(Vec));
88 return v;
89 }
Mike Klein21ef0d52019-12-17 11:40:14 -060090 SKVX_ALWAYS_INLINE void store(void* ptr) const {
Mike Klein42925152019-02-06 11:56:58 -050091 memcpy(ptr, this, sizeof(Vec));
92 }
Mike Klein455c7472019-02-05 13:42:46 -050093};
94
Mike Kleindcfc3ef2019-02-07 09:49:17 -050095template <typename T>
96struct Vec<1,T> {
97 T val;
Mike Klein455c7472019-02-05 13:42:46 -050098
Mike Klein21ef0d52019-12-17 11:40:14 -060099 SKVX_ALWAYS_INLINE Vec() = default;
Mike Kleinf4438d52019-03-14 13:30:42 -0500100
Mike Kleina1711092020-09-02 09:00:57 -0500101 template <typename U, typename=std::enable_if_t<std::is_convertible<U,T>::value>>
Mike Klein21ef0d52019-12-17 11:40:14 -0600102 SKVX_ALWAYS_INLINE
Mike Kleinf4438d52019-03-14 13:30:42 -0500103 Vec(U x) : val(x) {}
Mike Klein455c7472019-02-05 13:42:46 -0500104
Mike Klein21ef0d52019-12-17 11:40:14 -0600105 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) : val(xs.size() ? *xs.begin() : 0) {}
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500106
Mike Klein21ef0d52019-12-17 11:40:14 -0600107 SKVX_ALWAYS_INLINE T operator[](int) const { return val; }
108 SKVX_ALWAYS_INLINE T& operator[](int) { return val; }
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500109
Mike Klein21ef0d52019-12-17 11:40:14 -0600110 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) {
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500111 Vec v;
112 memcpy(&v, ptr, sizeof(Vec));
113 return v;
114 }
Mike Klein21ef0d52019-12-17 11:40:14 -0600115 SKVX_ALWAYS_INLINE void store(void* ptr) const {
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500116 memcpy(ptr, this, sizeof(Vec));
117 }
118};
Mike Klein455c7472019-02-05 13:42:46 -0500119
Mike Klein3637a442020-10-12 13:13:28 -0500120// Ideally we'd only use bit_pun(), but until this file is always built as C++17 with constexpr if,
121// we'll sometimes find need to use unchecked_bit_pun(). Please do check the call sites yourself!
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500122template <typename D, typename S>
Mike Kleina1711092020-09-02 09:00:57 -0500123SI D unchecked_bit_pun(const S& s) {
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500124 D d;
125 memcpy(&d, &s, sizeof(D));
126 return d;
127}
Mike Klein455c7472019-02-05 13:42:46 -0500128
Mike Klein5cb47d62020-07-10 15:46:46 -0500129template <typename D, typename S>
Mike Kleina1711092020-09-02 09:00:57 -0500130SI D bit_pun(const S& s) {
Mike Klein5cb47d62020-07-10 15:46:46 -0500131 static_assert(sizeof(D) == sizeof(S), "");
132 return unchecked_bit_pun<D>(s);
133}
134
Mike Klein455c7472019-02-05 13:42:46 -0500135// Translate from a value type T to its corresponding Mask, the result of a comparison.
Mike Kleincd9ef732019-02-09 13:48:54 -0500136template <typename T> struct Mask { using type = T; };
137template <> struct Mask<float > { using type = int32_t; };
138template <> struct Mask<double> { using type = int64_t; };
139template <typename T> using M = typename Mask<T>::type;
Mike Klein455c7472019-02-05 13:42:46 -0500140
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500141// Join two Vec<N,T> into one Vec<2N,T>.
Mike Klein9a885b22019-04-16 12:07:23 -0500142SINT Vec<2*N,T> join(const Vec<N,T>& lo, const Vec<N,T>& hi) {
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500143 Vec<2*N,T> v;
144 v.lo = lo;
145 v.hi = hi;
146 return v;
Mike Klein455c7472019-02-05 13:42:46 -0500147}
Mike Klein455c7472019-02-05 13:42:46 -0500148
Mike Klein3637a442020-10-12 13:13:28 -0500149// We have three strategies for implementing Vec operations:
Mike Klein455c7472019-02-05 13:42:46 -0500150// 1) lean on Clang/GCC vector extensions when available;
Mike Klein3637a442020-10-12 13:13:28 -0500151// 2) use map() to apply a scalar function lane-wise;
152// 3) recurse on lo/hi to scalar portable implementations.
153// We can slot in platform-specific implementations as overloads for particular Vec<N,T>,
154// or often integrate them directly into the recursion of style 3), allowing fine control.
Mike Klein455c7472019-02-05 13:42:46 -0500155
Mike Klein42925152019-02-06 11:56:58 -0500156#if !defined(SKNX_NO_SIMD) && (defined(__clang__) || defined(__GNUC__))
Mike Klein455c7472019-02-05 13:42:46 -0500157
158 // VExt<N,T> types have the same size as Vec<N,T> and support most operations directly.
Mike Klein455c7472019-02-05 13:42:46 -0500159 #if defined(__clang__)
160 template <int N, typename T>
161 using VExt = T __attribute__((ext_vector_type(N)));
162
163 #elif defined(__GNUC__)
164 template <int N, typename T>
165 struct VExtHelper {
166 typedef T __attribute__((vector_size(N*sizeof(T)))) type;
167 };
168
169 template <int N, typename T>
170 using VExt = typename VExtHelper<N,T>::type;
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500171
172 // For some reason some (new!) versions of GCC cannot seem to deduce N in the generic
173 // to_vec<N,T>() below for N=4 and T=float. This workaround seems to help...
Mike Kleina1711092020-09-02 09:00:57 -0500174 SI Vec<4,float> to_vec(VExt<4,float> v) { return bit_pun<Vec<4,float>>(v); }
Mike Klein455c7472019-02-05 13:42:46 -0500175 #endif
176
Mike Klein9a885b22019-04-16 12:07:23 -0500177 SINT VExt<N,T> to_vext(const Vec<N,T>& v) { return bit_pun<VExt<N,T>>(v); }
178 SINT Vec <N,T> to_vec(const VExt<N,T>& v) { return bit_pun<Vec <N,T>>(v); }
Mike Klein455c7472019-02-05 13:42:46 -0500179
Mike Kleina1711092020-09-02 09:00:57 -0500180 SINT Vec<N,T> operator+(const Vec<N,T>& x, const Vec<N,T>& y) {
181 return to_vec<N,T>(to_vext(x) + to_vext(y));
182 }
183 SINT Vec<N,T> operator-(const Vec<N,T>& x, const Vec<N,T>& y) {
184 return to_vec<N,T>(to_vext(x) - to_vext(y));
185 }
186 SINT Vec<N,T> operator*(const Vec<N,T>& x, const Vec<N,T>& y) {
187 return to_vec<N,T>(to_vext(x) * to_vext(y));
188 }
189 SINT Vec<N,T> operator/(const Vec<N,T>& x, const Vec<N,T>& y) {
190 return to_vec<N,T>(to_vext(x) / to_vext(y));
191 }
Mike Klein455c7472019-02-05 13:42:46 -0500192
Mike Kleina1711092020-09-02 09:00:57 -0500193 SINT Vec<N,T> operator^(const Vec<N,T>& x, const Vec<N,T>& y) {
194 return to_vec<N,T>(to_vext(x) ^ to_vext(y));
195 }
196 SINT Vec<N,T> operator&(const Vec<N,T>& x, const Vec<N,T>& y) {
197 return to_vec<N,T>(to_vext(x) & to_vext(y));
198 }
199 SINT Vec<N,T> operator|(const Vec<N,T>& x, const Vec<N,T>& y) {
200 return to_vec<N,T>(to_vext(x) | to_vext(y));
201 }
Mike Klein455c7472019-02-05 13:42:46 -0500202
Mike Klein9a885b22019-04-16 12:07:23 -0500203 SINT Vec<N,T> operator!(const Vec<N,T>& x) { return to_vec<N,T>(!to_vext(x)); }
204 SINT Vec<N,T> operator-(const Vec<N,T>& x) { return to_vec<N,T>(-to_vext(x)); }
205 SINT Vec<N,T> operator~(const Vec<N,T>& x) { return to_vec<N,T>(~to_vext(x)); }
Mike Klein455c7472019-02-05 13:42:46 -0500206
Mike Kleina1711092020-09-02 09:00:57 -0500207 SINT Vec<N,T> operator<<(const Vec<N,T>& x, int k) { return to_vec<N,T>(to_vext(x) << k); }
208 SINT Vec<N,T> operator>>(const Vec<N,T>& x, int k) { return to_vec<N,T>(to_vext(x) >> k); }
Mike Klein455c7472019-02-05 13:42:46 -0500209
Mike Kleina1711092020-09-02 09:00:57 -0500210 SINT Vec<N,M<T>> operator==(const Vec<N,T>& x, const Vec<N,T>& y) {
211 return bit_pun<Vec<N,M<T>>>(to_vext(x) == to_vext(y));
212 }
213 SINT Vec<N,M<T>> operator!=(const Vec<N,T>& x, const Vec<N,T>& y) {
214 return bit_pun<Vec<N,M<T>>>(to_vext(x) != to_vext(y));
215 }
216 SINT Vec<N,M<T>> operator<=(const Vec<N,T>& x, const Vec<N,T>& y) {
217 return bit_pun<Vec<N,M<T>>>(to_vext(x) <= to_vext(y));
218 }
219 SINT Vec<N,M<T>> operator>=(const Vec<N,T>& x, const Vec<N,T>& y) {
220 return bit_pun<Vec<N,M<T>>>(to_vext(x) >= to_vext(y));
221 }
222 SINT Vec<N,M<T>> operator< (const Vec<N,T>& x, const Vec<N,T>& y) {
223 return bit_pun<Vec<N,M<T>>>(to_vext(x) < to_vext(y));
224 }
225 SINT Vec<N,M<T>> operator> (const Vec<N,T>& x, const Vec<N,T>& y) {
226 return bit_pun<Vec<N,M<T>>>(to_vext(x) > to_vext(y));
227 }
Mike Klein455c7472019-02-05 13:42:46 -0500228
229#else
230
231 // Either SKNX_NO_SIMD is defined, or Clang/GCC vector extensions are not available.
Mike Klein3637a442020-10-12 13:13:28 -0500232 // We'll implement things portably with N==1 scalar implementations and recursion onto them.
Mike Klein455c7472019-02-05 13:42:46 -0500233
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500234 // N == 1 scalar implementations.
Mike Klein9a885b22019-04-16 12:07:23 -0500235 SIT Vec<1,T> operator+(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val + y.val; }
236 SIT Vec<1,T> operator-(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val - y.val; }
237 SIT Vec<1,T> operator*(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val * y.val; }
238 SIT Vec<1,T> operator/(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val / y.val; }
Mike Klein455c7472019-02-05 13:42:46 -0500239
Mike Klein9a885b22019-04-16 12:07:23 -0500240 SIT Vec<1,T> operator^(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val ^ y.val; }
241 SIT Vec<1,T> operator&(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val & y.val; }
242 SIT Vec<1,T> operator|(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val | y.val; }
Mike Klein455c7472019-02-05 13:42:46 -0500243
Mike Klein9a885b22019-04-16 12:07:23 -0500244 SIT Vec<1,T> operator!(const Vec<1,T>& x) { return !x.val; }
245 SIT Vec<1,T> operator-(const Vec<1,T>& x) { return -x.val; }
246 SIT Vec<1,T> operator~(const Vec<1,T>& x) { return ~x.val; }
Mike Klein455c7472019-02-05 13:42:46 -0500247
Mike Kleina1711092020-09-02 09:00:57 -0500248 SIT Vec<1,T> operator<<(const Vec<1,T>& x, int k) { return x.val << k; }
249 SIT Vec<1,T> operator>>(const Vec<1,T>& x, int k) { return x.val >> k; }
Mike Klein455c7472019-02-05 13:42:46 -0500250
Mike Kleina1711092020-09-02 09:00:57 -0500251 SIT Vec<1,M<T>> operator==(const Vec<1,T>& x, const Vec<1,T>& y) {
252 return x.val == y.val ? ~0 : 0;
253 }
254 SIT Vec<1,M<T>> operator!=(const Vec<1,T>& x, const Vec<1,T>& y) {
255 return x.val != y.val ? ~0 : 0;
256 }
257 SIT Vec<1,M<T>> operator<=(const Vec<1,T>& x, const Vec<1,T>& y) {
258 return x.val <= y.val ? ~0 : 0;
259 }
260 SIT Vec<1,M<T>> operator>=(const Vec<1,T>& x, const Vec<1,T>& y) {
261 return x.val >= y.val ? ~0 : 0;
262 }
263 SIT Vec<1,M<T>> operator< (const Vec<1,T>& x, const Vec<1,T>& y) {
264 return x.val < y.val ? ~0 : 0;
265 }
266 SIT Vec<1,M<T>> operator> (const Vec<1,T>& x, const Vec<1,T>& y) {
267 return x.val > y.val ? ~0 : 0;
268 }
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500269
Mike Klein3637a442020-10-12 13:13:28 -0500270 // Recurse on lo/hi down to N==1 scalar implementations.
Mike Kleina1711092020-09-02 09:00:57 -0500271 SINT Vec<N,T> operator+(const Vec<N,T>& x, const Vec<N,T>& y) {
272 return join(x.lo + y.lo, x.hi + y.hi);
273 }
274 SINT Vec<N,T> operator-(const Vec<N,T>& x, const Vec<N,T>& y) {
275 return join(x.lo - y.lo, x.hi - y.hi);
276 }
277 SINT Vec<N,T> operator*(const Vec<N,T>& x, const Vec<N,T>& y) {
278 return join(x.lo * y.lo, x.hi * y.hi);
279 }
280 SINT Vec<N,T> operator/(const Vec<N,T>& x, const Vec<N,T>& y) {
281 return join(x.lo / y.lo, x.hi / y.hi);
282 }
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500283
Mike Kleina1711092020-09-02 09:00:57 -0500284 SINT Vec<N,T> operator^(const Vec<N,T>& x, const Vec<N,T>& y) {
285 return join(x.lo ^ y.lo, x.hi ^ y.hi);
286 }
287 SINT Vec<N,T> operator&(const Vec<N,T>& x, const Vec<N,T>& y) {
288 return join(x.lo & y.lo, x.hi & y.hi);
289 }
290 SINT Vec<N,T> operator|(const Vec<N,T>& x, const Vec<N,T>& y) {
291 return join(x.lo | y.lo, x.hi | y.hi);
292 }
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500293
Mike Klein9a885b22019-04-16 12:07:23 -0500294 SINT Vec<N,T> operator!(const Vec<N,T>& x) { return join(!x.lo, !x.hi); }
295 SINT Vec<N,T> operator-(const Vec<N,T>& x) { return join(-x.lo, -x.hi); }
296 SINT Vec<N,T> operator~(const Vec<N,T>& x) { return join(~x.lo, ~x.hi); }
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500297
Mike Kleina1711092020-09-02 09:00:57 -0500298 SINT Vec<N,T> operator<<(const Vec<N,T>& x, int k) { return join(x.lo << k, x.hi << k); }
299 SINT Vec<N,T> operator>>(const Vec<N,T>& x, int k) { return join(x.lo >> k, x.hi >> k); }
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500300
Mike Kleina1711092020-09-02 09:00:57 -0500301 SINT Vec<N,M<T>> operator==(const Vec<N,T>& x, const Vec<N,T>& y) {
302 return join(x.lo == y.lo, x.hi == y.hi);
303 }
304 SINT Vec<N,M<T>> operator!=(const Vec<N,T>& x, const Vec<N,T>& y) {
305 return join(x.lo != y.lo, x.hi != y.hi);
306 }
307 SINT Vec<N,M<T>> operator<=(const Vec<N,T>& x, const Vec<N,T>& y) {
308 return join(x.lo <= y.lo, x.hi <= y.hi);
309 }
310 SINT Vec<N,M<T>> operator>=(const Vec<N,T>& x, const Vec<N,T>& y) {
311 return join(x.lo >= y.lo, x.hi >= y.hi);
312 }
313 SINT Vec<N,M<T>> operator< (const Vec<N,T>& x, const Vec<N,T>& y) {
314 return join(x.lo < y.lo, x.hi < y.hi);
315 }
316 SINT Vec<N,M<T>> operator> (const Vec<N,T>& x, const Vec<N,T>& y) {
317 return join(x.lo > y.lo, x.hi > y.hi);
318 }
Mike Klein455c7472019-02-05 13:42:46 -0500319#endif
320
Mike Klein3637a442020-10-12 13:13:28 -0500321// Scalar/vector operations splat the scalar to a vector.
322SINTU Vec<N,T> operator+ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) + y; }
323SINTU Vec<N,T> operator- (U x, const Vec<N,T>& y) { return Vec<N,T>(x) - y; }
324SINTU Vec<N,T> operator* (U x, const Vec<N,T>& y) { return Vec<N,T>(x) * y; }
325SINTU Vec<N,T> operator/ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) / y; }
326SINTU Vec<N,T> operator^ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) ^ y; }
327SINTU Vec<N,T> operator& (U x, const Vec<N,T>& y) { return Vec<N,T>(x) & y; }
328SINTU Vec<N,T> operator| (U x, const Vec<N,T>& y) { return Vec<N,T>(x) | y; }
329SINTU Vec<N,M<T>> operator==(U x, const Vec<N,T>& y) { return Vec<N,T>(x) == y; }
330SINTU Vec<N,M<T>> operator!=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) != y; }
331SINTU Vec<N,M<T>> operator<=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) <= y; }
332SINTU Vec<N,M<T>> operator>=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) >= y; }
333SINTU Vec<N,M<T>> operator< (U x, const Vec<N,T>& y) { return Vec<N,T>(x) < y; }
334SINTU Vec<N,M<T>> operator> (U x, const Vec<N,T>& y) { return Vec<N,T>(x) > y; }
335
336SINTU Vec<N,T> operator+ (const Vec<N,T>& x, U y) { return x + Vec<N,T>(y); }
337SINTU Vec<N,T> operator- (const Vec<N,T>& x, U y) { return x - Vec<N,T>(y); }
338SINTU Vec<N,T> operator* (const Vec<N,T>& x, U y) { return x * Vec<N,T>(y); }
339SINTU Vec<N,T> operator/ (const Vec<N,T>& x, U y) { return x / Vec<N,T>(y); }
340SINTU Vec<N,T> operator^ (const Vec<N,T>& x, U y) { return x ^ Vec<N,T>(y); }
341SINTU Vec<N,T> operator& (const Vec<N,T>& x, U y) { return x & Vec<N,T>(y); }
342SINTU Vec<N,T> operator| (const Vec<N,T>& x, U y) { return x | Vec<N,T>(y); }
343SINTU Vec<N,M<T>> operator==(const Vec<N,T>& x, U y) { return x == Vec<N,T>(y); }
344SINTU Vec<N,M<T>> operator!=(const Vec<N,T>& x, U y) { return x != Vec<N,T>(y); }
345SINTU Vec<N,M<T>> operator<=(const Vec<N,T>& x, U y) { return x <= Vec<N,T>(y); }
346SINTU Vec<N,M<T>> operator>=(const Vec<N,T>& x, U y) { return x >= Vec<N,T>(y); }
347SINTU Vec<N,M<T>> operator< (const Vec<N,T>& x, U y) { return x < Vec<N,T>(y); }
348SINTU Vec<N,M<T>> operator> (const Vec<N,T>& x, U y) { return x > Vec<N,T>(y); }
349
350SINT Vec<N,T>& operator+=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x + y); }
351SINT Vec<N,T>& operator-=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x - y); }
352SINT Vec<N,T>& operator*=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x * y); }
353SINT Vec<N,T>& operator/=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x / y); }
354SINT Vec<N,T>& operator^=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x ^ y); }
355SINT Vec<N,T>& operator&=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x & y); }
356SINT Vec<N,T>& operator|=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x | y); }
357
358SINTU Vec<N,T>& operator+=(Vec<N,T>& x, U y) { return (x = x + Vec<N,T>(y)); }
359SINTU Vec<N,T>& operator-=(Vec<N,T>& x, U y) { return (x = x - Vec<N,T>(y)); }
360SINTU Vec<N,T>& operator*=(Vec<N,T>& x, U y) { return (x = x * Vec<N,T>(y)); }
361SINTU Vec<N,T>& operator/=(Vec<N,T>& x, U y) { return (x = x / Vec<N,T>(y)); }
362SINTU Vec<N,T>& operator^=(Vec<N,T>& x, U y) { return (x = x ^ Vec<N,T>(y)); }
363SINTU Vec<N,T>& operator&=(Vec<N,T>& x, U y) { return (x = x & Vec<N,T>(y)); }
364SINTU Vec<N,T>& operator|=(Vec<N,T>& x, U y) { return (x = x | Vec<N,T>(y)); }
365
366SINT Vec<N,T>& operator<<=(Vec<N,T>& x, int bits) { return (x = x << bits); }
367SINT Vec<N,T>& operator>>=(Vec<N,T>& x, int bits) { return (x = x >> bits); }
368
Mike Klein840e8ea2020-10-12 12:38:10 -0500369// Some operations we want are not expressible with Clang/GCC vector extensions.
Mike Klein455c7472019-02-05 13:42:46 -0500370
Mike Klein7b1620f2020-09-16 10:18:47 -0500371// Clang can reason about naive_if_then_else() and optimize through it better
372// than if_then_else(), so it's sometimes useful to call it directly when we
373// think an entire expression should optimize away, e.g. min()/max().
374SINT Vec<N,T> naive_if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {
375 return bit_pun<Vec<N,T>>(( cond & bit_pun<Vec<N, M<T>>>(t)) |
376 (~cond & bit_pun<Vec<N, M<T>>>(e)) );
377}
378
Mike Klein3637a442020-10-12 13:13:28 -0500379SIT Vec<1,T> if_then_else(const Vec<1,M<T>>& cond, const Vec<1,T>& t, const Vec<1,T>& e) {
380 // In practice this scalar implementation is unlikely to be used. See next if_then_else().
381 return bit_pun<Vec<1,T>>(( cond & bit_pun<Vec<1, M<T>>>(t)) |
382 (~cond & bit_pun<Vec<1, M<T>>>(e)) );
383}
Mike Klein9a885b22019-04-16 12:07:23 -0500384SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {
Mike Klein5cb47d62020-07-10 15:46:46 -0500385 // Specializations inline here so they can generalize what types the apply to.
386 // (This header is used in C++14 contexts, so we have to kind of fake constexpr if.)
Mike Klein41083642020-09-15 22:08:24 -0500387#if defined(__AVX2__)
Mike Kleinc3ad6a12020-09-15 15:26:22 -0500388 if /*constexpr*/ (N*sizeof(T) == 32) {
389 return unchecked_bit_pun<Vec<N,T>>(_mm256_blendv_epi8(unchecked_bit_pun<__m256i>(e),
390 unchecked_bit_pun<__m256i>(t),
391 unchecked_bit_pun<__m256i>(cond)));
Mike Klein5cb47d62020-07-10 15:46:46 -0500392 }
393#endif
394#if defined(__SSE4_1__)
Mike Kleinc3ad6a12020-09-15 15:26:22 -0500395 if /*constexpr*/ (N*sizeof(T) == 16) {
396 return unchecked_bit_pun<Vec<N,T>>(_mm_blendv_epi8(unchecked_bit_pun<__m128i>(e),
397 unchecked_bit_pun<__m128i>(t),
398 unchecked_bit_pun<__m128i>(cond)));
Mike Klein5cb47d62020-07-10 15:46:46 -0500399 }
400#endif
401#if defined(__ARM_NEON)
Mike Kleinc3ad6a12020-09-15 15:26:22 -0500402 if /*constexpr*/ (N*sizeof(T) == 16) {
403 return unchecked_bit_pun<Vec<N,T>>(vbslq_u8(unchecked_bit_pun<uint8x16_t>(cond),
404 unchecked_bit_pun<uint8x16_t>(t),
405 unchecked_bit_pun<uint8x16_t>(e)));
Mike Klein5cb47d62020-07-10 15:46:46 -0500406 }
407#endif
408 // Recurse for large vectors to try to hit the specializations above.
Mike Kleinc3ad6a12020-09-15 15:26:22 -0500409 if /*constexpr*/ (N*sizeof(T) > 16) {
Mike Klein5cb47d62020-07-10 15:46:46 -0500410 return join(if_then_else(cond.lo, t.lo, e.lo),
411 if_then_else(cond.hi, t.hi, e.hi));
412 }
413 // This default can lead to better code than the recursing onto scalars.
Mike Klein7b1620f2020-09-16 10:18:47 -0500414 return naive_if_then_else(cond, t, e);
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500415}
416
Mike Klein7e129b82020-09-16 15:50:00 -0500417SIT bool any(const Vec<1,T>& x) { return x.val != 0; }
418SINT bool any(const Vec<N,T>& x) {
419#if defined(__wasm_simd128__)
420 if constexpr (N == 4 && sizeof(T) == 4) {
421 return wasm_i32x4_any_true(unchecked_bit_pun<VExt<4,int>>(x));
422 }
423#endif
424 return any(x.lo)
425 || any(x.hi);
426}
427
428SIT bool all(const Vec<1,T>& x) { return x.val != 0; }
429SINT bool all(const Vec<N,T>& x) {
430#if defined(__AVX2__)
431 if /*constexpr*/ (N*sizeof(T) == 32) {
432 return _mm256_testc_si256(unchecked_bit_pun<__m256i>(x),
433 _mm256_set1_epi32(-1));
434 }
435#endif
436#if defined(__SSE4_1__)
437 if /*constexpr*/ (N*sizeof(T) == 16) {
438 return _mm_testc_si128(unchecked_bit_pun<__m128i>(x),
439 _mm_set1_epi32(-1));
440 }
441#endif
442#if defined(__wasm_simd128__)
443 if /*constexpr*/ (N == 4 && sizeof(T) == 4) {
444 return wasm_i32x4_all_true(unchecked_bit_pun<VExt<4,int>>(x));
445 }
446#endif
447 return all(x.lo)
448 && all(x.hi);
449}
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500450
Mike Klein53a52982019-02-06 15:48:12 -0500451// cast() Vec<N,S> to Vec<N,D>, as if applying a C-cast to each lane.
Mike Klein3637a442020-10-12 13:13:28 -0500452// TODO: implement with map()?
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500453template <typename D, typename S>
Mike Kleina1711092020-09-02 09:00:57 -0500454SI Vec<1,D> cast(const Vec<1,S>& src) { return (D)src.val; }
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500455
Mike Klein42925152019-02-06 11:56:58 -0500456template <typename D, int N, typename S>
Mike Kleina1711092020-09-02 09:00:57 -0500457SI Vec<N,D> cast(const Vec<N,S>& src) {
Mike Klein42925152019-02-06 11:56:58 -0500458#if !defined(SKNX_NO_SIMD) && defined(__clang__)
Mike Kleinda7b0532019-04-10 12:40:31 -0500459 return to_vec(__builtin_convertvector(to_vext(src), VExt<N,D>));
Mike Klein42925152019-02-06 11:56:58 -0500460#else
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500461 return join(cast<D>(src.lo), cast<D>(src.hi));
Mike Klein42925152019-02-06 11:56:58 -0500462#endif
463}
464
Mike Klein7b1620f2020-09-16 10:18:47 -0500465// min/max match logic of std::min/std::max, which is important when NaN is involved.
466SIT T min(const Vec<1,T>& x) { return x.val; }
467SIT T max(const Vec<1,T>& x) { return x.val; }
468SINT T min(const Vec<N,T>& x) { return std::min(min(x.lo), min(x.hi)); }
469SINT T max(const Vec<N,T>& x) { return std::max(max(x.lo), max(x.hi)); }
470
471SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(y < x, y, x); }
472SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(x < y, y, x); }
473
474SINTU Vec<N,T> min(const Vec<N,T>& x, U y) { return min(x, Vec<N,T>(y)); }
475SINTU Vec<N,T> max(const Vec<N,T>& x, U y) { return max(x, Vec<N,T>(y)); }
476SINTU Vec<N,T> min(U x, const Vec<N,T>& y) { return min(Vec<N,T>(x), y); }
477SINTU Vec<N,T> max(U x, const Vec<N,T>& y) { return max(Vec<N,T>(x), y); }
478
Chris Dalton81b270a2020-10-16 15:12:10 -0600479// pin matches the logic of SkTPin, which is important when NaN is involved. It always returns
480// values in the range lo..hi, and if x is NaN, it returns lo.
481SINT Vec<N,T> pin(const Vec<N,T>& x, const Vec<N,T>& lo, const Vec<N,T>& hi) {
482 return max(lo, min(x, hi));
483}
Mike Klein7b1620f2020-09-16 10:18:47 -0500484
Mike Klein53a52982019-02-06 15:48:12 -0500485// Shuffle values from a vector pretty arbitrarily:
486// skvx::Vec<4,float> rgba = {R,G,B,A};
487// shuffle<2,1,0,3> (rgba) ~> {B,G,R,A}
488// shuffle<2,1> (rgba) ~> {B,G}
489// shuffle<2,1,2,1,2,1,2,1>(rgba) ~> {B,G,B,G,B,G,B,G}
490// shuffle<3,3,3,3> (rgba) ~> {A,A,A,A}
491// The only real restriction is that the output also be a legal N=power-of-two sknx::Vec.
492template <int... Ix, int N, typename T>
Mike Kleina1711092020-09-02 09:00:57 -0500493SI Vec<sizeof...(Ix),T> shuffle(const Vec<N,T>& x) {
Mike Klein3bad19c2019-04-11 14:14:16 -0500494#if !defined(SKNX_NO_SIMD) && defined(__clang__)
Mike Klein3637a442020-10-12 13:13:28 -0500495 // TODO: can we just always use { x[Ix]... }?
Mike Klein3bad19c2019-04-11 14:14:16 -0500496 return to_vec<sizeof...(Ix),T>(__builtin_shufflevector(to_vext(x), to_vext(x), Ix...));
497#else
Mike Klein53a52982019-02-06 15:48:12 -0500498 return { x[Ix]... };
Mike Klein3bad19c2019-04-11 14:14:16 -0500499#endif
Mike Klein53a52982019-02-06 15:48:12 -0500500}
Mike Klein42925152019-02-06 11:56:58 -0500501
Mike Klein840e8ea2020-10-12 12:38:10 -0500502// Call map(fn, x) for a vector with fn() applied to each lane of x, { fn(x[0]), fn(x[1]), ... },
503// or map(fn, x,y) for a vector of fn(x[i], y[i]), etc.
Mike Kleinec370972020-03-05 10:15:35 -0600504
Mike Klein840e8ea2020-10-12 12:38:10 -0500505template <typename Fn, typename... Args, size_t... I>
Mike Klein840e8ea2020-10-12 12:38:10 -0500506SI auto map(std::index_sequence<I...>,
507 Fn&& fn, const Args&... args) -> skvx::Vec<sizeof...(I), decltype(fn(args[0]...))> {
Mike Klein6b8b2ea2020-10-13 13:56:55 -0500508 auto lane = [&](size_t i)
509#if defined(__clang__)
510 // CFI, specifically -fsanitize=cfi-icall, seems to give a false positive here,
511 // with errors like "control flow integrity check for type 'float (float)
512 // noexcept' failed during indirect function call... note: sqrtf.cfi_jt defined
513 // here". But we can be quite sure fn is the right type: it's all inferred!
514 // So, stifle CFI in this function.
515 __attribute__((no_sanitize("cfi")))
516#endif
517 { return fn(args[i]...); };
518
Mike Klein840e8ea2020-10-12 12:38:10 -0500519 return { lane(I)... };
Mike Reed8520e762020-04-30 12:06:23 -0400520}
521
Mike Klein840e8ea2020-10-12 12:38:10 -0500522template <typename Fn, int N, typename T, typename... Rest>
523auto map(Fn&& fn, const Vec<N,T>& first, const Rest&... rest) {
524 // Derive an {0...N-1} index_sequence from the size of the first arg: N lanes in, N lanes out.
525 return map(std::make_index_sequence<N>{}, fn, first,rest...);
526}
527
Mike Klein840e8ea2020-10-12 12:38:10 -0500528SIN Vec<N,float> ceil(const Vec<N,float>& x) { return map( ceilf, x); }
529SIN Vec<N,float> floor(const Vec<N,float>& x) { return map(floorf, x); }
530SIN Vec<N,float> trunc(const Vec<N,float>& x) { return map(truncf, x); }
531SIN Vec<N,float> round(const Vec<N,float>& x) { return map(roundf, x); }
532SIN Vec<N,float> sqrt(const Vec<N,float>& x) { return map( sqrtf, x); }
533SIN Vec<N,float> abs(const Vec<N,float>& x) { return map( fabsf, x); }
Mike Klein840e8ea2020-10-12 12:38:10 -0500534SIN Vec<N,float> fma(const Vec<N,float>& x,
535 const Vec<N,float>& y,
536 const Vec<N,float>& z) {
537 // I don't understand why Clang's codegen is terrible if we write map(fmaf, x,y,z) directly.
538 auto fn = [](float x, float y, float z) { return fmaf(x,y,z); };
539 return map(fn, x,y,z);
540}
Mike Klein952f8f12020-09-16 14:33:37 -0500541
542SI Vec<1,int> lrint(const Vec<1,float>& x) {
543 return (int)lrintf(x.val);
544}
545SIN Vec<N,int> lrint(const Vec<N,float>& x) {
546#if defined(__AVX__)
547 if /*constexpr*/ (N == 8) {
548 return unchecked_bit_pun<Vec<N,int>>(_mm256_cvtps_epi32(unchecked_bit_pun<__m256>(x)));
549 }
550#endif
551#if defined(__SSE__)
552 if /*constexpr*/ (N == 4) {
553 return unchecked_bit_pun<Vec<N,int>>(_mm_cvtps_epi32(unchecked_bit_pun<__m128>(x)));
554 }
555#endif
556 return join(lrint(x.lo),
557 lrint(x.hi));
558}
559
Mike Klein952f8f12020-09-16 14:33:37 -0500560SIN Vec<N,float> fract(const Vec<N,float>& x) { return x - floor(x); }
561
Mike Klein3637a442020-10-12 13:13:28 -0500562// The default logic for to_half/from_half is borrowed from skcms,
563// and assumes inputs are finite and treat/flush denorm half floats as/to zero.
Mike Klein4d680cd2020-07-15 09:58:51 -0500564// Key constants to watch for:
565// - a float is 32-bit, 1-8-23 sign-exponent-mantissa, with 127 exponent bias;
566// - a half is 16-bit, 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
Mike Kleina1711092020-09-02 09:00:57 -0500567SIN Vec<N,uint16_t> to_half_finite_ftz(const Vec<N,float>& x) {
Mike Klein4d680cd2020-07-15 09:58:51 -0500568 Vec<N,uint32_t> sem = bit_pun<Vec<N,uint32_t>>(x),
569 s = sem & 0x8000'0000,
570 em = sem ^ s,
571 is_denorm = em < 0x3880'0000;
572 return cast<uint16_t>(if_then_else(is_denorm, Vec<N,uint32_t>(0)
573 , (s>>16) + (em>>13) - ((127-15)<<10)));
574}
Mike Kleina1711092020-09-02 09:00:57 -0500575SIN Vec<N,float> from_half_finite_ftz(const Vec<N,uint16_t>& x) {
Mike Klein4d680cd2020-07-15 09:58:51 -0500576 Vec<N,uint32_t> wide = cast<uint32_t>(x),
577 s = wide & 0x8000,
578 em = wide ^ s;
579 auto is_denorm = bit_pun<Vec<N,int32_t>>(em < 0x0400);
580 return if_then_else(is_denorm, Vec<N,float>(0)
581 , bit_pun<Vec<N,float>>( (s<<16) + (em<<13) + ((127-15)<<23) ));
582}
583
584// Like if_then_else(), these N=1 base cases won't actually be used unless explicitly called.
Mike Kleina1711092020-09-02 09:00:57 -0500585SI Vec<1,uint16_t> to_half(const Vec<1,float>& x) { return to_half_finite_ftz(x); }
586SI Vec<1,float> from_half(const Vec<1,uint16_t>& x) { return from_half_finite_ftz(x); }
Mike Klein4d680cd2020-07-15 09:58:51 -0500587
Mike Kleina1711092020-09-02 09:00:57 -0500588SIN Vec<N,uint16_t> to_half(const Vec<N,float>& x) {
Mike Klein4d680cd2020-07-15 09:58:51 -0500589#if defined(__F16C__)
590 if /*constexpr*/ (N == 8) {
591 return unchecked_bit_pun<Vec<N,uint16_t>>(_mm256_cvtps_ph(unchecked_bit_pun<__m256>(x),
592 _MM_FROUND_CUR_DIRECTION));
593 }
594#endif
595#if defined(__aarch64__)
596 if /*constexpr*/ (N == 4) {
597 return unchecked_bit_pun<Vec<N,uint16_t>>(vcvt_f16_f32(unchecked_bit_pun<float32x4_t>(x)));
598
599 }
600#endif
601 if /*constexpr*/ (N > 4) {
602 return join(to_half(x.lo),
603 to_half(x.hi));
604 }
605 return to_half_finite_ftz(x);
606}
607
Mike Kleina1711092020-09-02 09:00:57 -0500608SIN Vec<N,float> from_half(const Vec<N,uint16_t>& x) {
Mike Klein4d680cd2020-07-15 09:58:51 -0500609#if defined(__F16C__)
610 if /*constexpr*/ (N == 8) {
611 return unchecked_bit_pun<Vec<N,float>>(_mm256_cvtph_ps(unchecked_bit_pun<__m128i>(x)));
612 }
613#endif
614#if defined(__aarch64__)
615 if /*constexpr*/ (N == 4) {
Jose Dapena Pazdc4da5a2020-07-31 20:04:25 +0200616 return unchecked_bit_pun<Vec<N,float>>(vcvt_f32_f16(unchecked_bit_pun<float16x4_t>(x)));
Mike Klein4d680cd2020-07-15 09:58:51 -0500617 }
618#endif
619 if /*constexpr*/ (N > 4) {
620 return join(from_half(x.lo),
621 from_half(x.hi));
622 }
623 return from_half_finite_ftz(x);
624}
625
Mike Reed8520e762020-04-30 12:06:23 -0400626
Mike Klein4b44a0d2019-04-11 11:52:51 -0500627// div255(x) = (x + 127) / 255 is a bit-exact rounding divide-by-255, packing down to 8-bit.
Mike Kleina1711092020-09-02 09:00:57 -0500628SIN Vec<N,uint8_t> div255(const Vec<N,uint16_t>& x) {
Mike Klein4b44a0d2019-04-11 11:52:51 -0500629 return cast<uint8_t>( (x+127)/255 );
630}
631
632// approx_scale(x,y) approximates div255(cast<uint16_t>(x)*cast<uint16_t>(y)) within a bit,
633// and is always perfect when x or y is 0 or 255.
Mike Kleina1711092020-09-02 09:00:57 -0500634SIN Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y) {
Mike Klein4b44a0d2019-04-11 11:52:51 -0500635 // All of (x*y+x)/256, (x*y+y)/256, and (x*y+255)/256 meet the criteria above.
636 // We happen to have historically picked (x*y+x)/256.
637 auto X = cast<uint16_t>(x),
638 Y = cast<uint16_t>(y);
639 return cast<uint8_t>( (X*Y+X)/256 );
640}
641
Mike Klein7d3b27d2019-06-07 10:57:58 -0500642#if !defined(SKNX_NO_SIMD) && defined(__ARM_NEON)
Mike Klein9a885b22019-04-16 12:07:23 -0500643 // With NEON we can do eight u8*u8 -> u16 in one instruction, vmull_u8 (read, mul-long).
Mike Kleina1711092020-09-02 09:00:57 -0500644 SI Vec<8,uint16_t> mull(const Vec<8,uint8_t>& x,
645 const Vec<8,uint8_t>& y) {
Mike Klein9a885b22019-04-16 12:07:23 -0500646 return to_vec<8,uint16_t>(vmull_u8(to_vext(x),
647 to_vext(y)));
648 }
649
Mike Kleina1711092020-09-02 09:00:57 -0500650 SIN std::enable_if_t<(N < 8), Vec<N,uint16_t>> mull(const Vec<N,uint8_t>& x,
651 const Vec<N,uint8_t>& y) {
Mike Klein9a885b22019-04-16 12:07:23 -0500652 // N < 8 --> double up data until N == 8, returning the part we need.
653 return mull(join(x,x),
654 join(y,y)).lo;
655 }
656
Mike Kleina1711092020-09-02 09:00:57 -0500657 SIN std::enable_if_t<(N > 8), Vec<N,uint16_t>> mull(const Vec<N,uint8_t>& x,
658 const Vec<N,uint8_t>& y) {
Mike Klein9a885b22019-04-16 12:07:23 -0500659 // N > 8 --> usual join(lo,hi) strategy to recurse down to N == 8.
660 return join(mull(x.lo, y.lo),
661 mull(x.hi, y.hi));
662 }
663#else
664 // Nothing special when we don't have NEON... just cast up to 16-bit and multiply.
Mike Kleina1711092020-09-02 09:00:57 -0500665 SIN Vec<N,uint16_t> mull(const Vec<N,uint8_t>& x,
666 const Vec<N,uint8_t>& y) {
Mike Klein9a885b22019-04-16 12:07:23 -0500667 return cast<uint16_t>(x)
668 * cast<uint16_t>(y);
669 }
670#endif
671
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500672} // namespace skvx
673
Mike Kleinf4438d52019-03-14 13:30:42 -0500674#undef SINTU
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500675#undef SINT
Mike Kleincd74dea2020-11-23 15:25:03 -0600676#undef SIN
Mike Kleindcfc3ef2019-02-07 09:49:17 -0500677#undef SIT
Mike Kleina1711092020-09-02 09:00:57 -0500678#undef SI
Mike Kleincd74dea2020-11-23 15:25:03 -0600679#undef SKVX_ALWAYS_INLINE
Mike Klein455c7472019-02-05 13:42:46 -0500680
681#endif//SKVX_DEFINED