Add SkNx_cast().

SkNx_cast() can cast between any of our vector types,
provided they have the same number of elements.

Any types should work with the default implementation,
and we can drop in specializations as needed, like the
SSE and NEON Sk4f -> Sk4i I included here as an example.

To make this work, I made some internal name changes:
    SkNi<N,T> -> SkNx<N, T>
    SkNf<N>   -> SkNx<N, float>
User aliases (Sk4f, Sk16b, etc.) stay the same.
We can land this first (it's PS1) if that makes things easier.

BUG=skia:
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1464623002
diff --git a/src/core/SkNx.h b/src/core/SkNx.h
index 89ffa4a..b9b6770 100644
--- a/src/core/SkNx.h
+++ b/src/core/SkNx.h
@@ -27,19 +27,19 @@
 // The default implementations just fall back on a pair of size N/2.
 
 template <int N, typename T>
-class SkNi {
+class SkNx {
 public:
-    SkNi() {}
-    SkNi(const SkNi<N/2, T>& lo, const SkNi<N/2, T>& hi) : fLo(lo), fHi(hi) {}
-    SkNi(T val) : fLo(val), fHi(val) {}
-    static SkNi Load(const T vals[N]) {
-        return SkNi(SkNi<N/2,T>::Load(vals), SkNi<N/2,T>::Load(vals+N/2));
+    SkNx() {}
+    SkNx(const SkNx<N/2, T>& lo, const SkNx<N/2, T>& hi) : fLo(lo), fHi(hi) {}
+    SkNx(T val) : fLo(val), fHi(val) {}
+    static SkNx Load(const T vals[N]) {
+        return SkNx(SkNx<N/2,T>::Load(vals), SkNx<N/2,T>::Load(vals+N/2));
     }
 
-    SkNi(T a, T b)                                : fLo(a),       fHi(b)       { REQUIRE(N==2); }
-    SkNi(T a, T b, T c, T d)                      : fLo(a,b),     fHi(c,d)     { REQUIRE(N==4); }
-    SkNi(T a, T b, T c, T d,  T e, T f, T g, T h) : fLo(a,b,c,d), fHi(e,f,g,h) { REQUIRE(N==8); }
-    SkNi(T a, T b, T c, T d,  T e, T f, T g, T h,
+    SkNx(T a, T b)                                : fLo(a),       fHi(b)       { REQUIRE(N==2); }
+    SkNx(T a, T b, T c, T d)                      : fLo(a,b),     fHi(c,d)     { REQUIRE(N==4); }
+    SkNx(T a, T b, T c, T d,  T e, T f, T g, T h) : fLo(a,b,c,d), fHi(e,f,g,h) { REQUIRE(N==8); }
+    SkNx(T a, T b, T c, T d,  T e, T f, T g, T h,
          T i, T j, T k, T l,  T m, T n, T o, T p)
         : fLo(a,b,c,d, e,f,g,h), fHi(i,j,k,l, m,n,o,p) { REQUIRE(N==16); }
 
@@ -48,21 +48,21 @@
         fHi.store(vals+N/2);
     }
 
-    SkNi saturatedAdd(const SkNi& o) const {
-        return SkNi(fLo.saturatedAdd(o.fLo), fHi.saturatedAdd(o.fHi));
+    SkNx saturatedAdd(const SkNx& o) const {
+        return SkNx(fLo.saturatedAdd(o.fLo), fHi.saturatedAdd(o.fHi));
     }
 
-    SkNi operator + (const SkNi& o) const { return SkNi(fLo + o.fLo, fHi + o.fHi); }
-    SkNi operator - (const SkNi& o) const { return SkNi(fLo - o.fLo, fHi - o.fHi); }
-    SkNi operator * (const SkNi& o) const { return SkNi(fLo * o.fLo, fHi * o.fHi); }
+    SkNx operator + (const SkNx& o) const { return SkNx(fLo + o.fLo, fHi + o.fHi); }
+    SkNx operator - (const SkNx& o) const { return SkNx(fLo - o.fLo, fHi - o.fHi); }
+    SkNx operator * (const SkNx& o) const { return SkNx(fLo * o.fLo, fHi * o.fHi); }
 
-    SkNi operator << (int bits) const { return SkNi(fLo << bits, fHi << bits); }
-    SkNi operator >> (int bits) const { return SkNi(fLo >> bits, fHi >> bits); }
+    SkNx operator << (int bits) const { return SkNx(fLo << bits, fHi << bits); }
+    SkNx operator >> (int bits) const { return SkNx(fLo >> bits, fHi >> bits); }
 
-    static SkNi Min(const SkNi& a, const SkNi& b) {
-        return SkNi(SkNi<N/2, T>::Min(a.fLo, b.fLo), SkNi<N/2, T>::Min(a.fHi, b.fHi));
+    static SkNx Min(const SkNx& a, const SkNx& b) {
+        return SkNx(SkNx<N/2, T>::Min(a.fLo, b.fLo), SkNx<N/2, T>::Min(a.fHi, b.fHi));
     }
-    SkNi operator < (const SkNi& o) const { return SkNi(fLo < o.fLo, fHi < o.fHi); }
+    SkNx operator < (const SkNx& o) const { return SkNx(fLo < o.fLo, fHi < o.fHi); }
 
     template <int k> T kth() const {
         SkASSERT(0 <= k && k < N);
@@ -71,34 +71,34 @@
 
     bool allTrue() const { return fLo.allTrue() && fHi.allTrue(); }
     bool anyTrue() const { return fLo.anyTrue() || fHi.anyTrue(); }
-    SkNi thenElse(const SkNi& t, const SkNi& e) const {
-        return SkNi(fLo.thenElse(t.fLo, e.fLo), fHi.thenElse(t.fHi, e.fHi));
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
+        return SkNx(fLo.thenElse(t.fLo, e.fLo), fHi.thenElse(t.fHi, e.fHi));
     }
 
 protected:
     REQUIRE(0 == (N & (N-1)));
 
-    SkNi<N/2, T> fLo, fHi;
+    SkNx<N/2, T> fLo, fHi;
 };
 
 template <int N>
-class SkNf {
+class SkNx<N,float> {
 public:
-    SkNf() {}
-    SkNf(float val) : fLo(val),  fHi(val) {}
-    static SkNf Load(const float vals[N]) {
-        return SkNf(SkNf<N/2>::Load(vals), SkNf<N/2>::Load(vals+N/2));
+    SkNx() {}
+    SkNx(float val) : fLo(val),  fHi(val) {}
+    static SkNx Load(const float vals[N]) {
+        return SkNx(SkNx<N/2, float>::Load(vals), SkNx<N/2, float>::Load(vals+N/2));
     }
     // FromBytes() and toBytes() specializations may assume their argument is N-byte aligned.
     // E.g. Sk4f::FromBytes() may assume it's reading from a 4-byte-aligned pointer.
     // Converts [0,255] bytes to [0.0, 255.0] floats.
-    static SkNf FromBytes(const uint8_t bytes[N]) {
-        return SkNf(SkNf<N/2>::FromBytes(bytes), SkNf<N/2>::FromBytes(bytes+N/2));
+    static SkNx FromBytes(const uint8_t bytes[N]) {
+        return SkNx(SkNx<N/2, float>::FromBytes(bytes), SkNx<N/2, float>::FromBytes(bytes+N/2));
     }
 
-    SkNf(float a, float b)                   : fLo(a),   fHi(b)   { REQUIRE(N==2); }
-    SkNf(float a, float b, float c, float d) : fLo(a,b), fHi(c,d) { REQUIRE(N==4); }
-    SkNf(float a, float b, float c, float d, float e, float f, float g, float h)
+    SkNx(float a, float b)                   : fLo(a),   fHi(b)   { REQUIRE(N==2); }
+    SkNx(float a, float b, float c, float d) : fLo(a,b), fHi(c,d) { REQUIRE(N==4); }
+    SkNx(float a, float b, float c, float d, float e, float f, float g, float h)
         : fLo(a,b,c,d)
         , fHi(e,f,g,h) { REQUIRE(N==8); }
 
@@ -113,34 +113,34 @@
         fHi.toBytes(bytes+N/2);
     }
 
-    SkNf operator + (const SkNf& o) const { return SkNf(fLo + o.fLo, fHi + o.fHi); }
-    SkNf operator - (const SkNf& o) const { return SkNf(fLo - o.fLo, fHi - o.fHi); }
-    SkNf operator * (const SkNf& o) const { return SkNf(fLo * o.fLo, fHi * o.fHi); }
-    SkNf operator / (const SkNf& o) const { return SkNf(fLo / o.fLo, fHi / o.fHi); }
+    SkNx operator + (const SkNx& o) const { return SkNx(fLo + o.fLo, fHi + o.fHi); }
+    SkNx operator - (const SkNx& o) const { return SkNx(fLo - o.fLo, fHi - o.fHi); }
+    SkNx operator * (const SkNx& o) const { return SkNx(fLo * o.fLo, fHi * o.fHi); }
+    SkNx operator / (const SkNx& o) const { return SkNx(fLo / o.fLo, fHi / o.fHi); }
 
-    SkNf operator == (const SkNf& o) const { return SkNf(fLo == o.fLo, fHi == o.fHi); }
-    SkNf operator != (const SkNf& o) const { return SkNf(fLo != o.fLo, fHi != o.fHi); }
-    SkNf operator  < (const SkNf& o) const { return SkNf(fLo  < o.fLo, fHi  < o.fHi); }
-    SkNf operator  > (const SkNf& o) const { return SkNf(fLo  > o.fLo, fHi  > o.fHi); }
-    SkNf operator <= (const SkNf& o) const { return SkNf(fLo <= o.fLo, fHi <= o.fHi); }
-    SkNf operator >= (const SkNf& o) const { return SkNf(fLo >= o.fLo, fHi >= o.fHi); }
+    SkNx operator == (const SkNx& o) const { return SkNx(fLo == o.fLo, fHi == o.fHi); }
+    SkNx operator != (const SkNx& o) const { return SkNx(fLo != o.fLo, fHi != o.fHi); }
+    SkNx operator  < (const SkNx& o) const { return SkNx(fLo  < o.fLo, fHi  < o.fHi); }
+    SkNx operator  > (const SkNx& o) const { return SkNx(fLo  > o.fLo, fHi  > o.fHi); }
+    SkNx operator <= (const SkNx& o) const { return SkNx(fLo <= o.fLo, fHi <= o.fHi); }
+    SkNx operator >= (const SkNx& o) const { return SkNx(fLo >= o.fLo, fHi >= o.fHi); }
 
-    static SkNf Min(const SkNf& l, const SkNf& r) {
-        return SkNf(SkNf<N/2>::Min(l.fLo, r.fLo), SkNf<N/2>::Min(l.fHi, r.fHi));
+    static SkNx Min(const SkNx& l, const SkNx& r) {
+        return SkNx(SkNx<N/2, float>::Min(l.fLo, r.fLo), SkNx<N/2, float>::Min(l.fHi, r.fHi));
     }
-    static SkNf Max(const SkNf& l, const SkNf& r) {
-        return SkNf(SkNf<N/2>::Max(l.fLo, r.fLo), SkNf<N/2>::Max(l.fHi, r.fHi));
+    static SkNx Max(const SkNx& l, const SkNx& r) {
+        return SkNx(SkNx<N/2, float>::Max(l.fLo, r.fLo), SkNx<N/2, float>::Max(l.fHi, r.fHi));
     }
 
-    SkNf  sqrt() const { return SkNf(fLo. sqrt(), fHi. sqrt()); }
+    SkNx  sqrt() const { return SkNx(fLo. sqrt(), fHi. sqrt()); }
 
     // Generally, increasing precision, increasing cost.
-    SkNf rsqrt0() const { return SkNf(fLo.rsqrt0(), fHi.rsqrt0()); }
-    SkNf rsqrt1() const { return SkNf(fLo.rsqrt1(), fHi.rsqrt1()); }
-    SkNf rsqrt2() const { return SkNf(fLo.rsqrt2(), fHi.rsqrt2()); }
+    SkNx rsqrt0() const { return SkNx(fLo.rsqrt0(), fHi.rsqrt0()); }
+    SkNx rsqrt1() const { return SkNx(fLo.rsqrt1(), fHi.rsqrt1()); }
+    SkNx rsqrt2() const { return SkNx(fLo.rsqrt2(), fHi.rsqrt2()); }
 
-    SkNf       invert() const { return SkNf(fLo.      invert(), fHi.      invert()); }
-    SkNf approxInvert() const { return SkNf(fLo.approxInvert(), fHi.approxInvert()); }
+    SkNx       invert() const { return SkNx(fLo.      invert(), fHi.      invert()); }
+    SkNx approxInvert() const { return SkNx(fLo.approxInvert(), fHi.approxInvert()); }
 
     template <int k> float kth() const {
         SkASSERT(0 <= k && k < N);
@@ -149,44 +149,44 @@
 
     bool allTrue() const { return fLo.allTrue() && fHi.allTrue(); }
     bool anyTrue() const { return fLo.anyTrue() || fHi.anyTrue(); }
-    SkNf thenElse(const SkNf& t, const SkNf& e) const {
-        return SkNf(fLo.thenElse(t.fLo, e.fLo), fHi.thenElse(t.fHi, e.fHi));
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
+        return SkNx(fLo.thenElse(t.fLo, e.fLo), fHi.thenElse(t.fHi, e.fHi));
     }
 
 protected:
     REQUIRE(0 == (N & (N-1)));
-    SkNf(const SkNf<N/2>& lo, const SkNf<N/2>& hi) : fLo(lo), fHi(hi) {}
+    SkNx(const SkNx<N/2, float>& lo, const SkNx<N/2, float>& hi) : fLo(lo), fHi(hi) {}
 
-    SkNf<N/2> fLo, fHi;
+    SkNx<N/2, float> fLo, fHi;
 };
 
 
 // Bottom out the default implementations with scalars when nothing's been specialized.
 
 template <typename T>
-class SkNi<1,T> {
+class SkNx<1,T> {
 public:
-    SkNi() {}
-    SkNi(T val) : fVal(val) {}
-    static SkNi Load(const T vals[1]) { return SkNi(vals[0]); }
+    SkNx() {}
+    SkNx(T val) : fVal(val) {}
+    static SkNx Load(const T vals[1]) { return SkNx(vals[0]); }
 
     void store(T vals[1]) const { vals[0] = fVal; }
 
-    SkNi saturatedAdd(const SkNi& o) const {
+    SkNx saturatedAdd(const SkNx& o) const {
         SkASSERT((T)(~0) > 0); // TODO: support signed T
         T sum = fVal + o.fVal;
-        return SkNi(sum < fVal ? (T)(~0) : sum);
+        return SkNx(sum < fVal ? (T)(~0) : sum);
     }
 
-    SkNi operator + (const SkNi& o) const { return SkNi(fVal + o.fVal); }
-    SkNi operator - (const SkNi& o) const { return SkNi(fVal - o.fVal); }
-    SkNi operator * (const SkNi& o) const { return SkNi(fVal * o.fVal); }
+    SkNx operator + (const SkNx& o) const { return SkNx(fVal + o.fVal); }
+    SkNx operator - (const SkNx& o) const { return SkNx(fVal - o.fVal); }
+    SkNx operator * (const SkNx& o) const { return SkNx(fVal * o.fVal); }
 
-    SkNi operator << (int bits) const { return SkNi(fVal << bits); }
-    SkNi operator >> (int bits) const { return SkNi(fVal >> bits); }
+    SkNx operator << (int bits) const { return SkNx(fVal << bits); }
+    SkNx operator >> (int bits) const { return SkNx(fVal >> bits); }
 
-    static SkNi Min(const SkNi& a, const SkNi& b) { return SkNi(SkTMin(a.fVal, b.fVal)); }
-    SkNi operator <(const SkNi& o) const { return SkNi(fVal < o.fVal); }
+    static SkNx Min(const SkNx& a, const SkNx& b) { return SkNx(SkTMin(a.fVal, b.fVal)); }
+    SkNx operator <(const SkNx& o) const { return SkNx(fVal < o.fVal); }
 
     template <int k> T kth() const {
         SkASSERT(0 == k);
@@ -195,45 +195,45 @@
 
     bool allTrue() const { return fVal; }
     bool anyTrue() const { return fVal; }
-    SkNi thenElse(const SkNi& t, const SkNi& e) const { return fVal ? t : e; }
+    SkNx thenElse(const SkNx& t, const SkNx& e) const { return fVal ? t : e; }
 
 protected:
     T fVal;
 };
 
 template <>
-class SkNf<1> {
+class SkNx<1,float> {
 public:
-    SkNf() {}
-    SkNf(float val) : fVal(val) {}
-    static SkNf Load(const float vals[1]) { return SkNf(vals[0]); }
-    static SkNf FromBytes(const uint8_t bytes[1]) { return SkNf((float)bytes[0]); }
+    SkNx() {}
+    SkNx(float val) : fVal(val) {}
+    static SkNx Load(const float vals[1]) { return SkNx(vals[0]); }
+    static SkNx FromBytes(const uint8_t bytes[1]) { return SkNx((float)bytes[0]); }
 
     void store(float vals[1]) const { vals[0] = fVal; }
     void toBytes(uint8_t bytes[1]) const { bytes[0] = (uint8_t)(SkTMin(fVal, 255.0f)); }
 
-    SkNf operator + (const SkNf& o) const { return SkNf(fVal + o.fVal); }
-    SkNf operator - (const SkNf& o) const { return SkNf(fVal - o.fVal); }
-    SkNf operator * (const SkNf& o) const { return SkNf(fVal * o.fVal); }
-    SkNf operator / (const SkNf& o) const { return SkNf(fVal / o.fVal); }
+    SkNx operator + (const SkNx& o) const { return SkNx(fVal + o.fVal); }
+    SkNx operator - (const SkNx& o) const { return SkNx(fVal - o.fVal); }
+    SkNx operator * (const SkNx& o) const { return SkNx(fVal * o.fVal); }
+    SkNx operator / (const SkNx& o) const { return SkNx(fVal / o.fVal); }
 
-    SkNf operator == (const SkNf& o) const { return SkNf(fVal == o.fVal); }
-    SkNf operator != (const SkNf& o) const { return SkNf(fVal != o.fVal); }
-    SkNf operator  < (const SkNf& o) const { return SkNf(fVal  < o.fVal); }
-    SkNf operator  > (const SkNf& o) const { return SkNf(fVal  > o.fVal); }
-    SkNf operator <= (const SkNf& o) const { return SkNf(fVal <= o.fVal); }
-    SkNf operator >= (const SkNf& o) const { return SkNf(fVal >= o.fVal); }
+    SkNx operator == (const SkNx& o) const { return SkNx(fVal == o.fVal); }
+    SkNx operator != (const SkNx& o) const { return SkNx(fVal != o.fVal); }
+    SkNx operator  < (const SkNx& o) const { return SkNx(fVal  < o.fVal); }
+    SkNx operator  > (const SkNx& o) const { return SkNx(fVal  > o.fVal); }
+    SkNx operator <= (const SkNx& o) const { return SkNx(fVal <= o.fVal); }
+    SkNx operator >= (const SkNx& o) const { return SkNx(fVal >= o.fVal); }
 
-    static SkNf Min(const SkNf& l, const SkNf& r) { return SkNf(SkTMin(l.fVal, r.fVal)); }
-    static SkNf Max(const SkNf& l, const SkNf& r) { return SkNf(SkTMax(l.fVal, r.fVal)); }
+    static SkNx Min(const SkNx& l, const SkNx& r) { return SkNx(SkTMin(l.fVal, r.fVal)); }
+    static SkNx Max(const SkNx& l, const SkNx& r) { return SkNx(SkTMax(l.fVal, r.fVal)); }
 
-    SkNf  sqrt() const { return SkNf(sqrtf(fVal));        }
-    SkNf rsqrt0() const { return SkNf(1.0f / sqrtf(fVal)); }
-    SkNf rsqrt1() const { return this->rsqrt0(); }
-    SkNf rsqrt2() const { return this->rsqrt1(); }
+    SkNx  sqrt() const { return SkNx(sqrtf(fVal));        }
+    SkNx rsqrt0() const { return SkNx(1.0f / sqrtf(fVal)); }
+    SkNx rsqrt1() const { return this->rsqrt0(); }
+    SkNx rsqrt2() const { return this->rsqrt1(); }
 
-    SkNf       invert() const { return SkNf(1.0f / fVal); }
-    SkNf approxInvert() const { return this->invert();    }
+    SkNx       invert() const { return SkNx(1.0f / fVal); }
+    SkNx approxInvert() const { return this->invert();    }
 
     template <int k> float kth() const {
         SkASSERT(k == 0);
@@ -242,7 +242,7 @@
 
     bool allTrue() const { return this->pun() != 0; }
     bool anyTrue() const { return this->pun() != 0; }
-    SkNf thenElse(const SkNf& t, const SkNf& e) const { return this->pun() ? t : e; }
+    SkNx thenElse(const SkNx& t, const SkNx& e) const { return this->pun() ? t : e; }
 
 protected:
     uint32_t pun() const {
@@ -255,19 +255,45 @@
 
 // This default implementation can be specialized by ../opts/SkNx_foo.h
 // if there's a better platform-specific shuffle strategy.
-template <typename SkNx, int... Ix>
-inline SkNx SkNx_shuffle_impl(const SkNx& src) { return SkNx( src.template kth<Ix>()... ); }
+template <typename Nx, int... Ix>
+inline Nx SkNx_shuffle_impl(const Nx& src) { return Nx( src.template kth<Ix>()... ); }
 
-// This generic shuffle can be called on either SkNi or SkNf with 1 or N indices:
+// This generic shuffle can be called with 1 or N indices:
 //     Sk4f f(a,b,c,d);
 //     SkNx_shuffle<3>(f);        // ~~~> Sk4f(d,d,d,d)
 //     SkNx_shuffle<2,1,0,3>(f);  // ~~~> Sk4f(c,b,a,d)
-template <int... Ix, typename SkNx>
-inline SkNx SkNx_shuffle(const SkNx& src) { return SkNx_shuffle_impl<SkNx, Ix...>(src); }
+template <int... Ix, typename Nx>
+inline Nx SkNx_shuffle(const Nx& src) { return SkNx_shuffle_impl<Nx, Ix...>(src); }
 
 // A reminder alias that shuffles can be used to duplicate a single index across a vector.
-template <int Ix, typename SkNx>
-inline SkNx SkNx_dup(const SkNx& src) { return SkNx_shuffle<Ix>(src); }
+template <int Ix, typename Nx>
+inline Nx SkNx_dup(const Nx& src) { return SkNx_shuffle<Ix>(src); }
+
+// This is a poor-man's std::make_index_sequence from C++14.
+// I'd implement it fully, but it hurts my head.
+template <int...> struct SkIntSequence {};
+template <int N> struct MakeSkIntSequence;
+template <> struct MakeSkIntSequence< 1> : SkIntSequence<0                                    >{};
+template <> struct MakeSkIntSequence< 2> : SkIntSequence<0,1                                  >{};
+template <> struct MakeSkIntSequence< 4> : SkIntSequence<0,1,2,3                              >{};
+template <> struct MakeSkIntSequence< 8> : SkIntSequence<0,1,2,3,4,5,6,7                      >{};
+template <> struct MakeSkIntSequence<16> : SkIntSequence<0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>{};
+
+// This is the default/fallback implementation for SkNx_cast.  Best to specialize SkNx_cast!
+template <typename D, typename S, int N, int... Ix>
+SkNx<N,D> SkNx_cast_fallback(const SkNx<N,S>& src, SkIntSequence<Ix...>) {
+    return SkNx<N,D>( (D)src.template kth<Ix>()... );
+}
+
+// This is a generic cast between two SkNx with the same number of elements N.  E.g.
+//   Sk4b bs = ...;                    // Load 4 bytes.
+//   Sk4f fs = SkNx_cast<float>(bs);   // (This will replace SkNf::FromBytes() one day.)
+//   Sk4i is = SkNx_cast<int>(fs);     // Cast each float to int.
+// This can be specialized in ../opts/SkNx_foo.h if there's a better platform-specific cast.
+template <typename D, typename S, int N>
+SkNx<N,D> SkNx_cast(const SkNx<N,S>& src) {
+    return SkNx_cast_fallback<D,S,N>(src, MakeSkIntSequence<N>());
+}
 
 }  // namespace
 
@@ -285,15 +311,17 @@
 
 #undef REQUIRE
 
-typedef SkNf<2> Sk2f;
-typedef SkNf<2> Sk2s;
-typedef SkNf<4> Sk4f;
-typedef SkNf<4> Sk4s;
-typedef SkNf<8> Sk8f;
-typedef SkNf<8> Sk8s;
+typedef SkNx<2, float> Sk2f;
+typedef SkNx<2, float> Sk2s;
+typedef SkNx<4, float> Sk4f;
+typedef SkNx<4, float> Sk4s;
+typedef SkNx<8, float> Sk8f;
+typedef SkNx<8, float> Sk8s;
 
-typedef SkNi<8,  uint16_t> Sk8h;
-typedef SkNi<16, uint16_t> Sk16h;
-typedef SkNi<16, uint8_t>  Sk16b;
+typedef SkNx<8,  uint16_t> Sk8h;
+typedef SkNx<16, uint16_t> Sk16h;
+typedef SkNx<16, uint8_t>  Sk16b;
+
+typedef SkNx<4, int> Sk4i;
 
 #endif//SkNx_DEFINED
diff --git a/src/opts/SkNx_avx.h b/src/opts/SkNx_avx.h
index 86caac2..9697303 100644
--- a/src/opts/SkNx_avx.h
+++ b/src/opts/SkNx_avx.h
@@ -16,15 +16,15 @@
 namespace {  // See SkNx.h
 
 template <>
-class SkNf<8> {
+class SkNx<8, float> {
 public:
-    SkNf(const __m256& vec) : fVec(vec) {}
+    SkNx(const __m256& vec) : fVec(vec) {}
 
-    SkNf() {}
-    SkNf(float val) : fVec(_mm256_set1_ps(val)) {}
-    static SkNf Load(const float vals[8]) { return _mm256_loadu_ps(vals); }
+    SkNx() {}
+    SkNx(float val) : fVec(_mm256_set1_ps(val)) {}
+    static SkNx Load(const float vals[8]) { return _mm256_loadu_ps(vals); }
 
-    static SkNf FromBytes(const uint8_t bytes[8]) {
+    static SkNx FromBytes(const uint8_t bytes[8]) {
         __m128i fix8  = _mm_loadl_epi64((const __m128i*)bytes),
                 fix16 = _mm_unpacklo_epi8 (fix8 , _mm_setzero_si128()),
                  lo32 = _mm_unpacklo_epi16(fix16, _mm_setzero_si128()),
@@ -33,7 +33,7 @@
         return _mm256_cvtepi32_ps(fix32);
     }
 
-    SkNf(float a, float b, float c, float d,
+    SkNx(float a, float b, float c, float d,
          float e, float f, float g, float h) : fVec(_mm256_setr_ps(a,b,c,d,e,f,g,h)) {}
 
     void store(float vals[8]) const { _mm256_storeu_ps(vals, fVec); }
@@ -46,28 +46,28 @@
         _mm_storel_epi64((__m128i*)bytes, fix8);
     }
 
-    SkNf operator + (const SkNf& o) const { return _mm256_add_ps(fVec, o.fVec); }
-    SkNf operator - (const SkNf& o) const { return _mm256_sub_ps(fVec, o.fVec); }
-    SkNf operator * (const SkNf& o) const { return _mm256_mul_ps(fVec, o.fVec); }
-    SkNf operator / (const SkNf& o) const { return _mm256_div_ps(fVec, o.fVec); }
+    SkNx operator + (const SkNx& o) const { return _mm256_add_ps(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return _mm256_sub_ps(fVec, o.fVec); }
+    SkNx operator * (const SkNx& o) const { return _mm256_mul_ps(fVec, o.fVec); }
+    SkNx operator / (const SkNx& o) const { return _mm256_div_ps(fVec, o.fVec); }
 
-    SkNf operator == (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_EQ_OQ); }
-    SkNf operator != (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_NEQ_OQ); }
-    SkNf operator  < (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_LT_OQ); }
-    SkNf operator  > (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_GT_OQ); }
-    SkNf operator <= (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_LE_OQ); }
-    SkNf operator >= (const SkNf& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_GE_OQ); }
+    SkNx operator == (const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_EQ_OQ); }
+    SkNx operator != (const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_NEQ_OQ); }
+    SkNx operator  < (const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_LT_OQ); }
+    SkNx operator  > (const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_GT_OQ); }
+    SkNx operator <= (const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_LE_OQ); }
+    SkNx operator >= (const SkNx& o) const { return _mm256_cmp_ps(fVec, o.fVec, _CMP_GE_OQ); }
 
-    static SkNf Min(const SkNf& l, const SkNf& r) { return _mm256_min_ps(l.fVec, r.fVec); }
-    static SkNf Max(const SkNf& l, const SkNf& r) { return _mm256_max_ps(l.fVec, r.fVec); }
+    static SkNx Min(const SkNx& l, const SkNx& r) { return _mm256_min_ps(l.fVec, r.fVec); }
+    static SkNx Max(const SkNx& l, const SkNx& r) { return _mm256_max_ps(l.fVec, r.fVec); }
 
-    SkNf  sqrt() const { return _mm256_sqrt_ps (fVec);  }
-    SkNf rsqrt0() const { return _mm256_rsqrt_ps(fVec); }
-    SkNf rsqrt1() const { return this->rsqrt0(); }
-    SkNf rsqrt2() const { return this->rsqrt1(); }
+    SkNx  sqrt() const { return _mm256_sqrt_ps (fVec);  }
+    SkNx rsqrt0() const { return _mm256_rsqrt_ps(fVec); }
+    SkNx rsqrt1() const { return this->rsqrt0(); }
+    SkNx rsqrt2() const { return this->rsqrt1(); }
 
-    SkNf       invert() const { return SkNf(1) / *this; }
-    SkNf approxInvert() const { return _mm256_rcp_ps(fVec); }
+    SkNx       invert() const { return SkNx(1) / *this; }
+    SkNx approxInvert() const { return _mm256_rcp_ps(fVec); }
 
     template <int k> float kth() const {
         SkASSERT(0 <= k && k < 8);
@@ -78,7 +78,7 @@
     bool allTrue() const { return 0xff == _mm256_movemask_ps(fVec); }
     bool anyTrue() const { return 0x00 != _mm256_movemask_ps(fVec); }
 
-    SkNf thenElse(const SkNf& t, const SkNf& e) const {
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
         return _mm256_blendv_ps(e.fVec, t.fVec, fVec);
     }
 
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 43358a2..a03f0be 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -33,32 +33,32 @@
     case 31: return op(v, 31); } return fVec
 
 template <>
-class SkNf<2> {
+class SkNx<2, float> {
 public:
-    SkNf(float32x2_t vec) : fVec(vec) {}
+    SkNx(float32x2_t vec) : fVec(vec) {}
 
-    SkNf() {}
-    SkNf(float val)           : fVec(vdup_n_f32(val)) {}
-    static SkNf Load(const float vals[2]) { return vld1_f32(vals); }
-    SkNf(float a, float b) { fVec = (float32x2_t) { a, b }; }
+    SkNx() {}
+    SkNx(float val)           : fVec(vdup_n_f32(val)) {}
+    static SkNx Load(const float vals[2]) { return vld1_f32(vals); }
+    SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; }
 
     void store(float vals[2]) const { vst1_f32(vals, fVec); }
 
-    SkNf approxInvert() const {
+    SkNx approxInvert() const {
         float32x2_t est0 = vrecpe_f32(fVec),
                     est1 = vmul_f32(vrecps_f32(est0, fVec), est0);
         return est1;
     }
-    SkNf invert() const {
+    SkNx invert() const {
         float32x2_t est1 = this->approxInvert().fVec,
                     est2 = vmul_f32(vrecps_f32(est1, fVec), est1);
         return est2;
     }
 
-    SkNf operator + (const SkNf& o) const { return vadd_f32(fVec, o.fVec); }
-    SkNf operator - (const SkNf& o) const { return vsub_f32(fVec, o.fVec); }
-    SkNf operator * (const SkNf& o) const { return vmul_f32(fVec, o.fVec); }
-    SkNf operator / (const SkNf& o) const {
+    SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); }
+    SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); }
+    SkNx operator / (const SkNx& o) const {
     #if defined(SK_CPU_ARM64)
         return vdiv_f32(fVec, o.fVec);
     #else
@@ -66,29 +66,29 @@
     #endif
     }
 
-    SkNf operator == (const SkNf& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); }
-    SkNf operator  < (const SkNf& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); }
-    SkNf operator  > (const SkNf& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); }
-    SkNf operator <= (const SkNf& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); }
-    SkNf operator >= (const SkNf& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); }
-    SkNf operator != (const SkNf& o) const {
+    SkNx operator == (const SkNx& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); }
+    SkNx operator  < (const SkNx& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); }
+    SkNx operator  > (const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); }
+    SkNx operator <= (const SkNx& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); }
+    SkNx operator >= (const SkNx& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); }
+    SkNx operator != (const SkNx& o) const {
         return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec)));
     }
 
-    static SkNf Min(const SkNf& l, const SkNf& r) { return vmin_f32(l.fVec, r.fVec); }
-    static SkNf Max(const SkNf& l, const SkNf& r) { return vmax_f32(l.fVec, r.fVec); }
+    static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fVec); }
+    static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); }
 
-    SkNf rsqrt0() const { return vrsqrte_f32(fVec); }
-    SkNf rsqrt1() const {
+    SkNx rsqrt0() const { return vrsqrte_f32(fVec); }
+    SkNx rsqrt1() const {
         float32x2_t est0 = this->rsqrt0().fVec;
         return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0);
     }
-    SkNf rsqrt2() const {
+    SkNx rsqrt2() const {
         float32x2_t est1 = this->rsqrt1().fVec;
         return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
     }
 
-    SkNf sqrt() const {
+    SkNx sqrt() const {
     #if defined(SK_CPU_ARM64)
         return vsqrt_f32(fVec);
     #else
@@ -114,23 +114,23 @@
 };
 
 template <>
-class SkNi<4, int> {
+class SkNx<4, int> {
 public:
-    SkNi(const int32x4_t& vec) : fVec(vec) {}
+    SkNx(const int32x4_t& vec) : fVec(vec) {}
 
-    SkNi() {}
-    SkNi(int val) : fVec(vdupq_n_s32(val)) {}
-    static SkNi Load(const int vals[4]) { return vld1q_s32(vals); }
-    SkNi(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; }
+    SkNx() {}
+    SkNx(int val) : fVec(vdupq_n_s32(val)) {}
+    static SkNx Load(const int vals[4]) { return vld1q_s32(vals); }
+    SkNx(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; }
 
     void store(int vals[4]) const { vst1q_s32(vals, fVec); }
 
-    SkNi operator + (const SkNi& o) const { return vaddq_s32(fVec, o.fVec); }
-    SkNi operator - (const SkNi& o) const { return vsubq_s32(fVec, o.fVec); }
-    SkNi operator * (const SkNi& o) const { return vmulq_s32(fVec, o.fVec); }
+    SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
+    SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
 
-    SkNi operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); }
-    SkNi operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); }
+    SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); }
+    SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); }
 
     template <int k> int kth() const {
         SkASSERT(0 <= k && k < 4);
@@ -141,21 +141,21 @@
 };
 
 template <>
-class SkNf<4> {
+class SkNx<4, float> {
 public:
-    SkNf(float32x4_t vec) : fVec(vec) {}
+    SkNx(float32x4_t vec) : fVec(vec) {}
 
-    SkNf() {}
-    SkNf(float val)           : fVec(vdupq_n_f32(val)) {}
-    static SkNf Load(const float vals[4]) { return vld1q_f32(vals); }
-    static SkNf FromBytes(const uint8_t vals[4]) {
+    SkNx() {}
+    SkNx(float val)           : fVec(vdupq_n_f32(val)) {}
+    static SkNx Load(const float vals[4]) { return vld1q_f32(vals); }
+    static SkNx FromBytes(const uint8_t vals[4]) {
         uint8x8_t   fix8    = (uint8x8_t)vld1_dup_u32((const uint32_t*)vals);
         uint16x8_t  fix8_16 = vmovl_u8(fix8);
         uint32x4_t  fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
-        return SkNf(vcvtq_f32_u32(fix8_32));
+        return SkNx(vcvtq_f32_u32(fix8_32));
     }
 
-    SkNf(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
+    SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
 
     void store(float vals[4]) const { vst1q_f32(vals, fVec); }
     void toBytes(uint8_t bytes[4]) const {
@@ -165,21 +165,21 @@
         vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0);
     }
 
-    SkNf approxInvert() const {
+    SkNx approxInvert() const {
         float32x4_t est0 = vrecpeq_f32(fVec),
                     est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0);
         return est1;
     }
-    SkNf invert() const {
+    SkNx invert() const {
         float32x4_t est1 = this->approxInvert().fVec,
                     est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1);
         return est2;
     }
 
-    SkNf operator + (const SkNf& o) const { return vaddq_f32(fVec, o.fVec); }
-    SkNf operator - (const SkNf& o) const { return vsubq_f32(fVec, o.fVec); }
-    SkNf operator * (const SkNf& o) const { return vmulq_f32(fVec, o.fVec); }
-    SkNf operator / (const SkNf& o) const {
+    SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); }
+    SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); }
+    SkNx operator / (const SkNx& o) const {
     #if defined(SK_CPU_ARM64)
         return vdivq_f32(fVec, o.fVec);
     #else
@@ -187,29 +187,29 @@
     #endif
     }
 
-    SkNf operator==(const SkNf& o) const { return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec)); }
-    SkNf operator <(const SkNf& o) const { return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec)); }
-    SkNf operator >(const SkNf& o) const { return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec)); }
-    SkNf operator<=(const SkNf& o) const { return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec)); }
-    SkNf operator>=(const SkNf& o) const { return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec)); }
-    SkNf operator!=(const SkNf& o) const {
+    SkNx operator==(const SkNx& o) const { return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec)); }
+    SkNx operator <(const SkNx& o) const { return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec)); }
+    SkNx operator >(const SkNx& o) const { return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec)); }
+    SkNx operator<=(const SkNx& o) const { return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec)); }
+    SkNx operator>=(const SkNx& o) const { return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec)); }
+    SkNx operator!=(const SkNx& o) const {
         return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec)));
     }
 
-    static SkNf Min(const SkNf& l, const SkNf& r) { return vminq_f32(l.fVec, r.fVec); }
-    static SkNf Max(const SkNf& l, const SkNf& r) { return vmaxq_f32(l.fVec, r.fVec); }
+    static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.fVec); }
+    static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.fVec); }
 
-    SkNf rsqrt0() const { return vrsqrteq_f32(fVec); }
-    SkNf rsqrt1() const {
+    SkNx rsqrt0() const { return vrsqrteq_f32(fVec); }
+    SkNx rsqrt1() const {
         float32x4_t est0 = this->rsqrt0().fVec;
         return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
     }
-    SkNf rsqrt2() const {
+    SkNx rsqrt2() const {
         float32x4_t est1 = this->rsqrt1().fVec;
         return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
     }
 
-    SkNf sqrt() const {
+    SkNx sqrt() const {
     #if defined(SK_CPU_ARM64)
         return vsqrtq_f32(fVec);
     #else
@@ -233,7 +233,7 @@
             || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3);
     }
 
-    SkNf thenElse(const SkNf& t, const SkNf& e) const {
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
         return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec);
     }
 
@@ -241,36 +241,36 @@
 };
 
 template <>
-class SkNi<8, uint16_t> {
+class SkNx<8, uint16_t> {
 public:
-    SkNi(const uint16x8_t& vec) : fVec(vec) {}
+    SkNx(const uint16x8_t& vec) : fVec(vec) {}
 
-    SkNi() {}
-    SkNi(uint16_t val) : fVec(vdupq_n_u16(val)) {}
-    static SkNi Load(const uint16_t vals[8]) { return vld1q_u16(vals); }
+    SkNx() {}
+    SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {}
+    static SkNx Load(const uint16_t vals[8]) { return vld1q_u16(vals); }
 
-    SkNi(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
+    SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
          uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
         fVec = (uint16x8_t) { a,b,c,d, e,f,g,h };
     }
 
     void store(uint16_t vals[8]) const { vst1q_u16(vals, fVec); }
 
-    SkNi operator + (const SkNi& o) const { return vaddq_u16(fVec, o.fVec); }
-    SkNi operator - (const SkNi& o) const { return vsubq_u16(fVec, o.fVec); }
-    SkNi operator * (const SkNi& o) const { return vmulq_u16(fVec, o.fVec); }
+    SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); }
+    SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); }
 
-    SkNi operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); }
-    SkNi operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); }
+    SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); }
+    SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); }
 
-    static SkNi Min(const SkNi& a, const SkNi& b) { return vminq_u16(a.fVec, b.fVec); }
+    static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.fVec); }
 
     template <int k> uint16_t kth() const {
         SkASSERT(0 <= k && k < 8);
         return vgetq_lane_u16(fVec, k&7);
     }
 
-    SkNi thenElse(const SkNi& t, const SkNi& e) const {
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
         return vbslq_u16(fVec, t.fVec, e.fVec);
     }
 
@@ -278,15 +278,15 @@
 };
 
 template <>
-class SkNi<16, uint8_t> {
+class SkNx<16, uint8_t> {
 public:
-    SkNi(const uint8x16_t& vec) : fVec(vec) {}
+    SkNx(const uint8x16_t& vec) : fVec(vec) {}
 
-    SkNi() {}
-    SkNi(uint8_t val) : fVec(vdupq_n_u8(val)) {}
-    static SkNi Load(const uint8_t vals[16]) { return vld1q_u8(vals); }
+    SkNx() {}
+    SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {}
+    static SkNx Load(const uint8_t vals[16]) { return vld1q_u8(vals); }
 
-    SkNi(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
+    SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
          uint8_t e, uint8_t f, uint8_t g, uint8_t h,
          uint8_t i, uint8_t j, uint8_t k, uint8_t l,
          uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
@@ -295,20 +295,20 @@
 
     void store(uint8_t vals[16]) const { vst1q_u8(vals, fVec); }
 
-    SkNi saturatedAdd(const SkNi& o) const { return vqaddq_u8(fVec, o.fVec); }
+    SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); }
 
-    SkNi operator + (const SkNi& o) const { return vaddq_u8(fVec, o.fVec); }
-    SkNi operator - (const SkNi& o) const { return vsubq_u8(fVec, o.fVec); }
+    SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); }
 
-    static SkNi Min(const SkNi& a, const SkNi& b) { return vminq_u8(a.fVec, b.fVec); }
-    SkNi operator < (const SkNi& o) const { return vcltq_u8(fVec, o.fVec); }
+    static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fVec); }
+    SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); }
 
     template <int k> uint8_t kth() const {
         SkASSERT(0 <= k && k < 15);
         return vgetq_lane_u8(fVec, k&16);
     }
 
-    SkNi thenElse(const SkNi& t, const SkNi& e) const {
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
         return vbslq_u8(fVec, t.fVec, e.fVec);
     }
 
@@ -319,6 +319,11 @@
 #undef SHIFT16
 #undef SHIFT8
 
+template<>
+inline SkNx<4, int> SkNx_cast<int, float, 4>(const SkNx<4, float>& src) {
+    return vcvtq_s32_f32(src.fVec);
+}
+
 }  // namespace
 
 #endif//SkNx_neon_DEFINED
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 32e94a6..f0ccd3f 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -14,41 +14,41 @@
 
 
 template <>
-class SkNf<2> {
+class SkNx<2, float> {
 public:
-    SkNf(const __m128& vec) : fVec(vec) {}
+    SkNx(const __m128& vec) : fVec(vec) {}
 
-    SkNf() {}
-    SkNf(float val) : fVec(_mm_set1_ps(val)) {}
-    static SkNf Load(const float vals[2]) {
+    SkNx() {}
+    SkNx(float val) : fVec(_mm_set1_ps(val)) {}
+    static SkNx Load(const float vals[2]) {
         return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)vals));
     }
-    SkNf(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {}
+    SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {}
 
     void store(float vals[2]) const { _mm_storel_pi((__m64*)vals, fVec); }
 
-    SkNf operator + (const SkNf& o) const { return _mm_add_ps(fVec, o.fVec); }
-    SkNf operator - (const SkNf& o) const { return _mm_sub_ps(fVec, o.fVec); }
-    SkNf operator * (const SkNf& o) const { return _mm_mul_ps(fVec, o.fVec); }
-    SkNf operator / (const SkNf& o) const { return _mm_div_ps(fVec, o.fVec); }
+    SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
+    SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
+    SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
 
-    SkNf operator == (const SkNf& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
-    SkNf operator != (const SkNf& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
-    SkNf operator  < (const SkNf& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
-    SkNf operator  > (const SkNf& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
-    SkNf operator <= (const SkNf& o) const { return _mm_cmple_ps (fVec, o.fVec); }
-    SkNf operator >= (const SkNf& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
+    SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
+    SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
+    SkNx operator  < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
+    SkNx operator  > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
+    SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
+    SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
 
-    static SkNf Min(const SkNf& l, const SkNf& r) { return _mm_min_ps(l.fVec, r.fVec); }
-    static SkNf Max(const SkNf& l, const SkNf& r) { return _mm_max_ps(l.fVec, r.fVec); }
+    static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); }
+    static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); }
 
-    SkNf  sqrt() const { return _mm_sqrt_ps (fVec);  }
-    SkNf rsqrt0() const { return _mm_rsqrt_ps(fVec); }
-    SkNf rsqrt1() const { return this->rsqrt0(); }
-    SkNf rsqrt2() const { return this->rsqrt1(); }
+    SkNx  sqrt() const { return _mm_sqrt_ps (fVec);  }
+    SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); }
+    SkNx rsqrt1() const { return this->rsqrt0(); }
+    SkNx rsqrt2() const { return this->rsqrt1(); }
 
-    SkNf       invert() const { return SkNf(1) / *this; }
-    SkNf approxInvert() const { return _mm_rcp_ps(fVec); }
+    SkNx       invert() const { return SkNx(1) / *this; }
+    SkNx approxInvert() const { return _mm_rcp_ps(fVec); }
 
     template <int k> float kth() const {
         SkASSERT(0 <= k && k < 2);
@@ -63,28 +63,28 @@
 };
 
 template <>
-class SkNi<4, int> {
+class SkNx<4, int> {
 public:
-    SkNi(const __m128i& vec) : fVec(vec) {}
+    SkNx(const __m128i& vec) : fVec(vec) {}
 
-    SkNi() {}
-    SkNi(int val) : fVec(_mm_set1_epi32(val)) {}
-    static SkNi Load(const int vals[4]) { return _mm_loadu_si128((const __m128i*)vals); }
-    SkNi(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {}
+    SkNx() {}
+    SkNx(int val) : fVec(_mm_set1_epi32(val)) {}
+    static SkNx Load(const int vals[4]) { return _mm_loadu_si128((const __m128i*)vals); }
+    SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {}
 
     void store(int vals[4]) const { _mm_storeu_si128((__m128i*)vals, fVec); }
 
-    SkNi operator + (const SkNi& o) const { return _mm_add_epi32(fVec, o.fVec); }
-    SkNi operator - (const SkNi& o) const { return _mm_sub_epi32(fVec, o.fVec); }
-    SkNi operator * (const SkNi& o) const {
+    SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
+    SkNx operator * (const SkNx& o) const {
         __m128i mul20 = _mm_mul_epu32(fVec, o.fVec),
                 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4));
         return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)),
                                   _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
     }
 
-    SkNi operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
-    SkNi operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }
+    SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
+    SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }
 
     template <int k> int kth() const {
         SkASSERT(0 <= k && k < 4);
@@ -101,15 +101,15 @@
 };
 
 template <>
-class SkNf<4> {
+class SkNx<4, float> {
 public:
-    SkNf(const __m128& vec) : fVec(vec) {}
+    SkNx(const __m128& vec) : fVec(vec) {}
 
-    SkNf() {}
-    SkNf(float val)           : fVec( _mm_set1_ps(val) ) {}
-    static SkNf Load(const float vals[4]) { return _mm_loadu_ps(vals); }
+    SkNx() {}
+    SkNx(float val)           : fVec( _mm_set1_ps(val) ) {}
+    static SkNx Load(const float vals[4]) { return _mm_loadu_ps(vals); }
 
-    static SkNf FromBytes(const uint8_t bytes[4]) {
+    static SkNx FromBytes(const uint8_t bytes[4]) {
         __m128i fix8 = _mm_cvtsi32_si128(*(const int*)bytes);
     #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
         const char _ = ~0;  // Zero these bytes.
@@ -118,11 +118,11 @@
         __m128i fix8_16 = _mm_unpacklo_epi8 (fix8,    _mm_setzero_si128()),
                 fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
     #endif
-        return SkNf(_mm_cvtepi32_ps(fix8_32));
+        return SkNx(_mm_cvtepi32_ps(fix8_32));
         // TODO: use _mm_cvtepu8_epi32 w/SSE4.1?
     }
 
-    SkNf(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {}
+    SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {}
 
     void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); }
     void toBytes(uint8_t bytes[4]) const {
@@ -132,28 +132,28 @@
         *(int*)bytes = _mm_cvtsi128_si32(fix8);
     }
 
-    SkNf operator + (const SkNf& o) const { return _mm_add_ps(fVec, o.fVec); }
-    SkNf operator - (const SkNf& o) const { return _mm_sub_ps(fVec, o.fVec); }
-    SkNf operator * (const SkNf& o) const { return _mm_mul_ps(fVec, o.fVec); }
-    SkNf operator / (const SkNf& o) const { return _mm_div_ps(fVec, o.fVec); }
+    SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
+    SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
+    SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
 
-    SkNf operator == (const SkNf& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
-    SkNf operator != (const SkNf& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
-    SkNf operator  < (const SkNf& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
-    SkNf operator  > (const SkNf& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
-    SkNf operator <= (const SkNf& o) const { return _mm_cmple_ps (fVec, o.fVec); }
-    SkNf operator >= (const SkNf& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
+    SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
+    SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
+    SkNx operator  < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
+    SkNx operator  > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
+    SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
+    SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
 
-    static SkNf Min(const SkNf& l, const SkNf& r) { return _mm_min_ps(l.fVec, r.fVec); }
-    static SkNf Max(const SkNf& l, const SkNf& r) { return _mm_max_ps(l.fVec, r.fVec); }
+    static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); }
+    static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); }
 
-    SkNf  sqrt() const { return _mm_sqrt_ps (fVec);  }
-    SkNf rsqrt0() const { return _mm_rsqrt_ps(fVec); }
-    SkNf rsqrt1() const { return this->rsqrt0(); }
-    SkNf rsqrt2() const { return this->rsqrt1(); }
+    SkNx  sqrt() const { return _mm_sqrt_ps (fVec);  }
+    SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); }
+    SkNx rsqrt1() const { return this->rsqrt0(); }
+    SkNx rsqrt2() const { return this->rsqrt1(); }
 
-    SkNf       invert() const { return SkNf(1) / *this; }
-    SkNf approxInvert() const { return _mm_rcp_ps(fVec); }
+    SkNx       invert() const { return SkNx(1) / *this; }
+    SkNx approxInvert() const { return _mm_rcp_ps(fVec); }
 
     template <int k> float kth() const {
         SkASSERT(0 <= k && k < 4);
@@ -164,7 +164,7 @@
     bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(fVec)); }
     bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); }
 
-    SkNf thenElse(const SkNf& t, const SkNf& e) const {
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
         return _mm_or_ps(_mm_and_ps   (fVec, t.fVec),
                          _mm_andnot_ps(fVec, e.fVec));
     }
@@ -173,23 +173,23 @@
 };
 
 template <>
-class SkNi<4, uint16_t> {
+class SkNx<4, uint16_t> {
 public:
-    SkNi(const __m128i& vec) : fVec(vec) {}
+    SkNx(const __m128i& vec) : fVec(vec) {}
 
-    SkNi() {}
-    SkNi(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
-    static SkNi Load(const uint16_t vals[4]) { return _mm_loadl_epi64((const __m128i*)vals); }
-    SkNi(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {}
+    SkNx() {}
+    SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
+    static SkNx Load(const uint16_t vals[4]) { return _mm_loadl_epi64((const __m128i*)vals); }
+    SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {}
 
     void store(uint16_t vals[4]) const { _mm_storel_epi64((__m128i*)vals, fVec); }
 
-    SkNi operator + (const SkNi& o) const { return _mm_add_epi16(fVec, o.fVec); }
-    SkNi operator - (const SkNi& o) const { return _mm_sub_epi16(fVec, o.fVec); }
-    SkNi operator * (const SkNi& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
+    SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
+    SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
 
-    SkNi operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
-    SkNi operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
+    SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
+    SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
 
     template <int k> uint16_t kth() const {
         SkASSERT(0 <= k && k < 4);
@@ -200,26 +200,26 @@
 };
 
 template <>
-class SkNi<8, uint16_t> {
+class SkNx<8, uint16_t> {
 public:
-    SkNi(const __m128i& vec) : fVec(vec) {}
+    SkNx(const __m128i& vec) : fVec(vec) {}
 
-    SkNi() {}
-    SkNi(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
-    static SkNi Load(const uint16_t vals[8]) { return _mm_loadu_si128((const __m128i*)vals); }
-    SkNi(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
+    SkNx() {}
+    SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
+    static SkNx Load(const uint16_t vals[8]) { return _mm_loadu_si128((const __m128i*)vals); }
+    SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
          uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec(_mm_setr_epi16(a,b,c,d,e,f,g,h)) {}
 
     void store(uint16_t vals[8]) const { _mm_storeu_si128((__m128i*)vals, fVec); }
 
-    SkNi operator + (const SkNi& o) const { return _mm_add_epi16(fVec, o.fVec); }
-    SkNi operator - (const SkNi& o) const { return _mm_sub_epi16(fVec, o.fVec); }
-    SkNi operator * (const SkNi& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
+    SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
+    SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
 
-    SkNi operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
-    SkNi operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
+    SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
+    SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
 
-    static SkNi Min(const SkNi& a, const SkNi& b) {
+    static SkNx Min(const SkNx& a, const SkNx& b) {
         // No unsigned _mm_min_epu16, so we'll shift into a space where we can use the
         // signed version, _mm_min_epi16, then shift back.
         const uint16_t top = 0x8000; // Keep this separate from _mm_set1_epi16 or MSVC will whine.
@@ -228,7 +228,7 @@
                                                   _mm_sub_epi8(b.fVec, top_8x)));
     }
 
-    SkNi thenElse(const SkNi& t, const SkNi& e) const {
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
         return _mm_or_si128(_mm_and_si128   (fVec, t.fVec),
                             _mm_andnot_si128(fVec, e.fVec));
     }
@@ -242,14 +242,14 @@
 };
 
 template <>
-class SkNi<16, uint8_t> {
+class SkNx<16, uint8_t> {
 public:
-    SkNi(const __m128i& vec) : fVec(vec) {}
+    SkNx(const __m128i& vec) : fVec(vec) {}
 
-    SkNi() {}
-    SkNi(uint8_t val) : fVec(_mm_set1_epi8(val)) {}
-    static SkNi Load(const uint8_t vals[16]) { return _mm_loadu_si128((const __m128i*)vals); }
-    SkNi(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
+    SkNx() {}
+    SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {}
+    static SkNx Load(const uint8_t vals[16]) { return _mm_loadu_si128((const __m128i*)vals); }
+    SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
          uint8_t e, uint8_t f, uint8_t g, uint8_t h,
          uint8_t i, uint8_t j, uint8_t k, uint8_t l,
          uint8_t m, uint8_t n, uint8_t o, uint8_t p)
@@ -257,13 +257,13 @@
 
     void store(uint8_t vals[16]) const { _mm_storeu_si128((__m128i*)vals, fVec); }
 
-    SkNi saturatedAdd(const SkNi& o) const { return _mm_adds_epu8(fVec, o.fVec); }
+    SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); }
 
-    SkNi operator + (const SkNi& o) const { return _mm_add_epi8(fVec, o.fVec); }
-    SkNi operator - (const SkNi& o) const { return _mm_sub_epi8(fVec, o.fVec); }
+    SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); }
 
-    static SkNi Min(const SkNi& a, const SkNi& b) { return _mm_min_epu8(a.fVec, b.fVec); }
-    SkNi operator < (const SkNi& o) const {
+    static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); }
+    SkNx operator < (const SkNx& o) const {
         // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare.
         auto flip = _mm_set1_epi8(char(0x80));
         return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec));
@@ -276,7 +276,7 @@
         return k % 2 == 0 ? pair : (pair >> 8);
     }
 
-    SkNi thenElse(const SkNi& t, const SkNi& e) const {
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
         return _mm_or_si128(_mm_and_si128   (fVec, t.fVec),
                             _mm_andnot_si128(fVec, e.fVec));
     }
@@ -284,6 +284,12 @@
     __m128i fVec;
 };
 
+
+template<>
+inline SkNx<4, int> SkNx_cast<int, float, 4>(const SkNx<4, float>& src) {
+    return _mm_cvttps_epi32(src.fVec);
+}
+
 }  // namespace
 
 #endif//SkNx_sse_DEFINED
diff --git a/tests/SkNxTest.cpp b/tests/SkNxTest.cpp
index 31baebc..b3e03d0 100644
--- a/tests/SkNxTest.cpp
+++ b/tests/SkNxTest.cpp
@@ -13,7 +13,8 @@
 template <int N>
 static void test_Nf(skiatest::Reporter* r) {
 
-    auto assert_nearly_eq = [&](float eps, const SkNf<N>& v, float a, float b, float c, float d) {
+    auto assert_nearly_eq = [&](float eps, const SkNx<N, float>& v,
+                                float a, float b, float c, float d) {
         auto close = [=](float a, float b) { return fabsf(a-b) <= eps; };
         float vals[4];
         v.store(vals);
@@ -26,15 +27,15 @@
             REPORTER_ASSERT(r, ok);
         }
     };
-    auto assert_eq = [&](const SkNf<N>& v, float a, float b, float c, float d) {
+    auto assert_eq = [&](const SkNx<N, float>& v, float a, float b, float c, float d) {
         return assert_nearly_eq(0, v, a,b,c,d);
     };
 
     float vals[] = {3, 4, 5, 6};
-    SkNf<N> a = SkNf<N>::Load(vals),
-            b(a),
-            c = a;
-    SkNf<N> d;
+    SkNx<N,float> a = SkNx<N,float>::Load(vals),
+                  b(a),
+                  c = a;
+    SkNx<N,float> d;
     d = a;
 
     assert_eq(a, 3, 4, 5, 6);
@@ -47,9 +48,9 @@
     assert_eq(a*b-b, 6, 12, 20, 30);
     assert_eq((a*b).sqrt(), 3, 4, 5, 6);
     assert_eq(a/b, 1, 1, 1, 1);
-    assert_eq(SkNf<N>(0)-a, -3, -4, -5, -6);
+    assert_eq(SkNx<N,float>(0)-a, -3, -4, -5, -6);
 
-    SkNf<N> fours(4);
+    SkNx<N,float> fours(4);
 
     assert_eq(fours.sqrt(), 2,2,2,2);
     assert_nearly_eq(0.001f, fours.rsqrt0(), 0.5, 0.5, 0.5, 0.5);
@@ -59,8 +60,8 @@
     assert_eq(               fours.      invert(), 0.25, 0.25, 0.25, 0.25);
     assert_nearly_eq(0.001f, fours.approxInvert(), 0.25, 0.25, 0.25, 0.25);
 
-    assert_eq(SkNf<N>::Min(a, fours), 3, 4, 4, 4);
-    assert_eq(SkNf<N>::Max(a, fours), 4, 4, 5, 6);
+    assert_eq(SkNx<N,float>::Min(a, fours), 3, 4, 4, 4);
+    assert_eq(SkNx<N,float>::Max(a, fours), 4, 4, 5, 6);
 
     // Test some comparisons.  This is not exhaustive.
     REPORTER_ASSERT(r, (a == b).allTrue());
@@ -81,7 +82,7 @@
 
 template <int N, typename T>
 void test_Ni(skiatest::Reporter* r) {
-    auto assert_eq = [&](const SkNi<N,T>& v, T a, T b, T c, T d, T e, T f, T g, T h) {
+    auto assert_eq = [&](const SkNx<N,T>& v, T a, T b, T c, T d, T e, T f, T g, T h) {
         T vals[8];
         v.store(vals);
 
@@ -99,10 +100,10 @@
     };
 
     T vals[] = { 1,2,3,4,5,6,7,8 };
-    SkNi<N,T> a = SkNi<N,T>::Load(vals),
+    SkNx<N,T> a = SkNx<N,T>::Load(vals),
               b(a),
               c = a;
-    SkNi<N,T> d;
+    SkNx<N,T> d;
     d = a;
 
     assert_eq(a, 1,2,3,4,5,6,7,8);
@@ -120,7 +121,7 @@
     REPORTER_ASSERT(r, a.template kth<1>() == 2);
 }
 
-DEF_TEST(SkNi, r) {
+DEF_TEST(SkNx, r) {
     test_Ni<2, uint16_t>(r);
     test_Ni<4, uint16_t>(r);
     test_Ni<8, uint16_t>(r);
@@ -220,3 +221,13 @@
     REPORTER_ASSERT(r, bytes[2] == 255);
     REPORTER_ASSERT(r, bytes[3] == 255);
 }
+
+DEF_TEST(SkNx_cast, r) {
+    Sk4f fs(-1.7f, -1.4f, 0.5f, 1.9f);
+    Sk4i is = SkNx_cast<int>(fs);
+
+    REPORTER_ASSERT(r, is.kth<0>() == -1);
+    REPORTER_ASSERT(r, is.kth<1>() == -1);
+    REPORTER_ASSERT(r, is.kth<2>() ==  0);
+    REPORTER_ASSERT(r, is.kth<3>() ==  1);
+}