Revert of Start to vectorize SkTileGrid. (patchset #45 id:1430002 of https://codereview.chromium.org/634543004/)

Reason for revert:
Many GCC bots missing __builtin_shuffle, e.g. Test-Ubuntu12-ShuttleA-GTX660-x86-Debug-Trybot.

Original issue's description:
> Start to vectorize SkTileGrid.
>
> This adds Sk4x.h to help.
>
> BUG=skia:
>
> Committed: https://skia.googlesource.com/skia/+/90c7992bfc6330f070f7704d63372a0ec8410170

TBR=reed@google.com,mtklein@chromium.org
NOTREECHECKS=true
NOTRY=true
BUG=skia:

Review URL: https://codereview.chromium.org/663663002
diff --git a/bench/GeometryBench.cpp b/bench/GeometryBench.cpp
index 65398fa..a5c4643 100644
--- a/bench/GeometryBench.cpp
+++ b/bench/GeometryBench.cpp
@@ -6,7 +6,6 @@
  */
 
 #include "Benchmark.h"
-#include "Sk4x.h"
 #include "SkGeometry.h"
 #include "SkRandom.h"
 #include "SkRect.h"
@@ -45,9 +44,6 @@
     GeoRectBench(const char suffix[]) : GeometryBench(suffix) {}
 
 protected:
-    // void* vptr;
-    size_t align_fRects_to_16Bytes[sizeof(void*) == 8 ? 1 : 3];
-
     SkRect fRects[2048];
 
     virtual void onPreDraw() {
@@ -101,7 +97,7 @@
 class GeoRectBench_Intersects : public GeoRectBench {
 public:
     GeoRectBench_Intersects() : GeoRectBench("rect_Intersects") {}
-
+    
 protected:
     virtual void onDraw(const int loops, SkCanvas* canvas) SK_OVERRIDE {
         for (int outer = 0; outer < loops; ++outer) {
@@ -117,7 +113,7 @@
 class GeoRectBench_sort : public GeoRectBench {
 public:
     GeoRectBench_sort() : GeoRectBench("rect_sort") {}
-
+    
 protected:
     virtual void onDraw(const int loops, SkCanvas* canvas) SK_OVERRIDE {
         for (int outer = 0; outer < loops; ++outer) {
@@ -133,59 +129,3 @@
 DEF_BENCH( return new GeoRectBench_Intersects; )
 
 DEF_BENCH( return new GeoRectBench_sort; )
-
-class GeoRectBench_sort_4f : public GeoRectBench {
-public:
-    GeoRectBench_sort_4f() : GeoRectBench("rect_sort_4f") { }
-
-protected:
-    static SkRect Sort(const SkRect& rect) {
-        // To sort:
-        //   left, right = minmax(left, right)
-        //   top, bottom = minmax(top, bottom)
-        Sk4f ltrb(&rect.fLeft),
-             rblt = ltrb.zwxy(),
-             ltlt = Sk4f::Min(ltrb, rblt),  // Holds (2 copies of) new left and top.
-             rbrb = Sk4f::Max(ltrb, rblt),  // Holds (2 copies of) new right and bottom.
-             sort = Sk4f::XYAB(ltlt, rbrb);
-
-        SkRect sorted;
-        sort.store(&sorted.fLeft);
-        return sorted;
-    }
-
-    virtual void onDraw(const int loops, SkCanvas* canvas) SK_OVERRIDE {
-        for (int outer = 0; outer < loops; ++outer) {
-            for (size_t i = 0; i < SK_ARRAY_COUNT(fRects); ++i) {
-                fRects[i] = Sort(fRects[i]);
-            }
-        }
-    }
-};
-DEF_BENCH( return new GeoRectBench_sort_4f; )
-
-class GeoRectBench_Intersects_4f : public GeoRectBench {
-public:
-    GeoRectBench_Intersects_4f() : GeoRectBench("rect_Intersects_4f") {}
-
-protected:
-    static bool Intersects(const SkRect& a, const SkRect& b) {
-        Sk4f r1(&a.fLeft),
-             r2(&b.fLeft),
-             lt = Sk4f::XYAB(r1, r2),  // a.L a.T b.L b.T <
-             rb = Sk4f::ZWCD(r2, r1);  // b.R b.B a.R a.B ?
-        return lt.lessThan(rb).allTrue();
-    }
-
-    virtual void onDraw(const int loops, SkCanvas* canvas) SK_OVERRIDE {
-        for (int outer = 0; outer < loops; ++outer) {
-            int count = 0;
-            for (size_t i = 0; i < SK_ARRAY_COUNT(fRects); ++i) {
-                count += Intersects(fRects[0], fRects[i]);
-            }
-            this->virtualCallToFoilOptimizers(count);
-        }
-    }
-};
-DEF_BENCH( return new GeoRectBench_Intersects_4f; )
-
diff --git a/gyp/common_conditions.gypi b/gyp/common_conditions.gypi
index d213e17..d9c0487 100644
--- a/gyp/common_conditions.gypi
+++ b/gyp/common_conditions.gypi
@@ -46,7 +46,6 @@
             4275,  # An exported class was derived from a class that was not exported
             4345,  # This is an FYI about a behavior change from long ago. Chrome stifles it too.
             4355,  # 'this' used in base member initializer list. Off by default in newer compilers.
-            4800,  # forcing value to bool 'true' or 'false'
         ],
         'msvs_cygwin_shell': 0,
         'msvs_settings': {
diff --git a/include/core/SkTypes.h b/include/core/SkTypes.h
index 2625b73..0e9e230 100644
--- a/include/core/SkTypes.h
+++ b/include/core/SkTypes.h
@@ -300,9 +300,6 @@
 #define SkAlign8(x)     (((x) + 7) >> 3 << 3)
 #define SkIsAlign8(x)   (0 == ((x) & 7))
 
-#define SkAlign16(x)    (((x) + 15) >> 4 << 4)
-#define SkIsAlign16(x)  (0 == ((x) & 15))
-
 #define SkAlignPtr(x)   (sizeof(void*) == 8 ?   SkAlign8(x) :   SkAlign4(x))
 #define SkIsAlignPtr(x) (sizeof(void*) == 8 ? SkIsAlign8(x) : SkIsAlign4(x))
 
diff --git a/src/core/Sk4x.h b/src/core/Sk4x.h
deleted file mode 100644
index 9af12c4..0000000
--- a/src/core/Sk4x.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef Sk4x_DEFINED
-#define Sk4x_DEFINED
-
-#include "SkTypes.h"
-
-// First we'll let Clang or GCC try their best with whatever instructions are available.
-// Otherwise fall back on portable code.  This really should be a last resort.
-
-#define SK4X_PREAMBLE 1
-    #if defined(__clang__)
-        #include "Sk4x_clang.h"
-    #elif defined(__GNUC__)
-        #include "Sk4x_gcc.h"
-    #else
-        #include "Sk4x_portable.h"
-    #endif
-#undef SK4X_PREAMBLE
-
-template <typename T> class Sk4x;
-typedef Sk4x<int>   Sk4i;
-typedef Sk4x<float> Sk4f;
-
-template <typename T> class Sk4x {
-public:
-    Sk4x();  // Uninitialized; use Sk4x(0,0,0,0) for zero.
-    Sk4x(T, T, T, T);
-    explicit Sk4x(const T[4]);
-
-    Sk4x(const Sk4x&);
-    Sk4x& operator=(const Sk4x&);
-
-    void set(T, T, T, T);
-
-    void store(T[4]) const;
-
-    template <typename Dst> Dst reinterpret() const;
-    template <typename Dst> Dst cast() const;
-
-    bool allTrue() const;
-    bool anyTrue() const;
-
-    Sk4x bitNot() const;
-    Sk4x bitAnd(const Sk4x&) const;
-    Sk4x bitOr (const Sk4x&) const;
-
-    Sk4i            equal(const Sk4x&) const;
-    Sk4i         notEqual(const Sk4x&) const;
-    Sk4i         lessThan(const Sk4x&) const;
-    Sk4i      greaterThan(const Sk4x&) const;
-    Sk4i    lessThanEqual(const Sk4x&) const;
-    Sk4i greaterThanEqual(const Sk4x&) const;
-
-    Sk4x      add(const Sk4x&) const;
-    Sk4x subtract(const Sk4x&) const;
-    Sk4x multiply(const Sk4x&) const;
-    Sk4x   divide(const Sk4x&) const;
-
-    static Sk4x Min(const Sk4x& a, const Sk4x& b);
-    static Sk4x Max(const Sk4x& a, const Sk4x& b);
-
-    // Swizzles follow OpenCL xyzw convention.
-    Sk4x zwxy() const;
-
-    // When there's a second argument, it's abcd.
-    static Sk4x XYAB(const Sk4x& xyzw, const Sk4x& abcd);
-    static Sk4x ZWCD(const Sk4x& xyzw, const Sk4x& abcd);
-
-private:
-    // It's handy to have Sk4f and Sk4i be mutual friends.
-    template <typename S> friend class Sk4x;
-
-#define SK4X_PRIVATE 1
-    #if defined(__clang__)
-        #include "Sk4x_clang.h"
-    #elif defined(__GNUC__)
-        #include "Sk4x_gcc.h"
-    #else
-        #include "Sk4x_portable.h"
-    #endif
-#undef SK4X_PRIVATE
-};
-
-#if defined(__clang__)
-    #include "Sk4x_clang.h"
-#elif defined(__GNUC__)
-    #include "Sk4x_gcc.h"
-#else
-    #include "Sk4x_portable.h"
-#endif
-
-// TODO ideas for enterprising coders:
-//   1) Code generated for Max() isn't as good in Sk4x_gcc.h as it is in _clang.  Why?
-//   2) Sk4x_sse.h would be good for Windows, and could possibly beat _clang / _gcc
-//      (e.g. they can't generate _mm_movemask_ps for allTrue/anyTrue).
-//   3) Sk4x_neon.h might be a good idea if _clang / _gcc aren't good enough on ARM.
-
-
-#endif//Sk4x_DEFINED
diff --git a/src/core/Sk4x_clang.h b/src/core/Sk4x_clang.h
deleted file mode 100644
index ae976ba..0000000
--- a/src/core/Sk4x_clang.h
+++ /dev/null
@@ -1,125 +0,0 @@
-// It is important _not_ to put header guards here.
-// This file will be intentionally included three times.
-
-// Useful reading:
-//   http://clang.llvm.org/docs/LanguageExtensions.html#vectors-and-extended-vectors
-
-#if defined(SK4X_PREAMBLE)
-
-#elif defined(SK4X_PRIVATE)
-    typedef T Vector __attribute__((ext_vector_type(4)));
-
-    /*implicit*/ Sk4x(Vector vec) : fVec(vec) {}
-
-    template <int m, int a, int s, int k>
-    static Sk4x Shuffle(const Sk4x&, const Sk4x&);
-
-    Vector fVec;
-
-#else  // defined(SK4X_PRIVATE)
-
-template <typename T>
-Sk4x<T>::Sk4x() { }
-
-template <typename T>
-Sk4x<T>::Sk4x(T a, T b, T c, T d) { this->set(a,b,c,d); }
-
-template <typename T>
-Sk4x<T>::Sk4x(const T vals[4]) { this->set(vals[0], vals[1], vals[2], vals[3]); }
-
-template <typename T>
-Sk4x<T>::Sk4x(const Sk4x<T>& other) { *this = other; }
-
-template <typename T>
-Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) { fVec = other.fVec; return *this; }
-
-template <typename T>
-void Sk4x<T>::set(T a, T b, T c, T d) {
-    Vector v = { a, b, c, d };
-    fVec = v;
-}
-
-template <typename T>
-void Sk4x<T>::store(T vals[4]) const {
-    SkASSERT(SkIsAlign16((uintptr_t)vals));
-    *reinterpret_cast<Vector*>(vals) = fVec;
-}
-
-template <typename T>
-template <typename Dst> Dst Sk4x<T>::reinterpret() const {
-    return Dst((typename Dst::Vector)fVec);
-}
-
-template <typename T>
-template <typename Dst> Dst Sk4x<T>::cast() const {
-    #if __has_builtin(__builtin_convertvector)
-        return Dst(__builtin_convertvector(fVec, typename Dst::Vector));
-    #else
-        return Dst(fVec[0], fVec[1], fVec[2], fVec[3]);
-    #endif
-}
-
-template <typename T>
-bool Sk4x<T>::allTrue() const { return fVec[0] & fVec[1] & fVec[2] & fVec[3]; }
-template <typename T>
-bool Sk4x<T>::anyTrue() const { return fVec[0] | fVec[1] | fVec[2] | fVec[3]; }
-
-template <typename T> Sk4x<T> Sk4x<T>::bitNot() const { return ~fVec; }
-
-template <typename T> Sk4x<T> Sk4x<T>::bitAnd(const Sk4x& other) const { return fVec & other.fVec; }
-template <typename T> Sk4x<T> Sk4x<T>::bitOr (const Sk4x& other) const { return fVec | other.fVec; }
-
-template <typename T>
-Sk4i Sk4x<T>::           equal(const Sk4x<T>& other) const { return fVec == other.fVec; }
-template <typename T>
-Sk4i Sk4x<T>::        notEqual(const Sk4x<T>& other) const { return fVec != other.fVec; }
-template <typename T>
-Sk4i Sk4x<T>::        lessThan(const Sk4x<T>& other) const { return fVec  < other.fVec; }
-template <typename T>
-Sk4i Sk4x<T>::     greaterThan(const Sk4x<T>& other) const { return fVec  > other.fVec; }
-template <typename T>
-Sk4i Sk4x<T>::   lessThanEqual(const Sk4x<T>& other) const { return fVec <= other.fVec; }
-template <typename T>
-Sk4i Sk4x<T>::greaterThanEqual(const Sk4x<T>& other) const { return fVec >= other.fVec; }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::     add(const Sk4x<T>& other) const { return fVec + other.fVec; }
-template <typename T>
-Sk4x<T> Sk4x<T>::subtract(const Sk4x<T>& other) const { return fVec - other.fVec; }
-template <typename T>
-Sk4x<T> Sk4x<T>::multiply(const Sk4x<T>& other) const { return fVec * other.fVec; }
-template <typename T>
-Sk4x<T> Sk4x<T>::  divide(const Sk4x<T>& other) const { return fVec / other.fVec; }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::Min(const Sk4x<T>& a, const Sk4x<T>& b) {
-    Sk4i less(a.lessThan(b));
-    Sk4i val = a.reinterpret<Sk4i>().bitAnd(less).bitOr(
-               b.reinterpret<Sk4i>().bitAnd(less.bitNot()));
-    return val.reinterpret<Sk4x>();
-}
-
-template <typename T>
-Sk4x<T> Sk4x<T>::Max(const Sk4x<T>& a, const Sk4x<T>& b) {
-    Sk4i less(a.lessThan(b));
-    Sk4i val = b.reinterpret<Sk4i>().bitAnd(less).bitOr(
-               a.reinterpret<Sk4i>().bitAnd(less.bitNot()));
-    return val.reinterpret<Sk4x>();
-}
-
-template <typename T>
-template <int m, int a, int s, int k>
-Sk4x<T> Sk4x<T>::Shuffle(const Sk4x<T>& x, const Sk4x<T>& y) {
-    return __builtin_shufflevector(x.fVec, y.fVec, m,a,s,k);
-}
-
-template <typename T>
-Sk4x<T> Sk4x<T>::zwxy() const { return fVec.zwxy; }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::XYAB(const Sk4x& xyzw, const Sk4x& abcd) { return Shuffle<0,1,4,5>(xyzw, abcd); }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::ZWCD(const Sk4x& xyzw, const Sk4x& abcd) { return Shuffle<2,3,6,7>(xyzw, abcd); }
-
-#endif // defined(SK4X_PRIVATE)
diff --git a/src/core/Sk4x_gcc.h b/src/core/Sk4x_gcc.h
deleted file mode 100644
index dcef4c2..0000000
--- a/src/core/Sk4x_gcc.h
+++ /dev/null
@@ -1,135 +0,0 @@
-// It is important _not_ to put header guards here.
-// This file will be intentionally included three times.
-
-// Useful reading:
-//   https://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
-
-#if defined(SK4X_PREAMBLE)
-
-#elif defined(SK4X_PRIVATE)
-    typedef T Vector __attribute__((vector_size(16)));
-
-    /*implicit*/ Sk4x(Vector vec) : fVec(vec) {}
-    static inline Vector ShuffleImpl(Vector a, Vector b, int __attribute__((vector_size(16))) mask);
-    template <int m, int a, int s, int k>
-    static Sk4x Shuffle(const Sk4x&, const Sk4x&);
-
-    Vector fVec;
-
-#else  // defined(SK4X_PRIVATE)
-
-template <typename T>
-Sk4x<T>::Sk4x() { }
-
-template <typename T>
-Sk4x<T>::Sk4x(T a, T b, T c, T d) { this->set(a,b,c,d); }
-
-template <typename T>
-Sk4x<T>::Sk4x(const T vals[4]) {
-    fVec = *reinterpret_cast<const Vector*>(vals);  // Should compile to moveaps or moveups.
-}
-
-template <typename T>
-Sk4x<T>::Sk4x(const Sk4x<T>& other) { *this = other; }
-
-template <typename T>
-Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) { fVec = other.fVec; return *this; }
-
-template <typename T>
-void Sk4x<T>::set(T a, T b, T c, T d) {
-    Vector v = { a, b, c, d };
-    fVec = v;
-}
-
-template <typename T>
-void Sk4x<T>::store(T vals[4]) const {
-    SkASSERT(SkIsAlign16((uintptr_t)vals));
-    *reinterpret_cast<Vector*>(vals) = fVec;
-}
-
-template <typename T>
-template <typename Dst> Dst Sk4x<T>::reinterpret() const {
-    return Dst((typename Dst::Vector)fVec);
-}
-
-template <typename T>
-template <typename Dst> Dst Sk4x<T>::cast() const {
-    return Dst(fVec[0], fVec[1], fVec[2], fVec[3]);
-}
-
-template <typename T>
-bool Sk4x<T>::allTrue() const { return fVec[0] & fVec[1] & fVec[2] & fVec[3]; }
-template <typename T>
-bool Sk4x<T>::anyTrue() const { return fVec[0] | fVec[1] | fVec[2] | fVec[3]; }
-
-template <typename T> Sk4x<T> Sk4x<T>::bitNot() const { return Sk4i(~fVec); }
-
-template <typename T> Sk4x<T> Sk4x<T>::bitAnd(const Sk4x& other) const { return fVec & other.fVec; }
-template <typename T> Sk4x<T> Sk4x<T>::bitOr (const Sk4x& other) const { return fVec | other.fVec; }
-
-template <typename T>
-Sk4i Sk4x<T>::           equal(const Sk4x<T>& other) const { return fVec == other.fVec; }
-template <typename T>
-Sk4i Sk4x<T>::        notEqual(const Sk4x<T>& other) const { return fVec != other.fVec; }
-template <typename T>
-Sk4i Sk4x<T>::        lessThan(const Sk4x<T>& other) const { return fVec  < other.fVec; }
-template <typename T>
-Sk4i Sk4x<T>::     greaterThan(const Sk4x<T>& other) const { return fVec  > other.fVec; }
-template <typename T>
-Sk4i Sk4x<T>::   lessThanEqual(const Sk4x<T>& other) const { return fVec <= other.fVec; }
-template <typename T>
-Sk4i Sk4x<T>::greaterThanEqual(const Sk4x<T>& other) const { return fVec >= other.fVec; }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::     add(const Sk4x<T>& other) const { return fVec + other.fVec; }
-template <typename T>
-Sk4x<T> Sk4x<T>::subtract(const Sk4x<T>& other) const { return fVec - other.fVec; }
-template <typename T>
-Sk4x<T> Sk4x<T>::multiply(const Sk4x<T>& other) const { return fVec * other.fVec; }
-template <typename T>
-Sk4x<T> Sk4x<T>::  divide(const Sk4x<T>& other) const { return fVec / other.fVec; }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::Min(const Sk4x<T>& a, const Sk4x<T>& b) {
-    return a.fVec < b.fVec ? a.fVec : b.fVec;  // This makes great SSE code (1 minps op)...
-}
-
-template <typename T>
-Sk4x<T> Sk4x<T>::Max(const Sk4x<T>& a, const Sk4x<T>& b) {
-    return a.fVec < b.fVec ? b.fVec : a.fVec;  // ...but this doesn't look so good (7 ops?).
-}
-
-// GCC 4.8 has a bug that leads it to segfault when presented with the obvious code for Shuffle:
-//   Sk4i::Vector mask = { m,a,s,k };
-//   return __builtin_shuffle(x.fVec, y.fVec, mask);
-//
-// This roundabout implementation via ShuffleImpl works around that bug,
-//   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57509
-
-template <>
-inline Sk4i::Vector Sk4i::ShuffleImpl(Sk4i::Vector x, Sk4i::Vector y, Sk4i::Vector mask) {
-    return __builtin_shuffle(x,y, mask);
-}
-
-template <>
-inline Sk4f::Vector Sk4f::ShuffleImpl(Sk4f::Vector x, Sk4f::Vector y, Sk4i::Vector mask) {
-    return __builtin_shuffle(x,y, mask);
-}
-
-template <typename T>
-template <int m, int a, int s, int k>
-Sk4x<T> Sk4x<T>::Shuffle(const Sk4x<T>& x, const Sk4x<T>& y) {
-    Sk4i::Vector mask = { m,a,s,k };
-    return ShuffleImpl(x.fVec, y.fVec, mask);
-}
-
-template <typename T>
-Sk4x<T> Sk4x<T>::zwxy() const { return Shuffle<2,3,0,1>(*this, *this); }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::XYAB(const Sk4x& xyzw, const Sk4x& abcd) { return Shuffle<0,1,4,5>(xyzw, abcd); }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::ZWCD(const Sk4x& xyzw, const Sk4x& abcd) { return Shuffle<2,3,6,7>(xyzw, abcd); }
-
-#endif // defined(SK4X_PRIVATE)
diff --git a/src/core/Sk4x_portable.h b/src/core/Sk4x_portable.h
deleted file mode 100644
index 0515a9b..0000000
--- a/src/core/Sk4x_portable.h
+++ /dev/null
@@ -1,134 +0,0 @@
-// It is important _not_ to put header guards here.
-// This file will be intentionally included three times.
-
-#if defined(SK4X_PREAMBLE)
-
-#elif defined(SK4X_PRIVATE)
-    typedef T Vector[4];
-
-    Vector fVec;
-
-    template <int m, int a, int s, int k>
-    static Sk4x Shuffle(const Sk4x&, const Sk4x&);
-
-#else  // defined(SK4X_PRIVATE)
-
-template <typename T>
-Sk4x<T>::Sk4x() { }
-
-template <typename T>
-Sk4x<T>::Sk4x(T a, T b, T c, T d) { this->set(a,b,c,d); }
-
-template <typename T>
-Sk4x<T>::Sk4x(const T vals[4]) { this->set(vals[0], vals[1], vals[2], vals[3]); }
-
-template <typename T>
-Sk4x<T>::Sk4x(const Sk4x<T>& other) { *this = other; }
-
-template <typename T>
-Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) {
-    this->set(other.fVec[0], other.fVec[1], other.fVec[2], other.fVec[3]);
-    return *this;
-}
-
-template <typename T>
-void Sk4x<T>::set(T a, T b, T c, T d) {
-    fVec[0] = a;
-    fVec[1] = b;
-    fVec[2] = c;
-    fVec[3] = d;
-}
-
-template <typename T>
-void Sk4x<T>::store(T vals[4]) const {
-    vals[0] = fVec[0];
-    vals[1] = fVec[1];
-    vals[2] = fVec[2];
-    vals[3] = fVec[3];
-}
-
-template <typename T>
-template <typename Dst> Dst Sk4x<T>::reinterpret() const {
-    return Dst(reinterpret_cast<const typename Dst::Vector*>(fVec));
-}
-
-template <typename T>
-template <typename Dst> Dst Sk4x<T>::cast() const {
-    return Dst(fVec[0], fVec[1], fVec[2], fVec[3]);
-}
-
-template <typename T>
-bool Sk4x<T>::allTrue() const { return fVec[0] & fVec[1] & fVec[2] & fVec[3]; }
-template <typename T>
-bool Sk4x<T>::anyTrue() const { return fVec[0] | fVec[1] | fVec[2] | fVec[3]; }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::bitNot() const { return Sk4x(~fVec[0], ~fVec[1], ~fVec[2], ~fVec[3]); }
-
-#define BINOP(op) fVec[0] op other.fVec[0], \
-                  fVec[1] op other.fVec[1], \
-                  fVec[2] op other.fVec[2], \
-                  fVec[3] op other.fVec[3]
-
-template <typename T> Sk4x<T> Sk4x<T>::bitAnd(const Sk4x& other) const { return Sk4x(BINOP(&)); }
-template <typename T> Sk4x<T> Sk4x<T>::bitOr (const Sk4x& other) const { return Sk4x(BINOP(|)); }
-
-template <typename T>
-Sk4i Sk4x<T>::           equal(const Sk4x<T>& other) const { return Sk4i(BINOP(==)); }
-template <typename T>
-Sk4i Sk4x<T>::        notEqual(const Sk4x<T>& other) const { return Sk4i(BINOP(!=)); }
-template <typename T>
-Sk4i Sk4x<T>::        lessThan(const Sk4x<T>& other) const { return Sk4i(BINOP( <)); }
-template <typename T>
-Sk4i Sk4x<T>::     greaterThan(const Sk4x<T>& other) const { return Sk4i(BINOP( >)); }
-template <typename T>
-Sk4i Sk4x<T>::   lessThanEqual(const Sk4x<T>& other) const { return Sk4i(BINOP(<=)); }
-template <typename T>
-Sk4i Sk4x<T>::greaterThanEqual(const Sk4x<T>& other) const { return Sk4i(BINOP(>=)); }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::     add(const Sk4x<T>& other) const { return Sk4x(BINOP(+)); }
-template <typename T>
-Sk4x<T> Sk4x<T>::subtract(const Sk4x<T>& other) const { return Sk4x(BINOP(-)); }
-template <typename T>
-Sk4x<T> Sk4x<T>::multiply(const Sk4x<T>& other) const { return Sk4x(BINOP(*)); }
-template <typename T>
-Sk4x<T> Sk4x<T>::  divide(const Sk4x<T>& other) const { return Sk4x(BINOP(/)); }
-
-#undef BINOP
-
-template <typename T>
-Sk4x<T> Sk4x<T>::Min(const Sk4x<T>& a, const Sk4x<T>& b) {
-    return Sk4x(SkTMin(a.fVec[0], b.fVec[0]),
-                SkTMin(a.fVec[1], b.fVec[1]),
-                SkTMin(a.fVec[2], b.fVec[2]),
-                SkTMin(a.fVec[3], b.fVec[3]));
-}
-
-template <typename T>
-Sk4x<T> Sk4x<T>::Max(const Sk4x<T>& a, const Sk4x<T>& b) {
-    return Sk4x(SkTMax(a.fVec[0], b.fVec[0]),
-                SkTMax(a.fVec[1], b.fVec[1]),
-                SkTMax(a.fVec[2], b.fVec[2]),
-                SkTMax(a.fVec[3], b.fVec[3]));
-}
-
-template <typename T>
-template <int m, int a, int s, int k>
-Sk4x<T> Sk4x<T>::Shuffle(const Sk4x<T>& x, const Sk4x<T>& y) {
-    return Sk4x(m < 4 ? x.fVec[m] : y.fVec[m-4],
-                a < 4 ? x.fVec[a] : y.fVec[a-4],
-                s < 4 ? x.fVec[s] : y.fVec[s-4],
-                k < 4 ? x.fVec[k] : y.fVec[k-4]);
-}
-
-template <typename T>
-Sk4x<T> Sk4x<T>::zwxy() const { return Shuffle<2,3,0,1>(*this, *this); }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::XYAB(const Sk4x& xyzw, const Sk4x& abcd) { return Shuffle<0,1,4,5>(xyzw, abcd); }
-
-template <typename T>
-Sk4x<T> Sk4x<T>::ZWCD(const Sk4x& xyzw, const Sk4x& abcd) { return Shuffle<2,3,6,7>(xyzw, abcd); }
-
-#endif // defined(SK4X_PRIVATE)
diff --git a/src/core/SkTileGrid.cpp b/src/core/SkTileGrid.cpp
index 03b30f2..10782c4 100644
--- a/src/core/SkTileGrid.cpp
+++ b/src/core/SkTileGrid.cpp
@@ -6,34 +6,25 @@
  */
 
 #include "SkTileGrid.h"
-#include "Sk4x.h"
 
 SkTileGrid::SkTileGrid(int xTiles, int yTiles, const SkTileGridFactory::TileGridInfo& info)
     : fXTiles(xTiles)
-    , fNumTiles(xTiles * yTiles)
+    , fYTiles(yTiles)
+    , fInvWidth( SkScalarInvert(info.fTileInterval.width()))
+    , fInvHeight(SkScalarInvert(info.fTileInterval.height()))
+    , fMarginWidth (info.fMargin.fWidth +1)  // Margin is offset by 1 as a provision for AA and
+    , fMarginHeight(info.fMargin.fHeight+1)  // to cancel the outset applied by getClipDeviceBounds.
+    , fOffset(SkPoint::Make(info.fOffset.fX, info.fOffset.fY))
     , fGridBounds(SkRect::MakeWH(xTiles * info.fTileInterval.width(),
                                  yTiles * info.fTileInterval.height()))
-    , fMargin(-info.fMargin.fWidth  - 1,  // Outset margin by 1 as a provision for AA and to
-              -info.fMargin.fHeight - 1,  // cancel the outset applied by getClipDeviceBounds().
-              +info.fMargin.fWidth  + 1,
-              +info.fMargin.fHeight + 1)
-    , fOffset(info.fOffset.fX,
-              info.fOffset.fY,
-              info.fOffset.fX - SK_ScalarNearlyZero,  // We scrunch user-provided bounds in a little
-              info.fOffset.fY - SK_ScalarNearlyZero)  // to make right and bottom edges exclusive.
-    , fUserToGrid(SkScalarInvert(info.fTileInterval.width()),
-                  SkScalarInvert(info.fTileInterval.height()),
-                  SkScalarInvert(info.fTileInterval.width()),
-                  SkScalarInvert(info.fTileInterval.height()))
-    , fGridHigh(fXTiles - 1, yTiles - 1, fXTiles - 1, yTiles - 1)
-    , fTiles(SkNEW_ARRAY(SkTDArray<unsigned>, fNumTiles)) {}
+    , fTiles(SkNEW_ARRAY(SkTDArray<unsigned>, xTiles * yTiles)) {}
 
 SkTileGrid::~SkTileGrid() {
     SkDELETE_ARRAY(fTiles);
 }
 
 void SkTileGrid::reserve(unsigned opCount) {
-    if (fNumTiles == 0) {
+    if (fXTiles * fYTiles == 0) {
         return;  // A tileless tile grid is nonsensical, but happens in at least cc_unittests.
     }
 
@@ -43,9 +34,9 @@
 
     // If we take those observations and further assume the ops are distributed evenly
     // across the picture, we get this guess for number of ops per tile:
-    const int opsPerTileGuess = (2 * opCount) / fNumTiles;
+    const int opsPerTileGuess = (2 * opCount) / (fXTiles * fYTiles);
 
-    for (SkTDArray<unsigned>* tile = fTiles; tile != fTiles + fNumTiles; tile++) {
+    for (SkTDArray<unsigned>* tile = fTiles; tile != fTiles + (fXTiles * fYTiles); tile++) {
         tile->setReserve(opsPerTileGuess);
     }
 
@@ -54,51 +45,39 @@
 }
 
 void SkTileGrid::flushDeferredInserts() {
-    for (SkTDArray<unsigned>* tile = fTiles; tile != fTiles + fNumTiles; tile++) {
+    for (SkTDArray<unsigned>* tile = fTiles; tile != fTiles + (fXTiles * fYTiles); tile++) {
         tile->shrinkToFit();
     }
 }
 
-// Convert user-space bounds to grid tiles they cover (LT+RB both inclusive).
-// Out of bounds queries are clamped to the single nearest tile.
-void SkTileGrid::userToGrid(const Sk4f& user, SkIRect* out) const {
-    // Map from user coordinates to grid tile coordinates.
-    Sk4f grid = user.multiply(fUserToGrid);
+// Adjustments to user-provided bounds common to both insert() and search().
+// Call this after making insert- or search- specific adjustments.
+void SkTileGrid::commonAdjust(SkRect* rect) const {
+    // Apply our offset.
+    rect->offset(fOffset);
 
-    // Now that we're in grid coordinates, clamp to the grid bounds.
-    grid = Sk4f::Max(grid, Sk4f(0,0,0,0));
-    grid = Sk4f::Min(grid, fGridHigh);
-
-    // Truncate to integers.
-    grid.cast<Sk4i>().store(&out->fLeft);
+    // Scrunch the bounds in just a little to make the right and bottom edges
+    // exclusive.  We want bounds of exactly one tile to hit exactly one tile.
+    rect->fRight  -= SK_ScalarNearlyZero;
+    rect->fBottom -= SK_ScalarNearlyZero;
 }
 
-// If the rect is inverted, sort it.
-static Sk4f sorted(const Sk4f& ltrb) {
-    // To sort:
-    //   left, right = minmax(left, right)
-    //   top, bottom = minmax(top, bottom)
-    Sk4f rblt = ltrb.zwxy(),
-         ltlt = Sk4f::Min(ltrb, rblt),  // Holds (2 copies of) new left and top.
-         rbrb = Sk4f::Max(ltrb, rblt),  // Holds (2 copies of) new right and bottom.
-         sort = Sk4f::XYAB(ltlt, rbrb);
-    return sort;
-}
-
-// Does this rect intersect the grid?
-bool SkTileGrid::intersectsGrid(const Sk4f& ltrb) const {
-    SkRect bounds;
-    ltrb.store(&bounds.fLeft);
-    return SkRect::Intersects(bounds, fGridBounds);
-    // TODO: If we can get it fast enough, write intersect using Sk4f.
+// Convert user-space bounds to grid tiles they cover (LT and RB both inclusive).
+void SkTileGrid::userToGrid(const SkRect& user, SkIRect* grid) const {
+    grid->fLeft   = SkPin32(user.left()   * fInvWidth , 0, fXTiles - 1);
+    grid->fTop    = SkPin32(user.top()    * fInvHeight, 0, fYTiles - 1);
+    grid->fRight  = SkPin32(user.right()  * fInvWidth , 0, fXTiles - 1);
+    grid->fBottom = SkPin32(user.bottom() * fInvHeight, 0, fYTiles - 1);
 }
 
 void SkTileGrid::insert(unsigned opIndex, const SkRect& originalBounds, bool) {
-    Sk4f bounds = Sk4f(&originalBounds.fLeft).add(fMargin).add(fOffset);
-    SkASSERT(sorted(bounds).equal(bounds).allTrue());
+    SkRect bounds = originalBounds;
+    bounds.outset(fMarginWidth, fMarginHeight);
+    this->commonAdjust(&bounds);
 
-    // TODO(mtklein): skip this check and just let out-of-bounds rects insert into nearest tile?
-    if (!this->intersectsGrid(bounds)) {
+    // TODO(mtklein): can we assert this instead to save an intersection in Release mode,
+    // or just allow out-of-bound insertions to insert anyway (clamped to nearest tile)?
+    if (!SkRect::Intersects(bounds, fGridBounds)) {
         return;
     }
 
@@ -124,11 +103,20 @@
 static const int kStackAllocationTileCount = 1024;
 
 void SkTileGrid::search(const SkRect& originalQuery, SkTDArray<unsigned>* results) const {
-    // The .subtract(fMargin) counteracts the .add(fMargin) applied in insert(),
-    // which optimizes for lookups of size tileInterval + 2 * margin (aligned with the tile grid).
-    // That .subtract(fMargin) may have inverted the rect, so we sort it.
-    Sk4f query = sorted(Sk4f(&originalQuery.fLeft).subtract(fMargin).add(fOffset));
+    // The inset counteracts the outset that applied in 'insert', which optimizes
+    // for lookups of size 'tileInterval + 2 * margin' (aligned with the tile grid).
+    SkRect query = originalQuery;
+    query.inset(fMarginWidth, fMarginHeight);
+    this->commonAdjust(&query);
 
+    // The inset may have inverted the rectangle, so sort().
+    // TODO(mtklein): It looks like we only end up with inverted bounds in unit tests
+    // that make explicitly inverted queries, not from insetting.  If we can drop support for
+    // unsorted bounds (i.e. we don't see them outside unit tests), I think we can drop this.
+    query.sort();
+
+    // No intersection check.  We optimize for queries that are in bounds.
+    // We're safe anyway: userToGrid() will clamp out-of-bounds queries to nearest tile.
     SkIRect grid;
     this->userToGrid(query, &grid);
 
diff --git a/src/core/SkTileGrid.h b/src/core/SkTileGrid.h
index d556f80..fd7584f 100644
--- a/src/core/SkTileGrid.h
+++ b/src/core/SkTileGrid.h
@@ -8,7 +8,6 @@
 #ifndef SkTileGrid_DEFINED
 #define SkTileGrid_DEFINED
 
-#include "Sk4x.h"
 #include "SkBBHFactory.h"
 #include "SkBBoxHierarchy.h"
 
@@ -44,16 +43,16 @@
     virtual void flushDeferredInserts() SK_OVERRIDE;
 
 private:
-    void userToGrid(const Sk4f&, SkIRect*) const;
-    bool intersectsGrid(const Sk4f&) const;
+    void commonAdjust(SkRect*) const;
+    void userToGrid(const SkRect&, SkIRect* grid) const;
 
-    const int fXTiles,    // Number of tiles in a single row.
-              fNumTiles;  // Total number of tiles.
+    const int fXTiles, fYTiles;
+    const SkScalar fInvWidth, fInvHeight;
+    const SkScalar fMarginWidth, fMarginHeight;
+    const SkPoint fOffset;
+    const SkRect  fGridBounds;
 
-    const SkRect fGridBounds;  // Only used for intersectsGrid().  Remove if that's removed.
-    const Sk4f fMargin, fOffset, fUserToGrid, fGridHigh;
-
-    // fNumTiles SkTDArrays, each listing ops overlapping that tile in order.
+    // (fXTiles * fYTiles) SkTDArrays, each listing ops overlapping that tile in order.
     SkTDArray<unsigned>* fTiles;
 
     typedef SkBBoxHierarchy INHERITED;