Generic 4f gradient T sampler fallback

Add a generic T sampler fallback impl which uses T series produced by
subclasses mapTs() overrides.  The fallback path uses the same interval
structures as the current optimized linear4f impl, but always sorted
in stop order (never inverted to match dx/increasing x order).

Enable the new mechanism for 4f linear w/ perspective.

Other boring changes:

  * relocate the interval builder (back) to the base class
  * add a private header for shared templates

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1783823002

Review URL: https://codereview.chromium.org/1783823002
diff --git a/gyp/effects.gypi b/gyp/effects.gypi
index 1c6b29e..cfbae45 100644
--- a/gyp/effects.gypi
+++ b/gyp/effects.gypi
@@ -68,6 +68,7 @@
 
     '<(skia_src_path)/effects/gradients/Sk4fGradientBase.cpp',
     '<(skia_src_path)/effects/gradients/Sk4fGradientBase.h',
+    '<(skia_src_path)/effects/gradients/Sk4fGradientPriv.h',
     '<(skia_src_path)/effects/gradients/Sk4fLinearGradient.cpp',
     '<(skia_src_path)/effects/gradients/Sk4fLinearGradient.h',
     '<(skia_src_path)/effects/gradients/SkClampRange.cpp',
diff --git a/src/effects/gradients/Sk4fGradientBase.cpp b/src/effects/gradients/Sk4fGradientBase.cpp
index 76fb178..a729812 100644
--- a/src/effects/gradients/Sk4fGradientBase.cpp
+++ b/src/effects/gradients/Sk4fGradientBase.cpp
@@ -6,17 +6,115 @@
  */
 
 #include "Sk4fGradientBase.h"
+#include "Sk4fGradientPriv.h"
+
+#include <functional>
 
 namespace {
 
-// true when x is in [k1,k2)
-bool in_range(SkScalar x, SkScalar k1, SkScalar k2) {
-    SkASSERT(k1 != k2);
-    return (k1 < k2)
-        ? (x >= k1 && x < k2)
-        : (x >= k2 && x < k1);
+SkPMColor pack_color(SkColor c, bool premul) {
+    return premul
+        ? SkPreMultiplyColor(c)
+        : SkPackARGB32NoCheck(SkColorGetA(c), SkColorGetR(c), SkColorGetG(c), SkColorGetB(c));
 }
 
+template<SkShader::TileMode>
+SkScalar tileProc(SkScalar t);
+
+template<>
+SkScalar tileProc<SkShader::kClamp_TileMode>(SkScalar t) {
+    // synthetic clamp-mode edge intervals allow for a free-floating t:
+    //   [-inf..0)[0..1)[1..+inf)
+    return t;
+}
+
+template<>
+SkScalar tileProc<SkShader::kRepeat_TileMode>(SkScalar t) {
+    // t % 1  (intervals range: [0..1))
+    return t - SkScalarFloorToScalar(t);
+}
+
+template<>
+SkScalar tileProc<SkShader::kMirror_TileMode>(SkScalar t) {
+    // t % 2  (synthetic mirror intervals expand the range to [0..2)
+    return t - SkScalarFloorToScalar(t / 2) * 2;
+}
+
+class IntervalIterator {
+public:
+    IntervalIterator(const SkColor* colors, const SkScalar* pos, int count, bool reverse)
+        : fColors(colors)
+        , fPos(pos)
+        , fCount(count)
+        , fFirstPos(reverse ? SK_Scalar1 : 0)
+        , fBegin(reverse ? count - 1 : 0)
+        , fAdvance(reverse ? -1 : 1) {
+        SkASSERT(colors);
+        SkASSERT(count > 0);
+    }
+
+    void iterate(std::function<void(SkColor, SkColor, SkScalar, SkScalar)> func) const {
+        if (!fPos) {
+            this->iterateImplicitPos(func);
+            return;
+        }
+
+        const int end = fBegin + fAdvance * (fCount - 1);
+        const SkScalar lastPos = 1 - fFirstPos;
+        int prev = fBegin;
+        SkScalar prevPos = fFirstPos;
+
+        do {
+            const int curr = prev + fAdvance;
+            SkASSERT(curr >= 0 && curr < fCount);
+
+            // TODO: this sanitization should be done in SkGradientShaderBase
+            const SkScalar currPos = (fAdvance > 0)
+                ? SkTPin(fPos[curr], prevPos, lastPos)
+                : SkTPin(fPos[curr], lastPos, prevPos);
+
+            if (currPos != prevPos) {
+                SkASSERT((currPos - prevPos > 0) == (fAdvance > 0));
+                func(fColors[prev], fColors[curr], prevPos, currPos);
+            }
+
+            prev = curr;
+            prevPos = currPos;
+        } while (prev != end);
+    }
+
+private:
+    void iterateImplicitPos(std::function<void(SkColor, SkColor, SkScalar, SkScalar)> func) const {
+        // When clients don't provide explicit color stop positions (fPos == nullptr),
+        // the color stops are distributed evenly across the unit interval
+        // (implicit positioning).
+        const SkScalar dt = fAdvance * SK_Scalar1 / (fCount - 1);
+        const int end = fBegin + fAdvance * (fCount - 2);
+        int prev = fBegin;
+        SkScalar prevPos = fFirstPos;
+
+        while (prev != end) {
+            const int curr = prev + fAdvance;
+            SkASSERT(curr >= 0 && curr < fCount);
+
+            const SkScalar currPos = prevPos + dt;
+            func(fColors[prev], fColors[curr], prevPos, currPos);
+            prev = curr;
+            prevPos = currPos;
+        }
+
+        // emit the last interval with a pinned end position, to avoid precision issues
+        func(fColors[prev], fColors[prev + fAdvance], prevPos, 1 - fFirstPos);
+    }
+
+    const SkColor*  fColors;
+    const SkScalar* fPos;
+    const int       fCount;
+    const SkScalar  fFirstPos;
+    const int       fBegin;
+    const int       fAdvance;
+};
+
 } // anonymous namespace
 
 SkGradientShaderBase::GradientShaderBase4fContext::
@@ -36,11 +134,6 @@
     dc4f.store(&fDc.fVec);
 }
 
-bool SkGradientShaderBase::GradientShaderBase4fContext::
-Interval::contains(SkScalar fx) const {
-    return in_range(fx, fP0, fP1);
-}
-
 SkGradientShaderBase::
 GradientShaderBase4fContext::GradientShaderBase4fContext(const SkGradientShaderBase& shader,
                                                          const ContextRec& rec)
@@ -65,3 +158,280 @@
         (shader.fGradFlags & SkGradientShader::kInterpolateColorsInPremul_Flag)
         || shader.fColorsAreOpaque;
 }
+
+void SkGradientShaderBase::
+GradientShaderBase4fContext::buildIntervals(const SkGradientShaderBase& shader,
+                                            const ContextRec& rec, bool reverse) {
+    // The main job here is to build a specialized interval list: a different
+    // representation of the color stops data, optimized for efficient scan line
+    // access during shading.
+    //
+    //   [{P0,C0} , {P1,C1}) [{P1,C2} , {P2,c3}) ... [{Pn,C2n} , {Pn+1,C2n+1})
+    //
+    // The list may be inverted when requested (such that e.g. points are sorted
+    // in increasing x order when dx < 0).
+    //
+    // Note: the current representation duplicates pos data; we could refactor to
+    //       avoid this if interval storage size becomes a concern.
+    //
+    // Aside from reordering, we also perform two more pre-processing steps at
+    // this stage:
+    //
+    //   1) scale the color components depending on paint alpha and the requested
+    //      interpolation space (note: the interval color storage is SkPM4f, but
+    //      that doesn't necessarily mean the colors are premultiplied; that
+    //      property is tracked in fColorsArePremul)
+    //
+    //   2) inject synthetic intervals to support tiling.
+    //
+    //      * for kRepeat, no extra intervals are needed - the iterator just
+    //        wraps around at the end:
+    //
+    //          ->[P0,P1)->..[Pn-1,Pn)->
+    //
+    //      * for kClamp, we add two "infinite" intervals before/after:
+    //
+    //          [-/+inf , P0)->[P0 , P1)->..[Pn-1 , Pn)->[Pn , +/-inf)
+    //
+    //        (the iterator should never run off the end in this mode)
+    //
+    //      * for kMirror, we extend the range to [0..2] and add a flipped
+    //        interval series - then the iterator operates just as in the
+    //        kRepeat case:
+    //
+    //          ->[P0,P1)->..[Pn-1,Pn)->[2 - Pn,2 - Pn-1)->..[2 - P1,2 - P0)->
+    //
+    // TODO: investigate collapsing intervals << 1px.
+
+    SkASSERT(shader.fColorCount > 0);
+    SkASSERT(shader.fOrigColors);
+
+    const float paintAlpha = rec.fPaint->getAlpha() * (1.0f / 255);
+    const Sk4f componentScale = fColorsArePremul
+        ? Sk4f(paintAlpha)
+        : Sk4f(1.0f, 1.0f, 1.0f, paintAlpha);
+    const int first_index = reverse ? shader.fColorCount - 1 : 0;
+    const int last_index = shader.fColorCount - 1 - first_index;
+    const SkScalar first_pos = reverse ? SK_Scalar1 : 0;
+    const SkScalar last_pos = SK_Scalar1 - first_pos;
+
+    if (shader.fTileMode == SkShader::kClamp_TileMode) {
+        // synthetic edge interval: -/+inf .. P0
+        const SkPMColor clamp_color = pack_color(shader.fOrigColors[first_index],
+                                                 fColorsArePremul);
+        const SkScalar clamp_pos = reverse ? SK_ScalarMax : SK_ScalarMin;
+        fIntervals.emplace_back(clamp_color, clamp_pos,
+                                clamp_color, first_pos,
+                                componentScale);
+    } else if (shader.fTileMode == SkShader::kMirror_TileMode && reverse) {
+        // synthetic mirror intervals injected before main intervals: (2 .. 1]
+        addMirrorIntervals(shader, componentScale, false);
+    }
+
+    const IntervalIterator iter(shader.fOrigColors,
+                                shader.fOrigPos,
+                                shader.fColorCount,
+                                reverse);
+    iter.iterate([this, &componentScale] (SkColor c0, SkColor c1, SkScalar p0, SkScalar p1) {
+        SkASSERT(fIntervals.empty() || fIntervals.back().fP1 == p0);
+
+        fIntervals.emplace_back(pack_color(c0, fColorsArePremul),
+                                p0,
+                                pack_color(c1, fColorsArePremul),
+                                p1,
+                                componentScale);
+    });
+
+    if (shader.fTileMode == SkShader::kClamp_TileMode) {
+        // synthetic edge interval: Pn .. +/-inf
+        const SkPMColor clamp_color =
+            pack_color(shader.fOrigColors[last_index], fColorsArePremul);
+        const SkScalar clamp_pos = reverse ? SK_ScalarMin : SK_ScalarMax;
+        fIntervals.emplace_back(clamp_color, last_pos,
+                                clamp_color, clamp_pos,
+                                componentScale);
+    } else if (shader.fTileMode == SkShader::kMirror_TileMode && !reverse) {
+        // synthetic mirror intervals injected after main intervals: [1 .. 2)
+        addMirrorIntervals(shader, componentScale, true);
+    }
+}
+
+void SkGradientShaderBase::
+GradientShaderBase4fContext::addMirrorIntervals(const SkGradientShaderBase& shader,
+                                            const Sk4f& componentScale, bool reverse) {
+    const IntervalIterator iter(shader.fOrigColors,
+                                shader.fOrigPos,
+                                shader.fColorCount,
+                                reverse);
+    iter.iterate([this, &componentScale] (SkColor c0, SkColor c1, SkScalar p0, SkScalar p1) {
+        SkASSERT(fIntervals.empty() || fIntervals.back().fP1 == 2 - p0);
+
+        fIntervals.emplace_back(pack_color(c0, fColorsArePremul),
+                                2 - p0,
+                                pack_color(c1, fColorsArePremul),
+                                2 - p1,
+                                componentScale);
+    });
+}
+
+void SkGradientShaderBase::
+GradientShaderBase4fContext::shadeSpan(int x, int y, SkPMColor dst[], int count) {
+    if (fColorsArePremul) {
+        this->shadePremulSpan<SkPMColor, false>(x, y, dst, count);
+    } else {
+        this->shadePremulSpan<SkPMColor, true>(x, y, dst, count);
+    }
+}
+
+void SkGradientShaderBase::
+GradientShaderBase4fContext::shadeSpan4f(int x, int y, SkPM4f dst[], int count) {
+    if (fColorsArePremul) {
+        this->shadePremulSpan<SkPM4f, false>(x, y, dst, count);
+    } else {
+        this->shadePremulSpan<SkPM4f, true>(x, y, dst, count);
+    }
+}
+
+template<typename DstType, bool do_premul>
+void SkGradientShaderBase::
+GradientShaderBase4fContext::shadePremulSpan(int x, int y,
+                                             DstType dst[],
+                                             int count) const {
+    const SkGradientShaderBase& shader =
+        static_cast<const SkGradientShaderBase&>(fShader);
+
+    switch (shader.fTileMode) {
+    case kClamp_TileMode:
+        this->shadeSpanInternal<DstType,
+                                do_premul,
+                                kClamp_TileMode>(x, y, dst, count);
+        break;
+    case kRepeat_TileMode:
+        this->shadeSpanInternal<DstType,
+                                do_premul,
+                                kRepeat_TileMode>(x, y, dst, count);
+        break;
+    case kMirror_TileMode:
+        this->shadeSpanInternal<DstType,
+                                do_premul,
+                                kMirror_TileMode>(x, y, dst, count);
+        break;
+    }
+}
+
+template<typename DstType, bool do_premul, SkShader::TileMode tileMode>
+void SkGradientShaderBase::
+GradientShaderBase4fContext::shadeSpanInternal(int x, int y,
+                                               DstType dst[],
+                                               int count) const {
+    static const int kBufSize = 128;
+    SkScalar ts[kBufSize];
+    TSampler<DstType, tileMode> sampler(*this);
+
+    SkASSERT(count > 0);
+    do {
+        const int n = SkTMin(kBufSize, count);
+        this->mapTs(x, y, ts, n);
+        for (int i = 0; i < n; ++i) {
+            const Sk4f c = sampler.sample(ts[i]);
+            store<DstType, do_premul>(c, dst++);
+        }
+        x += n;
+        count -= n;
+    } while (count > 0);
+}
+
+template<typename DstType, SkShader::TileMode tileMode>
+class SkGradientShaderBase::GradientShaderBase4fContext::TSampler {
+public:
+    TSampler(const GradientShaderBase4fContext& ctx)
+        : fFirstInterval(ctx.fIntervals.begin())
+        , fLastInterval(ctx.fIntervals.end() - 1)
+        , fInterval(nullptr) {
+        SkASSERT(fLastInterval >= fFirstInterval);
+    }
+
+    Sk4f sample(SkScalar t) {
+        const SkScalar tiled_t = tileProc<tileMode>(t);
+
+        if (!fInterval) {
+            // Very first sample => locate the initial interval.
+            // TODO: maybe do this in ctor to remove a branch?
+            fInterval = this->findFirstInterval(tiled_t);
+            this->loadIntervalData(fInterval);
+        } else if (tiled_t < fInterval->fP0 || tiled_t >= fInterval->fP1) {
+            fInterval = this->findNextInterval(t, tiled_t);
+            this->loadIntervalData(fInterval);
+        }
+
+        fPrevT = t;
+        return lerp(tiled_t);
+    }
+
+private:
+    Sk4f lerp(SkScalar t) {
+        SkASSERT(t >= fInterval->fP0 && t < fInterval->fP1);
+        return fCc + fDc * (t - fInterval->fP0);
+    }
+
+    const Interval* findFirstInterval(SkScalar t) const {
+        // Binary search.
+        const Interval* i0 = fFirstInterval;
+        const Interval* i1 = fLastInterval;
+
+        while (i0 != i1) {
+            SkASSERT(i0 < i1);
+            SkASSERT(t >= i0->fP0 && t < i1->fP1);
+
+            const Interval* i = i0 + ((i1 - i0) >> 1);
+
+            if (t >= i->fP1) {
+                i0 = i + 1;
+            } else {
+                i1 = i;
+            }
+        }
+
+        SkASSERT(t >= i0->fP0 && t <= i0->fP1);
+        return i0;
+    }
+
+    const Interval* findNextInterval(SkScalar t, SkScalar tiled_t) const {
+        SkASSERT(tiled_t < fInterval->fP0 || tiled_t >= fInterval->fP1);
+        SkASSERT(tiled_t >= fFirstInterval->fP0 && tiled_t < fLastInterval->fP1);
+
+        const Interval* i = fInterval;
+
+        // Use the t vs. prev_t signal to figure which direction we should search for
+        // the next interval, then perform a linear search.
+        if (t >= fPrevT) {
+            do {
+                i += 1;
+                if (i > fLastInterval) {
+                    i = fFirstInterval;
+                }
+            } while (tiled_t < i->fP0 || tiled_t >= i->fP1);
+        } else {
+            do {
+                i -= 1;
+                if (i < fFirstInterval) {
+                    i = fLastInterval;
+                }
+            } while (tiled_t < i->fP0 || tiled_t >= i->fP1);
+        }
+
+        return i;
+    }
+
+    void loadIntervalData(const Interval* i) {
+        fCc = dst_swizzle<DstType>(i->fC0) * dst_component_scale<DstType>();
+        fDc = dst_swizzle<DstType>(i->fDc) * dst_component_scale<DstType>();
+    }
+
+    const Interval* fFirstInterval;
+    const Interval* fLastInterval;
+    const Interval* fInterval;
+    SkScalar        fPrevT;
+    Sk4f            fCc;
+    Sk4f            fDc;
+};
diff --git a/src/effects/gradients/Sk4fGradientBase.h b/src/effects/gradients/Sk4fGradientBase.h
index 1fece72..f745f8f 100644
--- a/src/effects/gradients/Sk4fGradientBase.h
+++ b/src/effects/gradients/Sk4fGradientBase.h
@@ -24,6 +24,9 @@
 
     uint32_t getFlags() const override { return fFlags; }
 
+    void shadeSpan(int x, int y, SkPMColor dst[], int count) override;
+    void shadeSpan4f(int x, int y, SkPM4f dst[], int count) override;
+
 protected:
     struct Interval {
         Interval(SkPMColor c0, SkScalar p0,
@@ -32,14 +35,15 @@
 
         bool isZeroRamp() const { return fZeroRamp; }
 
-        // true when fx is in [p0,p1)
-        bool contains(SkScalar fx) const;
-
         SkPM4f   fC0, fDc;
         SkScalar fP0, fP1;
         bool     fZeroRamp;
     };
 
+    virtual void mapTs(int x, int y, SkScalar ts[], int count) const = 0;
+
+    void buildIntervals(const SkGradientShaderBase&, const ContextRec&, bool reverse);
+
     SkSTArray<8, Interval, true> fIntervals;
     SkMatrix                     fDstToPos;
     SkMatrix::MapXYProc          fDstToPosProc;
@@ -50,6 +54,18 @@
 
 private:
     using INHERITED = SkShader::Context;
+
+    void addMirrorIntervals(const SkGradientShaderBase&,
+                            const Sk4f& componentScale, bool reverse);
+
+    template<typename DstType, SkShader::TileMode tileMode>
+    class TSampler;
+
+    template <typename DstType, bool do_premul>
+    void shadePremulSpan(int x, int y, DstType[], int count) const;
+
+    template <typename DstType, bool do_premul, SkShader::TileMode tileMode>
+    void shadeSpanInternal(int x, int y, DstType[], int count) const;
 };
 
 #endif // Sk4fGradientBase_DEFINED
diff --git a/src/effects/gradients/Sk4fGradientPriv.h b/src/effects/gradients/Sk4fGradientPriv.h
new file mode 100644
index 0000000..cf7a99b
--- /dev/null
+++ b/src/effects/gradients/Sk4fGradientPriv.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef Sk4fGradientPriv_DEFINED
+#define Sk4fGradientPriv_DEFINED
+
+#include "SkColor.h"
+#include "SkNx.h"
+#include "SkPM4f.h"
+
+// Templates shared by various 4f gradient flavors.
+
+namespace {
+
+inline Sk4f premul_4f(const Sk4f& c) {
+    const float alpha = c[SkPM4f::A];
+    // FIXME: portable swizzle?
+    return c * Sk4f(alpha, alpha, alpha, 1);
+}
+
+template <bool do_premul>
+inline SkPMColor trunc_from_255(const Sk4f& c) {
+    SkPMColor pmc;
+    SkNx_cast<uint8_t>(c).store(&pmc);
+    if (do_premul) {
+        pmc = SkPreMultiplyARGB(SkGetPackedA32(pmc), SkGetPackedR32(pmc),
+                                SkGetPackedG32(pmc), SkGetPackedB32(pmc));
+    }
+    return pmc;
+}
+
+template<typename DstType, bool do_premul>
+void store(const Sk4f& color, DstType* dst);
+
+template<>
+inline void store<SkPM4f, false>(const Sk4f& c, SkPM4f* dst) {
+    c.store(dst);
+}
+
+template<>
+inline void store<SkPM4f, true>(const Sk4f& c, SkPM4f* dst) {
+    store<SkPM4f, false>(premul_4f(c), dst);
+}
+
+template<>
+inline void store<SkPMColor, false>(const Sk4f& c, SkPMColor* dst) {
+    *dst = trunc_from_255<false>(c);
+}
+
+template<>
+inline void store<SkPMColor, true>(const Sk4f& c, SkPMColor* dst) {
+    *dst = trunc_from_255<true>(c);
+}
+
+template<typename DstType, bool do_premul>
+inline void store4x(const Sk4f& c0,
+                    const Sk4f& c1,
+                    const Sk4f& c2,
+                    const Sk4f& c3,
+                    DstType* dst) {
+    store<DstType, do_premul>(c0, dst++);
+    store<DstType, do_premul>(c1, dst++);
+    store<DstType, do_premul>(c2, dst++);
+    store<DstType, do_premul>(c3, dst++);
+}
+
+template<>
+inline void store4x<SkPMColor, false>(const Sk4f& c0,
+                                      const Sk4f& c1,
+                                      const Sk4f& c2,
+                                      const Sk4f& c3,
+                                      SkPMColor* dst) {
+    Sk4f_ToBytes((uint8_t*)dst, c0, c1, c2, c3);
+}
+
+template<typename DstType>
+float dst_component_scale();
+
+template<>
+inline float dst_component_scale<SkPM4f>() {
+    return 1;
+}
+
+template<>
+inline float dst_component_scale<SkPMColor>() {
+    return 255;
+}
+
+template<typename DstType>
+Sk4f dst_swizzle(const SkPM4f&);
+
+template<>
+inline Sk4f dst_swizzle<SkPM4f>(const SkPM4f& c) {
+    return c.to4f();
+}
+
+template<>
+inline Sk4f dst_swizzle<SkPMColor>(const SkPM4f& c) {
+    return c.to4f_pmorder();
+}
+
+}
+
+#endif // Sk4fGradientPriv_DEFINED
diff --git a/src/effects/gradients/Sk4fLinearGradient.cpp b/src/effects/gradients/Sk4fLinearGradient.cpp
index 4a20a28..9bc84c4 100644
--- a/src/effects/gradients/Sk4fLinearGradient.cpp
+++ b/src/effects/gradients/Sk4fLinearGradient.cpp
@@ -5,27 +5,11 @@
  * found in the LICENSE file.
  */
 
+#include "Sk4fGradientPriv.h"
 #include "Sk4fLinearGradient.h"
 
 namespace {
 
-Sk4f premul_4f(const Sk4f& c) {
-    const float alpha = c[SkPM4f::A];
-    // FIXME: portable swizzle?
-    return c * Sk4f(alpha, alpha, alpha, 1);
-}
-
-template <bool do_premul>
-SkPMColor trunc_from_255(const Sk4f& c) {
-    SkPMColor pmc;
-    SkNx_cast<uint8_t>(c).store(&pmc);
-    if (do_premul) {
-        pmc = SkPreMultiplyARGB(SkGetPackedA32(pmc), SkGetPackedR32(pmc),
-                                SkGetPackedG32(pmc), SkGetPackedB32(pmc));
-    }
-    return pmc;
-}
-
 template<typename DstType, bool do_premul>
 void fill(const Sk4f& c, DstType* dst, int n);
 
@@ -53,50 +37,6 @@
 }
 
 template<typename DstType, bool do_premul>
-void store(const Sk4f& color, DstType* dst);
-
-template<>
-void store<SkPM4f, false>(const Sk4f& c, SkPM4f* dst) {
-    c.store(dst);
-}
-
-template<>
-void store<SkPM4f, true>(const Sk4f& c, SkPM4f* dst) {
-    store<SkPM4f, false>(premul_4f(c), dst);
-}
-
-template<>
-void store<SkPMColor, false>(const Sk4f& c, SkPMColor* dst) {
-    *dst = trunc_from_255<false>(c);
-}
-
-template<>
-void store<SkPMColor, true>(const Sk4f& c, SkPMColor* dst) {
-    *dst = trunc_from_255<true>(c);
-}
-
-template<typename DstType, bool do_premul>
-void store4x(const Sk4f& c0,
-             const Sk4f& c1,
-             const Sk4f& c2,
-             const Sk4f& c3,
-             DstType* dst) {
-    store<DstType, do_premul>(c0, dst++);
-    store<DstType, do_premul>(c1, dst++);
-    store<DstType, do_premul>(c2, dst++);
-    store<DstType, do_premul>(c3, dst++);
-}
-
-template<>
-void store4x<SkPMColor, false>(const Sk4f& c0,
-                               const Sk4f& c1,
-                               const Sk4f& c2,
-                               const Sk4f& c3,
-                               SkPMColor* dst) {
-    Sk4f_ToBytes((uint8_t*)dst, c0, c1, c2, c3);
-}
-
-template<typename DstType, bool do_premul>
 void ramp(const Sk4f& c, const Sk4f& dc, DstType* dst, int n) {
     SkASSERT(n > 0);
 
@@ -148,38 +88,6 @@
     return f < 0 ? f + 2 : f;
 }
 
-template<typename DstType>
-float dst_component_scale();
-
-template<>
-float dst_component_scale<SkPM4f>() {
-    return 1;
-}
-
-template<>
-float dst_component_scale<SkPMColor>() {
-    return 255;
-}
-
-template<typename DstType>
-Sk4f dst_swizzle(const SkPM4f&);
-
-template<>
-Sk4f dst_swizzle<SkPM4f>(const SkPM4f& c) {
-    return c.to4f();
-}
-
-template<>
-Sk4f dst_swizzle<SkPMColor>(const SkPM4f& c) {
-    return c.to4f_pmorder();
-}
-
-SkPMColor pack_color(SkColor c, bool premul) {
-    return premul
-        ? SkPreMultiplyColor(c)
-        : SkPackARGB32NoCheck(SkColorGetA(c), SkColorGetR(c), SkColorGetG(c), SkColorGetB(c));
-}
-
 // true when x is in [k1,k2)
 bool in_range(SkScalar x, SkScalar k1, SkScalar k2) {
     SkASSERT(k1 != k2);
@@ -188,208 +96,21 @@
         : (x >= k2 && x < k1);
 }
 
-class IntervalBuilder {
-public:
-    IntervalBuilder(const SkColor* colors, const SkScalar* pos, int count, bool reverse)
-        : fColors(colors)
-        , fPos(pos)
-        , fCount(count)
-        , fFirstPos(reverse ? SK_Scalar1 : 0)
-        , fBegin(reverse ? count - 1 : 0)
-        , fAdvance(reverse ? -1 : 1) {
-        SkASSERT(colors);
-        SkASSERT(count > 1);
-    }
-
-    template<typename F>
-    void build(F func) const {
-        if (!fPos) {
-            this->buildImplicitPos(func);
-            return;
-        }
-
-        const int end = fBegin + fAdvance * (fCount - 1);
-        const SkScalar lastPos = 1 - fFirstPos;
-        int prev = fBegin;
-        SkScalar prevPos = fFirstPos;
-
-        do {
-            const int curr = prev + fAdvance;
-            SkASSERT(curr >= 0 && curr < fCount);
-
-            // TODO: this sanitization should be done in SkGradientShaderBase
-            const SkScalar currPos = (fAdvance > 0)
-                ? SkTPin(fPos[curr], prevPos, lastPos)
-                : SkTPin(fPos[curr], lastPos, prevPos);
-
-            if (currPos != prevPos) {
-                SkASSERT((currPos - prevPos > 0) == (fAdvance > 0));
-                func(fColors[prev], fColors[curr], prevPos, currPos);
-            }
-
-            prev = curr;
-            prevPos = currPos;
-        } while (prev != end);
-    }
-
-private:
-    template<typename F>
-    void buildImplicitPos(F func) const {
-        // When clients don't provide explicit color stop positions (fPos == nullptr),
-        // the color stops are distributed evenly across the unit interval
-        // (implicit positioning).
-        const SkScalar dt = fAdvance * SK_Scalar1 / (fCount - 1);
-        const int end = fBegin + fAdvance * (fCount - 2);
-        int prev = fBegin;
-        SkScalar prevPos = fFirstPos;
-
-        while (prev != end) {
-            const int curr = prev + fAdvance;
-            SkASSERT(curr >= 0 && curr < fCount);
-
-            const SkScalar currPos = prevPos + dt;
-            func(fColors[prev], fColors[curr], prevPos, currPos);
-            prev = curr;
-            prevPos = currPos;
-        }
-
-        // emit the last interval with a pinned end position, to avoid precision issues
-        func(fColors[prev], fColors[prev + fAdvance], prevPos, 1 - fFirstPos);
-    }
-
-    const SkColor*  fColors;
-    const SkScalar* fPos;
-    const int       fCount;
-    const SkScalar  fFirstPos;
-    const int       fBegin;
-    const int       fAdvance;
-};
-
 } // anonymous namespace
 
 SkLinearGradient::
 LinearGradient4fContext::LinearGradient4fContext(const SkLinearGradient& shader,
                                                  const ContextRec& rec)
     : INHERITED(shader, rec) {
-    // The main job here is to build a specialized interval list: a different
-    // representation of the color stops data, optimized for efficient scan line
-    // access during shading.
-    //
-    //   [{P0,C0} , {P1,C1}) [{P1,C2} , {P2,c3}) ... [{Pn,C2n} , {Pn+1,C2n+1})
-    //
-    // The list is sorted in increasing dst order, i.e. X(Pk) < X(Pk+1).  This
-    // allows us to always traverse left->right when iterating over a scan line.
-    // It also means that the interval order matches the color stops when dx >= 0,
-    // and is the inverse (pos, colors, order are flipped) when dx < 0.
-    //
-    // Note: the current representation duplicates pos data; we could refactor to
-    //       avoid this if interval storage size becomes a concern.
-    //
-    // Aside from reordering, we also perform two more pre-processing steps at
-    // this stage:
-    //
-    //   1) scale the color components depending on paint alpha and the requested
-    //      interpolation space (note: the interval color storage is SkPM4f, but
-    //      that doesn't necessarily mean the colors are premultiplied; that
-    //      property is tracked in fColorsArePremul)
-    //
-    //   2) inject synthetic intervals to support tiling.
-    //
-    //      * for kRepeat, no extra intervals are needed - the iterator just
-    //        wraps around at the end:
-    //
-    //          ->[P0,P1)->..[Pn-1,Pn)->
-    //
-    //      * for kClamp, we add two "infinite" intervals before/after:
-    //
-    //          [-/+inf , P0)->[P0 , P1)->..[Pn-1 , Pn)->[Pn , +/-inf)
-    //
-    //        (the iterator should never run off the end in this mode)
-    //
-    //      * for kMirror, we extend the range to [0..2] and add a flipped
-    //        interval series - then the iterator operates just as in the
-    //        kRepeat case:
-    //
-    //          ->[P0,P1)->..[Pn-1,Pn)->[2 - Pn,2 - Pn-1)->..[2 - P1,2 - P0)->
-    //
-    // TODO: investigate collapsing intervals << 1px.
 
-    SkASSERT(shader.fColorCount > 1);
-    SkASSERT(shader.fOrigColors);
-
-    const float paintAlpha = rec.fPaint->getAlpha() * (1.0f / 255);
-    const Sk4f componentScale = fColorsArePremul
-        ? Sk4f(paintAlpha)
-        : Sk4f(1.0f, 1.0f, 1.0f, paintAlpha);
-    const bool dx_is_pos = fDstToPos.getScaleX() >= 0;
-    const int first_index = dx_is_pos ? 0 : shader.fColorCount - 1;
-    const int last_index = shader.fColorCount - 1 - first_index;
-    const SkScalar first_pos = dx_is_pos ? 0 : SK_Scalar1;
-    const SkScalar last_pos = 1 - first_pos;
-
-    if (shader.fTileMode == SkShader::kClamp_TileMode) {
-        // synthetic edge interval: -/+inf .. P0
-        const SkPMColor clamp_color = pack_color(shader.fOrigColors[first_index],
-                                                 fColorsArePremul);
-        const SkScalar clamp_pos = dx_is_pos ? SK_ScalarMin : SK_ScalarMax;
-        fIntervals.emplace_back(clamp_color, clamp_pos,
-                                clamp_color, first_pos,
-                                componentScale);
-    } else if (shader.fTileMode == SkShader::kMirror_TileMode && !dx_is_pos) {
-        // synthetic mirror intervals injected before main intervals: (2 .. 1]
-        addMirrorIntervals(shader, componentScale, dx_is_pos);
-    }
-
-    const IntervalBuilder builder(shader.fOrigColors,
-                                  shader.fOrigPos,
-                                  shader.fColorCount,
-                                  !dx_is_pos);
-    builder.build([this, &componentScale] (SkColor c0, SkColor c1, SkScalar p0, SkScalar p1) {
-        SkASSERT(fIntervals.empty() || fIntervals.back().fP1 == p0);
-
-        fIntervals.emplace_back(pack_color(c0, fColorsArePremul),
-                                p0,
-                                pack_color(c1, fColorsArePremul),
-                                p1,
-                                componentScale);
-    });
-
-    if (shader.fTileMode == SkShader::kClamp_TileMode) {
-        // synthetic edge interval: Pn .. +/-inf
-        const SkPMColor clamp_color =
-            pack_color(shader.fOrigColors[last_index], fColorsArePremul);
-        const SkScalar clamp_pos = dx_is_pos ? SK_ScalarMax : SK_ScalarMin;
-        fIntervals.emplace_back(clamp_color, last_pos,
-                                clamp_color, clamp_pos,
-                                componentScale);
-    } else if (shader.fTileMode == SkShader::kMirror_TileMode && dx_is_pos) {
-        // synthetic mirror intervals injected after main intervals: [1 .. 2)
-        addMirrorIntervals(shader, componentScale, dx_is_pos);
-    }
+    // Our fast path expects interval points to be monotonically increasing in x.
+    const bool reverseIntervals = this->isFast() && fDstToPos.getScaleX() < 0;
+    this->buildIntervals(shader, rec, reverseIntervals);
 
     SkASSERT(fIntervals.count() > 0);
     fCachedInterval = fIntervals.begin();
 }
 
-void SkLinearGradient::
-LinearGradient4fContext::addMirrorIntervals(const SkLinearGradient& shader,
-                                            const Sk4f& componentScale, bool dx_is_pos) {
-    // Iterates in reverse order (vs main interval builder) and adds intervals reflected in 2.
-    const IntervalBuilder builder(shader.fOrigColors,
-                                  shader.fOrigPos,
-                                  shader.fColorCount,
-                                  dx_is_pos);
-    builder.build([this, &componentScale] (SkColor c0, SkColor c1, SkScalar p0, SkScalar p1) {
-        SkASSERT(fIntervals.empty() || fIntervals.back().fP1 == 2 - p0);
-
-        fIntervals.emplace_back(pack_color(c0, fColorsArePremul),
-                                2 - p0,
-                                pack_color(c1, fColorsArePremul),
-                                2 - p1,
-                                componentScale);
-    });
-}
-
 const SkGradientShaderBase::GradientShaderBase4fContext::Interval*
 SkLinearGradient::LinearGradient4fContext::findInterval(SkScalar fx) const {
     SkASSERT(in_range(fx, fIntervals.front().fP0, fIntervals.back().fP1));
@@ -434,6 +155,11 @@
 
 void SkLinearGradient::
 LinearGradient4fContext::shadeSpan(int x, int y, SkPMColor dst[], int count) {
+    if (!this->isFast()) {
+        this->INHERITED::shadeSpan(x, y, dst, count);
+        return;
+    }
+
     // TODO: plumb dithering
     SkASSERT(count > 0);
     if (fColorsArePremul) {
@@ -445,6 +171,11 @@
 
 void SkLinearGradient::
 LinearGradient4fContext::shadeSpan4f(int x, int y, SkPM4f dst[], int count) {
+    if (!this->isFast()) {
+        this->INHERITED::shadeSpan4f(x, y, dst, count);
+        return;
+    }
+
     // TONOTDO: plumb dithering
     SkASSERT(count > 0);
     if (fColorsArePremul) {
@@ -545,7 +276,7 @@
         , fIsVertical(is_vertical)
     {
         SkASSERT(firstInterval <= lastInterval);
-        SkASSERT(i->contains(fx));
+        SkASSERT(in_range(fx, i->fP0, i->fP1));
         this->compute_interval_props(fx - i->fP0);
     }
 
@@ -626,3 +357,48 @@
     const SkScalar  fDx;        // 'dx' for consistency with other impls; actually dt/dx
     const bool      fIsVertical;
 };
+
+void SkLinearGradient::
+LinearGradient4fContext::mapTs(int x, int y, SkScalar ts[], int count) const {
+    SkASSERT(count > 0);
+    SkASSERT(fDstToPosClass != kLinear_MatrixClass);
+
+    SkScalar sx = x + SK_ScalarHalf;
+    const SkScalar sy = y + SK_ScalarHalf;
+    SkPoint pt;
+
+    if (fDstToPosClass != kPerspective_MatrixClass) {
+        // kLinear_MatrixClass, kFixedStepInX_MatrixClass => fixed dt per scanline
+        const SkScalar dtdx = fDstToPos.fixedStepInX(sy).x();
+        fDstToPosProc(fDstToPos, sx, sy, &pt);
+
+        const Sk4f dtdx4 = Sk4f(4 * dtdx);
+        Sk4f t4 = Sk4f(pt.x() + 0 * dtdx,
+                       pt.x() + 1 * dtdx,
+                       pt.x() + 2 * dtdx,
+                       pt.x() + 3 * dtdx);
+
+        while (count >= 4) {
+            t4.store(ts);
+            t4 = t4 + dtdx4;
+            ts += 4;
+            count -= 4;
+        }
+
+        if (count & 2) {
+            *ts++ = t4[0];
+            *ts++ = t4[1];
+            t4 = SkNx_shuffle<2, 0, 1, 3>(t4);
+        }
+
+        if (count & 1) {
+            *ts++ = t4[0];
+        }
+    } else {
+        for (int i = 0; i < count; ++i) {
+            fDstToPosProc(fDstToPos, sx, sy, &pt);
+            ts[i] = pt.x();
+            sx += SK_Scalar1;
+        }
+    }
+}
diff --git a/src/effects/gradients/Sk4fLinearGradient.h b/src/effects/gradients/Sk4fLinearGradient.h
index a0a09ca..67449fc 100644
--- a/src/effects/gradients/Sk4fLinearGradient.h
+++ b/src/effects/gradients/Sk4fLinearGradient.h
@@ -12,19 +12,19 @@
 #include "SkLinearGradient.h"
 
 class SkLinearGradient::
-LinearGradient4fContext : public GradientShaderBase4fContext {
+LinearGradient4fContext final : public GradientShaderBase4fContext {
 public:
     LinearGradient4fContext(const SkLinearGradient&, const ContextRec&);
 
     void shadeSpan(int x, int y, SkPMColor dst[], int count) override;
     void shadeSpan4f(int x, int y, SkPM4f dst[], int count) override;
 
+protected:
+    void mapTs(int x, int y, SkScalar ts[], int count) const override;
+
 private:
     using INHERITED = GradientShaderBase4fContext;
 
-    void addMirrorIntervals(const SkLinearGradient&, const Sk4f& componentScale,
-                            bool dx_is_pos);
-
     template<typename DstType, TileMode>
     class LinearIntervalProcessor;
 
@@ -32,13 +32,12 @@
     void shadePremulSpan(int x, int y, DstType[], int count) const;
 
     template <typename DstType, bool premul, SkShader::TileMode tileMode>
-    void shadePremulTileSpan(int x, int y, DstType[], int count) const;
-
-    template <typename DstType, bool premul, SkShader::TileMode tileMode>
     void shadeSpanInternal(int x, int y, DstType[], int count) const;
 
     const Interval* findInterval(SkScalar fx) const;
 
+    bool isFast() const { return fDstToPosClass == kLinear_MatrixClass; }
+
     mutable const Interval*      fCachedInterval;
 };
 
diff --git a/src/effects/gradients/SkLinearGradient.cpp b/src/effects/gradients/SkLinearGradient.cpp
index 0a2b158..f521bd7 100644
--- a/src/effects/gradients/SkLinearGradient.cpp
+++ b/src/effects/gradients/SkLinearGradient.cpp
@@ -41,12 +41,6 @@
 #ifdef FORCE_4F_CONTEXT
     return true;
 #else
-    // Perspective not supported in 4f yet.
-    if (rec.fMatrix->hasPerspective()
-        || (rec.fLocalMatrix && rec.fLocalMatrix->hasPerspective())) {
-        return false;
-    }
-
     return rec.fPreferredDstType == SkShader::ContextRec::kPM4f_DstType
         || SkToBool(flags & SkLinearGradient::kForce4fContext_PrivateFlag);
 #endif