Initial linear gradient 4f impl

GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1688543002

Review URL: https://codereview.chromium.org/1688543002
diff --git a/gm/gradients.cpp b/gm/gradients.cpp
index c1847d4..5fb3465 100644
--- a/gm/gradients.cpp
+++ b/gm/gradients.cpp
@@ -7,6 +7,7 @@
 
 #include "gm.h"
 #include "SkGradientShader.h"
+#include "SkLinearGradient.h"
 
 namespace skiagm {
 
@@ -593,9 +594,16 @@
 DEF_GM( return new LinearGradientGM(false); )
 
 class LinearGradientTinyGM : public GM {
+public:
+    LinearGradientTinyGM(uint32_t flags, const char* suffix = nullptr)
+    : fName("linear_gradient_tiny")
+    , fFlags(flags) {
+        fName.append(suffix);
+    }
+
 protected:
     SkString onShortName() override {
-        return SkString("linear_gradient_tiny");
+        return fName;
     }
 
     SkISize onISize() override {
@@ -631,7 +639,7 @@
             SkAutoCanvasRestore acr(canvas, true);
             SkAutoTUnref<SkShader> gradient(
                 SkGradientShader::CreateLinear(configs[i].pts, colors, configs[i].pos, kStopCount,
-                                               SkShader::kClamp_TileMode));
+                                               SkShader::kClamp_TileMode, fFlags, nullptr));
             canvas->translate(kRectSize * ((i % 4) * 1.5f + 0.25f),
                               kRectSize * ((i / 4) * 1.5f + 0.25f));
 
@@ -642,9 +650,12 @@
 
 private:
     typedef GM INHERITED;
-};
-DEF_GM( return new LinearGradientTinyGM(); )
 
+    SkString fName;
+    uint32_t fFlags;
+};
+DEF_GM( return new LinearGradientTinyGM(0); )
+DEF_GM( return new LinearGradientTinyGM(SkLinearGradient::kForce4fContext_PrivateFlag, "_4f"); )
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -736,3 +747,35 @@
         }
     }
 }
+
+static void draw_many_stops(SkCanvas* canvas, uint32_t flags) {
+    const unsigned kStopCount = 200;
+    const SkPoint pts[] = { {50, 50}, {450, 465}};
+
+    SkColor colors[kStopCount];
+    for (unsigned i = 0; i < kStopCount; i++) {
+        switch (i % 5) {
+        case 0: colors[i] = SK_ColorRED; break;
+        case 1: colors[i] = SK_ColorGREEN; break;
+        case 2: colors[i] = SK_ColorGREEN; break;
+        case 3: colors[i] = SK_ColorBLUE; break;
+        case 4: colors[i] = SK_ColorRED; break;
+        }
+    }
+
+    SkAutoTUnref<SkShader> shader(SkGradientShader::CreateLinear(
+        pts, colors, nullptr, SK_ARRAY_COUNT(colors), SkShader::kClamp_TileMode, flags, nullptr));
+
+    SkPaint p;
+    p.setShader(shader);
+
+    canvas->drawRect(SkRect::MakeXYWH(0, 0, 500, 500), p);
+}
+
+DEF_SIMPLE_GM(gradient_many_stops, canvas, 500, 500) {
+    draw_many_stops(canvas, 0);
+}
+
+DEF_SIMPLE_GM(gradient_many_stops_4f, canvas, 500, 500) {
+    draw_many_stops(canvas, SkLinearGradient::kForce4fContext_PrivateFlag);
+}
diff --git a/gyp/effects.gypi b/gyp/effects.gypi
index bbda38a..347d140 100644
--- a/gyp/effects.gypi
+++ b/gyp/effects.gypi
@@ -64,6 +64,10 @@
     '<(skia_src_path)/effects/SkTileImageFilter.cpp',
     '<(skia_src_path)/effects/SkXfermodeImageFilter.cpp',
 
+    '<(skia_src_path)/effects/gradients/Sk4fGradientBase.cpp',
+    '<(skia_src_path)/effects/gradients/Sk4fGradientBase.h',
+    '<(skia_src_path)/effects/gradients/Sk4fLinearGradient.cpp',
+    '<(skia_src_path)/effects/gradients/Sk4fLinearGradient.h',
     '<(skia_src_path)/effects/gradients/SkClampRange.cpp',
     '<(skia_src_path)/effects/gradients/SkClampRange.h',
     '<(skia_src_path)/effects/gradients/SkGradientBitmapCache.cpp',
diff --git a/gyp/gmslides.gypi b/gyp/gmslides.gypi
index 6ad0cca..43d354d 100644
--- a/gyp/gmslides.gypi
+++ b/gyp/gmslides.gypi
@@ -9,6 +9,7 @@
     # include dirs needed by particular GMs
     '../include/client/android',
     '../tools/debugger',
+    '../src/effects/gradients',
     '../src/images',
     '../src/lazy',
   ],
diff --git a/src/effects/gradients/Sk4fGradientBase.cpp b/src/effects/gradients/Sk4fGradientBase.cpp
new file mode 100644
index 0000000..0840f60
--- /dev/null
+++ b/src/effects/gradients/Sk4fGradientBase.cpp
@@ -0,0 +1,266 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Sk4fGradientBase.h"
+
+namespace {
+
+const float kInv255Float = 1.0f / 255;
+
+SkPMColor pack_color(SkColor c, bool premul) {
+    return premul
+        ? SkPreMultiplyColor(c)
+        : SkPackARGB32NoCheck(SkColorGetA(c), SkColorGetR(c), SkColorGetG(c), SkColorGetB(c));
+}
+
+// true when x is in [k1,k2)
+bool in_range(SkScalar x, SkScalar k1, SkScalar k2) {
+    SkASSERT(k1 != k2);
+    return (k1 < k2)
+        ? (x >= k1 && x < k2)
+        : (x >= k2 && x < k1);
+}
+
+} // anonymous namespace
+
+SkGradientShaderBase::GradientShaderBase4fContext::
+Interval::Interval(SkPMColor c0, SkScalar p0,
+                   SkPMColor c1, SkScalar p1,
+                   const Sk4f& componentScale)
+    : fP0(p0)
+    , fP1(p1)
+    , fZeroRamp(c0 == c1) {
+    SkASSERT(p0 != p1);
+
+    const Sk4f c4f0 = SkNx_cast<float>(Sk4b::Load(&c0)) * componentScale;
+    const Sk4f c4f1 = SkNx_cast<float>(Sk4b::Load(&c1)) * componentScale;
+    const Sk4f dc4f = (c4f1 - c4f0) / (p1 - p0);
+
+    c4f0.store(&fC0.fVec);
+    dc4f.store(&fDc.fVec);
+}
+
+SkGradientShaderBase::GradientShaderBase4fContext::
+Interval::Interval(const Sk4f& c0, const Sk4f& dc,
+                   SkScalar p0, SkScalar p1)
+    : fP0(p0)
+    , fP1(p1)
+    , fZeroRamp((dc == 0).allTrue()) {
+    c0.store(fC0.fVec);
+    dc.store(fDc.fVec);
+}
+
+bool SkGradientShaderBase::GradientShaderBase4fContext::
+Interval::contains(SkScalar fx) const {
+    return in_range(fx, fP0, fP1);
+}
+
+SkGradientShaderBase::
+GradientShaderBase4fContext::GradientShaderBase4fContext(const SkGradientShaderBase& shader,
+                                                         const ContextRec& rec)
+    : INHERITED(shader, rec)
+    , fFlags(this->INHERITED::getFlags() | kSupports4f_Flag)
+#ifdef SK_SUPPORT_LEGACY_GRADIENT_DITHERING
+    , fDither(true)
+#else
+    , fDither(rec.fPaint->isDither())
+#endif
+{
+    // The main job here is to build an interval list.  Intervals are a different
+    // representation of the color stops data, optimized for efficient scan line
+    // access during shading.
+    //
+    //   [{P0,C0} , {P1,C1}) [{P1,C2} , {P2,c3}) ... [{Pn,C2n} , {Pn+1,C2n+1})
+    //
+    // The list is sorted in increasing dst order, i.e. X(Pk) < X(Pk+1).  This
+    // allows us to always traverse left->right when iterating over a scan line.
+    // It also means that the interval order matches the color stops when dx >= 0,
+    // and is the inverse (pos, colors, order are flipped) when dx < 0.
+    //
+    // Note: the current representation duplicates pos data; we could refactor to
+    //       avoid this if interval storage size becomes a concern.
+    //
+    // Aside from reordering, we also perform two more pre-processing steps at
+    // this stage:
+    //
+    //   1) scale the color components depending on paint alpha and the requested
+    //      interpolation space (note: the interval color storage is SkPM4f, but
+    //      that doesn't necessarily mean the colors are premultiplied; that
+    //      property is tracked in fColorsArePremul)
+    //
+    //   2) inject synthetic intervals to support tiling.
+    //
+    //      * for kRepeat, no extra intervals are needed - the iterator just
+    //        wraps around at the end:
+    //
+    //          ->[P0,P1)->..[Pn-1,Pn)->
+    //
+    //      * for kClamp, we add two "infinite" intervals before/after:
+    //
+    //          [-/+inf , P0)->[P0 , P1)->..[Pn-1 , Pn)->[Pn , +/-inf)
+    //
+    //        (the iterator should never run off the end in this mode)
+    //
+    //      * for kMirror, we extend the range to [0..2] and add a flipped
+    //        interval series - then the iterator operates just as in the
+    //        kRepeat case:
+    //
+    //          ->[P0,P1)->..[Pn-1,Pn)->[2 - Pn,2 - Pn-1)->..[2 - P1,2 - P0)->
+    //
+    // TODO: investigate collapsing intervals << 1px.
+
+    const SkMatrix& inverse = this->getTotalInverse();
+    fDstToPos.setConcat(shader.fPtsToUnit, inverse);
+    fDstToPosProc = fDstToPos.getMapXYProc();
+    fDstToPosClass = static_cast<uint8_t>(INHERITED::ComputeMatrixClass(fDstToPos));
+
+    if (shader.fColorsAreOpaque && this->getPaintAlpha() == SK_AlphaOPAQUE) {
+        fFlags |= kOpaqueAlpha_Flag;
+    }
+
+    fColorsArePremul =
+        (shader.fGradFlags & SkGradientShader::kInterpolateColorsInPremul_Flag)
+        || shader.fColorsAreOpaque;
+
+    const float paintAlpha = rec.fPaint->getAlpha() * kInv255Float;
+    const Sk4f componentScale = fColorsArePremul
+        ? Sk4f(paintAlpha * kInv255Float)
+        : Sk4f(kInv255Float, kInv255Float, kInv255Float, paintAlpha * kInv255Float);
+
+    SkASSERT(shader.fColorCount > 1);
+    SkASSERT(shader.fOrigColors);
+
+    int direction = 1;
+    int first_index = 0;
+    int last_index = shader.fColorCount - 1;
+    SkScalar first_pos = 0;
+    SkScalar last_pos = 1;
+    const bool dx_is_pos = fDstToPos.getScaleX() >= 0;
+    if (!dx_is_pos) {
+        direction = -direction;
+        SkTSwap(first_index, last_index);
+        SkTSwap(first_pos, last_pos);
+    }
+
+    if (shader.fTileMode == SkShader::kClamp_TileMode) {
+        // synthetic edge interval: -/+inf .. P0)
+        const SkPMColor clamp_color = pack_color(shader.fOrigColors[first_index],
+                                                 fColorsArePremul);
+        const SkScalar clamp_pos = dx_is_pos ? SK_ScalarMin : SK_ScalarMax;
+        fIntervals.emplace_back(clamp_color, clamp_pos,
+                                clamp_color, first_pos,
+                                componentScale);
+    }
+
+    int prev = first_index;
+    int curr = prev + direction;
+    SkScalar prev_pos = first_pos;
+    if (shader.fOrigPos) {
+        // explicit positions
+        do {
+            // TODO: this sanitization should be done in SkGradientShaderBase
+            const SkScalar curr_pos = (dx_is_pos)
+                ? SkTPin(shader.fOrigPos[curr], prev_pos, last_pos)
+                : SkTPin(shader.fOrigPos[curr], last_pos, prev_pos);
+            if (curr_pos != prev_pos) {
+                fIntervals.emplace_back(
+                    pack_color(shader.fOrigColors[prev], fColorsArePremul),
+                    prev_pos,
+                    pack_color(shader.fOrigColors[curr], fColorsArePremul),
+                    curr_pos,
+                    componentScale);
+            }
+            prev = curr;
+            prev_pos = curr_pos;
+            curr += direction;
+        } while (prev != last_index);
+    } else {
+        // implicit positions
+        const SkScalar dt = direction * SK_Scalar1 / (shader.fColorCount - 1);
+        do {
+            const SkScalar curr_pos = prev_pos + dt;
+            fIntervals.emplace_back(
+                pack_color(shader.fOrigColors[prev], fColorsArePremul),
+                prev_pos,
+                pack_color(shader.fOrigColors[curr], fColorsArePremul),
+                curr_pos,
+                componentScale);
+
+            prev = curr;
+            prev_pos = curr_pos;
+            curr += direction;
+        } while (prev != last_index);
+        // pin the last pos to maintain accurate [0,1] pos coverage.
+        fIntervals.back().fP1 = last_pos;
+    }
+
+    if (shader.fTileMode == SkShader::kClamp_TileMode) {
+        // synthetic edge interval: Pn .. +/-inf
+        const SkPMColor clamp_color =
+            pack_color(shader.fOrigColors[last_index], fColorsArePremul);
+        const SkScalar clamp_pos = dx_is_pos ? SK_ScalarMax : SK_ScalarMin;
+        fIntervals.emplace_back(clamp_color, last_pos,
+                                clamp_color, clamp_pos,
+                                componentScale);
+    } else if (shader.fTileMode == SkShader::kMirror_TileMode) {
+        // synthetic flipped intervals in [1 .. 2)
+        for (int i = fIntervals.count() - 1; i >= 0; --i) {
+            const Interval& interval = fIntervals[i];
+            const SkScalar p0 = interval.fP0;
+            const SkScalar p1 = interval.fP1;
+            Sk4f dc = Sk4f::Load(interval.fDc.fVec);
+            Sk4f c = Sk4f::Load(interval.fC0.fVec) + dc * Sk4f(p1 - p0);
+            fIntervals.emplace_back(c, dc * Sk4f(-1), 2 - p1, 2 - p0);
+        }
+    }
+
+    SkASSERT(fIntervals.count() > 0);
+    fCachedInterval = fIntervals.begin();
+}
+
+const SkGradientShaderBase::GradientShaderBase4fContext::Interval*
+SkGradientShaderBase::
+GradientShaderBase4fContext::findInterval(SkScalar fx) const {
+    SkASSERT(in_range(fx, fIntervals.front().fP0, fIntervals.back().fP1));
+
+    if (1) {
+        // Linear search, using the last scanline interval as a starting point.
+        SkASSERT(fCachedInterval >= fIntervals.begin());
+        SkASSERT(fCachedInterval < fIntervals.end());
+        const int search_dir = fDstToPos.getScaleX() >= 0 ? 1 : -1;
+        while (!in_range(fx, fCachedInterval->fP0, fCachedInterval->fP1)) {
+            fCachedInterval += search_dir;
+            if (fCachedInterval >= fIntervals.end()) {
+                fCachedInterval = fIntervals.begin();
+            } else if (fCachedInterval < fIntervals.begin()) {
+                fCachedInterval = fIntervals.end() - 1;
+            }
+        }
+        return fCachedInterval;
+    } else {
+        // Binary search.  Seems less effective than linear + caching.
+        const Interval* i0 = fIntervals.begin();
+        const Interval* i1 = fIntervals.end() - 1;
+
+        while (i0 != i1) {
+            SkASSERT(i0 < i1);
+            SkASSERT(in_range(fx, i0->fP0, i1->fP1));
+
+            const Interval* i = i0 + ((i1 - i0) >> 1);
+
+            if (in_range(fx, i0->fP0, i->fP1)) {
+                i1 = i;
+            } else {
+                SkASSERT(in_range(fx, i->fP1, i1->fP1));
+                i0 = i + 1;
+            }
+        }
+
+        SkASSERT(in_range(fx, i0->fP0, i0->fP1));
+        return i0;
+    }
+}
diff --git a/src/effects/gradients/Sk4fGradientBase.h b/src/effects/gradients/Sk4fGradientBase.h
new file mode 100644
index 0000000..7870bc3
--- /dev/null
+++ b/src/effects/gradients/Sk4fGradientBase.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef Sk4fGradientBase_DEFINED
+#define Sk4fGradientBase_DEFINED
+
+#include "SkColor.h"
+#include "SkGradientShaderPriv.h"
+#include "SkMatrix.h"
+#include "SkNx.h"
+#include "SkPM4f.h"
+#include "SkShader.h"
+#include "SkTArray.h"
+
+class SkGradientShaderBase::
+GradientShaderBase4fContext : public SkShader::Context {
+public:
+    GradientShaderBase4fContext(const SkGradientShaderBase&,
+                                const ContextRec&);
+
+    uint32_t getFlags() const override { return fFlags; }
+
+protected:
+    struct Interval {
+        Interval(SkPMColor c0, SkScalar p0,
+                 SkPMColor c1, SkScalar p1,
+                 const Sk4f& componentScale);
+        Interval(const Sk4f& c0, const Sk4f& dc,
+                 SkScalar p0, SkScalar p1);
+
+        bool isZeroRamp() const { return fZeroRamp; }
+
+        // true when fx is in [p0,p1)
+        bool contains(SkScalar fx) const;
+
+        SkPM4f   fC0, fDc;
+        SkScalar fP0, fP1;
+        bool     fZeroRamp;
+    };
+
+    const Interval* findInterval(SkScalar fx) const;
+
+    SkSTArray<8, Interval, true> fIntervals;
+    SkMatrix                     fDstToPos;
+    SkMatrix::MapXYProc          fDstToPosProc;
+    uint8_t                      fDstToPosClass;
+    uint8_t                      fFlags;
+    bool                         fDither;
+    bool                         fColorsArePremul;
+
+private:
+    using INHERITED = SkShader::Context;
+
+    mutable const Interval*      fCachedInterval;
+};
+
+#endif // Sk4fGradientBase_DEFINED
diff --git a/src/effects/gradients/Sk4fLinearGradient.cpp b/src/effects/gradients/Sk4fLinearGradient.cpp
new file mode 100644
index 0000000..57c81f6
--- /dev/null
+++ b/src/effects/gradients/Sk4fLinearGradient.cpp
@@ -0,0 +1,364 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Sk4fLinearGradient.h"
+
+namespace {
+
+Sk4f premul_4f(const Sk4f& c) {
+    const float alpha = c[SkPM4f::A];
+    // FIXME: portable swizzle?
+    return c * Sk4f(alpha, alpha, alpha, 1);
+}
+
+template <bool do_premul>
+SkPMColor trunc_from_255(const Sk4f& c) {
+    SkPMColor pmc;
+    SkNx_cast<uint8_t>(c).store(&pmc);
+    if (do_premul) {
+        pmc = SkPreMultiplyARGB(SkGetPackedA32(pmc), SkGetPackedR32(pmc),
+                                SkGetPackedG32(pmc), SkGetPackedB32(pmc));
+    }
+    return pmc;
+}
+
+template<typename DstType, bool do_premul>
+void fill(const Sk4f& c, DstType* dst, int n);
+
+template<>
+void fill<SkPM4f, false>(const Sk4f& c, SkPM4f* dst, int n) {
+    while (n > 0) {
+        c.store(dst++);
+        n--;
+    }
+}
+
+template<>
+void fill<SkPM4f, true>(const Sk4f& c, SkPM4f* dst, int n) {
+    fill<SkPM4f, false>(premul_4f(c), dst, n);
+}
+
+template<>
+void fill<SkPMColor, false>(const Sk4f& c, SkPMColor* dst, int n) {
+    sk_memset32(dst, trunc_from_255<false>(c), n);
+}
+
+template<>
+void fill<SkPMColor, true>(const Sk4f& c, SkPMColor* dst, int n) {
+    sk_memset32(dst, trunc_from_255<true>(c), n);
+}
+
+template<typename DstType, bool do_premul>
+void store(const Sk4f& color, DstType* dst);
+
+template<>
+void store<SkPM4f, false>(const Sk4f& c, SkPM4f* dst) {
+    c.store(dst);
+}
+
+template<>
+void store<SkPM4f, true>(const Sk4f& c, SkPM4f* dst) {
+    store<SkPM4f, false>(premul_4f(c), dst);
+}
+
+template<>
+void store<SkPMColor, false>(const Sk4f& c, SkPMColor* dst) {
+    *dst = trunc_from_255<false>(c);
+}
+
+template<>
+void store<SkPMColor, true>(const Sk4f& c, SkPMColor* dst) {
+    *dst = trunc_from_255<true>(c);
+}
+
+template<typename DstType, bool do_premul>
+void store4x(const Sk4f& c0,
+             const Sk4f& c1,
+             const Sk4f& c2,
+             const Sk4f& c3,
+             DstType* dst) {
+    store<DstType, do_premul>(c0, dst++);
+    store<DstType, do_premul>(c1, dst++);
+    store<DstType, do_premul>(c2, dst++);
+    store<DstType, do_premul>(c3, dst++);
+}
+
+template<>
+void store4x<SkPMColor, false>(const Sk4f& c0,
+                               const Sk4f& c1,
+                               const Sk4f& c2,
+                               const Sk4f& c3,
+                               SkPMColor* dst) {
+    Sk4f_ToBytes((uint8_t*)dst, c0, c1, c2, c3);
+}
+
+template<typename DstType, bool do_premul>
+void ramp(const Sk4f& c, const Sk4f& dc, DstType* dst, int n) {
+    SkASSERT(n > 0);
+
+    const Sk4f dc2 = dc + dc;
+    const Sk4f dc4 = dc2 + dc2;
+
+    Sk4f c0 = c ;
+    Sk4f c1 = c + dc;
+    Sk4f c2 = c0 + dc2;
+    Sk4f c3 = c1 + dc2;
+
+    while (n >= 4) {
+        store4x<DstType, do_premul>(c0, c1, c2, c3, dst);
+        dst += 4;
+
+        c0 = c0 + dc4;
+        c1 = c1 + dc4;
+        c2 = c2 + dc4;
+        c3 = c3 + dc4;
+        n -= 4;
+    }
+    if (n & 2) {
+        store<DstType, do_premul>(c0, dst++);
+        store<DstType, do_premul>(c1, dst++);
+        c0 = c0 + dc2;
+    }
+    if (n & 1) {
+        store<DstType, do_premul>(c0, dst);
+    }
+}
+
+template<SkShader::TileMode>
+SkScalar pinFx(SkScalar);
+
+template<>
+SkScalar pinFx<SkShader::kClamp_TileMode>(SkScalar fx) {
+    return fx;
+}
+
+template<>
+SkScalar pinFx<SkShader::kRepeat_TileMode>(SkScalar fx) {
+    const SkScalar f = SkScalarFraction(fx);
+    return f < 0 ? f + 1 : f;
+}
+
+template<>
+SkScalar pinFx<SkShader::kMirror_TileMode>(SkScalar fx) {
+    const SkScalar f = SkScalarMod(fx, 2.0f);
+    return f < 0 ? f + 2 : f;
+}
+
+template<typename DstType>
+float dst_component_scale();
+
+template<>
+float dst_component_scale<SkPM4f>() {
+    return 1;
+}
+
+template<>
+float dst_component_scale<SkPMColor>() {
+    return 255;
+}
+
+} // anonymous namespace
+
+SkLinearGradient::
+LinearGradient4fContext::LinearGradient4fContext(const SkLinearGradient& shader,
+                                                 const ContextRec& rec)
+    : INHERITED(shader, rec) {}
+
+void SkLinearGradient::
+LinearGradient4fContext::shadeSpan(int x, int y, SkPMColor dst[], int count) {
+    // TODO: plumb dithering
+    SkASSERT(count > 0);
+    if (fColorsArePremul) {
+        this->shadePremulSpan<SkPMColor, false>(x, y, dst, count);
+    } else {
+        this->shadePremulSpan<SkPMColor, true>(x, y, dst, count);
+    }
+}
+
+void SkLinearGradient::
+LinearGradient4fContext::shadeSpan4f(int x, int y, SkPM4f dst[], int count) {
+    // TONOTDO: plumb dithering
+    SkASSERT(count > 0);
+    if (fColorsArePremul) {
+        this->shadePremulSpan<SkPM4f, false>(x, y, dst, count);
+    } else {
+        this->shadePremulSpan<SkPM4f, true>(x, y, dst, count);
+    }
+}
+
+template<typename DstType, bool do_premul>
+void SkLinearGradient::
+LinearGradient4fContext::shadePremulSpan(int x, int y,
+                                         DstType dst[],
+                                         int count) const {
+    const SkLinearGradient& shader =
+        static_cast<const SkLinearGradient&>(fShader);
+    switch (shader.fTileMode) {
+    case kClamp_TileMode:
+        this->shadeSpanInternal<DstType,
+                                do_premul,
+                                kClamp_TileMode>(x, y, dst, count);
+        break;
+    case kRepeat_TileMode:
+        this->shadeSpanInternal<DstType,
+                                do_premul,
+                                kRepeat_TileMode>(x, y, dst, count);
+        break;
+    case kMirror_TileMode:
+        this->shadeSpanInternal<DstType,
+                                do_premul,
+                                kMirror_TileMode>(x, y, dst, count);
+        break;
+    }
+}
+
+template<typename DstType, bool do_premul, SkShader::TileMode tileMode>
+void SkLinearGradient::
+LinearGradient4fContext::shadeSpanInternal(int x, int y,
+                                           DstType dst[],
+                                           int count) const {
+    SkPoint pt;
+    fDstToPosProc(fDstToPos,
+                  x + SK_ScalarHalf,
+                  y + SK_ScalarHalf,
+                  &pt);
+    const SkScalar fx = pinFx<tileMode>(pt.x());
+    const SkScalar dx = fDstToPos.getScaleX();
+    LinearIntervalProcessor<DstType, tileMode> proc(fIntervals.begin(),
+                                                    fIntervals.end() - 1,
+                                                    this->findInterval(fx),
+                                                    fx,
+                                                    dx,
+                                                    SkScalarNearlyZero(dx * count));
+    while (count > 0) {
+        // What we really want here is SkTPin(advance, 1, count)
+        // but that's a significant perf hit for >> stops; investigate.
+        const int n = SkScalarTruncToInt(
+            SkTMin<SkScalar>(proc.currentAdvance() + 1, SkIntToScalar(count)));
+
+        // The current interval advance can be +inf (e.g. when reaching
+        // the clamp mode end intervals) - when that happens, we expect to
+        //   a) consume all remaining count in one swoop
+        //   b) return a zero color gradient
+        SkASSERT(SkScalarIsFinite(proc.currentAdvance())
+            || (n == count && proc.currentRampIsZero()));
+
+        if (proc.currentRampIsZero()) {
+            fill<DstType, do_premul>(proc.currentColor(),
+                                     dst, n);
+        } else {
+            ramp<DstType, do_premul>(proc.currentColor(),
+                                     proc.currentColorGrad(),
+                                     dst, n);
+        }
+
+        proc.advance(SkIntToScalar(n));
+        count -= n;
+        dst   += n;
+    }
+}
+
+template<typename DstType, SkShader::TileMode tileMode>
+class SkLinearGradient::
+LinearGradient4fContext::LinearIntervalProcessor {
+public:
+    LinearIntervalProcessor(const Interval* firstInterval,
+                            const Interval* lastInterval,
+                            const Interval* i,
+                            SkScalar fx,
+                            SkScalar dx,
+                            bool is_vertical)
+        : fDstComponentScale(dst_component_scale<DstType>())
+        , fAdvX((i->fP1 - fx) / dx)
+        , fFirstInterval(firstInterval)
+        , fLastInterval(lastInterval)
+        , fInterval(i)
+        , fDx(dx)
+        , fIsVertical(is_vertical)
+    {
+        SkASSERT(firstInterval <= lastInterval);
+        SkASSERT(i->contains(fx));
+        this->compute_interval_props(fx - i->fP0);
+    }
+
+    SkScalar currentAdvance() const {
+        SkASSERT(fAdvX >= 0);
+        SkASSERT(fAdvX <= (fInterval->fP1 - fInterval->fP0) / fDx);
+        return fAdvX;
+    }
+
+    bool currentRampIsZero() const { return fZeroRamp; }
+    const Sk4f& currentColor() const { return fCc; }
+    const Sk4f& currentColorGrad() const { return fDcDx; }
+
+    void advance(SkScalar advX) {
+        SkASSERT(advX > 0);
+        SkASSERT(fAdvX >= 0);
+
+        if (advX >= fAdvX) {
+            advX = this->advance_interval(advX);
+        }
+        SkASSERT(advX < fAdvX);
+
+        fCc = fCc + fDcDx * Sk4f(advX);
+        fAdvX -= advX;
+    }
+
+private:
+    void compute_interval_props(SkScalar t) {
+        fDc   = Sk4f::Load(fInterval->fDc.fVec);
+        fCc   = Sk4f::Load(fInterval->fC0.fVec);
+        fCc   = fCc + fDc * Sk4f(t);
+        fCc   = fCc * fDstComponentScale;
+        fDcDx = fDc * fDstComponentScale * Sk4f(fDx);
+        fZeroRamp = fIsVertical || fInterval->isZeroRamp();
+    }
+
+    const Interval* next_interval(const Interval* i) const {
+        SkASSERT(i >= fFirstInterval);
+        SkASSERT(i <= fLastInterval);
+        i++;
+
+        if (tileMode == kClamp_TileMode) {
+            SkASSERT(i <= fLastInterval);
+            return i;
+        }
+
+        return (i <= fLastInterval) ? i : fFirstInterval;
+    }
+
+    SkScalar advance_interval(SkScalar advX) {
+        SkASSERT(advX >= fAdvX);
+
+        do {
+            advX -= fAdvX;
+            fInterval = this->next_interval(fInterval);
+            fAdvX = (fInterval->fP1 - fInterval->fP0) / fDx;
+            SkASSERT(fAdvX > 0);
+        } while (advX >= fAdvX);
+
+        compute_interval_props(0);
+
+        SkASSERT(advX >= 0);
+        return advX;
+    }
+
+    const Sk4f      fDstComponentScale; // cached dst scale (PMC: 255, PM4f: 1)
+
+    // Current interval properties.
+    Sk4f            fDc;        // local color gradient (dc/dt)
+    Sk4f            fDcDx;      // dst color gradient (dc/dx)
+    Sk4f            fCc;        // current color, interpolated in dst
+    SkScalar        fAdvX;      // remaining interval advance in dst
+    bool            fZeroRamp;  // current interval color grad is 0
+
+    const Interval* fFirstInterval;
+    const Interval* fLastInterval;
+    const Interval* fInterval;  // current interval
+    const SkScalar  fDx;        // 'dx' for consistency with other impls; actually dt/dx
+    const bool      fIsVertical;
+};
diff --git a/src/effects/gradients/Sk4fLinearGradient.h b/src/effects/gradients/Sk4fLinearGradient.h
new file mode 100644
index 0000000..8633760
--- /dev/null
+++ b/src/effects/gradients/Sk4fLinearGradient.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef Sk4fLinearGradient_DEFINED
+#define Sk4fLinearGradient_DEFINED
+
+#include "Sk4fGradientBase.h"
+#include "SkLinearGradient.h"
+
+class SkLinearGradient::
+LinearGradient4fContext : public GradientShaderBase4fContext {
+public:
+    LinearGradient4fContext(const SkLinearGradient&, const ContextRec&);
+
+    void shadeSpan(int x, int y, SkPMColor dst[], int count) override;
+    void shadeSpan4f(int x, int y, SkPM4f dst[], int count) override;
+
+private:
+    using INHERITED = GradientShaderBase4fContext;
+
+    template<typename DstType, TileMode>
+    class LinearIntervalProcessor;
+
+    template <typename DstType, bool premul>
+    void shadePremulSpan(int x, int y, DstType[], int count) const;
+
+    template <typename DstType, bool premul, SkShader::TileMode tileMode>
+    void shadePremulTileSpan(int x, int y, DstType[], int count) const;
+
+    template <typename DstType, bool premul, SkShader::TileMode tileMode>
+    void shadeSpanInternal(int x, int y, DstType[], int count) const;
+};
+
+#endif // Sk4fLinearGradient_DEFINED
diff --git a/src/effects/gradients/SkGradientShader.cpp b/src/effects/gradients/SkGradientShader.cpp
index 3a4d0e5..5444c1c 100644
--- a/src/effects/gradients/SkGradientShader.cpp
+++ b/src/effects/gradients/SkGradientShader.cpp
@@ -5,6 +5,7 @@
  * found in the LICENSE file.
  */
 
+#include "Sk4fLinearGradient.h"
 #include "SkGradientShaderPriv.h"
 #include "SkLinearGradient.h"
 #include "SkRadialGradient.h"
diff --git a/src/effects/gradients/SkGradientShaderPriv.h b/src/effects/gradients/SkGradientShaderPriv.h
index 5ab8c0b..5b1b09b 100644
--- a/src/effects/gradients/SkGradientShaderPriv.h
+++ b/src/effects/gradients/SkGradientShaderPriv.h
@@ -221,6 +221,8 @@
     uint32_t getGradFlags() const { return fGradFlags; }
 
 protected:
+    class GradientShaderBase4fContext;
+
     SkGradientShaderBase(SkReadBuffer& );
     void flatten(SkWriteBuffer&) const override;
     SK_TO_STRING_OVERRIDE()
diff --git a/src/effects/gradients/SkLinearGradient.cpp b/src/effects/gradients/SkLinearGradient.cpp
index a1abe54..9a350b8 100644
--- a/src/effects/gradients/SkLinearGradient.cpp
+++ b/src/effects/gradients/SkLinearGradient.cpp
@@ -5,8 +5,12 @@
  * found in the LICENSE file.
  */
 
+#include "Sk4fLinearGradient.h"
 #include "SkLinearGradient.h"
 
+// define to test the 4f gradient path
+// #define USE_4fGRADIENTS
+
 static const float kInv255Float = 1.0f / 255;
 
 static inline int repeat_8bits(int x) {
@@ -43,6 +47,14 @@
     return matrix;
 }
 
+static bool use_4f_context(uint32_t flags) {
+#ifdef USE_4fGRADIENTS
+    return true;
+#else
+    return SkToBool(flags & SkLinearGradient::kForce4fContext_PrivateFlag);
+#endif
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 
 SkLinearGradient::SkLinearGradient(const SkPoint pts[2], const Descriptor& desc)
@@ -70,11 +82,15 @@
 }
 
 size_t SkLinearGradient::contextSize() const {
-    return sizeof(LinearGradientContext);
+    return use_4f_context(fGradFlags)
+        ? sizeof(LinearGradient4fContext)
+        : sizeof(LinearGradientContext);
 }
 
 SkShader::Context* SkLinearGradient::onCreateContext(const ContextRec& rec, void* storage) const {
-    return new (storage) LinearGradientContext(*this, rec);
+    return use_4f_context(fGradFlags)
+        ? static_cast<SkShader::Context*>(new (storage) LinearGradient4fContext(*this, rec))
+        : static_cast<SkShader::Context*>(new (storage) LinearGradientContext(*this, rec));
 }
 
 // This swizzles SkColor into the same component order as SkPMColor, but does not actually
diff --git a/src/effects/gradients/SkLinearGradient.h b/src/effects/gradients/SkLinearGradient.h
index c8b91a5..c81eea6 100644
--- a/src/effects/gradients/SkLinearGradient.h
+++ b/src/effects/gradients/SkLinearGradient.h
@@ -26,6 +26,11 @@
 
 class SkLinearGradient : public SkGradientShaderBase {
 public:
+    enum {
+        // Temp flag for testing the 4f impl.
+        kForce4fContext_PrivateFlag     = 1 << 7,
+    };
+
     SkLinearGradient(const SkPoint pts[2], const Descriptor&);
 
     size_t contextSize() const override;
@@ -69,6 +74,8 @@
     Context* onCreateContext(const ContextRec&, void* storage) const override;
 
 private:
+    class LinearGradient4fContext;
+
     friend class SkGradientShader;
     typedef SkGradientShaderBase INHERITED;
     const SkPoint fStart;