WIP: experimental bilerp pipeline.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1775963002

Review URL: https://codereview.chromium.org/1775963002
diff --git a/bench/SkLinearBitmapPipelineBench.cpp b/bench/SkLinearBitmapPipelineBench.cpp
index 7d2d4a5..7ab29f6 100644
--- a/bench/SkLinearBitmapPipelineBench.cpp
+++ b/bench/SkLinearBitmapPipelineBench.cpp
@@ -145,7 +145,7 @@
         SkPixmap srcPixmap{fInfo, fBitmap.get(), static_cast<size_t>(4 * width)};
 
         SkLinearBitmapPipeline pipeline{
-            fInvert, filterQuality, fXTile, fYTile, srcPixmap};
+            fInvert, filterQuality, fXTile, fYTile, 1.0f, srcPixmap};
 
         int count = 100;
 
@@ -262,6 +262,31 @@
     srcSize, kLinear_SkColorProfileType, mS, true,
     SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
 
+// Repeat
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kSRGB_SkColorProfileType, mS, false,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kLinear_SkColorProfileType, mS, false,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPOrigShader(
+    srcSize, kLinear_SkColorProfileType, mS, false,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kSRGB_SkColorProfileType, mS, true,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kLinear_SkColorProfileType, mS, true,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPOrigShader(
+    srcSize, kLinear_SkColorProfileType, mS, true,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
 static SkMatrix rotate(SkScalar r) {
     SkMatrix m;
     m.setRotate(30);
@@ -293,3 +318,29 @@
     srcSize, kLinear_SkColorProfileType, mR, true,
     SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);)
 
+// Repeat
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kSRGB_SkColorProfileType, mR, false,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kLinear_SkColorProfileType, mR, false,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPOrigShader(
+    srcSize, kLinear_SkColorProfileType, mR, false,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kSRGB_SkColorProfileType, mR, true,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPGeneral(
+    srcSize, kLinear_SkColorProfileType, mR, true,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
+DEF_BENCH(return new SkBitmapFPOrigShader(
+    srcSize, kLinear_SkColorProfileType, mR, true,
+    SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode);)
+
+
diff --git a/gm/SkLinearBitmapPipelineGM.cpp b/gm/SkLinearBitmapPipelineGM.cpp
index 51fd2cf..f5e01ca 100644
--- a/gm/SkLinearBitmapPipelineGM.cpp
+++ b/gm/SkLinearBitmapPipelineGM.cpp
@@ -60,12 +60,16 @@
     sk_sp<SkImage> image(SkImage::MakeRasterCopy(SkPixmap(info, pmsrc.addr32(), pmsrc.rowBytes())));
     SkPaint paint;
     int32_t storage[300];
-    paint.setShader(image->makeShader(SkShader::kClamp_TileMode, SkShader::kClamp_TileMode));
+
+    sk_sp<SkShader> shader = image->makeShader(SkShader::kRepeat_TileMode,
+                                               SkShader::kRepeat_TileMode);
+
     if (useBilerp) {
         paint.setFilterQuality(SkFilterQuality::kLow_SkFilterQuality);
     } else {
         paint.setFilterQuality(SkFilterQuality::kNone_SkFilterQuality);
     }
+    paint.setShader(std::move(shader));
     const SkShader::ContextRec rec(paint, *mat, nullptr,
                                    SkBlitter::PreferredShaderDest(pmsrc.info()));
     SkASSERT(paint.getShader()->contextSize(rec) <= sizeof(storage));
@@ -79,7 +83,6 @@
     canvas->drawBitmap(bmdst, r.left(), r.top(), nullptr);
 
     ctx->~Context();
-
 }
 
 static void draw_rect_fp(SkCanvas* canvas, const SkRect& r, SkColor c, const SkMatrix* mat, bool useBilerp) {
@@ -117,7 +120,7 @@
 
     SkLinearBitmapPipeline pipeline{
             inv, filterQuality,
-            SkShader::kClamp_TileMode, SkShader::kClamp_TileMode, pmsrc};
+            SkShader::kRepeat_TileMode, SkShader::kRepeat_TileMode, 1.0f, pmsrc};
 
     for (int y = 0; y < ir.height(); y++) {
         pipeline.shadeSpan4f(0, y, dstBits, ir.width());
@@ -161,9 +164,9 @@
     SkMatrix mt2;
     mt2.setTranslate(-18, -18);
     SkMatrix ms;
-    ms.setScale(2.7f, 2.7f);
+    ms.setScale(2.7f, 2.7f, -1.5f, 0);
     SkMatrix ms2;
-    ms2.setScale(-0.2f, 0.2f);
+    ms2.setScale(-0.4f, 0.4f);
     SkMatrix mr;
     mr.setRotate(10);
 
diff --git a/gyp/core.gypi b/gyp/core.gypi
index c3cd618..7c873c9 100644
--- a/gyp/core.gypi
+++ b/gyp/core.gypi
@@ -155,6 +155,10 @@
         '<(skia_src_path)/core/SkLightingShader.cpp',
         '<(skia_src_path)/core/SkLinearBitmapPipeline.cpp',
         '<(skia_src_path)/core/SkLinearBitmapPipeline.h',
+        '<(skia_src_path)/core/SkLinearBitmapPipeline_core.h',
+        '<(skia_src_path)/core/SkLinearBitmapPipeline_matrix.h',
+        '<(skia_src_path)/core/SkLinearBitmapPipeline_tile.h',
+        '<(skia_src_path)/core/SkLinearBitmapPipeline_sample.h',
         '<(skia_src_path)/core/SkLineClipper.cpp',
         '<(skia_src_path)/core/SkLocalMatrixImageFilter.cpp',
         '<(skia_src_path)/core/SkLocalMatrixImageFilter.h',
diff --git a/src/core/SkBitmapProcShader.cpp b/src/core/SkBitmapProcShader.cpp
index 4813e01..fb58f54 100644
--- a/src/core/SkBitmapProcShader.cpp
+++ b/src/core/SkBitmapProcShader.cpp
@@ -46,7 +46,7 @@
     ~BitmapProcInfoContext() override {
         fInfo->~SkBitmapProcInfo();
     }
-    
+
     uint32_t getFlags() const override { return fFlags; }
 
 private:
@@ -123,8 +123,10 @@
     {
         // Need to ensure that our pipeline is created at a 16byte aligned address
         fPipeline = (SkLinearBitmapPipeline*)SkAlign16((intptr_t)fStorage);
-        new (fPipeline) SkLinearBitmapPipeline(info->fInvMatrix, info->fFilterQuality,
+        float alpha = SkColorGetA(info->fPaintColor) / 255.0f;
+        new (fPipeline) SkLinearBitmapPipeline(info->fRealInvMatrix, info->fFilterQuality,
                                                info->fTileModeX, info->fTileModeY,
+                                               alpha,
                                                info->fPixmap);
 
         // To implement the old shadeSpan entry-point, we need to efficiently convert our native
@@ -175,7 +177,8 @@
     // These src attributes are not supported in the new 4f context (yet)
     //
     if (srcInfo.bytesPerPixel() < 4 ||
-        kRGBA_F16_SkColorType == srcInfo.colorType()) {
+        kRGBA_F16_SkColorType == srcInfo.colorType() ||
+        kIndex_8_SkColorType == srcInfo.colorType()) {
         return false;
     }
 
@@ -211,25 +214,13 @@
         return nullptr;
     }
 
-    // Decide if we can/want to use the new linear pipeine
+    // Decide if we can/want to use the new linear pipeline
     bool useLinearPipeline = choose_linear_pipeline(rec, provider.info());
 
-    // New code doesn't support Mirror (YET), so we detect that here.
-    //
-    if (SkShader::kMirror_TileMode == tmx || SkShader::kMirror_TileMode == tmy) {
-        useLinearPipeline = false;
-    }
-
-    // New code doesn't support Mirror (YET), so we detect that here.
-    //
-    if (totalInverse.hasPerspective()) {
-        useLinearPipeline = false;
-    }
-
     //
     // For now, only enable locally since we are hitting some crashers on the test bots
     //
-    useLinearPipeline = false;
+    //useLinearPipeline = false;
 
     if (useLinearPipeline) {
         void* infoStorage = (char*)storage + sizeof(LinearPipelineContext);
@@ -238,6 +229,10 @@
             info->~SkBitmapProcInfo();
             return nullptr;
         }
+        if (info->fPixmap.colorType() != kRGBA_8888_SkColorType
+            && info->fPixmap.colorType() != kBGRA_8888_SkColorType) {
+            return nullptr;
+        }
         return new (storage) LinearPipelineContext(shader, rec, info);
     } else {
         void* stateStorage = (char*)storage + sizeof(BitmapProcShaderContext);
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index c169288..ab23212 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -138,6 +138,7 @@
     }
     fPixmap = fBMState->pixmap();
     fInvMatrix = fBMState->invMatrix();
+    fRealInvMatrix = fBMState->invMatrix();
     fPaintColor = paint.getColor();
     fFilterQuality = fBMState->quality();
     SkASSERT(fPixmap.addr());
@@ -198,7 +199,7 @@
             fFilterQuality = kNone_SkFilterQuality;
         }
     }
-    
+
     return true;
 }
 
@@ -332,7 +333,7 @@
             S4444_alpha_D32_filter_DXDY,
             S4444_opaque_D32_filter_DX,
             S4444_alpha_D32_filter_DX,
-            
+
             // A8 treats alpha/opaque the same (equally efficient)
             SA8_alpha_D32_nofilter_DXDY,
             SA8_alpha_D32_nofilter_DXDY,
@@ -342,7 +343,7 @@
             SA8_alpha_D32_filter_DXDY,
             SA8_alpha_D32_filter_DX,
             SA8_alpha_D32_filter_DX,
-            
+
             // todo: possibly specialize on opaqueness
             SG8_alpha_D32_nofilter_DXDY,
             SG8_alpha_D32_nofilter_DXDY,
diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h
index 26e8db8..40dc31a 100644
--- a/src/core/SkBitmapProcState.h
+++ b/src/core/SkBitmapProcState.h
@@ -35,7 +35,9 @@
     const SkBitmapProvider fProvider;
 
     SkPixmap            fPixmap;
-    SkMatrix            fInvMatrix;         // copy of what is in fBMState, can we remove the dup?
+    SkMatrix            fInvMatrix;         // This changes based on tile mode.
+    // TODO: combine fInvMatrix and fRealInvMatrix.
+    SkMatrix            fRealInvMatrix;     // The actual inverse matrix.
     SkColor             fPaintColor;
     SkShader::TileMode  fTileModeX;
     SkShader::TileMode  fTileModeY;
diff --git a/src/core/SkLinearBitmapPipeline.cpp b/src/core/SkLinearBitmapPipeline.cpp
index 4c21180..3a9a019 100644
--- a/src/core/SkLinearBitmapPipeline.cpp
+++ b/src/core/SkLinearBitmapPipeline.cpp
@@ -17,12 +17,20 @@
 #include "SkLinearBitmapPipeline_core.h"
 #include "SkLinearBitmapPipeline_matrix.h"
 #include "SkLinearBitmapPipeline_tile.h"
+#include "SkLinearBitmapPipeline_sample.h"
 
 class SkLinearBitmapPipeline::PointProcessorInterface {
 public:
     virtual ~PointProcessorInterface() { }
+    // Take the first n (where 0 < n && n < 4) items from xs and ys and sample those points. For
+    // nearest neighbor, that means just taking the floor xs and ys. For bilerp, this means
+    // to expand the bilerp filter around the point and sample using that filter.
     virtual void VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) = 0;
+    // Same as pointListFew, but n = 4.
     virtual void VECTORCALL pointList4(Sk4s xs, Sk4s ys) = 0;
+    // A span is a compact form of sample points that are obtained by mapping points from
+    // destination space to source space. This is used for horizontal lines only, and is mainly
+    // used to take advantage of memory coherence for horizontal spans.
     virtual void pointSpan(Span span) = 0;
 };
 
@@ -41,8 +49,13 @@
     // +--------+--------+
     // These pixels coordinates are arranged in the following order in xs and ys:
     // px00  px10  px01  px11
-    virtual void VECTORCALL bilerpList(Sk4s xs, Sk4s ys) = 0;
-    virtual void bilerpSpan(BilerpSpan span) = 0;
+    virtual void VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) = 0;
+
+    // A span represents sample points that have been mapped from destination space to source
+    // space. Each sample point is then expanded to the four bilerp points by add +/- 0.5. The
+    // resulting Y values my be off the tile. When y +/- 0.5 are more than 1 apart because of
+    // tiling, the second Y is used to denote the retiled Y value.
+    virtual void bilerpSpan(Span span, SkScalar y) = 0;
 };
 
 class SkLinearBitmapPipeline::PixelPlacerInterface {
@@ -54,6 +67,9 @@
 };
 
 namespace  {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Matrix Stage
 // PointProcessor uses a strategy to help complete the work of the different stages. The strategy
 // must implement the following methods:
 // * processPoints(xs, ys) - must mutate the xs and ys for the stage.
@@ -64,10 +80,10 @@
 //   maybeProcessSpan - returns false if it can not process the span and needs to fallback to
 //                      point lists for processing.
 template<typename Strategy, typename Next>
-class PointProcessor final : public SkLinearBitmapPipeline::PointProcessorInterface {
+class MatrixStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
 public:
     template <typename... Args>
-    PointProcessor(Next* next, Args&&... args)
+    MatrixStage(Next* next, Args&&... args)
         : fNext{next}
         , fStrategy{std::forward<Args>(args)...}{ }
 
@@ -94,66 +110,31 @@
     Strategy fStrategy;
 };
 
-// See PointProcessor for responsibilities of Strategy.
-template<typename Strategy, typename Next>
-class BilerpProcessor final : public SkLinearBitmapPipeline::BilerpProcessorInterface  {
-public:
-    template <typename... Args>
-    BilerpProcessor(Next* next, Args&&... args)
-        : fNext{next}
-        , fStrategy{std::forward<Args>(args)...}{ }
-
-    void VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
-        fStrategy.processPoints(&xs, &ys);
-        fNext->pointListFew(n, xs, ys);
-    }
-
-    void VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
-        fStrategy.processPoints(&xs, &ys);
-        fNext->pointList4(xs, ys);
-    }
-
-    void VECTORCALL bilerpList(Sk4s xs, Sk4s ys) override {
-        fStrategy.processPoints(&xs, &ys);
-        fNext->bilerpList(xs, ys);
-    }
-
-    void pointSpan(Span span) override {
-        SkASSERT(!span.isEmpty());
-        if (!fStrategy.maybeProcessSpan(span, fNext)) {
-            span_fallback(span, this);
-        }
-    }
-
-    void bilerpSpan(BilerpSpan bSpan) override {
-        SkASSERT(!bSpan.isEmpty());
-        if (!fStrategy.maybeProcessBilerpSpan(bSpan, fNext)) {
-            bilerp_span_fallback(bSpan, this);
-        }
-    }
-
-private:
-    Next* const fNext;
-    Strategy fStrategy;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Matrix Stage
 template <typename Next = SkLinearBitmapPipeline::PointProcessorInterface>
-using TranslateMatrix = PointProcessor<TranslateMatrixStrategy, Next>;
+using TranslateMatrix = MatrixStage<TranslateMatrixStrategy, Next>;
 
 template <typename Next = SkLinearBitmapPipeline::PointProcessorInterface>
-using ScaleMatrix = PointProcessor<ScaleMatrixStrategy, Next>;
+using ScaleMatrix = MatrixStage<ScaleMatrixStrategy, Next>;
 
 template <typename Next = SkLinearBitmapPipeline::PointProcessorInterface>
-using AffineMatrix = PointProcessor<AffineMatrixStrategy, Next>;
+using AffineMatrix = MatrixStage<AffineMatrixStrategy, Next>;
+
+template <typename Next = SkLinearBitmapPipeline::PointProcessorInterface>
+using PerspectiveMatrix = MatrixStage<PerspectiveMatrixStrategy, Next>;
+
 
 static SkLinearBitmapPipeline::PointProcessorInterface* choose_matrix(
     SkLinearBitmapPipeline::PointProcessorInterface* next,
     const SkMatrix& inverse,
     SkLinearBitmapPipeline::MatrixStage* matrixProc) {
     if (inverse.hasPerspective()) {
-        SkFAIL("Not implemented.");
+        matrixProc->Initialize<PerspectiveMatrix<>>(
+            next,
+            SkVector{inverse.getTranslateX(), inverse.getTranslateY()},
+            SkVector{inverse.getScaleX(), inverse.getScaleY()},
+            SkVector{inverse.getSkewX(), inverse.getSkewY()},
+            SkVector{inverse.getPerspX(), inverse.getPerspY()},
+            inverse.get(SkMatrix::kMPersp2));
     } else if (inverse.getSkewX() != 0.0f || inverse.getSkewY() != 0.0f) {
         matrixProc->Initialize<AffineMatrix<>>(
             next,
@@ -176,370 +157,305 @@
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// Bilerp Expansion Stage
-template <typename Next = SkLinearBitmapPipeline::BilerpProcessorInterface>
-class ExpandBilerp final : public SkLinearBitmapPipeline::PointProcessorInterface {
+// Tile Stage
+
+template<typename XStrategy, typename YStrategy, typename Next>
+class NearestTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
 public:
-    ExpandBilerp(Next* next) : fNext{next} { }
+    template <typename... Args>
+    NearestTileStage(Next* next, SkISize dimensions)
+        : fNext{next}
+        , fXStrategy{dimensions.width()}
+        , fYStrategy{dimensions.height()}{ }
 
     void VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
-        SkASSERT(0 < n && n < 4);
-        //                    px00   px10   px01  px11
-        const Sk4s kXOffsets{-0.5f,  0.5f, -0.5f, 0.5f},
-                   kYOffsets{-0.5f, -0.5f,  0.5f, 0.5f};
-        if (n >= 1) fNext->bilerpList(Sk4s{xs[0]} + kXOffsets, Sk4s{ys[0]} + kYOffsets);
-        if (n >= 2) fNext->bilerpList(Sk4s{xs[1]} + kXOffsets, Sk4s{ys[1]} + kYOffsets);
-        if (n >= 3) fNext->bilerpList(Sk4s{xs[2]} + kXOffsets, Sk4s{ys[2]} + kYOffsets);
+        fXStrategy.tileXPoints(&xs);
+        fYStrategy.tileYPoints(&ys);
+        fNext->pointListFew(n, xs, ys);
     }
 
-    void VECTORCALL pointList4(Sk4f xs, Sk4f ys) override {
-        //                    px00   px10   px01  px11
-        const Sk4f kXOffsets{-0.5f,  0.5f, -0.5f, 0.5f},
-                   kYOffsets{-0.5f, -0.5f,  0.5f, 0.5f};
-        fNext->bilerpList(Sk4s{xs[0]} + kXOffsets, Sk4s{ys[0]} + kYOffsets);
-        fNext->bilerpList(Sk4s{xs[1]} + kXOffsets, Sk4s{ys[1]} + kYOffsets);
-        fNext->bilerpList(Sk4s{xs[2]} + kXOffsets, Sk4s{ys[2]} + kYOffsets);
-        fNext->bilerpList(Sk4s{xs[3]} + kXOffsets, Sk4s{ys[3]} + kYOffsets);
+    void VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
+        fXStrategy.tileXPoints(&xs);
+        fYStrategy.tileYPoints(&ys);
+        fNext->pointList4(xs, ys);
     }
 
+    // The span you pass must not be empty.
     void pointSpan(Span span) override {
         SkASSERT(!span.isEmpty());
         SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
-        // Adjust the span so that it is in the correct phase with the pixel.
-        BilerpSpan bSpan{X(start) - 0.5f, Y(start) - 0.5f, Y(start) + 0.5f, length, count};
-        fNext->bilerpSpan(bSpan);
+        SkScalar x = X(start);
+        SkScalar y = fYStrategy.tileY(Y(start));
+        Span yAdjustedSpan{{x, y}, length, count};
+        if (!fXStrategy.maybeProcessSpan(yAdjustedSpan, fNext)) {
+            span_fallback(span, this);
+        }
     }
 
 private:
     Next* const fNext;
+    XStrategy fXStrategy;
+    YStrategy fYStrategy;
 };
 
-static SkLinearBitmapPipeline::PointProcessorInterface* choose_filter(
-    SkLinearBitmapPipeline::BilerpProcessorInterface* next,
-    SkFilterQuality filterQuailty,
-    SkLinearBitmapPipeline::FilterStage* filterProc) {
-    if (SkFilterQuality::kNone_SkFilterQuality == filterQuailty) {
-        return next;
-    } else {
-        filterProc->Initialize<ExpandBilerp<>>(next);
-        return filterProc->get();
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Tile Stage
-template <typename Next = SkLinearBitmapPipeline::BilerpProcessorInterface>
-using Clamp = BilerpProcessor<ClampStrategy, Next>;
-
-template <typename Next = SkLinearBitmapPipeline::BilerpProcessorInterface>
-using Repeat = BilerpProcessor<RepeatStrategy, Next>;
-
-static SkLinearBitmapPipeline::BilerpProcessorInterface* choose_tiler(
-    SkLinearBitmapPipeline::BilerpProcessorInterface* next,
-    SkSize dimensions,
-    SkShader::TileMode xMode,
-    SkShader::TileMode yMode,
-    SkLinearBitmapPipeline::TileStage* tileProcXOrBoth,
-    SkLinearBitmapPipeline::TileStage* tileProcY) {
-    if (xMode == yMode) {
-        switch (xMode) {
-            case SkShader::kClamp_TileMode:
-                tileProcXOrBoth->Initialize<Clamp<>>(next, dimensions);
-                break;
-            case SkShader::kRepeat_TileMode:
-                tileProcXOrBoth->Initialize<Repeat<>>(next, dimensions);
-                break;
-            case SkShader::kMirror_TileMode:
-                SkFAIL("Not implemented.");
-                break;
-        }
-    } else {
-        switch (yMode) {
-            case SkShader::kClamp_TileMode:
-                tileProcY->Initialize<Clamp<>>(next, Y(dimensions));
-                break;
-            case SkShader::kRepeat_TileMode:
-                tileProcY->Initialize<Repeat<>>(next, Y(dimensions));
-                break;
-            case SkShader::kMirror_TileMode:
-                SkFAIL("Not implemented.");
-                break;
-        }
-        switch (xMode) {
-            case SkShader::kClamp_TileMode:
-                tileProcXOrBoth->Initialize<Clamp<>>(tileProcY->get(), X(dimensions));
-                break;
-            case SkShader::kRepeat_TileMode:
-                tileProcXOrBoth->Initialize<Repeat<>>(tileProcY->get(), X(dimensions));
-                break;
-            case SkShader::kMirror_TileMode:
-                SkFAIL("Not implemented.");
-                break;
-        }
-    }
-    return tileProcXOrBoth->get();
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Source Sampling Stage
-class sRGBFast {
-public:
-    static Sk4s VECTORCALL sRGBToLinear(Sk4s pixel) {
-        Sk4s l = pixel * pixel;
-        return Sk4s{l[0], l[1], l[2], pixel[3]};
-    }
-};
-
-enum class ColorOrder {
-    kRGBA = false,
-    kBGRA = true,
-};
-template <SkColorProfileType colorProfile, ColorOrder colorOrder>
-class Pixel8888 {
-public:
-    Pixel8888(int width, const uint32_t* src) : fSrc{src}, fWidth{width}{ }
-    Pixel8888(const SkPixmap& srcPixmap)
-        : fSrc{srcPixmap.addr32()}
-        , fWidth{static_cast<int>(srcPixmap.rowBytes() / 4)} { }
-
-    void VECTORCALL getFewPixels(int n, Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) {
-        Sk4i XIs = SkNx_cast<int, SkScalar>(xs);
-        Sk4i YIs = SkNx_cast<int, SkScalar>(ys);
-        Sk4i bufferLoc = YIs * fWidth + XIs;
-        switch (n) {
-            case 3:
-                *px2 = this->getPixel(fSrc, bufferLoc[2]);
-            case 2:
-                *px1 = this->getPixel(fSrc, bufferLoc[1]);
-            case 1:
-                *px0 = this->getPixel(fSrc, bufferLoc[0]);
-            default:
-                break;
-        }
-    }
-
-    void VECTORCALL get4Pixels(Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
-        Sk4i XIs = SkNx_cast<int, SkScalar>(xs);
-        Sk4i YIs = SkNx_cast<int, SkScalar>(ys);
-        Sk4i bufferLoc = YIs * fWidth + XIs;
-        *px0 = this->getPixel(fSrc, bufferLoc[0]);
-        *px1 = this->getPixel(fSrc, bufferLoc[1]);
-        *px2 = this->getPixel(fSrc, bufferLoc[2]);
-        *px3 = this->getPixel(fSrc, bufferLoc[3]);
-    }
-
-    void get4Pixels(const void* vsrc, int index, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
-        const uint32_t* src = static_cast<const uint32_t*>(vsrc);
-        *px0 = this->getPixel(src, index + 0);
-        *px1 = this->getPixel(src, index + 1);
-        *px2 = this->getPixel(src, index + 2);
-        *px3 = this->getPixel(src, index + 3);
-    }
-
-    Sk4f getPixel(const void* vsrc, int index) {
-        const uint32_t* src = static_cast<const uint32_t*>(vsrc);
-        Sk4b bytePixel = Sk4b::Load((uint8_t *)(&src[index]));
-        Sk4f pixel = SkNx_cast<float, uint8_t>(bytePixel);
-        if (colorOrder == ColorOrder::kBGRA) {
-            pixel = SkNx_shuffle<2, 1, 0, 3>(pixel);
-        }
-        pixel = pixel * Sk4f{1.0f/255.0f};
-        if (colorProfile == kSRGB_SkColorProfileType) {
-            pixel = sRGBFast::sRGBToLinear(pixel);
-        }
-        return pixel;
-    }
-
-    const uint32_t* row(int y) { return fSrc + y * fWidth[0]; }
-
-private:
-    const uint32_t* const fSrc;
-    const Sk4i fWidth;
-};
-
-// Explaination of the math:
-//              1 - x      x
-//           +--------+--------+
-//           |        |        |
-//  1 - y    |  px00  |  px10  |
-//           |        |        |
-//           +--------+--------+
-//           |        |        |
-//    y      |  px01  |  px11  |
-//           |        |        |
-//           +--------+--------+
-//
-//
-// Given a pixelxy each is multiplied by a different factor derived from the fractional part of x
-// and y:
-// * px00 -> (1 - x)(1 - y) = 1 - x - y + xy
-// * px10 -> x(1 - y) = x - xy
-// * px01 -> (1 - x)y = y - xy
-// * px11 -> xy
-// So x * y is calculated first and then used to calculate all the other factors.
-static Sk4s VECTORCALL bilerp4(Sk4s xs, Sk4s ys, Sk4f px00, Sk4f px10,
-                                                 Sk4f px01, Sk4f px11) {
-    // Calculate fractional xs and ys.
-    Sk4s fxs = xs - xs.floor();
-    Sk4s fys = ys - ys.floor();
-    Sk4s fxys{fxs * fys};
-    Sk4f sum =  px11 * fxys;
-    sum = sum + px01 * (fys - fxys);
-    sum = sum + px10 * (fxs - fxys);
-    sum = sum + px00 * (Sk4f{1.0f} - fxs - fys + fxys);
-    return sum;
-}
-
-template <typename SourceStrategy>
-class Sampler final : public SkLinearBitmapPipeline::BilerpProcessorInterface {
+template<typename XStrategy, typename YStrategy, typename Next>
+class BilerpTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
 public:
     template <typename... Args>
-    Sampler(SkLinearBitmapPipeline::PixelPlacerInterface* next, Args&&... args)
-        : fNext{next}
-        , fStrategy{std::forward<Args>(args)...} { }
+    BilerpTileStage(Next* next, SkISize dimensions)
+        : fXMax(dimensions.width())
+        , fYMax(dimensions.height())
+        , fNext{next}
+        , fXStrategy{dimensions.width()}
+        , fYStrategy{dimensions.height()}{ }
 
     void VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
-        SkASSERT(0 < n && n < 4);
-        Sk4f px0, px1, px2;
-        fStrategy.getFewPixels(n, xs, ys, &px0, &px1, &px2);
-        if (n >= 1) fNext->placePixel(px0);
-        if (n >= 2) fNext->placePixel(px1);
-        if (n >= 3) fNext->placePixel(px2);
+        fXStrategy.tileXPoints(&xs);
+        fYStrategy.tileYPoints(&ys);
+        // TODO: check to see if xs and ys are in range then just call pointListFew on next.
+        if (n >= 1) this->bilerpPoint(xs[0], ys[0]);
+        if (n >= 2) this->bilerpPoint(xs[1], ys[1]);
+        if (n >= 3) this->bilerpPoint(xs[2], ys[2]);
     }
 
     void VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
-        Sk4f px0, px1, px2, px3;
-        fStrategy.get4Pixels(xs, ys, &px0, &px1, &px2, &px3);
-        fNext->place4Pixels(px0, px1, px2, px3);
+        fXStrategy.tileXPoints(&xs);
+        fYStrategy.tileYPoints(&ys);
+        // TODO: check to see if xs and ys are in range then just call pointList4 on next.
+        this->bilerpPoint(xs[0], ys[0]);
+        this->bilerpPoint(xs[1], ys[1]);
+        this->bilerpPoint(xs[2], ys[2]);
+        this->bilerpPoint(xs[3], ys[3]);
     }
 
-    void VECTORCALL bilerpList(Sk4s xs, Sk4s ys) override {
-        Sk4f px00, px10, px01, px11;
-        fStrategy.get4Pixels(xs, ys, &px00, &px10, &px01, &px11);
-        Sk4f pixel = bilerp4(xs, ys, px00, px10, px01, px11);
-        fNext->placePixel(pixel);
-    }
+    struct Wrapper {
+        void pointSpan(Span span) {
+            processor->breakIntoEdges(span);
+        }
 
+        BilerpTileStage* processor;
+    };
+
+    // The span you pass must not be empty.
     void pointSpan(Span span) override {
         SkASSERT(!span.isEmpty());
-        SkPoint start; SkScalar length; int count;
-        std::tie(start, length, count) = span;
-        if (length < (count - 1)) {
-            this->pointSpanSlowRate(span);
-        } else if (length == (count - 1)) {
-            this->pointSpanUnitRate(span);
-        } else {
-            this->pointSpanFastRate(span);
+
+        Wrapper wrapper = {this};
+        if (!fXStrategy.maybeProcessSpan(span, &wrapper)) {
+            span_fallback(span, this);
         }
     }
 
 private:
-    // When moving through source space more slowly than dst space (zoomed in),
-    // we'll be sampling from the same source pixel more than once.
-    void pointSpanSlowRate(Span span) {
+    void bilerpPoint(SkScalar x, SkScalar y) {
+        Sk4f txs = Sk4f{x} + Sk4f{-0.5f, 0.5f, -0.5f, 0.5f};
+        Sk4f tys = Sk4f{y} + Sk4f{-0.5f, -0.5f, 0.5f, 0.5f};
+        fXStrategy.tileXPoints(&txs);
+        fYStrategy.tileYPoints(&tys);
+        fNext->bilerpEdge(txs, tys);
+    }
+
+    void handleEdges(Span span, SkScalar dx) {
         SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
         SkScalar x = X(start);
-        SkFixed fx = SkScalarToFixed(x);
-        SkScalar dx = length / (count - 1);
-        SkFixed fdx = SkScalarToFixed(dx);
+        SkScalar y = Y(start);
+        SkScalar tiledY = fYStrategy.tileY(y);
+        while (count > 0) {
+            this->bilerpPoint(x, tiledY);
+            x += dx;
+            count -= 1;
+        }
+    }
 
-        const void* row = fStrategy.row((int)std::floor(Y(start)));
-        SkLinearBitmapPipeline::PixelPlacerInterface* next = fNext;
+    void yProcessSpan(Span span) {
+        SkScalar tiledY = fYStrategy.tileY(span.startY());
+        if (0.5f <= tiledY && tiledY < fYMax - 0.5f ) {
+            Span tiledSpan{{span.startX(), tiledY}, span.length(), span.count()};
+            fNext->pointSpan(tiledSpan);
+        } else {
+            // Convert to the Y0 bilerp sample set by shifting by -0.5f. Then tile that new y
+            // value and shift it back resulting in the working Y0. Do the same thing with Y1 but
+            // in the opposite direction.
+            SkScalar y0 = fYStrategy.tileY(span.startY() - 0.5f) + 0.5f;
+            SkScalar y1 = fYStrategy.tileY(span.startY() + 0.5f) - 0.5f;
+            Span newSpan{{span.startX(), y0}, span.length(), span.count()};
+            fNext->bilerpSpan(newSpan, y1);
+        }
+    }
+    void breakIntoEdges(Span span) {
+        if (span.length() == 0) {
+            yProcessSpan(span);
+        } else {
+            SkScalar dx = span.length() / (span.count() - 1);
+            if (span.length() > 0) {
+                Span leftBorder = span.breakAt(0.5f, dx);
+                if (!leftBorder.isEmpty()) {
+                    this->handleEdges(leftBorder, dx);
+                }
+                Span center = span.breakAt(fXMax - 0.5f, dx);
+                if (!center.isEmpty()) {
+                    this->yProcessSpan(center);
+                }
 
-        int ix = SkFixedFloorToInt(fx);
-        int prevIX = ix;
-        Sk4f fpixel = fStrategy.getPixel(row, ix);
+                if (!span.isEmpty()) {
+                    this->handleEdges(span, dx);
+                }
+            } else {
+                Span center = span.breakAt(fXMax + 0.5f, dx);
+                if (!span.isEmpty()) {
+                    this->handleEdges(span, dx);
+                }
+                Span leftEdge = center.breakAt(0.5f, dx);
+                if (!center.isEmpty()) {
+                    this->yProcessSpan(center);
+                }
+                if (!leftEdge.isEmpty()) {
+                    this->handleEdges(leftEdge, dx);
+                }
 
-        // When dx is less than one, each pixel is used more than once. Using the fixed point fx
-        // allows the code to quickly check that the same pixel is being used. The code uses this
-        // same pixel check to do the sRGB and normalization only once.
-        auto getNextPixel = [&]() {
-            if (ix != prevIX) {
-                fpixel = fStrategy.getPixel(row, ix);
-                prevIX = ix;
             }
-            fx += fdx;
-            ix = SkFixedFloorToInt(fx);
-            return fpixel;
-        };
-
-        while (count >= 4) {
-            Sk4f px0 = getNextPixel();
-            Sk4f px1 = getNextPixel();
-            Sk4f px2 = getNextPixel();
-            Sk4f px3 = getNextPixel();
-            next->place4Pixels(px0, px1, px2, px3);
-            count -= 4;
-        }
-        while (count > 0) {
-            next->placePixel(getNextPixel());
-            count -= 1;
         }
     }
 
-    // We're moving through source space at a rate of 1 source pixel per 1 dst pixel.
-    // We'll never re-use pixels, but we can at least load contiguous pixels.
-    void pointSpanUnitRate(Span span) {
-        SkPoint start; SkScalar length; int count;
-        std::tie(start, length, count) = span;
-        int ix = SkScalarFloorToInt(X(start));
-        const void* row = fStrategy.row((int)std::floor(Y(start)));
-        SkLinearBitmapPipeline::PixelPlacerInterface* next = fNext;
-        while (count >= 4) {
-            Sk4f px0, px1, px2, px3;
-            fStrategy.get4Pixels(row, ix, &px0, &px1, &px2, &px3);
-            next->place4Pixels(px0, px1, px2, px3);
-            ix += 4;
-            count -= 4;
-        }
+    SkScalar fXMax;
+    SkScalar fYMax;
+    Next* const fNext;
+    XStrategy fXStrategy;
+    YStrategy fYStrategy;
+};
 
-        while (count > 0) {
-            next->placePixel(fStrategy.getPixel(row, ix));
-            ix += 1;
-            count -= 1;
-        }
+template <typename XStrategy, typename YStrategy, typename Next>
+void make_tile_stage(
+    SkFilterQuality filterQuality, SkISize dimensions,
+    Next* next, SkLinearBitmapPipeline::TileStage* tileStage) {
+    if (filterQuality == kNone_SkFilterQuality) {
+        tileStage->Initialize<NearestTileStage<XStrategy, YStrategy, Next>>(next, dimensions);
+    } else {
+        tileStage->Initialize<BilerpTileStage<XStrategy, YStrategy, Next>>(next, dimensions);
+    }
+}
+template <typename XStrategy>
+void choose_tiler_ymode(
+    SkShader::TileMode yMode, SkFilterQuality filterQuality, SkISize dimensions,
+    SkLinearBitmapPipeline::BilerpProcessorInterface* next,
+    SkLinearBitmapPipeline::TileStage* tileStage) {
+    switch (yMode) {
+        case SkShader::kClamp_TileMode:
+            make_tile_stage<XStrategy, YClampStrategy>(filterQuality, dimensions, next, tileStage);
+            break;
+        case SkShader::kRepeat_TileMode:
+            make_tile_stage<XStrategy, YRepeatStrategy>(filterQuality, dimensions, next, tileStage);
+            break;
+        case SkShader::kMirror_TileMode:
+            make_tile_stage<XStrategy, YMirrorStrategy>(filterQuality, dimensions, next, tileStage);
+            break;
+    }
+};
+
+static SkLinearBitmapPipeline::PointProcessorInterface* choose_tiler(
+    SkLinearBitmapPipeline::BilerpProcessorInterface* next,
+    SkISize dimensions,
+    SkShader::TileMode xMode,
+    SkShader::TileMode yMode,
+    SkFilterQuality filterQuality,
+    SkLinearBitmapPipeline::TileStage* tileStage) {
+    switch (xMode) {
+        case SkShader::kClamp_TileMode:
+            choose_tiler_ymode<XClampStrategy>(yMode, filterQuality, dimensions, next, tileStage);
+            break;
+        case SkShader::kRepeat_TileMode:
+            choose_tiler_ymode<XRepeatStrategy>(yMode, filterQuality, dimensions, next, tileStage);
+            break;
+        case SkShader::kMirror_TileMode:
+            choose_tiler_ymode<XMirrorStrategy>(yMode, filterQuality, dimensions, next, tileStage);
+            break;
     }
 
-    // We're moving through source space faster than dst (zoomed out),
-    // so we'll never reuse a source pixel or be able to do contiguous loads.
-    void pointSpanFastRate(Span span) {
-        span_fallback(span, this);
+    return tileStage->get();
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Source Sampling Stage
+template <typename SourceStrategy, typename Next>
+class NearestNeighborSampler final : public SkLinearBitmapPipeline::BilerpProcessorInterface {
+public:
+    template <typename... Args>
+    NearestNeighborSampler(Next* next, Args&&... args)
+    : fSampler{next, std::forward<Args>(args)...} { }
+
+    void VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
+        fSampler.nearestListFew(n, xs, ys);
+    }
+    void VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
+        fSampler.nearestList4(xs, ys);
+    }
+    void pointSpan(Span span) override {
+        fSampler.nearestSpan(span);
+    }
+    void VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) override {
+        SkFAIL("Using nearest neighbor sampler, but calling a bilerpEdge.");
     }
 
-    void bilerpSpan(BilerpSpan span) override {
-        bilerp_span_fallback(span, this);
+    virtual void bilerpSpan(Span span, SkScalar y) override {
+        SkFAIL("Using nearest neighbor sampler, but calling a bilerpSpan.");
     }
 
 private:
-    SkLinearBitmapPipeline::PixelPlacerInterface* const fNext;
-    SourceStrategy fStrategy;
+    GeneralSampler<SourceStrategy, Next> fSampler;
 };
 
-using Pixel8888SRGB = Pixel8888<kSRGB_SkColorProfileType, ColorOrder::kRGBA>;
-using Pixel8888LRGB = Pixel8888<kLinear_SkColorProfileType, ColorOrder::kRGBA>;
-using Pixel8888SBGR = Pixel8888<kSRGB_SkColorProfileType, ColorOrder::kBGRA>;
-using Pixel8888LBGR = Pixel8888<kLinear_SkColorProfileType, ColorOrder::kBGRA>;
+template <typename SourceStrategy, typename Next>
+class BilerpSampler final : public SkLinearBitmapPipeline::BilerpProcessorInterface {
+public:
+    template <typename... Args>
+    BilerpSampler(Next* next, Args&&... args)
+        : fSampler{next, std::forward<Args>(args)...} { }
 
-static SkLinearBitmapPipeline::BilerpProcessorInterface* choose_pixel_sampler(
-    SkLinearBitmapPipeline::PixelPlacerInterface* next,
+    void VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
+        fSampler.bilerpListFew(n, xs, ys);
+    }
+    void VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
+        fSampler.bilerpList4(xs, ys);
+    }
+    void pointSpan(Span span) override {
+        fSampler.bilerpSpan(span);
+    }
+    void VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) override {
+        fSampler.bilerpEdge(xs, ys);
+    }
+
+    virtual void bilerpSpan(Span span, SkScalar y) override {
+        fSampler.bilerpSpanWithY(span, y);
+    }
+
+private:
+    GeneralSampler<SourceStrategy, Next> fSampler;
+};
+
+using Placer = SkLinearBitmapPipeline::PixelPlacerInterface;
+
+template<template <typename, typename> class Sampler>
+static SkLinearBitmapPipeline::BilerpProcessorInterface* choose_pixel_sampler_base(
+    Placer* next,
     const SkPixmap& srcPixmap,
     SkLinearBitmapPipeline::SampleStage* sampleStage) {
     const SkImageInfo& imageInfo = srcPixmap.info();
     switch (imageInfo.colorType()) {
         case kRGBA_8888_SkColorType:
             if (imageInfo.profileType() == kSRGB_SkColorProfileType) {
-                sampleStage->Initialize<Sampler<Pixel8888SRGB>>(next, srcPixmap);
+                sampleStage->Initialize<Sampler<Pixel8888SRGB, Placer>>(next, srcPixmap);
             } else {
-                sampleStage->Initialize<Sampler<Pixel8888LRGB>>(next, srcPixmap);
+                sampleStage->Initialize<Sampler<Pixel8888LRGB, Placer>>(next, srcPixmap);
             }
             break;
         case kBGRA_8888_SkColorType:
             if (imageInfo.profileType() == kSRGB_SkColorProfileType) {
-                sampleStage->Initialize<Sampler<Pixel8888SBGR>>(next, srcPixmap);
+                sampleStage->Initialize<Sampler<Pixel8888SBGR, Placer>>(next, srcPixmap);
             } else {
-                sampleStage->Initialize<Sampler<Pixel8888LBGR>>(next, srcPixmap);
+                sampleStage->Initialize<Sampler<Pixel8888LBGR, Placer>>(next, srcPixmap);
             }
             break;
         default:
@@ -549,11 +465,24 @@
     return sampleStage->get();
 }
 
+SkLinearBitmapPipeline::BilerpProcessorInterface* choose_pixel_sampler(
+    Placer* next,
+    SkFilterQuality filterQuality,
+    const SkPixmap& srcPixmap,
+    SkLinearBitmapPipeline::SampleStage* sampleStage) {
+    if (filterQuality == kNone_SkFilterQuality) {
+        return choose_pixel_sampler_base<NearestNeighborSampler>(next, srcPixmap, sampleStage);
+    } else {
+        return choose_pixel_sampler_base<BilerpSampler>(next, srcPixmap, sampleStage);
+    }
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Pixel Placement Stage
 template <SkAlphaType alphaType>
 class PlaceFPPixel final : public SkLinearBitmapPipeline::PixelPlacerInterface {
 public:
+    PlaceFPPixel(float postAlpha) : fPostAlpha{postAlpha} { }
     void VECTORCALL placePixel(Sk4f pixel) override {
         PlacePixel(fDst, pixel, 0);
         fDst += 1;
@@ -573,11 +502,12 @@
     }
 
 private:
-    static void VECTORCALL PlacePixel(SkPM4f* dst, Sk4f pixel, int index) {
+    void VECTORCALL PlacePixel(SkPM4f* dst, Sk4f pixel, int index) {
         Sk4f newPixel = pixel;
         if (alphaType == kUnpremul_SkAlphaType) {
             newPixel = Premultiply(pixel);
         }
+        newPixel = newPixel * fPostAlpha;
         newPixel.store(dst + index);
     }
     static Sk4f VECTORCALL Premultiply(Sk4f pixel) {
@@ -586,16 +516,18 @@
     }
 
     SkPM4f* fDst;
+    Sk4f fPostAlpha;
 };
 
 static SkLinearBitmapPipeline::PixelPlacerInterface* choose_pixel_placer(
     SkAlphaType alphaType,
+    float postAlpha,
     SkLinearBitmapPipeline::PixelStage* placerStage) {
     if (alphaType == kUnpremul_SkAlphaType) {
-        placerStage->Initialize<PlaceFPPixel<kUnpremul_SkAlphaType>>();
+        placerStage->Initialize<PlaceFPPixel<kUnpremul_SkAlphaType>>(postAlpha);
     } else {
         // kOpaque_SkAlphaType is treated the same as kPremul_SkAlphaType
-        placerStage->Initialize<PlaceFPPixel<kPremul_SkAlphaType>>();
+        placerStage->Initialize<PlaceFPPixel<kPremul_SkAlphaType>>(postAlpha);
     }
     return placerStage->get();
 }
@@ -608,18 +540,31 @@
     const SkMatrix& inverse,
     SkFilterQuality filterQuality,
     SkShader::TileMode xTile, SkShader::TileMode yTile,
+    float postAlpha,
     const SkPixmap& srcPixmap) {
-    SkSize size = SkSize::Make(srcPixmap.width(), srcPixmap.height());
+    SkISize dimensions = srcPixmap.info().dimensions();
     const SkImageInfo& srcImageInfo = srcPixmap.info();
 
+    SkMatrix adjustedInverse = inverse;
+    if (filterQuality == kNone_SkFilterQuality) {
+        if (inverse.getScaleX() >= 0.0f) {
+            adjustedInverse.setTranslateX(
+                nextafterf(inverse.getTranslateX(), std::floor(inverse.getTranslateX())));
+        }
+        if (inverse.getScaleY() >= 0.0f) {
+            adjustedInverse.setTranslateY(
+                nextafterf(inverse.getTranslateY(), std::floor(inverse.getTranslateY())));
+        }
+    }
+
     // As the stages are built, the chooser function may skip a stage. For example, with the
     // identity matrix, the matrix stage is skipped, and the tilerStage is the first stage.
-    auto placementStage = choose_pixel_placer(srcImageInfo.alphaType(), &fPixelStage);
-    auto samplerStage   = choose_pixel_sampler(placementStage, srcPixmap, &fSampleStage);
-    auto tilerStage     = choose_tiler(samplerStage, size, xTile, yTile, &fTileXOrBothStage,
-                                       &fTileYStage);
-    auto filterStage    = choose_filter(tilerStage, filterQuality, &fFilterStage);
-    fFirstStage         = choose_matrix(filterStage, inverse, &fMatrixStage);
+    auto placementStage = choose_pixel_placer(srcImageInfo.alphaType(), postAlpha, &fPixelStage);
+    auto samplerStage   = choose_pixel_sampler(placementStage,
+                                               filterQuality, srcPixmap, &fSampleStage);
+    auto tilerStage     = choose_tiler(samplerStage,
+                                       dimensions, xTile, yTile, filterQuality, &fTiler);
+    fFirstStage         = choose_matrix(tilerStage, adjustedInverse, &fMatrixStage);
 }
 
 void SkLinearBitmapPipeline::shadeSpan4f(int x, int y, SkPM4f* dst, int count) {
@@ -629,5 +574,6 @@
     // math correct through the different stages. Count is the number of pixel to produce.
     // Since the code samples at pixel centers, length is the distance from the center of the
     // first pixel to the center of the last pixel. This implies that length is count-1.
-    fFirstStage->pointSpan(Span{SkPoint{x + 0.5f, y + 0.5f}, count - 1.0f, count});
+    fFirstStage->pointSpan(Span{{x + 0.5f, y + 0.5f}, count - 1.0f, count});
 }
+
diff --git a/src/core/SkLinearBitmapPipeline.h b/src/core/SkLinearBitmapPipeline.h
index c65b753..7efdd1c 100644
--- a/src/core/SkLinearBitmapPipeline.h
+++ b/src/core/SkLinearBitmapPipeline.h
@@ -21,6 +21,7 @@
         const SkMatrix& inverse,
         SkFilterQuality filterQuality,
         SkShader::TileMode xTile, SkShader::TileMode yTile,
+        float postAlpha,
         const SkPixmap& srcPixmap);
     ~SkLinearBitmapPipeline();
 
@@ -33,7 +34,7 @@
 
         ~PolymorphicUnion() {
             if (fIsInitialized) {
-                get()->~Base();
+                this->get()->~Base();
             }
         }
 
@@ -47,8 +48,8 @@
         };
 
         Base* get() const { return reinterpret_cast<Base*>(&fSpace); }
-        Base* operator->() const { return get(); }
-        Base& operator*() const { return *get(); }
+        Base* operator->() const { return this->get(); }
+        Base& operator*() const { return *(this->get()); }
 
     private:
         struct SK_STRUCT_ALIGN(16) Space {
@@ -62,18 +63,16 @@
     class BilerpProcessorInterface;
     class PixelPlacerInterface;
 
-    using MatrixStage = PolymorphicUnion<PointProcessorInterface, 112>;
-    using FilterStage = PolymorphicUnion<PointProcessorInterface,   8>;
-    using TileStage   = PolymorphicUnion<BilerpProcessorInterface, 96>;
+    // These values were generated by the assert above in PolymorphicUnion.
+    using MatrixStage = PolymorphicUnion<PointProcessorInterface, 160>;
+    using TileStage   = PolymorphicUnion<PointProcessorInterface, 160>;
     using SampleStage = PolymorphicUnion<BilerpProcessorInterface, 80>;
     using PixelStage  = PolymorphicUnion<PixelPlacerInterface,     80>;
 
 private:
     PointProcessorInterface* fFirstStage;
     MatrixStage fMatrixStage;
-    FilterStage fFilterStage;
-    TileStage   fTileXOrBothStage;
-    TileStage   fTileYStage;
+    TileStage   fTiler;
     SampleStage fSampleStage;
     PixelStage  fPixelStage;
 };
diff --git a/src/core/SkLinearBitmapPipeline_core.h b/src/core/SkLinearBitmapPipeline_core.h
index 0541f3c..2759f0b 100644
--- a/src/core/SkLinearBitmapPipeline_core.h
+++ b/src/core/SkLinearBitmapPipeline_core.h
@@ -10,6 +10,16 @@
 
 #include <cmath>
 
+// New bilerp strategy:
+// Pass through on bilerpList4 and bilerpListFew (analogs to pointList), introduce bilerpEdge
+// which takes 4 points. If the sample spans an edge, then break it into a bilerpEdge. Bilerp
+// span then becomes a normal span except in special cases where an extra Y is given. The bilerp
+// need to stay single point calculations until the tile layer.
+// TODO:
+//  - edge span predicate.
+//  - introduce new point API
+//  - Add tile for new api.
+
 // Tweak ABI of functions that pass Sk4f by value to pass them via registers.
 #if defined(_MSC_VER) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
     #define VECTORCALL __vectorcall
@@ -65,12 +75,13 @@
     }
 
     bool isEmpty() const { return 0 == fCount; }
+    void clear() { fCount = 0; }
+    int count() const { return fCount; }
     SkScalar length() const { return fLength; }
     SkScalar startX() const { return X(fStart); }
-    SkScalar endX() const { return startX() + length(); }
-    void clear() {
-        fCount = 0;
-    }
+    SkScalar endX() const { return this->startX() + this->length(); }
+    SkScalar startY() const { return Y(fStart); }
+    Span emptySpan() { return Span{{0.0, 0.0}, 0.0f, 0}; }
 
     bool completelyWithin(SkScalar xMin, SkScalar xMax) const {
         SkScalar sMin, sMax;
@@ -88,17 +99,14 @@
         SkASSERT(dx != 0.0f);
 
         if (this->isEmpty()) {
-            return Span{{0.0, 0.0}, 0.0f, 0};
+            return this->emptySpan();
         }
 
         int dxSteps = SkScalarFloorToInt((breakX - this->startX()) / dx);
 
-        // Calculate the values for the span to cleave off.
-        SkScalar newLength = dxSteps * dx;
-
         if (dxSteps < 0) {
             // The span is wholly after breakX.
-            return Span{{0.0, 0.0}, 0.0f, 0};
+            return this->emptySpan();
         } else if (dxSteps >= fCount) {
             // The span is wholly before breakX.
             Span answer = *this;
@@ -106,6 +114,9 @@
             return answer;
         }
 
+        // Calculate the values for the span to cleave off.
+        SkScalar newLength = dxSteps * dx;
+
         // If the last (or first if count = 1) sample lands directly on the boundary. Include it
         // when dx < 0 and exclude it when dx > 0.
         // Reasoning:
@@ -113,15 +124,16 @@
         // pixel is after the boundary.
         //  dx < 0: The sample point on the boundary is part of the current span because the
         // entire pixel is before the boundary.
-        if (startX() + newLength == breakX && dx > 0) {
-            if (dxSteps != 0) {
+        if (this->startX() + newLength == breakX && dx > 0) {
+            if (dxSteps > 0) {
                 dxSteps -= 1;
                 newLength -= dx;
             } else {
-                return Span{{0.0, 0.0}, 0.0f, 0};
+                return this->emptySpan();
             }
         }
 
+        // Calculate new span parameters
         SkPoint newStart = fStart;
         int newCount = dxSteps + 1;
         SkASSERT(newCount > 0);
@@ -146,39 +158,6 @@
     int      fCount;
 };
 
-// BilerpSpans are similar to Spans, but they represent four source samples converting to single
-// destination pixel per count. The pixels for the four samples are collect along two horizontal
-// lines; one starting at {x, y0} and the other starting at {x, y1}. There are two distinct lines
-// to deal with the edge case of the tile mode. For example, y0 may be at the last y position in
-// a tile while y1 would be at the first.
-// The step of a Bilerp (dx) is still length / (count - 1) and the start to the next sample is
-// still dx * count, but the bounds are complicated by the sampling kernel so that the pixels
-// touched are from x to x + length + 1.
-class BilerpSpan {
-public:
-    BilerpSpan(SkScalar x, SkScalar y0, SkScalar y1, SkScalar length, int count)
-        : fX{x}, fY0{y0}, fY1{y1}, fLength{length}, fCount{count} {
-        SkASSERT(count >= 0);
-        SkASSERT(std::isfinite(length));
-        SkASSERT(std::isfinite(x));
-        SkASSERT(std::isfinite(y0));
-        SkASSERT(std::isfinite(y1));
-    }
-
-    operator std::tuple<SkScalar&, SkScalar&, SkScalar&, SkScalar&, int&>() {
-        return std::tie(fX, fY0, fY1, fLength, fCount);
-    }
-
-    bool isEmpty() const { return 0 == fCount; }
-
-private:
-    SkScalar fX;
-    SkScalar fY0;
-    SkScalar fY1;
-    SkScalar fLength;
-    int      fCount;
-};
-
 template<typename Stage>
 void span_fallback(Span span, Stage* stage) {
     SkPoint start;
@@ -206,26 +185,6 @@
         stage->pointListFew(count, xs, ys);
     }
 }
-
-template <typename Next>
-void bilerp_span_fallback(BilerpSpan span, Next* next) {
-    SkScalar x, y0, y1; SkScalar length; int count;
-    std::tie(x, y0, y1, length, count) = span;
-
-    SkASSERT(!span.isEmpty());
-    float dx = length / (count - 1);
-
-    Sk4f xs = Sk4f{x} + Sk4f{0.0f,  1.0f, 0.0f, 1.0f};
-    Sk4f ys = Sk4f{y0, y0,  y1, y1};
-
-    // If count == 1 then dx will be inf or NaN, but that is ok because the resulting addition is
-    // never used.
-    while (count > 0) {
-        next->bilerpList(xs, ys);
-        xs = xs + dx;
-        count -= 1;
-    }
-}
 }  // namespace
 
 #endif // SkLinearBitmapPipeline_core_DEFINED
diff --git a/src/core/SkLinearBitmapPipeline_matrix.h b/src/core/SkLinearBitmapPipeline_matrix.h
index b1bd81f..d194d07 100644
--- a/src/core/SkLinearBitmapPipeline_matrix.h
+++ b/src/core/SkLinearBitmapPipeline_matrix.h
@@ -85,6 +85,34 @@
     const Sk4s fXSkew,   fYSkew;
 };
 
+class PerspectiveMatrixStrategy {
+public:
+    PerspectiveMatrixStrategy(SkVector offset, SkVector scale, SkVector skew,
+                              SkVector zSkew, SkScalar zOffset)
+        : fXOffset{X(offset)}, fYOffset{Y(offset)}, fZOffset{zOffset}
+        , fXScale{X(scale)},   fYScale{Y(scale)}
+        , fXSkew{X(skew)},     fYSkew{Y(skew)}, fZXSkew{X(zSkew)}, fZYSkew{Y(zSkew)} { }
+    void processPoints(Sk4s* xs, Sk4s* ys) {
+        Sk4s newXs = fXScale * *xs +  fXSkew * *ys + fXOffset;
+        Sk4s newYs =  fYSkew * *xs + fYScale * *ys + fYOffset;
+        Sk4s newZs =  fZXSkew * *xs + fZYSkew * *ys + fZOffset;
+
+        *xs = newXs / newZs;
+        *ys = newYs / newZs;
+    }
+
+    template <typename Next>
+    bool maybeProcessSpan(Span span, Next* next) {
+        return false;
+    }
+
+private:
+    const Sk4s fXOffset, fYOffset, fZOffset;
+    const Sk4s fXScale,  fYScale;
+    const Sk4s fXSkew,   fYSkew, fZXSkew, fZYSkew;
+};
+
+
 }  // namespace
 
 #endif  // SkLinearBitmapPipeline_matrix_DEFINED
diff --git a/src/core/SkLinearBitmapPipeline_sample.h b/src/core/SkLinearBitmapPipeline_sample.h
new file mode 100644
index 0000000..2115379
--- /dev/null
+++ b/src/core/SkLinearBitmapPipeline_sample.h
@@ -0,0 +1,644 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkLinearBitmapPipeline_sampler_DEFINED
+#define SkLinearBitmapPipeline_sampler_DEFINED
+
+#include "SkLinearBitmapPipeline_core.h"
+#include <tuple>
+
+namespace {
+// Explaination of the math:
+//              1 - x      x
+//           +--------+--------+
+//           |        |        |
+//  1 - y    |  px00  |  px10  |
+//           |        |        |
+//           +--------+--------+
+//           |        |        |
+//    y      |  px01  |  px11  |
+//           |        |        |
+//           +--------+--------+
+//
+//
+// Given a pixelxy each is multiplied by a different factor derived from the fractional part of x
+// and y:
+// * px00 -> (1 - x)(1 - y) = 1 - x - y + xy
+// * px10 -> x(1 - y) = x - xy
+// * px01 -> (1 - x)y = y - xy
+// * px11 -> xy
+// So x * y is calculated first and then used to calculate all the other factors.
+static Sk4s VECTORCALL bilerp4(Sk4s xs, Sk4s ys, Sk4f px00, Sk4f px10,
+                               Sk4f px01, Sk4f px11) {
+    // Calculate fractional xs and ys.
+    Sk4s fxs = xs - xs.floor();
+    Sk4s fys = ys - ys.floor();
+    Sk4s fxys{fxs * fys};
+    Sk4f sum = px11 * fxys;
+    sum = sum + px01 * (fys - fxys);
+    sum = sum + px10 * (fxs - fxys);
+    sum = sum + px00 * (Sk4f{1.0f} - fxs - fys + fxys);
+    return sum;
+}
+
+// The GeneralSampler class
+template<typename SourceStrategy, typename Next>
+class GeneralSampler {
+public:
+    template<typename... Args>
+    GeneralSampler(SkLinearBitmapPipeline::PixelPlacerInterface* next, Args&& ... args)
+        : fNext{next}, fStrategy{std::forward<Args>(args)...} { }
+
+    void VECTORCALL nearestListFew(int n, Sk4s xs, Sk4s ys) {
+        SkASSERT(0 < n && n < 4);
+        Sk4f px0, px1, px2;
+        fStrategy.getFewPixels(n, xs, ys, &px0, &px1, &px2);
+        if (n >= 1) fNext->placePixel(px0);
+        if (n >= 2) fNext->placePixel(px1);
+        if (n >= 3) fNext->placePixel(px2);
+    }
+
+    void VECTORCALL nearestList4(Sk4s xs, Sk4s ys) {
+        Sk4f px0, px1, px2, px3;
+        fStrategy.get4Pixels(xs, ys, &px0, &px1, &px2, &px3);
+        fNext->place4Pixels(px0, px1, px2, px3);
+    }
+
+    void nearestSpan(Span span) {
+        SkASSERT(!span.isEmpty());
+        SkPoint start;
+        SkScalar length;
+        int count;
+        std::tie(start, length, count) = span;
+        SkScalar absLength = SkScalarAbs(length);
+        if (absLength < (count - 1)) {
+            this->nearestSpanSlowRate(span);
+        } else if (absLength == (count - 1)) {
+            this->nearestSpanUnitRate(span);
+        } else {
+            this->nearestSpanFastRate(span);
+        }
+    }
+
+    Sk4f bilerNonEdgePixel(SkScalar x, SkScalar y) {
+        Sk4f px00, px10, px01, px11;
+        Sk4f xs = Sk4f{x};
+        Sk4f ys = Sk4f{y};
+        Sk4f sampleXs = xs + Sk4f{-0.5f, 0.5f, -0.5f, 0.5f};
+        Sk4f sampleYs = ys + Sk4f{-0.5f, -0.5f, 0.5f, 0.5f};
+        fStrategy.get4Pixels(sampleXs, sampleYs, &px00, &px10, &px01, &px11);
+        return bilerp4(xs, ys, px00, px10, px01, px11);
+    }
+
+    void VECTORCALL bilerpListFew(int n, Sk4s xs, Sk4s ys) {
+        SkASSERT(0 < n && n < 4);
+        auto bilerpPixel = [&](int index) {
+            return this->bilerNonEdgePixel(xs[index], ys[index]);
+        };
+
+        if (n >= 1) fNext->placePixel(bilerpPixel(0));
+        if (n >= 2) fNext->placePixel(bilerpPixel(1));
+        if (n >= 3) fNext->placePixel(bilerpPixel(2));
+    }
+
+    void VECTORCALL bilerpList4(Sk4s xs, Sk4s ys) {
+        auto bilerpPixel = [&](int index) {
+            return this->bilerNonEdgePixel(xs[index], ys[index]);
+        };
+        fNext->place4Pixels(bilerpPixel(0), bilerpPixel(1), bilerpPixel(2), bilerpPixel(3));
+    }
+
+    void VECTORCALL bilerpEdge(Sk4s sampleXs, Sk4s sampleYs) {
+        Sk4f px00, px10, px01, px11;
+        Sk4f xs = Sk4f{sampleXs[0]};
+        Sk4f ys = Sk4f{sampleYs[0]};
+        fStrategy.get4Pixels(sampleXs, sampleYs, &px00, &px10, &px01, &px11);
+        Sk4f pixel = bilerp4(xs, ys, px00, px10, px01, px11);
+        fNext->placePixel(pixel);
+    }
+
+    void bilerpSpan(Span span) {
+        this->bilerpSpanWithY(span, span.startY());
+    }
+
+    void bilerpSpanWithY(Span span, SkScalar y) {
+        SkASSERT(!span.isEmpty());
+        SkPoint start;
+        SkScalar length;
+        int count;
+        std::tie(start, length, count) = span;
+        SkScalar absLength = SkScalarAbs(length);
+        if (absLength == 0.0f) {
+            this->bilerpSpanZeroRate(span, y);
+        } else if (absLength < (count - 1)) {
+            this->bilerpSpanSlowRate(span, y);
+        } else if (absLength == (count - 1)) {
+            if (std::fmod(span.startX() - 0.5f, 1.0f) == 0.0f) {
+                if (std::fmod(span.startY() - 0.5f, 1.0f) == 0.0f) {
+                    this->nearestSpanUnitRate(span);
+                } else {
+                    this->bilerpSpanUnitRateAlignedX(span, y);
+                }
+            } else {
+                this->bilerpSpanUnitRate(span, y);
+            }
+        } else {
+            this->bilerpSpanFastRate(span, y);
+        }
+    }
+
+private:
+    // When moving through source space more slowly than dst space (zoomed in),
+    // we'll be sampling from the same source pixel more than once.
+    void nearestSpanSlowRate(Span span) {
+        SkPoint start;
+        SkScalar length;
+        int count;
+        std::tie(start, length, count) = span;
+        SkScalar x = X(start);
+        SkFixed fx = SkScalarToFixed(x);
+        SkScalar dx = length / (count - 1);
+        SkFixed fdx = SkScalarToFixed(dx);
+
+        const void* row = fStrategy.row((int)std::floor(Y(start)));
+        Next* next = fNext;
+
+        int ix = SkFixedFloorToInt(fx);
+        int prevIX = ix;
+        Sk4f fpixel = fStrategy.getPixel(row, ix);
+
+        // When dx is less than one, each pixel is used more than once. Using the fixed point fx
+        // allows the code to quickly check that the same pixel is being used. The code uses this
+        // same pixel check to do the sRGB and normalization only once.
+        auto getNextPixel = [&]() {
+            if (ix != prevIX) {
+                fpixel = fStrategy.getPixel(row, ix);
+                prevIX = ix;
+            }
+            fx += fdx;
+            ix = SkFixedFloorToInt(fx);
+            return fpixel;
+        };
+
+        while (count >= 4) {
+            Sk4f px0 = getNextPixel();
+            Sk4f px1 = getNextPixel();
+            Sk4f px2 = getNextPixel();
+            Sk4f px3 = getNextPixel();
+            next->place4Pixels(px0, px1, px2, px3);
+            count -= 4;
+        }
+        while (count > 0) {
+            next->placePixel(getNextPixel());
+            count -= 1;
+        }
+    }
+
+    // We're moving through source space at a rate of 1 source pixel per 1 dst pixel.
+    // We'll never re-use pixels, but we can at least load contiguous pixels.
+    void nearestSpanUnitRate(Span span) {
+        SkPoint start;
+        SkScalar length;
+        int count;
+        std::tie(start, length, count) = span;
+        int ix = SkScalarFloorToInt(X(start));
+        const void* row = fStrategy.row((int)std::floor(Y(start)));
+        Next* next = fNext;
+        if (length > 0) {
+            while (count >= 4) {
+                Sk4f px0, px1, px2, px3;
+                fStrategy.get4Pixels(row, ix, &px0, &px1, &px2, &px3);
+                next->place4Pixels(px0, px1, px2, px3);
+                ix += 4;
+                count -= 4;
+            }
+
+            while (count > 0) {
+                next->placePixel(fStrategy.getPixel(row, ix));
+                ix += 1;
+                count -= 1;
+            }
+        } else {
+            while (count >= 4) {
+                Sk4f px0, px1, px2, px3;
+                fStrategy.get4Pixels(row, ix - 3, &px3, &px2, &px1, &px0);
+                next->place4Pixels(px0, px1, px2, px3);
+                ix -= 4;
+                count -= 4;
+            }
+
+            while (count > 0) {
+                next->placePixel(fStrategy.getPixel(row, ix));
+                ix -= 1;
+                count -= 1;
+            }
+        }
+    }
+
+    // We're moving through source space faster than dst (zoomed out),
+    // so we'll never reuse a source pixel or be able to do contiguous loads.
+    void nearestSpanFastRate(Span span) {
+        struct NearestWrapper {
+            void VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) {
+                fSampler.nearestListFew(n, xs, ys);
+            }
+
+            void VECTORCALL pointList4(Sk4s xs, Sk4s ys) {
+                fSampler.nearestList4(xs, ys);
+            }
+
+            GeneralSampler& fSampler;
+        };
+        NearestWrapper wrapper{*this};
+        span_fallback(span, &wrapper);
+    }
+
+    void bilerpSpanZeroRate(Span span, SkScalar y1) {
+        SkScalar y0 = span.startY() - 0.5f;
+        y1 += 0.5f;
+        int iy0 = SkScalarFloorToInt(y0);
+        SkScalar filterY1 = y0 - iy0;
+        SkScalar filterY0 = 1.0f - filterY1;
+        int iy1 = SkScalarFloorToInt(y1);
+        int ix = SkScalarFloorToInt(span.startX());
+        Sk4f pixelY0 = fStrategy.getPixel(fStrategy.row(iy0), ix);
+        Sk4f pixelY1 = fStrategy.getPixel(fStrategy.row(iy1), ix);
+        Sk4f filterPixel = pixelY0 * filterY0 + pixelY1 * filterY1;
+        int count = span.count();
+        while (count >= 4) {
+            fNext->place4Pixels(filterPixel, filterPixel, filterPixel, filterPixel);
+            count -= 4;
+        }
+        while (count > 0) {
+            fNext->placePixel(filterPixel);
+            count -= 1;
+        }
+    }
+
+    // When moving through source space more slowly than dst space (zoomed in),
+    // we'll be sampling from the same source pixel more than once.
+    void bilerpSpanSlowRate(Span span, SkScalar ry1) {
+        SkPoint start;
+        SkScalar length;
+        int count;
+        std::tie(start, length, count) = span;
+        SkFixed fx = SkScalarToFixed(X(start)
+                                         -0.5f);
+
+        SkFixed fdx = SkScalarToFixed(length / (count - 1));
+        //start = start + SkPoint{-0.5f, -0.5f};
+
+        Sk4f xAdjust;
+        if (fdx >= 0) {
+            xAdjust = Sk4f{-1.0f};
+        } else {
+            xAdjust = Sk4f{1.0f};
+        }
+        int ix = SkFixedFloorToInt(fx);
+        int ioldx = ix;
+        Sk4f x{SkFixedToScalar(fx) - ix};
+        Sk4f dx{SkFixedToScalar(fdx)};
+        SkScalar ry0 = Y(start) - 0.5f;
+        ry1 += 0.5f;
+        SkScalar yFloor = std::floor(ry0);
+        Sk4f y1 = Sk4f{ry0 - yFloor};
+        Sk4f y0 = Sk4f{1.0f} - y1;
+        const uint32_t* const row0 = fStrategy.row(SkScalarFloorToInt(ry0));
+        const uint32_t* const row1 = fStrategy.row(SkScalarFloorToInt(ry1));
+        Sk4f fpixel00 = y0 * fStrategy.getPixel(row0, ix);
+        Sk4f fpixel01 = y1 * fStrategy.getPixel(row1, ix);
+        Sk4f fpixel10 = y0 * fStrategy.getPixel(row0, ix + 1);
+        Sk4f fpixel11 = y1 * fStrategy.getPixel(row1, ix + 1);
+        auto getNextPixel = [&]() {
+            if (ix != ioldx) {
+                fpixel00 = fpixel10;
+                fpixel01 = fpixel11;
+                fpixel10 = y0 * fStrategy.getPixel(row0, ix + 1);
+                fpixel11 = y1 * fStrategy.getPixel(row1, ix + 1);
+                ioldx = ix;
+                x = x + xAdjust;
+            }
+
+            Sk4f x0, x1;
+            x0 = Sk4f{1.0f} - x;
+            x1 = x;
+            Sk4f fpixel = x0 * (fpixel00 + fpixel01) + x1 * (fpixel10 + fpixel11);
+            fx += fdx;
+            ix = SkFixedFloorToInt(fx);
+            x = x + dx;
+            return fpixel;
+        };
+
+        while (count >= 4) {
+            Sk4f fpixel0 = getNextPixel();
+            Sk4f fpixel1 = getNextPixel();
+            Sk4f fpixel2 = getNextPixel();
+            Sk4f fpixel3 = getNextPixel();
+
+            fNext->place4Pixels(fpixel0, fpixel1, fpixel2, fpixel3);
+            count -= 4;
+        }
+
+        while (count > 0) {
+            fNext->placePixel(getNextPixel());
+
+            count -= 1;
+        }
+    }
+
+    // We're moving through source space at a rate of 1 source pixel per 1 dst pixel.
+    // We'll never re-use pixels, but we can at least load contiguous pixels.
+    void bilerpSpanUnitRate(Span span, SkScalar y1) {
+        y1 += 0.5f;
+        SkScalar y0 = span.startY() - 0.5f;
+        int iy0 = SkScalarFloorToInt(y0);
+        SkScalar filterY1 = y0 - iy0;
+        SkScalar filterY0 = 1.0f - filterY1;
+        int iy1 = SkScalarFloorToInt(y1);
+        const void* rowY0 = fStrategy.row(iy0);
+        const void* rowY1 = fStrategy.row(iy1);
+        SkScalar x0 = span.startX() - 0.5f;
+        int ix0 = SkScalarFloorToInt(x0);
+        SkScalar filterX1 = x0 - ix0;
+        SkScalar filterX0 = 1.0f - filterX1;
+
+        auto getPixelY0 = [&]() {
+            Sk4f px = fStrategy.getPixel(rowY0, ix0);
+            return px * filterY0;
+        };
+
+        auto getPixelY1 = [&]() {
+            Sk4f px = fStrategy.getPixel(rowY1, ix0);
+            return px * filterY1;
+        };
+
+        auto get4PixelsY0 = [&](int ix, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
+            fStrategy.get4Pixels(rowY0, ix, px0, px1, px2, px3);
+            *px0 = *px0 * filterY0;
+            *px1 = *px1 * filterY0;
+            *px2 = *px2 * filterY0;
+            *px3 = *px3 * filterY0;
+        };
+
+        auto get4PixelsY1 = [&](int ix, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
+            fStrategy.get4Pixels(rowY1, ix, px0, px1, px2, px3);
+            *px0 = *px0 * filterY1;
+            *px1 = *px1 * filterY1;
+            *px2 = *px2 * filterY1;
+            *px3 = *px3 * filterY1;
+        };
+
+        auto lerp = [&](Sk4f& pixelX0, Sk4f& pixelX1) {
+            return pixelX0 * filterX0 + pixelX1 * filterX1;
+        };
+
+        // Mid making 4 unit rate.
+        Sk4f pxB = getPixelY0() + getPixelY1();
+        if (span.length() > 0) {
+            int count = span.count();
+            while (count >= 4) {
+                Sk4f px00, px10, px20, px30;
+                get4PixelsY0(ix0, &px00, &px10, &px20, &px30);
+                Sk4f px01, px11, px21, px31;
+                get4PixelsY1(ix0, &px01, &px11, &px21, &px31);
+                Sk4f pxS0 = px00 + px01;
+                Sk4f px0 = lerp(pxB, pxS0);
+                Sk4f pxS1 = px10 + px11;
+                Sk4f px1 = lerp(pxS0, pxS1);
+                Sk4f pxS2 = px20 + px21;
+                Sk4f px2 = lerp(pxS1, pxS2);
+                Sk4f pxS3 = px30 + px31;
+                Sk4f px3 = lerp(pxS2, pxS3);
+                pxB = pxS3;
+                fNext->place4Pixels(
+                    px0,
+                    px1,
+                    px2,
+                    px3);
+                ix0 += 4;
+                count -= 4;
+            }
+            while (count > 0) {
+                Sk4f pixelY0 = fStrategy.getPixel(rowY0, ix0);
+                Sk4f pixelY1 = fStrategy.getPixel(rowY1, ix0);
+
+                fNext->placePixel(lerp(pixelY0, pixelY1));
+                ix0 += 1;
+                count -= 1;
+            }
+        } else {
+            int count = span.count();
+            while (count >= 4) {
+                Sk4f px00, px10, px20, px30;
+                get4PixelsY0(ix0 - 3, &px00, &px10, &px20, &px30);
+                Sk4f px01, px11, px21, px31;
+                get4PixelsY1(ix0 - 3, &px01, &px11, &px21, &px31);
+                Sk4f pxS3 = px30 + px31;
+                Sk4f px0 = lerp(pxS3, pxB);
+                Sk4f pxS2 = px20 + px21;
+                Sk4f px1 = lerp(pxS2, pxS3);
+                Sk4f pxS1 = px10 + px11;
+                Sk4f px2 = lerp(pxS1, pxS2);
+                Sk4f pxS0 = px00 + px01;
+                Sk4f px3 = lerp(pxS0, pxS1);
+                pxB = pxS0;
+                fNext->place4Pixels(
+                    px0,
+                    px1,
+                    px2,
+                    px3);
+                ix0 -= 4;
+                count -= 4;
+            }
+            while (count > 0) {
+                Sk4f pixelY0 = fStrategy.getPixel(rowY0, ix0);
+                Sk4f pixelY1 = fStrategy.getPixel(rowY1, ix0);
+
+                fNext->placePixel(lerp(pixelY0, pixelY1));
+                ix0 -= 1;
+                count -= 1;
+            }
+        }
+    }
+
+    void bilerpSpanUnitRateAlignedX(Span span, SkScalar y1) {
+        SkScalar y0 = span.startY() - 0.5f;
+        y1 += 0.5f;
+        int iy0 = SkScalarFloorToInt(y0);
+        SkScalar filterY1 = y0 - iy0;
+        SkScalar filterY0 = 1.0f - filterY1;
+        int iy1 = SkScalarFloorToInt(y1);
+        int ix = SkScalarFloorToInt(span.startX());
+        const void* rowY0 = fStrategy.row(iy0);
+        const void* rowY1 = fStrategy.row(iy1);
+        auto lerp = [&](Sk4f* pixelY0, Sk4f* pixelY1) {
+            return *pixelY0 * filterY0 + *pixelY1 * filterY1;
+        };
+
+        if (span.length() > 0) {
+            int count = span.count();
+            while (count >= 4) {
+                Sk4f px00, px10, px20, px30;
+                fStrategy.get4Pixels(rowY0, ix, &px00, &px10, &px20, &px30);
+                Sk4f px01, px11, px21, px31;
+                fStrategy.get4Pixels(rowY1, ix, &px01, &px11, &px21, &px31);
+                fNext->place4Pixels(
+                    lerp(&px00, &px01), lerp(&px10, &px11), lerp(&px20, &px21), lerp(&px30, &px31));
+                ix += 4;
+                count -= 4;
+            }
+            while (count > 0) {
+                Sk4f pixelY0 = fStrategy.getPixel(rowY0, ix);
+                Sk4f pixelY1 = fStrategy.getPixel(rowY1, ix);
+
+                fNext->placePixel(lerp(&pixelY0, &pixelY1));
+                ix += 1;
+                count -= 1;
+            }
+        } else {
+            int count = span.count();
+            while (count >= 4) {
+                Sk4f px00, px10, px20, px30;
+                fStrategy.get4Pixels(rowY0, ix - 3, &px30, &px20, &px10, &px00);
+                Sk4f px01, px11, px21, px31;
+                fStrategy.get4Pixels(rowY1, ix - 3, &px31, &px21, &px11, &px01);
+                fNext->place4Pixels(
+                    lerp(&px00, &px01), lerp(&px10, &px11), lerp(&px20, &px21), lerp(&px30, &px31));
+                ix -= 4;
+                count -= 4;
+            }
+            while (count > 0) {
+                Sk4f pixelY0 = fStrategy.getPixel(rowY0, ix);
+                Sk4f pixelY1 = fStrategy.getPixel(rowY1, ix);
+
+                fNext->placePixel(lerp(&pixelY0, &pixelY1));
+                ix -= 1;
+                count -= 1;
+            }
+        }
+    }
+
+    // We're moving through source space faster than dst (zoomed out),
+    // so we'll never reuse a source pixel or be able to do contiguous loads.
+    void bilerpSpanFastRate(Span span, SkScalar y1) {
+        SkPoint start;
+        SkScalar length;
+        int count;
+        std::tie(start, length, count) = span;
+        SkScalar x = X(start);
+        SkScalar y = Y(start);
+        if (false && y == y1) {
+            struct BilerpWrapper {
+                void VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) {
+                    fSampler.bilerpListFew(n, xs, ys);
+                }
+
+                void VECTORCALL pointList4(Sk4s xs, Sk4s ys) {
+                    fSampler.bilerpList4(xs, ys);
+                }
+
+                GeneralSampler& fSampler;
+            };
+            BilerpWrapper wrapper{*this};
+            span_fallback(span, &wrapper);
+        } else {
+            SkScalar dx = length / (count - 1);
+            Sk4f ys = {y - 0.5f, y - 0.5f, y1 + 0.5f, y1 + 0.5f};
+            while (count > 0) {
+                Sk4f xs = Sk4f{-0.5f, 0.5f, -0.5f, 0.5f} + Sk4f{x};
+                this->bilerpEdge(xs, ys);
+                x += dx;
+                count -= 1;
+            }
+        }
+    }
+
+    Next* const fNext;
+    SourceStrategy fStrategy;
+};
+
+class sRGBFast {
+public:
+    static Sk4s VECTORCALL sRGBToLinear(Sk4s pixel) {
+        Sk4s l = pixel * pixel;
+        return Sk4s{l[0], l[1], l[2], pixel[3]};
+    }
+};
+
+enum class ColorOrder {
+    kRGBA = false,
+    kBGRA = true,
+};
+template <SkColorProfileType colorProfile, ColorOrder colorOrder>
+class Pixel8888 {
+public:
+    Pixel8888(int width, const uint32_t* src) : fSrc{src}, fWidth{width}{ }
+    Pixel8888(const SkPixmap& srcPixmap)
+        : fSrc{srcPixmap.addr32()}
+        , fWidth{static_cast<int>(srcPixmap.rowBytes() / 4)} { }
+
+    void VECTORCALL getFewPixels(int n, Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) {
+        Sk4i XIs = SkNx_cast<int, SkScalar>(xs);
+        Sk4i YIs = SkNx_cast<int, SkScalar>(ys);
+        Sk4i bufferLoc = YIs * fWidth + XIs;
+        switch (n) {
+            case 3:
+                *px2 = this->getPixel(fSrc, bufferLoc[2]);
+            case 2:
+                *px1 = this->getPixel(fSrc, bufferLoc[1]);
+            case 1:
+                *px0 = this->getPixel(fSrc, bufferLoc[0]);
+            default:
+                break;
+        }
+    }
+
+    void VECTORCALL get4Pixels(Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
+        Sk4i XIs = SkNx_cast<int, SkScalar>(xs);
+        Sk4i YIs = SkNx_cast<int, SkScalar>(ys);
+        Sk4i bufferLoc = YIs * fWidth + XIs;
+        *px0 = this->getPixel(fSrc, bufferLoc[0]);
+        *px1 = this->getPixel(fSrc, bufferLoc[1]);
+        *px2 = this->getPixel(fSrc, bufferLoc[2]);
+        *px3 = this->getPixel(fSrc, bufferLoc[3]);
+    }
+
+    void get4Pixels(const void* vsrc, int index, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
+        const uint32_t* src = static_cast<const uint32_t*>(vsrc);
+        *px0 = this->getPixel(src, index + 0);
+        *px1 = this->getPixel(src, index + 1);
+        *px2 = this->getPixel(src, index + 2);
+        *px3 = this->getPixel(src, index + 3);
+    }
+
+    Sk4f getPixel(const void* vsrc, int index) {
+        const uint32_t* src = static_cast<const uint32_t*>(vsrc);
+        Sk4b bytePixel = Sk4b::Load((uint8_t *)(&src[index]));
+        Sk4f pixel = SkNx_cast<float, uint8_t>(bytePixel);
+        if (colorOrder == ColorOrder::kBGRA) {
+            pixel = SkNx_shuffle<2, 1, 0, 3>(pixel);
+        }
+        pixel = pixel * Sk4f{1.0f/255.0f};
+        if (colorProfile == kSRGB_SkColorProfileType) {
+            pixel = sRGBFast::sRGBToLinear(pixel);
+        }
+        return pixel;
+    }
+
+    const uint32_t* row(int y) { return fSrc + y * fWidth[0]; }
+
+private:
+    const uint32_t* const fSrc;
+    const Sk4i fWidth;
+};
+using Pixel8888SRGB = Pixel8888<kSRGB_SkColorProfileType, ColorOrder::kRGBA>;
+using Pixel8888LRGB = Pixel8888<kLinear_SkColorProfileType, ColorOrder::kRGBA>;
+using Pixel8888SBGR = Pixel8888<kSRGB_SkColorProfileType, ColorOrder::kBGRA>;
+using Pixel8888LBGR = Pixel8888<kLinear_SkColorProfileType, ColorOrder::kBGRA>;
+}  // namespace
+
+#endif  // SkLinearBitmapPipeline_sampler_DEFINED
diff --git a/src/core/SkLinearBitmapPipeline_tile.h b/src/core/SkLinearBitmapPipeline_tile.h
index 761e3c5..60cc2a5 100644
--- a/src/core/SkLinearBitmapPipeline_tile.h
+++ b/src/core/SkLinearBitmapPipeline_tile.h
@@ -15,39 +15,30 @@
 #include <limits>
 
 namespace {
-class ClampStrategy {
+class XClampStrategy {
 public:
-    ClampStrategy(X max)
-        : fXMin{0.0f}, fXMax{max - 1.0f} { }
+    XClampStrategy(int32_t max)
+        : fXsMax{SkScalar(max - 0.5f)}
+        , fXMax{SkScalar(max)} { }
 
-    ClampStrategy(Y max)
-        : fYMin{0.0f}, fYMax{max - 1.0f} { }
-
-    ClampStrategy(SkSize max)
-        : fXMin{0.0f}, fYMin{0.0f}, fXMax{X(max) - 1.0f}, fYMax{Y(max) - 1.0f} { }
-
-    void processPoints(Sk4s* xs, Sk4s* ys) {
-        *xs = Sk4s::Min(Sk4s::Max(*xs, fXMin), fXMax);
-        *ys = Sk4s::Min(Sk4s::Max(*ys, fYMin), fYMax);
+    void tileXPoints(Sk4s* xs) {
+        *xs = Sk4s::Min(Sk4s::Max(*xs, 0.0f), fXsMax);
+        SkASSERT(0 <= (*xs)[0] && (*xs)[0] < fXMax);
+        SkASSERT(0 <= (*xs)[1] && (*xs)[1] < fXMax);
+        SkASSERT(0 <= (*xs)[2] && (*xs)[2] < fXMax);
+        SkASSERT(0 <= (*xs)[3] && (*xs)[3] < fXMax);
     }
 
     template<typename Next>
     bool maybeProcessSpan(Span originalSpan, Next* next) {
         SkASSERT(!originalSpan.isEmpty());
-        SkPoint start;
-        SkScalar length;
-        int count;
+        SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = originalSpan;
-        SkScalar xMin = fXMin[0];
-        SkScalar xMax = fXMax[0] + 1.0f;
-        SkScalar yMin = fYMin[0];
-        SkScalar yMax = fYMax[0];
         SkScalar x = X(start);
-        SkScalar y = std::min(std::max<SkScalar>(yMin, Y(start)), yMax);
-
+        SkScalar y = Y(start);
         Span span{{x, y}, length, count};
 
-        if (span.completelyWithin(xMin, xMax)) {
+        if (span.completelyWithin(0.0f, fXMax)) {
             next->pointSpan(span);
             return true;
         }
@@ -85,84 +76,100 @@
         // * Over - for the portion of the span > xMax, take the color at pixel {xMax-1, y} and
         //   use it to fill in the rest of the destination pixels.
         if (dx >= 0) {
-            Span leftClamped = span.breakAt(xMin, dx);
+            Span leftClamped = span.breakAt(0.0f, dx);
             if (!leftClamped.isEmpty()) {
-                leftClamped.clampToSinglePixel({xMin, y});
+                leftClamped.clampToSinglePixel({0.0f, y});
                 next->pointSpan(leftClamped);
             }
-            Span middle = span.breakAt(xMax, dx);
-            if (!middle.isEmpty()) {
-                next->pointSpan(middle);
+            Span center = span.breakAt(fXMax, dx);
+            if (!center.isEmpty()) {
+                next->pointSpan(center);
             }
             if (!span.isEmpty()) {
-                span.clampToSinglePixel({xMax - 1, y});
+                span.clampToSinglePixel({fXMax - 1, y});
                 next->pointSpan(span);
             }
         } else {
-            Span rightClamped = span.breakAt(xMax, dx);
+            Span center = span.breakAt(fXMax, dx);
 
-            if (!rightClamped.isEmpty()) {
-                rightClamped.clampToSinglePixel({xMax - 1, y});
-                next->pointSpan(rightClamped);
-            }
-            Span middle = span.breakAt(xMin, dx);
-            if (!middle.isEmpty()) {
-                next->pointSpan(middle);
-            }
             if (!span.isEmpty()) {
-                span.clampToSinglePixel({xMin, y});
+                span.clampToSinglePixel({fXMax - 1, y});
                 next->pointSpan(span);
             }
+            Span leftEdge = center.breakAt(0.0f, dx);
+            if (!center.isEmpty()) {
+                next->pointSpan(center);
+            }
+            if (!leftEdge.isEmpty()) {
+                leftEdge.clampToSinglePixel({0.0f, y});
+                next->pointSpan(leftEdge);
+            }
         }
         return true;
     }
 
-    template <typename Next>
-    bool maybeProcessBilerpSpan(BilerpSpan bSpan, Next* next) {
-        return false;
+private:
+    const Sk4s     fXsMax;
+    const SkScalar fXMax;
+};
+
+class YClampStrategy {
+public:
+    YClampStrategy(int32_t max)
+        : fYMax{SkScalar(max) - 0.5f}
+        , fYsMax{SkScalar(max) - 0.5f} { }
+
+    void tileYPoints(Sk4s* ys) {
+        *ys = Sk4s::Min(Sk4s::Max(*ys, 0.0f), fYsMax);
+        SkASSERT(0 <= (*ys)[0] && (*ys)[0] <= fYMax);
+        SkASSERT(0 <= (*ys)[1] && (*ys)[1] <= fYMax);
+        SkASSERT(0 <= (*ys)[2] && (*ys)[2] <= fYMax);
+        SkASSERT(0 <= (*ys)[3] && (*ys)[3] <= fYMax);
+    }
+
+    SkScalar tileY(SkScalar y) {
+        return std::min(std::max<SkScalar>(0.0f, y), fYMax);
     }
 
 private:
-    const Sk4s fXMin{SK_FloatNegativeInfinity};
-    const Sk4s fYMin{SK_FloatNegativeInfinity};
-    const Sk4s fXMax{SK_FloatInfinity};
-    const Sk4s fYMax{SK_FloatInfinity};
+    const SkScalar fYMax;
+    const Sk4s     fYsMax;
 };
 
-class RepeatStrategy {
+SkScalar tile_mod(SkScalar x, SkScalar base) {
+    return x - SkScalarFloorToScalar(x / base) * base;
+}
+
+class XRepeatStrategy {
 public:
-    RepeatStrategy(X max) : fXMax{max}, fXInvMax{1.0f / max} { }
+    XRepeatStrategy(int32_t max)
+        : fXMax{SkScalar(max)}
+        , fXsMax{SkScalar(max)}
+        , fXsCap{SkScalar(nextafterf(SkScalar(max), 0.0f))}
+        , fXsInvMax{1.0f / SkScalar(max)} { }
 
-    RepeatStrategy(Y max) : fYMax{max}, fYInvMax{1.0f / max} { }
-
-    RepeatStrategy(SkSize max)
-        : fXMax{X(max)}, fXInvMax{1.0f / X(max)}, fYMax{Y(max)}, fYInvMax{1.0f / Y(max)} { }
-
-    void processPoints(Sk4s* xs, Sk4s* ys) {
-        Sk4s divX = (*xs * fXInvMax).floor();
-        Sk4s divY = (*ys * fYInvMax).floor();
-        Sk4s baseX = (divX * fXMax);
-        Sk4s baseY = (divY * fYMax);
-        *xs = *xs - baseX;
-        *ys = *ys - baseY;
+    void tileXPoints(Sk4s* xs) {
+        Sk4s divX = *xs * fXsInvMax;
+        Sk4s modX = *xs - divX.floor() * fXsMax;
+        *xs = Sk4s::Min(fXsCap, modX);
+        SkASSERT(0 <= (*xs)[0] && (*xs)[0] < fXMax);
+        SkASSERT(0 <= (*xs)[1] && (*xs)[1] < fXMax);
+        SkASSERT(0 <= (*xs)[2] && (*xs)[2] < fXMax);
+        SkASSERT(0 <= (*xs)[3] && (*xs)[3] < fXMax);
     }
 
     template<typename Next>
     bool maybeProcessSpan(Span originalSpan, Next* next) {
         SkASSERT(!originalSpan.isEmpty());
-        SkPoint start;
-        SkScalar length;
-        int count;
+        SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = originalSpan;
         // Make x and y in range on the tile.
-        SkScalar x = TileMod(X(start), fXMax[0]);
-        SkScalar y = TileMod(Y(start), fYMax[0]);
-        SkScalar xMax = fXMax[0];
-        SkScalar xMin = 0.0f;
+        SkScalar x = tile_mod(X(start), fXMax);
+        SkScalar y = Y(start);
         SkScalar dx = length / (count - 1);
 
         // No need trying to go fast because the steps are larger than a tile or there is one point.
-        if (SkScalarAbs(dx) >= xMax || count <= 1) {
+        if (SkScalarAbs(dx) >= fXMax || count <= 1) {
             return false;
         }
 
@@ -199,16 +206,16 @@
 
         Span span({x, y}, length, count);
         if (dx > 0) {
-            while (!span.isEmpty() && span.endX() >= xMax) {
-                Span toDraw = span.breakAt(xMax, dx);
+            while (!span.isEmpty() && span.endX() >= fXMax) {
+                Span toDraw = span.breakAt(fXMax, dx);
                 next->pointSpan(toDraw);
-                span.offset(-xMax);
+                span.offset(-fXMax);
             }
         } else {
-            while (!span.isEmpty() && span.endX() < xMin) {
-                Span toDraw = span.breakAt(xMin, dx);
+            while (!span.isEmpty() && span.endX() < 0.0f) {
+                Span toDraw = span.breakAt(0.0f, dx);
                 next->pointSpan(toDraw);
-                span.offset(xMax);
+                span.offset(fXMax);
             }
         }
 
@@ -220,19 +227,106 @@
         return true;
     }
 
-    template <typename Next>
-    bool maybeProcessBilerpSpan(BilerpSpan bSpan, Next* next) {
-        return false;
+private:
+    const SkScalar fXMax;
+    const Sk4s     fXsMax;
+    const Sk4s     fXsCap;
+    const Sk4s     fXsInvMax;
+};
+
+class YRepeatStrategy {
+public:
+    YRepeatStrategy(int32_t max)
+        : fYMax{SkScalar(max)}
+        , fYsMax{SkScalar(max)}
+        , fYsInvMax{1.0f / SkScalar(max)} { }
+
+    void tileYPoints(Sk4s* ys) {
+        Sk4s divY = *ys * fYsInvMax;
+        Sk4s modY = *ys - divY.floor() * fYsMax;
+        *ys = modY;
+        SkASSERT(0 <= (*ys)[0] && (*ys)[0] < fYMax);
+        SkASSERT(0 <= (*ys)[1] && (*ys)[1] < fYMax);
+        SkASSERT(0 <= (*ys)[2] && (*ys)[2] < fYMax);
+        SkASSERT(0 <= (*ys)[3] && (*ys)[3] < fYMax);
+    }
+
+    SkScalar tileY(SkScalar y) {
+        SkScalar answer = tile_mod(y, fYMax);
+        SkASSERT(0 <= answer && answer < fYMax);
+        return answer;
     }
 
 private:
-    SkScalar TileMod(SkScalar x, SkScalar base) {
-        return x - std::floor(x / base) * base;
+    const SkScalar fYMax;
+    const Sk4s     fYsMax;
+    const Sk4s     fYsInvMax;
+};
+// max = 40
+// mq2[x_] := Abs[(x - 40) - Floor[(x - 40)/80] * 80 - 40]
+class XMirrorStrategy {
+public:
+    XMirrorStrategy(int32_t max)
+        : fXsMax{SkScalar(max)}
+        , fXsCap{SkScalar(nextafterf(SkScalar(max), 0.0f))}
+        , fXsDoubleInvMax{1.0f / (2.0f * SkScalar(max))} { }
+
+    void tileXPoints(Sk4s* xs) {
+        Sk4f bias   = *xs - fXsMax;
+        Sk4f div    = bias * fXsDoubleInvMax;
+        Sk4f mod    = bias - div.floor() * 2.0f * fXsMax;
+        Sk4f unbias = mod - fXsMax;
+        *xs = Sk4f::Min(unbias.abs(), fXsCap);
+        SkASSERT(0 <= (*xs)[0] && (*xs)[0] < fXsMax[0]);
+        SkASSERT(0 <= (*xs)[1] && (*xs)[1] < fXsMax[0]);
+        SkASSERT(0 <= (*xs)[2] && (*xs)[2] < fXsMax[0]);
+        SkASSERT(0 <= (*xs)[3] && (*xs)[3] < fXsMax[0]);
     }
-    const Sk4s fXMax{0.0f};
-    const Sk4s fXInvMax{0.0f};
-    const Sk4s fYMax{0.0f};
-    const Sk4s fYInvMax{0.0f};
+
+    template <typename Next>
+    bool maybeProcessSpan(Span originalSpan, Next* next) { return false; }
+
+private:
+    Sk4f     fXsMax;
+    Sk4f     fXsCap;
+    Sk4f     fXsDoubleInvMax;
+};
+
+class YMirrorStrategy {
+public:
+    YMirrorStrategy(int32_t max)
+        : fYMax{SkScalar(max)}
+        , fYsMax{SkScalar(max)}
+        , fYsCap{nextafterf(SkScalar(max), 0.0f)}
+        , fYsDoubleInvMax{1.0f / (2.0f * SkScalar(max))} { }
+
+    void tileYPoints(Sk4s* ys) {
+        Sk4f bias   = *ys - fYsMax;
+        Sk4f div    = bias * fYsDoubleInvMax;
+        Sk4f mod    = bias - div.floor() * 2.0f * fYsMax;
+        Sk4f unbias = mod - fYsMax;
+        *ys = Sk4f::Min(unbias.abs(), fYsCap);
+        SkASSERT(0 <= (*ys)[0] && (*ys)[0] < fYMax);
+        SkASSERT(0 <= (*ys)[1] && (*ys)[1] < fYMax);
+        SkASSERT(0 <= (*ys)[2] && (*ys)[2] < fYMax);
+        SkASSERT(0 <= (*ys)[3] && (*ys)[3] < fYMax);
+    }
+
+    SkScalar tileY(SkScalar y) {
+        SkScalar bias   = y - fYMax;
+        SkScalar div    = bias * fYsDoubleInvMax[0];
+        SkScalar mod    = bias - SkScalarFloorToScalar(div) * 2.0f * fYMax;
+        SkScalar unbias = mod - fYMax;
+        SkScalar answer = SkMinScalar(SkScalarAbs(unbias), fYsCap[0]);
+        SkASSERT(0 <= answer && answer < fYMax);
+        return answer;
+    };
+
+private:
+    SkScalar fYMax;
+    Sk4f     fYsMax;
+    Sk4f     fYsCap;
+    Sk4f     fYsDoubleInvMax;
 };
 
 }  // namespace
diff --git a/tests/SkLinearBitmapPipelineTest.cpp b/tests/SkLinearBitmapPipelineTest.cpp
index e715b62..be52e29 100644
--- a/tests/SkLinearBitmapPipelineTest.cpp
+++ b/tests/SkLinearBitmapPipelineTest.cpp
@@ -18,6 +18,10 @@
 #include "SkLinearBitmapPipeline_tile.h"
 
 
+DEF_TEST(LBPBilerpEdge, reporter) {
+
+}
+
 static SkString dump(SkScalar cut, Span prefix, Span remainder) {
     SkPoint prefixStart; SkScalar prefixLen; int prefixCount;
     std::tie(prefixStart, prefixLen, prefixCount) = prefix;
@@ -100,8 +104,13 @@
     }
 }
 
-template <typename Tiler>
-static bool compare_tiler_case(Tiler& tiler, Span span, skiatest::Reporter* reporter) {
+DEF_TEST(LBPBilerpSpanOps, reporter) {
+
+}
+
+template <typename XTiler, typename YTiler>
+static bool compare_tiler_case(
+    XTiler& xTiler, YTiler& yTiler, Span span, skiatest::Reporter* reporter) {
     Span originalSpan = span;
     std::vector<SkPoint> listPoints;
     std::vector<SkPoint> spanPoints;
@@ -143,17 +152,24 @@
     while (count >= 4) {
         Sk4f txs = xs;
         Sk4f tys = ys;
-        tiler.processPoints(&txs, &tys);
+        xTiler.tileXPoints(&txs);
+        yTiler.tileYPoints(&tys);
         listSink.pointList4(txs, tys);
         xs = xs + 4.0f * dx;
         count -= 4;
     }
     if (count > 0) {
-        tiler.processPoints(&xs, &ys);
+        xTiler.tileXPoints(&xs);
+        yTiler.tileYPoints(&ys);
         listSink.pointListFew(count, xs, ys);
     }
 
-    bool handledSpan = tiler.maybeProcessSpan(span, &spanSink);
+    std::tie(start, length, count) = originalSpan;
+    SkScalar x = X(start);
+    SkScalar y = yTiler.tileY(Y(start));
+    Span yAdjustedSpan{{x, y}, length, count};
+
+    bool handledSpan = xTiler.maybeProcessSpan(yAdjustedSpan, &spanSink);
     if (handledSpan) {
         auto firstNotTheSame = std::mismatch(
             listPoints.begin(), listPoints.end(), spanPoints.begin());
@@ -184,9 +200,10 @@
     return true;
 }
 
-template <typename Tiler>
+template <typename XTiler, typename YTiler>
 static bool compare_tiler_spans(int width, int height, skiatest::Reporter* reporter) {
-    Tiler tiler{SkSize::Make((SkScalar)width, (SkScalar)height)};
+    XTiler xTiler{width};
+    YTiler yTiler{height};
     INFOF(reporter, "w: %d, h: %d \n", width, height);
     std::array<int, 8> interestingX {{-5, -1, 0, 1, width - 1, width, width + 1, width + 5}};
     std::array<int, 8> interestingY {{-5, -1, 0, 1, height - 1, height, height + 1, height + 5}};
@@ -198,7 +215,7 @@
                 for (auto y : interestingY) {
                     Span span{
                         SkPoint::Make((SkScalar)startX, (SkScalar)y), (count-1.0f) * scale, count};
-                    if (!compare_tiler_case(tiler, span, reporter)) {
+                    if (!compare_tiler_case(xTiler, yTiler, span, reporter)) {
                         return false;
                     }
                 }
@@ -208,23 +225,23 @@
     return true;
 }
 
-template <typename Tiler>
+template <typename XTiler, typename YTiler>
 static void test_tiler(skiatest::Reporter* reporter) {
     std::array<int, 6> interestingSize {{1, 2, 3, 4, 5, 10}};
     for (auto width : interestingSize) {
         for (auto height : interestingSize) {
-            if (!compare_tiler_spans<Tiler>(width, height, reporter)) { return; }
+            if (!compare_tiler_spans<XTiler, YTiler>(width, height, reporter)) { return; }
         }
     }
 }
-
+/*
 DEF_TEST(LBPStrategyClampTile, reporter) {
 #if 0
     ClampStrategy tiler{SkSize::Make(1, 1)};
     Span span{SkPoint::Make(0, -5), 1.0f, 2};
     compare_tiler_case<ClampStrategy>(tiler, span, reporter);
 #else
-    test_tiler<ClampStrategy>(reporter);
+    test_tiler<XClampStrategy, YClampStrategy>(reporter);
 #endif
 }
 
@@ -234,8 +251,7 @@
     Span span{SkPoint::Make(-5, -5), 20 * 2.1f, 100};
     compare_tiler_case<RepeatStrategy>(tiler, span, reporter);
 #else
-    test_tiler<RepeatStrategy>(reporter);
+    test_tiler<XRepeatStrategy, YRepeatStrategy>(reporter);
 #endif
 }
-
-
+*/