Revert of Redo Tiling (patchset #14 id:260001 of https://codereview.chromium.org/2134893002/ )

Reason for revert:
Crashing on Win with:

Caught exception 3221225477 EXCEPTION_ACCESS_VIOLATION, was running:
	unit test  GrShape
	srgb gm  shadertext2
	srgb gm  shallow_gradient_conical
	srgb gm  shallow_gradient_sweep
	srgb gm  shallow_gradient_linear_nodither
step returned non-zero exit code: -1073741819

https://status.skia.org/?commit_label=author&filter=search&search_value=Test-Win-MSVC-GCE-CPU-AVX2-x86-Release

Original issue's description:
> In the current code, tiling and bilerp sampling are strongly tied together. They can be separated by taking advantage of observation that translating a sample point into filter points in the bilerp stage the filter points will be at most 0.5 outside the tile. This allows simplified repositioning for the various tiling modes; clamp and mirror use min and max while repeat has max -> 0 and 0-> max. This allows bilerp to simply treat the filter points that fall off the tile. This allows tiling and bilerp sampling to be totally separate.
>
> This CL has several parts that are intertwined:
> * move pin/wrap functionality into BilerpSampler.
> * remove the nearest neighbor and bilerp tilers
> * create a simplified general tiler
> * remove the pipeline virtual calls bilerpEdge and bilerpSpan because everything works of sample points now.
> * redo all the bilerp sampling to use the new local to methods to wrap/pin.
> * introduce a new medium rate sample that handles spans with 1 < |dx| < 2.
>
> This change improves the performance as displayed below:
> Most of top 25 desktop improves or are the same. A few are worse, but close to the noise floor. In addition, this change has about 3% smaller code.
>
> old time     new time   new/old
> 13274693  8414645  0.633886  top25desk_google_com_search_q_c.skp_1
> 4946466   3258018  0.658656  top25desk_wordpress.skp_1
> 6977187   5737584  0.822335  top25desk_youtube_com.skp_1
> 3770021   3296831  0.874486  top25desk_google_com__hl_en_q_b.skp_1
> 8890813   8600143  0.967307  top25desk_answers_yahoo_com.skp_1
> 3178974   3094300  0.973364  top25desk_facebook.skp_1
> 8871835   8711260  0.981901  top25desk_twitter.skp_1
> 838509    829290   0.989005  top25desk_blogger.skp_1
> 2821870   2801111  0.992644  top25desk_plus_google_com_11003.skp_1
> 511978    509530   0.995219  top25desk_techcrunch_com.skp_1
> 2408588   2397435  0.995369  top25desk_ebay_com.skp_1
> 4446919   4448004  1.00024   top25desk_espn.skp_1
> 2863241   2875696  1.00435   top25desk_google_com_calendar_.skp_1
> 7170086   7208447  1.00535   top25desk_booking_com.skp_1
> 7356109   7417776  1.00838   top25desk_pinterest.skp_1
> 5265591   5340392  1.01421   top25desk_weather_com.skp_1
> 5675244   5774144  1.01743   top25desk_sports_yahoo_com_.skp_1
> 1048531   1067663  1.01825   top25desk_games_yahoo_com.skp_1
> 2075501   2115131  1.01909   top25desk_amazon_com.skp_1
> 4262170   4370441  1.0254    top25desk_news_yahoo_com.skp_1
> 3789319   3897996  1.02868   top25desk_docs___1_open_documen.skp_1
> 919336    949979   1.03333   top25desk_wikipedia__1_tab_.skp_1
> 4274454   4489369  1.05028   top25desk_mail_google_com_mail_.skp_1
> 4149326   4376556  1.05476   top25desk_linkedin.skp_1
>
> BUG=skia:
> GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2134893002
> CQ_INCLUDE_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot;master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
>
> Committed: https://skia.googlesource.com/skia/+/8602ede5fdfa721dcad4dcb11db028c1c24265f1

TBR=mtklein@google.com,herb@google.com
# Skipping CQ checks because original CL landed less than 1 days ago.
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
BUG=skia:

Review-Url: https://codereview.chromium.org/2174793002
diff --git a/src/core/SkBitmapProcShader.h b/src/core/SkBitmapProcShader.h
index 67b005a..a4591c7 100644
--- a/src/core/SkBitmapProcShader.h
+++ b/src/core/SkBitmapProcShader.h
@@ -56,7 +56,7 @@
     typedef SkShader INHERITED;
 };
 
-enum {kSkBlitterContextSize = 3332};
+enum {kSkBlitterContextSize = 3200};
 
 // Commonly used allocator. It currently is only used to allocate up to 3 objects. The total
 // bytes requested is calculated using one of our large shaders, its context size plus the size of
diff --git a/src/core/SkLinearBitmapPipeline.cpp b/src/core/SkLinearBitmapPipeline.cpp
index 0122765..088e829 100644
--- a/src/core/SkLinearBitmapPipeline.cpp
+++ b/src/core/SkLinearBitmapPipeline.cpp
@@ -165,14 +165,15 @@
 // Tile Stage
 
 template<typename XStrategy, typename YStrategy, typename Next>
-class CombinedTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
+class NearestTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
 public:
-    CombinedTileStage(Next* next, SkISize dimensions)
+    template <typename... Args>
+    NearestTileStage(Next* next, SkISize dimensions)
         : fNext{next}
         , fXStrategy{dimensions.width()}
         , fYStrategy{dimensions.height()}{ }
 
-    CombinedTileStage(Next* next, const CombinedTileStage& stage)
+    NearestTileStage(Next* next, const NearestTileStage& stage)
         : fNext{next}
         , fXStrategy{stage.fXStrategy}
         , fYStrategy{stage.fYStrategy} { }
@@ -194,16 +195,9 @@
         SkASSERT(!span.isEmpty());
         SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
-
-        if (span.count() == 1) {
-            this->pointListFew(1, span.startX(), span.startY());
-            return;
-        }
-
         SkScalar x = X(start);
         SkScalar y = fYStrategy.tileY(Y(start));
         Span yAdjustedSpan{{x, y}, length, count};
-
         if (!fXStrategy.maybeProcessSpan(yAdjustedSpan, fNext)) {
             span_fallback(span, this);
         }
@@ -215,27 +209,173 @@
     YStrategy fYStrategy;
 };
 
-template <typename XStrategy, typename Next>
+template<typename XStrategy, typename YStrategy, typename Next>
+class BilerpTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
+public:
+    template <typename... Args>
+    BilerpTileStage(Next* next, SkISize dimensions)
+        : fNext{next}
+        , fXMax(dimensions.width())
+        , fYMax(dimensions.height())
+        , fXStrategy{dimensions.width()}
+        , fYStrategy{dimensions.height()} { }
+
+    BilerpTileStage(Next* next, const BilerpTileStage& stage)
+        : fNext{next}
+        , fXMax{stage.fXMax}
+        , fYMax{stage.fYMax}
+        , fXStrategy{stage.fXStrategy}
+        , fYStrategy{stage.fYStrategy} { }
+
+    void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
+        fXStrategy.tileXPoints(&xs);
+        fYStrategy.tileYPoints(&ys);
+        // TODO: check to see if xs and ys are in range then just call pointListFew on next.
+        if (n >= 1) this->bilerpPoint(xs[0], ys[0]);
+        if (n >= 2) this->bilerpPoint(xs[1], ys[1]);
+        if (n >= 3) this->bilerpPoint(xs[2], ys[2]);
+    }
+
+    void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
+        fXStrategy.tileXPoints(&xs);
+        fYStrategy.tileYPoints(&ys);
+        // TODO: check to see if xs and ys are in range then just call pointList4 on next.
+        this->bilerpPoint(xs[0], ys[0]);
+        this->bilerpPoint(xs[1], ys[1]);
+        this->bilerpPoint(xs[2], ys[2]);
+        this->bilerpPoint(xs[3], ys[3]);
+    }
+
+    struct Wrapper {
+        void pointSpan(Span span) {
+            processor->breakIntoEdges(span);
+        }
+
+        void repeatSpan(Span span, int32_t repeatCount) {
+            while (repeatCount --> 0) {
+                processor->pointSpan(span);
+            }
+        }
+
+        BilerpTileStage* processor;
+    };
+
+    // The span you pass must not be empty.
+    void pointSpan(Span span) override {
+        SkASSERT(!span.isEmpty());
+
+        Wrapper wrapper = {this};
+        if (!fXStrategy.maybeProcessSpan(span, &wrapper)) {
+            span_fallback(span, this);
+        }
+    }
+
+private:
+    void bilerpPoint(SkScalar x, SkScalar y) {
+        Sk4f txs = Sk4f{x} + Sk4f{-0.5f, 0.5f, -0.5f, 0.5f};
+        Sk4f tys = Sk4f{y} + Sk4f{-0.5f, -0.5f, 0.5f, 0.5f};
+        fXStrategy.tileXPoints(&txs);
+        fYStrategy.tileYPoints(&tys);
+        fNext->bilerpEdge(txs, tys);
+    }
+
+    void handleEdges(Span span, SkScalar dx) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+        SkScalar x = X(start);
+        SkScalar y = Y(start);
+        SkScalar tiledY = fYStrategy.tileY(y);
+        while (count > 0) {
+            this->bilerpPoint(x, tiledY);
+            x += dx;
+            count -= 1;
+        }
+    }
+
+    void yProcessSpan(Span span) {
+        SkScalar tiledY = fYStrategy.tileY(span.startY());
+        if (0.5f <= tiledY && tiledY < fYMax - 0.5f ) {
+            Span tiledSpan{{span.startX(), tiledY}, span.length(), span.count()};
+            fNext->pointSpan(tiledSpan);
+        } else {
+            // Convert to the Y0 bilerp sample set by shifting by -0.5f. Then tile that new y
+            // value and shift it back resulting in the working Y0. Do the same thing with Y1 but
+            // in the opposite direction.
+            SkScalar y0 = fYStrategy.tileY(span.startY() - 0.5f) + 0.5f;
+            SkScalar y1 = fYStrategy.tileY(span.startY() + 0.5f) - 0.5f;
+            Span newSpan{{span.startX(), y0}, span.length(), span.count()};
+            fNext->bilerpSpan(newSpan, y1);
+        }
+    }
+    void breakIntoEdges(Span span) {
+        if (span.count() == 1) {
+            this->bilerpPoint(span.startX(), span.startY());
+        } else if (span.length() == 0) {
+            yProcessSpan(span);
+        } else {
+            SkScalar dx = span.length() / (span.count() - 1);
+            if (span.length() > 0) {
+                Span leftBorder = span.breakAt(0.5f, dx);
+                if (!leftBorder.isEmpty()) {
+                    this->handleEdges(leftBorder, dx);
+                }
+                Span center = span.breakAt(fXMax - 0.5f, dx);
+                if (!center.isEmpty()) {
+                    this->yProcessSpan(center);
+                }
+
+                if (!span.isEmpty()) {
+                    this->handleEdges(span, dx);
+                }
+            } else {
+                Span center = span.breakAt(fXMax + 0.5f, dx);
+                if (!span.isEmpty()) {
+                    this->handleEdges(span, dx);
+                }
+                Span leftEdge = center.breakAt(0.5f, dx);
+                if (!center.isEmpty()) {
+                    this->yProcessSpan(center);
+                }
+                if (!leftEdge.isEmpty()) {
+                    this->handleEdges(leftEdge, dx);
+                }
+
+            }
+        }
+    }
+
+    Next* const fNext;
+    SkScalar fXMax;
+    SkScalar fYMax;
+    XStrategy fXStrategy;
+    YStrategy fYStrategy;
+};
+
+template <typename XStrategy, typename YStrategy, typename Next>
+void make_tile_stage(
+    SkFilterQuality filterQuality, SkISize dimensions,
+    Next* next, SkLinearBitmapPipeline::TileStage* tileStage) {
+    if (filterQuality == kNone_SkFilterQuality) {
+        tileStage->initStage<NearestTileStage<XStrategy, YStrategy, Next>>(next, dimensions);
+    } else {
+        tileStage->initStage<BilerpTileStage<XStrategy, YStrategy, Next>>(next, dimensions);
+    }
+}
+template <typename XStrategy>
 void choose_tiler_ymode(
     SkShader::TileMode yMode, SkFilterQuality filterQuality, SkISize dimensions,
-    Next* next,
+    SkLinearBitmapPipeline::SampleProcessorInterface* next,
     SkLinearBitmapPipeline::TileStage* tileStage) {
     switch (yMode) {
-        case SkShader::kClamp_TileMode: {
-            using Tiler = CombinedTileStage<XStrategy, YClampStrategy, Next>;
-            tileStage->initStage<Tiler>(next, dimensions);
+        case SkShader::kClamp_TileMode:
+            make_tile_stage<XStrategy, YClampStrategy>(filterQuality, dimensions, next, tileStage);
             break;
-        }
-        case SkShader::kRepeat_TileMode: {
-            using Tiler = CombinedTileStage<XStrategy, YRepeatStrategy, Next>;
-            tileStage->initStage<Tiler>(next, dimensions);
+        case SkShader::kRepeat_TileMode:
+            make_tile_stage<XStrategy, YRepeatStrategy>(filterQuality, dimensions, next, tileStage);
             break;
-        }
-        case SkShader::kMirror_TileMode: {
-            using Tiler = CombinedTileStage<XStrategy, YMirrorStrategy, Next>;
-            tileStage->initStage<Tiler>(next, dimensions);
+        case SkShader::kMirror_TileMode:
+            make_tile_stage<XStrategy, YMirrorStrategy>(filterQuality, dimensions, next, tileStage);
             break;
-        }
     }
 };
 
@@ -327,6 +467,10 @@
         fDest = dest;
     }
 
+    void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) override { SkFAIL("Not Implemented"); }
+
+    void bilerpSpan(Span span, SkScalar y) override { SkFAIL("Not Implemented"); }
+
     void setDestination(void* dst, int count) override  {
         fDest = static_cast<uint32_t*>(dst);
         fEnd = fDest + count;
@@ -394,6 +538,10 @@
         SkASSERT(fDest <= fEnd);
     }
 
+    void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) override { SkFAIL("Not Implemented"); }
+
+    void bilerpSpan(Span span, SkScalar y) override { SkFAIL("Not Implemented"); }
+
     void setDestination(void* dst, int count) override  {
         SkASSERT(count > 0);
         fDest = static_cast<uint32_t*>(dst);
@@ -434,9 +582,12 @@
     }
 }
 
-static SkLinearBitmapPipeline::PixelAccessorInterface* choose_pixel_accessor(
+template<template <typename, typename> class Sampler>
+static SkLinearBitmapPipeline::SampleProcessorInterface* choose_pixel_sampler_base(
+    Blender* next,
     const SkPixmap& srcPixmap,
     const SkColor A8TintColor,
+    SkLinearBitmapPipeline::SampleStage* sampleStage,
     SkLinearBitmapPipeline::Accessor* accessor)
 {
     const SkImageInfo& imageInfo = srcPixmap.info();
@@ -478,19 +629,19 @@
             break;
     }
 
-    return pixelAccessor;
+    using S = Sampler<PixelAccessorShim, Blender>;
+    sampleStage->initStage<S>(next, pixelAccessor);
+    return sampleStage->get();
 }
 
 SkLinearBitmapPipeline::SampleProcessorInterface* choose_pixel_sampler(
     Blender* next,
     SkFilterQuality filterQuality,
-    SkShader::TileMode xTile, SkShader::TileMode yTile,
     const SkPixmap& srcPixmap,
     const SkColor A8TintColor,
     SkLinearBitmapPipeline::SampleStage* sampleStage,
     SkLinearBitmapPipeline::Accessor* accessor) {
     const SkImageInfo& imageInfo = srcPixmap.info();
-    SkISize dimensions = imageInfo.dimensions();
 
     // Special case samplers with fully expanded templates
     if (imageInfo.gammaCloseToSRGB()) {
@@ -519,14 +670,14 @@
                     using S =
                     BilerpSampler<
                         PixelAccessor<kN32_SkColorType, kSRGB_SkGammaType>, Blender>;
-                    sampleStage->initStage<S>(next, dimensions, xTile, yTile, srcPixmap);
+                    sampleStage->initStage<S>(next, srcPixmap);
                     return sampleStage->get();
                 }
                 case kIndex_8_SkColorType: {
                     using S =
                     BilerpSampler<
                         PixelAccessor<kIndex_8_SkColorType, kSRGB_SkGammaType>, Blender>;
-                    sampleStage->initStage<S>(next, dimensions, xTile, yTile, srcPixmap);
+                    sampleStage->initStage<S>(next, srcPixmap);
                     return sampleStage->get();
                 }
                 default:
@@ -535,16 +686,14 @@
         }
     }
 
-    auto pixelAccessor = choose_pixel_accessor(srcPixmap, A8TintColor, accessor);
     // General cases.
     if (filterQuality == kNone_SkFilterQuality) {
-        using S = NearestNeighborSampler<PixelAccessorShim, Blender>;
-        sampleStage->initStage<S>(next, pixelAccessor);
+        return choose_pixel_sampler_base<NearestNeighborSampler>(
+            next, srcPixmap, A8TintColor, sampleStage, accessor);
     } else {
-        using S = BilerpSampler<PixelAccessorShim, Blender>;
-        sampleStage->initStage<S>(next, dimensions, xTile, yTile, pixelAccessor);
+        return choose_pixel_sampler_base<BilerpSampler>(
+            next, srcPixmap, A8TintColor, sampleStage, accessor);
     }
-    return sampleStage->get();
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -556,17 +705,17 @@
     SrcFPPixel(const SrcFPPixel& Blender) : fPostAlpha(Blender.fPostAlpha) {}
     void SK_VECTORCALL blendPixel(Sk4f pixel) override {
         SkASSERT(fDst + 1 <= fEnd );
-        this->srcPixel(fDst, pixel, 0);
+        SrcPixel(fDst, pixel, 0);
         fDst += 1;
     }
 
     void SK_VECTORCALL blend4Pixels(Sk4f p0, Sk4f p1, Sk4f p2, Sk4f p3) override {
         SkASSERT(fDst + 4 <= fEnd);
         SkPM4f* dst = fDst;
-        this->srcPixel(dst, p0, 0);
-        this->srcPixel(dst, p1, 1);
-        this->srcPixel(dst, p2, 2);
-        this->srcPixel(dst, p3, 3);
+        SrcPixel(dst, p0, 0);
+        SrcPixel(dst, p1, 1);
+        SrcPixel(dst, p2, 2);
+        SrcPixel(dst, p3, 3);
         fDst += 4;
     }
 
@@ -576,9 +725,7 @@
     }
 
 private:
-    void SK_VECTORCALL srcPixel(SkPM4f* dst, Sk4f pixel, int index) {
-        check_pixel(pixel);
-
+    void SK_VECTORCALL SrcPixel(SkPM4f* dst, Sk4f pixel, int index) {
         Sk4f newPixel = pixel;
         if (alphaType == kUnpremul_SkAlphaType) {
             newPixel = Premultiply(pixel);
@@ -650,8 +797,7 @@
     // identity matrix, the matrix stage is skipped, and the tilerStage is the first stage.
     auto blenderStage = choose_blender_for_shading(alphaType, postAlpha, &fBlenderStage);
     auto samplerStage = choose_pixel_sampler(
-        blenderStage, filterQuality, xTile, yTile,
-        srcPixmap, paintColor, &fSampleStage, &fAccessor);
+        blenderStage, filterQuality, srcPixmap, paintColor, &fSampleStage, &fAccessor);
     auto tilerStage   = choose_tiler(samplerStage, dimensions, xTile, yTile,
                                      filterQuality, dx, &fTileStage);
     fFirstStage       = choose_matrix(tilerStage, adjustedInverse, &fMatrixStage);
diff --git a/src/core/SkLinearBitmapPipeline.h b/src/core/SkLinearBitmapPipeline.h
index 91b573d..b0f7e9d 100644
--- a/src/core/SkLinearBitmapPipeline.h
+++ b/src/core/SkLinearBitmapPipeline.h
@@ -133,9 +133,9 @@
     // These values were generated by the assert above in Stage::init{Sink|Stage}.
     using MatrixStage  = Stage<PointProcessorInterface,    160, PointProcessorInterface>;
     using TileStage    = Stage<PointProcessorInterface,    160, SampleProcessorInterface>;
-    using SampleStage  = Stage<SampleProcessorInterface,   160, BlendProcessorInterface>;
+    using SampleStage  = Stage<SampleProcessorInterface,   100, BlendProcessorInterface>;
     using BlenderStage = Stage<BlendProcessorInterface,     40>;
-    using Accessor     = PolyMemory<PixelAccessorInterface, 64>;
+    using Accessor     = PolyMemory<PixelAccessorInterface, 48>;
 
 private:
     PointProcessorInterface* fFirstStage;
diff --git a/src/core/SkLinearBitmapPipeline_core.h b/src/core/SkLinearBitmapPipeline_core.h
index cf120ee..2c39a38 100644
--- a/src/core/SkLinearBitmapPipeline_core.h
+++ b/src/core/SkLinearBitmapPipeline_core.h
@@ -178,15 +178,6 @@
         stage->pointListFew(count, xs, ys);
     }
 }
-
-inline Sk4f check_pixel(Sk4f& pixel) {
-    SkASSERTF(0.0f <= pixel[0] && pixel[0] <= 1.0f, "pixel[0]: %f", pixel[0]);
-    SkASSERTF(0.0f <= pixel[1] && pixel[1] <= 1.0f, "pixel[1]: %f", pixel[1]);
-    SkASSERTF(0.0f <= pixel[2] && pixel[2] <= 1.0f, "pixel[2]: %f", pixel[2]);
-    SkASSERTF(0.0f <= pixel[3] && pixel[3] <= 1.0f, "pixel[3]: %f", pixel[3]);
-    return pixel;
-}
-
 }  // namespace
 
 class SkLinearBitmapPipeline::PointProcessorInterface {
@@ -210,6 +201,26 @@
     // Used for nearest neighbor when scale factor is 1.0. The span can just be repeated with no
     // edge pixel alignment problems. This is for handling a very common case.
     virtual void repeatSpan(Span span, int32_t repeatCount) = 0;
+
+    // The x's and y's are setup in the following order:
+    // +--------+--------+
+    // |        |        |
+    // |  px00  |  px10  |
+    // |    0   |    1   |
+    // +--------+--------+
+    // |        |        |
+    // |  px01  |  px11  |
+    // |    2   |    3   |
+    // +--------+--------+
+    // These pixels coordinates are arranged in the following order in xs and ys:
+    // px00  px10  px01  px11
+    virtual void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) = 0;
+
+    // A span represents sample points that have been mapped from destination space to source
+    // space. Each sample point is then expanded to the four bilerp points by add +/- 0.5. The
+    // resulting Y values my be off the tile. When y +/- 0.5 are more than 1 apart because of
+    // tiling, the second Y is used to denote the retiled Y value.
+    virtual void bilerpSpan(Span span, SkScalar y) = 0;
 };
 
 class SkLinearBitmapPipeline::DestinationInterface {
@@ -232,10 +243,10 @@
 public:
     virtual ~PixelAccessorInterface() { }
     virtual void SK_VECTORCALL getFewPixels(
-        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const = 0;
+        int n, Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const = 0;
 
     virtual void SK_VECTORCALL get4Pixels(
-        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const = 0;
+        Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const = 0;
 
     virtual void get4Pixels(
         const void* src, int index, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const = 0;
diff --git a/src/core/SkLinearBitmapPipeline_sample.h b/src/core/SkLinearBitmapPipeline_sample.h
index 5421758..759075b 100644
--- a/src/core/SkLinearBitmapPipeline_sample.h
+++ b/src/core/SkLinearBitmapPipeline_sample.h
@@ -40,7 +40,7 @@
 // * px11 -> xy
 // So x * y is calculated first and then used to calculate all the other factors.
 static Sk4s SK_VECTORCALL bilerp4(Sk4s xs, Sk4s ys, Sk4f px00, Sk4f px10,
-                                                    Sk4f px01, Sk4f px11) {
+                               Sk4f px01, Sk4f px11) {
     // Calculate fractional xs and ys.
     Sk4s fxs = xs - xs.floor();
     Sk4s fys = ys - ys.floor();
@@ -134,21 +134,20 @@
 class PixelConverter<kIndex_8_SkColorType, gammaType> {
 public:
     using Element = uint8_t;
-    PixelConverter(const SkPixmap& srcPixmap)
-    : fColorTableSize(srcPixmap.ctable()->count()){
+    PixelConverter(const SkPixmap& srcPixmap) {
         SkColorTable* skColorTable = srcPixmap.ctable();
         SkASSERT(skColorTable != nullptr);
 
         fColorTable = (Sk4f*)SkAlign16((intptr_t)fColorTableStorage.get());
-        for (int i = 0; i < fColorTableSize; i++) {
+        for (int i = 0; i < skColorTable->count(); i++) {
             fColorTable[i] = pmcolor_to_rgba<gammaType>((*skColorTable)[i]);
         }
     }
 
-    PixelConverter(const PixelConverter& strategy)
-    : fColorTableSize{strategy.fColorTableSize}{
+    PixelConverter(const PixelConverter& strategy) {
         fColorTable = (Sk4f*)SkAlign16((intptr_t)fColorTableStorage.get());
-        for (int i = 0; i < fColorTableSize; i++) {
+        // TODO: figure out the count.
+        for (int i = 0; i < 256; i++) {
             fColorTable[i] = strategy.fColorTable[i];
         }
     }
@@ -159,9 +158,9 @@
 
 private:
     static const size_t kColorTableSize = sizeof(Sk4f[256]) + 12;
-    const int           fColorTableSize;
-    SkAutoMalloc        fColorTableStorage{kColorTableSize};
-    Sk4f*               fColorTable;
+
+    SkAutoMalloc         fColorTableStorage{kColorTableSize};
+    Sk4f*                fColorTable;
 };
 
 template <SkGammaType gammaType>
@@ -195,12 +194,12 @@
         : fPixelAccessor(accessor) { }
 
     void SK_VECTORCALL getFewPixels(
-        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const {
+        int n, Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const {
         fPixelAccessor->getFewPixels(n, xs, ys, px0, px1, px2);
     }
 
     void SK_VECTORCALL get4Pixels(
-        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const {
+        Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const {
         fPixelAccessor->get4Pixels(xs, ys, px0, px1, px2, px3);
     }
 
@@ -238,8 +237,10 @@
         , fConverter{srcPixmap, std::move<Args>(args)...} { }
 
     void SK_VECTORCALL getFewPixels (
-        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const override {
-        Sk4i bufferLoc = ys * fWidth + xs;
+        int n, Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const override {
+        Sk4i XIs = SkNx_cast<int, SkScalar>(xs);
+        Sk4i YIs = SkNx_cast<int, SkScalar>(ys);
+        Sk4i bufferLoc = YIs * fWidth + XIs;
         switch (n) {
             case 3:
                 *px2 = this->getPixelAt(bufferLoc[2]);
@@ -253,8 +254,10 @@
     }
 
     void SK_VECTORCALL get4Pixels(
-        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const override {
-        Sk4i bufferLoc = ys * fWidth + xs;
+        Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const override {
+        Sk4i XIs = SkNx_cast<int, SkScalar>(xs);
+        Sk4i YIs = SkNx_cast<int, SkScalar>(ys);
+        Sk4i bufferLoc = YIs * fWidth + XIs;
         *px0 = this->getPixelAt(bufferLoc[0]);
         *px1 = this->getPixelAt(bufferLoc[1]);
         *px2 = this->getPixelAt(bufferLoc[2]);
@@ -327,7 +330,6 @@
     }
 }
 
-// -- NearestNeighborSampler -----------------------------------------------------------------------
 // NearestNeighborSampler - use nearest neighbor filtering to create runs of destination pixels.
 template<typename Accessor, typename Next>
 class NearestNeighborSampler : public SkLinearBitmapPipeline::SampleProcessorInterface {
@@ -343,7 +345,7 @@
     void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
         SkASSERT(0 < n && n < 4);
         Sk4f px0, px1, px2;
-        fAccessor.getFewPixels(n, SkNx_cast<int>(xs), SkNx_cast<int>(ys), &px0, &px1, &px2);
+        fAccessor.getFewPixels(n, xs, ys, &px0, &px1, &px2);
         if (n >= 1) fNext->blendPixel(px0);
         if (n >= 2) fNext->blendPixel(px1);
         if (n >= 3) fNext->blendPixel(px2);
@@ -351,7 +353,7 @@
 
     void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
         Sk4f px0, px1, px2, px3;
-        fAccessor.get4Pixels(SkNx_cast<int>(xs), SkNx_cast<int>(ys), &px0, &px1, &px2, &px3);
+        fAccessor.get4Pixels(xs, ys, &px0, &px1, &px2, &px3);
         fNext->blend4Pixels(px0, px1, px2, px3);
     }
 
@@ -378,11 +380,21 @@
         }
     }
 
+    void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) override {
+        SkFAIL("Using nearest neighbor sampler, but calling a bilerpEdge.");
+    }
+
+    void bilerpSpan(Span span, SkScalar y) override {
+        SkFAIL("Using nearest neighbor sampler, but calling a bilerpSpan.");
+    }
+
 private:
     // When moving through source space more slowly than dst space (zoomed in),
     // we'll be sampling from the same source pixel more than once.
     void spanSlowRate(Span span) {
-        SkPoint start; SkScalar length; int count;
+        SkPoint start;
+        SkScalar length;
+        int count;
         std::tie(start, length, count) = span;
         SkScalar x = X(start);
         SkFixed fx = SkScalarToFixed(x);
@@ -439,82 +451,35 @@
     Accessor    fAccessor;
 };
 
-// From an edgeType, the integer value of a pixel vs, and the integer value of the extreme edge
-// vMax, take the point which might be off the tile by one pixel and either wrap it or pin it to
-// generate the right pixel. The value vs is on the interval [-1, vMax + 1]. It produces a value
-// on the interval [0, vMax].
-// Note: vMax is not width or height, but width-1 or height-1 because it is the largest valid pixel.
-static inline int adjust_edge(SkShader::TileMode edgeType, int vs, int vMax) {
-    SkASSERT(-1 <= vs && vs <= vMax + 1)
-    switch (edgeType) {
-        case SkShader::kClamp_TileMode:
-        case SkShader::kMirror_TileMode:
-            vs = std::max(vs, 0);
-            vs = std::min(vs, vMax);
-            break;
-        case SkShader::kRepeat_TileMode:
-            vs = (vs <= vMax) ? vs : 0;
-            vs =    (vs >= 0) ? vs : vMax;
-            break;
-    }
-    SkASSERT(0 <= vs && vs <= vMax);
-    return vs;
-}
-
-// From a sample point on the tile, return the top or left filter value.
-// The result r should be in the range (0, 1]. Since this represents the weight given to the top
-// left element, then if x == 0.5 the filter value should be 1.0.
-// The input sample point must be on the tile, therefore it must be >= 0.
-static SkScalar sample_to_filter(SkScalar x) {
-    SkASSERT(x >= 0.0f);
-    // The usual form of the top or left edge is x - .5, but since we are working on the unit
-    // square, then x + .5 works just as well. This also guarantees that v > 0.0 allowing the use
-    // of trunc.
-    SkScalar v = x + 0.5f;
-    // Produce the top or left offset a value on the range [0, 1).
-    SkScalar f = v - SkScalarTruncToScalar(v);
-    // Produce the filter value which is on the range (0, 1].
-    SkScalar r =  1.0f - f;
-    SkASSERT(0.0f < r && r <= 1.0f);
-    return r;
-}
-
 // -- BilerpSampler --------------------------------------------------------------------------------
 // BilerpSampler - use a bilerp filter to create runs of destination pixels.
-// Note: in the code below, there are two types of points
-//       * sample points - these are the points passed in by pointList* and Spans.
-//       * filter points - are created from a sample point to form the coordinates of the points
-//                         to use in the filter and to generate the filter values.
 template<typename Accessor, typename Next>
 class BilerpSampler : public SkLinearBitmapPipeline::SampleProcessorInterface {
 public:
     template<typename... Args>
-    BilerpSampler(
-        SkLinearBitmapPipeline::BlendProcessorInterface* next,
-        SkISize dimensions,
-        SkShader::TileMode xTile, SkShader::TileMode yTile,
-        Args&& ... args
-    )
-        : fNext{next}
-        , fXEdgeType{xTile}
-        , fXMax{dimensions.width() - 1}
-        , fYEdgeType{yTile}
-        , fYMax{dimensions.height() - 1}
-        , fAccessor{std::forward<Args>(args)...} { }
+    BilerpSampler(SkLinearBitmapPipeline::BlendProcessorInterface* next, Args&& ... args)
+        : fNext{next}, fAccessor{std::forward<Args>(args)...} { }
 
     BilerpSampler(SkLinearBitmapPipeline::BlendProcessorInterface* next,
                    const BilerpSampler& sampler)
-        : fNext{next}
-        , fXEdgeType{sampler.fXEdgeType}
-        , fXMax{sampler.fXMax}
-        , fYEdgeType{sampler.fYEdgeType}
-        , fYMax{sampler.fYMax}
-        , fAccessor{sampler.fAccessor} { }
+        : fNext{next}, fAccessor{sampler.fAccessor} { }
+
+    Sk4f bilerpNonEdgePixel(SkScalar x, SkScalar y) {
+        Sk4f px00, px10, px01, px11;
+
+        // bilerp4() expects xs, ys are the top-lefts of the 2x2 kernel.
+        Sk4f xs = Sk4f{x} - 0.5f;
+        Sk4f ys = Sk4f{y} - 0.5f;
+        Sk4f sampleXs = xs + Sk4f{0.0f, 1.0f, 0.0f, 1.0f};
+        Sk4f sampleYs = ys + Sk4f{0.0f, 0.0f, 1.0f, 1.0f};
+        fAccessor.get4Pixels(sampleXs, sampleYs, &px00, &px10, &px01, &px11);
+        return bilerp4(xs, ys, px00, px10, px01, px11);
+    }
 
     void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
         SkASSERT(0 < n && n < 4);
         auto bilerpPixel = [&](int index) {
-            return this->bilerpSamplePoint(SkPoint{xs[index], ys[index]});
+            return this->bilerpNonEdgePixel(xs[index], ys[index]);
         };
 
         if (n >= 1) fNext->blendPixel(bilerpPixel(0));
@@ -524,56 +489,13 @@
 
     void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
         auto bilerpPixel = [&](int index) {
-            return this->bilerpSamplePoint(SkPoint{xs[index], ys[index]});
+            return this->bilerpNonEdgePixel(xs[index], ys[index]);
         };
         fNext->blend4Pixels(bilerpPixel(0), bilerpPixel(1), bilerpPixel(2), bilerpPixel(3));
     }
 
     void pointSpan(Span span) override {
-        SkASSERT(!span.isEmpty());
-        SkPoint start;
-        SkScalar length;
-        int count;
-        std::tie(start, length, count) = span;
-
-        // Nothing to do.
-        if (count == 0) {
-            return;
-        }
-
-        // Trivial case. No sample points are generated other than start.
-        if (count == 1) {
-            fNext->blendPixel(this->bilerpSamplePoint(start));
-            return;
-        }
-
-        // Note: the following code could be done in terms of dx = length / (count -1), but that
-        // would introduce a divide that is not needed for the most common dx == 1 cases.
-        SkScalar absLength = SkScalarAbs(length);
-        if (absLength == 0.0f) {
-            // |dx| == 0
-            // length is zero, so clamp an edge pixel.
-            this->spanZeroRate(span);
-        } else if (absLength < (count - 1)) {
-            // 0 < |dx| < 1.
-            this->spanSlowRate(span);
-        } else if (absLength == (count - 1)) {
-            // |dx| == 1.
-            if (sample_to_filter(span.startX()) == 1.0f
-                && sample_to_filter(span.startY()) == 1.0f) {
-                // All the pixels are aligned with the dest; go fast.
-                src_strategy_blend(span, fNext, &fAccessor);
-            } else {
-                // There is some sub-pixel offsets, so bilerp.
-                this->spanUnitRate(span);
-            }
-        } else if (absLength < 2.0f * (count - 1)) {
-            // 1 < |dx| < 2.
-            this->spanMediumRate(span);
-        } else {
-            // |dx| >= 2.
-            this->spanFastRate(span);
-        }
+        this->bilerpSpan(span, span.startY());
     }
 
     void repeatSpan(Span span, int32_t repeatCount) override {
@@ -583,425 +505,292 @@
         }
     }
 
-private:
-
-    // Convert a sample point to the points used by the filter.
-    void filterPoints(SkPoint sample, Sk4i* filterXs, Sk4i* filterYs) {
-        // May be less than zero. Be careful to use Floor.
-        int x0 = adjust_edge(fXEdgeType, SkScalarFloorToInt(X(sample) - 0.5), fXMax);
-        // Always greater than zero. Use the faster Trunc.
-        int x1 = adjust_edge(fXEdgeType, SkScalarTruncToInt(X(sample) + 0.5), fXMax);
-        int y0 = adjust_edge(fYEdgeType, SkScalarFloorToInt(Y(sample) - 0.5), fYMax);
-        int y1 = adjust_edge(fYEdgeType, SkScalarTruncToInt(Y(sample) + 0.5), fYMax);
-
-        *filterXs = Sk4i{x0, x1, x0, x1};
-        *filterYs = Sk4i{y0, y0, y1, y1};
-    }
-
-    // Given a sample point, generate a color by bilerping the four filter points.
-    Sk4f bilerpSamplePoint(SkPoint sample) {
-        Sk4i iXs, iYs;
-        filterPoints(sample, &iXs, &iYs);
+    void SK_VECTORCALL bilerpEdge(Sk4s sampleXs, Sk4s sampleYs) override {
         Sk4f px00, px10, px01, px11;
-        fAccessor.get4Pixels(iXs, iYs, &px00, &px10, &px01, &px11);
-        return bilerp4(Sk4f{X(sample) - 0.5f}, Sk4f{Y(sample) - 0.5f}, px00, px10, px01, px11);
+        Sk4f xs = Sk4f{sampleXs[0]};
+        Sk4f ys = Sk4f{sampleYs[0]};
+        fAccessor.get4Pixels(sampleXs, sampleYs, &px00, &px10, &px01, &px11);
+        Sk4f pixel = bilerp4(xs, ys, px00, px10, px01, px11);
+        fNext->blendPixel(pixel);
     }
 
-    // Get two pixels at x from row0 and row1.
-    void get2PixelColumn(const void* row0, const void* row1, int x, Sk4f* px0, Sk4f* px1) {
-        *px0 = fAccessor.getPixelFromRow(row0, x);
-        *px1 = fAccessor.getPixelFromRow(row1, x);
-    }
-
-    // |dx| == 0. This code assumes that length is zero.
-    void spanZeroRate(Span span) {
-        SkPoint start; SkScalar length; int count;
+    void bilerpSpan(Span span, SkScalar y) override {
+        SkASSERT(!span.isEmpty());
+        SkPoint start;
+        SkScalar length;
+        int count;
         std::tie(start, length, count) = span;
-        SkASSERT(length == 0.0f);
+        SkScalar absLength = SkScalarAbs(length);
+        if (absLength == 0.0f) {
+            this->spanZeroRate(span, y);
+        } else if (absLength < (count - 1)) {
+            this->spanSlowRate(span, y);
+        } else if (absLength == (count - 1)) {
+            if (std::fmod(span.startX() - 0.5f, 1.0f) == 0.0f) {
+                if (std::fmod(span.startY() - 0.5f, 1.0f) == 0.0f) {
+                    src_strategy_blend(span, fNext, &fAccessor);
+                } else {
+                    this->spanUnitRateAlignedX(span, y);
+                }
+            } else {
+                this->spanUnitRate(span, y);
+            }
+        } else {
+            this->spanFastRate(span, y);
+        }
+    }
 
-        // Filter for the blending of the top and bottom pixels.
-        SkScalar filterY = sample_to_filter(Y(start));
-
-        // Generate the four filter points from the sample point start. Generate the row* values.
-        Sk4i iXs, iYs;
-        this->filterPoints(start, &iXs, &iYs);
-        const void* const row0 = fAccessor.row(iYs[0]);
-        const void* const row1 = fAccessor.row(iYs[2]);
-
-        // Get the two pixels that make up the clamping pixel.
-        Sk4f pxTop, pxBottom;
-        this->get2PixelColumn(row0, row1, SkScalarFloorToInt(X(start)), &pxTop, &pxBottom);
-        Sk4f pixel = pxTop * filterY + (1.0f - filterY) * pxBottom;
-
+private:
+    void spanZeroRate(Span span, SkScalar y1) {
+        SkScalar y0 = span.startY() - 0.5f;
+        y1 += 0.5f;
+        int iy0 = SkScalarFloorToInt(y0);
+        SkScalar filterY1 = y0 - iy0;
+        SkScalar filterY0 = 1.0f - filterY1;
+        int iy1 = SkScalarFloorToInt(y1);
+        int ix = SkScalarFloorToInt(span.startX());
+        Sk4f pixelY0 = fAccessor.getPixelFromRow(fAccessor.row(iy0), ix);
+        Sk4f pixelY1 = fAccessor.getPixelFromRow(fAccessor.row(iy1), ix);
+        Sk4f filterPixel = pixelY0 * filterY0 + pixelY1 * filterY1;
+        int count = span.count();
         while (count >= 4) {
-            fNext->blend4Pixels(pixel, pixel, pixel, pixel);
+            fNext->blend4Pixels(filterPixel, filterPixel, filterPixel, filterPixel);
             count -= 4;
         }
         while (count > 0) {
-            fNext->blendPixel(pixel);
+            fNext->blendPixel(filterPixel);
             count -= 1;
         }
     }
 
-    // 0 < |dx| < 1. This code reuses the calculations from previous pixels to reduce
-    // computation. In particular, several destination pixels maybe generated from the same four
-    // source pixels.
-    // In the following code a "part" is a combination of two pixels from the same column of the
-    // filter.
-    void spanSlowRate(Span span) {
-        SkPoint start; SkScalar length; int count;
+    // When moving through source space more slowly than dst space (zoomed in),
+    // we'll be sampling from the same source pixel more than once.
+    void spanSlowRate(Span span, SkScalar ry1) {
+        SkPoint start;
+        SkScalar length;
+        int count;
         std::tie(start, length, count) = span;
+        SkFixed fx = SkScalarToFixed(X(start)-0.5f);
 
-        // Calculate the distance between each sample point.
-        const SkScalar dx = length / (count - 1);
-        SkASSERT(-1.0f < dx && dx < 1.0f && dx != 0.0f);
+        SkFixed fdx = SkScalarToFixed(length / (count - 1));
 
-        // Generate the filter values for the top-left corner.
-        // Note: these values are in filter space; this has implications about how to adjust
-        // these values at each step. For example, as the sample point increases, the filter
-        // value decreases, this is because the filter and position are related by
-        // (1 - (X(sample) - .5)) % 1. The (1 - stuff) causes the filter to move in the opposite
-        // direction of the sample point which is increasing by dx.
-        SkScalar filterX = sample_to_filter(X(start));
-        SkScalar filterY = sample_to_filter(Y(start));
-
-        // Generate the four filter points from the sample point start. Generate the row* values.
-        Sk4i iXs, iYs;
-        this->filterPoints(start, &iXs, &iYs);
-        const void* const row0 = fAccessor.row(iYs[0]);
-        const void* const row1 = fAccessor.row(iYs[2]);
-
-        // Generate part of the filter value at xColumn.
-        auto partAtColumn = [&](int xColumn) {
-            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
-            Sk4f pxTop, pxBottom;
-            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
-            return pxTop * filterY + (1.0f - filterY) * pxBottom;
-        };
-
-        // The leftPart is made up of two pixels from the left column of the filter, right part
-        // is similar. The top and bottom pixels in the *Part are created as a linear blend of
-        // the top and bottom pixels using filterY. See the partAtColumn function above.
-        Sk4f leftPart  = partAtColumn(iXs[0]);
-        Sk4f rightPart = partAtColumn(iXs[1]);
-
-        // Create a destination color by blending together a left and right part using filterX.
-        auto bilerp = [&]() {
-            Sk4f pixel = leftPart * filterX + rightPart * (1.0f - filterX);
-            return check_pixel(pixel);
-        };
-
-        // Send the first pixel to the destination. This simplifies the loop structure so that no
-        // extra pixels are fetched for the last iteration of the loop.
-        fNext->blendPixel(bilerp());
-        count -= 1;
-
-        if (dx > 0.0f) {
-            // * positive direction - generate destination pixels by sliding the filter from left
-            //                        to right.
-            int rightPartCursor = iXs[1];
-
-            // Advance the filter from left to right. Remember that moving the top-left corner of
-            // the filter to the right actually makes the filter value smaller.
-            auto advanceFilter = [&]() {
-                filterX -= dx;
-                if (filterX <= 0.0f) {
-                    filterX += 1.0f;
-                    leftPart = rightPart;
-                    rightPartCursor += 1;
-                    rightPart = partAtColumn(rightPartCursor);
-                }
-                SkASSERT(0.0f < filterX && filterX <= 1.0f);
-
-                return bilerp();
-            };
-
-            while (count >= 4) {
-                Sk4f px0 = advanceFilter(),
-                     px1 = advanceFilter(),
-                     px2 = advanceFilter(),
-                     px3 = advanceFilter();
-                fNext->blend4Pixels(px0, px1, px2, px3);
-                count -= 4;
+        Sk4f xAdjust;
+        if (fdx >= 0) {
+            xAdjust = Sk4f{-1.0f};
+        } else {
+            xAdjust = Sk4f{1.0f};
+        }
+        int ix = SkFixedFloorToInt(fx);
+        int ioldx = ix;
+        Sk4f x{SkFixedToScalar(fx) - ix};
+        Sk4f dx{SkFixedToScalar(fdx)};
+        SkScalar ry0 = Y(start) - 0.5f;
+        ry1 += 0.5f;
+        SkScalar yFloor = std::floor(ry0);
+        Sk4f y1 = Sk4f{ry0 - yFloor};
+        Sk4f y0 = Sk4f{1.0f} - y1;
+        const void* const row0 = fAccessor.row(SkScalarFloorToInt(ry0));
+        const void* const row1 = fAccessor.row(SkScalarFloorToInt(ry1));
+        Sk4f fpixel00 = y0 * fAccessor.getPixelFromRow(row0, ix);
+        Sk4f fpixel01 = y1 * fAccessor.getPixelFromRow(row1, ix);
+        Sk4f fpixel10 = y0 * fAccessor.getPixelFromRow(row0, ix + 1);
+        Sk4f fpixel11 = y1 * fAccessor.getPixelFromRow(row1, ix + 1);
+        auto getNextPixel = [&]() {
+            if (ix != ioldx) {
+                fpixel00 = fpixel10;
+                fpixel01 = fpixel11;
+                fpixel10 = y0 * fAccessor.getPixelFromRow(row0, ix + 1);
+                fpixel11 = y1 * fAccessor.getPixelFromRow(row1, ix + 1);
+                ioldx = ix;
+                x = x + xAdjust;
             }
 
+            Sk4f x0, x1;
+            x0 = Sk4f{1.0f} - x;
+            x1 = x;
+            Sk4f fpixel = x0 * (fpixel00 + fpixel01) + x1 * (fpixel10 + fpixel11);
+            fx += fdx;
+            ix = SkFixedFloorToInt(fx);
+            x = x + dx;
+            return fpixel;
+        };
+
+        while (count >= 4) {
+            Sk4f fpixel0 = getNextPixel();
+            Sk4f fpixel1 = getNextPixel();
+            Sk4f fpixel2 = getNextPixel();
+            Sk4f fpixel3 = getNextPixel();
+
+            fNext->blend4Pixels(fpixel0, fpixel1, fpixel2, fpixel3);
+            count -= 4;
+        }
+
+        while (count > 0) {
+            fNext->blendPixel(getNextPixel());
+
+            count -= 1;
+        }
+    }
+
+    // We're moving through source space at a rate of 1 source pixel per 1 dst pixel.
+    // We'll never re-use pixels, but we can at least load contiguous pixels.
+    void spanUnitRate(Span span, SkScalar y1) {
+        y1 += 0.5f;
+        SkScalar y0 = span.startY() - 0.5f;
+        int iy0 = SkScalarFloorToInt(y0);
+        SkScalar filterY1 = y0 - iy0;
+        SkScalar filterY0 = 1.0f - filterY1;
+        int iy1 = SkScalarFloorToInt(y1);
+        const void* rowY0 = fAccessor.row(iy0);
+        const void* rowY1 = fAccessor.row(iy1);
+        SkScalar x0 = span.startX() - 0.5f;
+        int ix0 = SkScalarFloorToInt(x0);
+        SkScalar filterX1 = x0 - ix0;
+        SkScalar filterX0 = 1.0f - filterX1;
+
+        auto getPixelY0 = [&]() {
+            Sk4f px = fAccessor.getPixelFromRow(rowY0, ix0);
+            return px * filterY0;
+        };
+
+        auto getPixelY1 = [&]() {
+            Sk4f px = fAccessor.getPixelFromRow(rowY1, ix0);
+            return px * filterY1;
+        };
+
+        auto get4PixelsY0 = [&](int ix, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
+            fAccessor.get4Pixels(rowY0, ix, px0, px1, px2, px3);
+            *px0 = *px0 * filterY0;
+            *px1 = *px1 * filterY0;
+            *px2 = *px2 * filterY0;
+            *px3 = *px3 * filterY0;
+        };
+
+        auto get4PixelsY1 = [&](int ix, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
+            fAccessor.get4Pixels(rowY1, ix, px0, px1, px2, px3);
+            *px0 = *px0 * filterY1;
+            *px1 = *px1 * filterY1;
+            *px2 = *px2 * filterY1;
+            *px3 = *px3 * filterY1;
+        };
+
+        auto lerp = [&](Sk4f& pixelX0, Sk4f& pixelX1) {
+            return pixelX0 * filterX0 + pixelX1 * filterX1;
+        };
+
+        // Mid making 4 unit rate.
+        Sk4f pxB = getPixelY0() + getPixelY1();
+        if (span.length() > 0) {
+            int count = span.count();
+            while (count >= 4) {
+                Sk4f px00, px10, px20, px30;
+                get4PixelsY0(ix0, &px00, &px10, &px20, &px30);
+                Sk4f px01, px11, px21, px31;
+                get4PixelsY1(ix0, &px01, &px11, &px21, &px31);
+                Sk4f pxS0 = px00 + px01;
+                Sk4f px0 = lerp(pxB, pxS0);
+                Sk4f pxS1 = px10 + px11;
+                Sk4f px1 = lerp(pxS0, pxS1);
+                Sk4f pxS2 = px20 + px21;
+                Sk4f px2 = lerp(pxS1, pxS2);
+                Sk4f pxS3 = px30 + px31;
+                Sk4f px3 = lerp(pxS2, pxS3);
+                pxB = pxS3;
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                ix0 += 4;
+                count -= 4;
+            }
             while (count > 0) {
-                fNext->blendPixel(advanceFilter());
+                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix0);
+                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix0);
+
+                fNext->blendPixel(lerp(pixelY0, pixelY1));
+                ix0 += 1;
                 count -= 1;
             }
         } else {
-            // * negative direction - generate destination pixels by sliding the filter from
-            //                        right to left.
-            int leftPartCursor = iXs[0];
-
-            // Advance the filter from right to left. Remember that moving the top-left corner of
-            // the filter to the left actually makes the filter value larger.
-            auto advanceFilter = [&]() {
-                // Remember, dx < 0 therefore this adds |dx| to filterX.
-                filterX -= dx;
-                // At this point filterX may be > 1, and needs to be wrapped back on to the filter
-                // interval, and the next column in the filter is calculated.
-                if (filterX > 1.0f) {
-                    filterX -= 1.0f;
-                    rightPart = leftPart;
-                    leftPartCursor -= 1;
-                    leftPart = partAtColumn(leftPartCursor);
-                }
-                SkASSERT(0.0f < filterX && filterX <= 1.0f);
-
-                return bilerp();
-            };
-
+            int count = span.count();
             while (count >= 4) {
-                Sk4f px0 = advanceFilter(),
-                     px1 = advanceFilter(),
-                     px2 = advanceFilter(),
-                     px3 = advanceFilter();
+                Sk4f px00, px10, px20, px30;
+                get4PixelsY0(ix0 - 3, &px00, &px10, &px20, &px30);
+                Sk4f px01, px11, px21, px31;
+                get4PixelsY1(ix0 - 3, &px01, &px11, &px21, &px31);
+                Sk4f pxS3 = px30 + px31;
+                Sk4f px0 = lerp(pxS3, pxB);
+                Sk4f pxS2 = px20 + px21;
+                Sk4f px1 = lerp(pxS2, pxS3);
+                Sk4f pxS1 = px10 + px11;
+                Sk4f px2 = lerp(pxS1, pxS2);
+                Sk4f pxS0 = px00 + px01;
+                Sk4f px3 = lerp(pxS0, pxS1);
+                pxB = pxS0;
                 fNext->blend4Pixels(px0, px1, px2, px3);
+                ix0 -= 4;
                 count -= 4;
             }
-
             while (count > 0) {
-                fNext->blendPixel(advanceFilter());
+                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix0);
+                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix0);
+
+                fNext->blendPixel(lerp(pixelY0, pixelY1));
+                ix0 -= 1;
                 count -= 1;
             }
         }
     }
 
-    // |dx| == 1. Moving through source space at a rate of 1 source pixel per 1 dst pixel.
-    // Every filter part is used for two destination pixels, and the code can bulk load four
-    // pixels at a time.
-    void spanUnitRate(Span span) {
-        SkPoint start; SkScalar length; int count;
-        std::tie(start, length, count) = span;
-        SkASSERT(SkScalarAbs(length) == (count - 1));
-
-        // Calculate the four filter points of start, and use the two different Y values to
-        // generate the row pointers.
-        Sk4i iXs, iYs;
-        filterPoints(start, &iXs, &iYs);
-        const void* row0 = fAccessor.row(iYs[0]);
-        const void* row1 = fAccessor.row(iYs[2]);
-
-        // Calculate the filter values for the top-left filter element.
-        const SkScalar filterX = sample_to_filter(X(start));
-        const SkScalar filterY = sample_to_filter(Y(start));
-
-        // Generate part of the filter value at xColumn.
-        auto partAtColumn = [&](int xColumn) {
-            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
-            Sk4f pxTop, pxBottom;
-            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
-            return pxTop * filterY + (1.0f - filterY) * pxBottom;
+    void spanUnitRateAlignedX(Span span, SkScalar y1) {
+        SkScalar y0 = span.startY() - 0.5f;
+        y1 += 0.5f;
+        int iy0 = SkScalarFloorToInt(y0);
+        SkScalar filterY1 = y0 - iy0;
+        SkScalar filterY0 = 1.0f - filterY1;
+        int iy1 = SkScalarFloorToInt(y1);
+        int ix = SkScalarFloorToInt(span.startX());
+        const void* rowY0 = fAccessor.row(iy0);
+        const void* rowY1 = fAccessor.row(iy1);
+        auto lerp = [&](Sk4f* pixelY0, Sk4f* pixelY1) {
+            return *pixelY0 * filterY0 + *pixelY1 * filterY1;
         };
 
-        auto get4Parts = [&](int ix, Sk4f* part0, Sk4f* part1, Sk4f* part2, Sk4f* part3) {
-            // Check if the pixels needed are near the edges. If not go fast using bulk pixels,
-            // otherwise be careful.
-            if (0 <= ix && ix <= fXMax - 3) {
-                Sk4f px00, px10, px20, px30,
-                     px01, px11, px21, px31;
-                fAccessor.get4Pixels(row0, ix, &px00, &px10, &px20, &px30);
-                fAccessor.get4Pixels(row1, ix, &px01, &px11, &px21, &px31);
-                *part0 = filterY * px00 + (1.0f - filterY) * px01;
-                *part1 = filterY * px10 + (1.0f - filterY) * px11;
-                *part2 = filterY * px20 + (1.0f - filterY) * px21;
-                *part3 = filterY * px30 + (1.0f - filterY) * px31;
-            } else {
-                *part0 = partAtColumn(ix + 0);
-                *part1 = partAtColumn(ix + 1);
-                *part2 = partAtColumn(ix + 2);
-                *part3 = partAtColumn(ix + 3);
-            }
-        };
-
-        auto bilerp = [&](Sk4f& part0, Sk4f& part1) {
-            return part0 * filterX + part1 * (1.0f - filterX);
-        };
-
-        if (length > 0) {
-            // * positive direction - generate destination pixels by sliding the filter from left
-            //                        to right.
-
-            // overlapPart is the filter part from the end of the previous four pixels used at
-            // the start of the next four pixels.
-            Sk4f overlapPart = partAtColumn(iXs[0]);
-            int rightColumnCursor = iXs[1];
+        if (span.length() > 0) {
+            int count = span.count();
             while (count >= 4) {
-                Sk4f part0, part1, part2, part3;
-                get4Parts(rightColumnCursor, &part0, &part1, &part2, &part3);
-                Sk4f px0 = bilerp(overlapPart, part0);
-                Sk4f px1 = bilerp(part0, part1);
-                Sk4f px2 = bilerp(part1, part2);
-                Sk4f px3 = bilerp(part2, part3);
-                overlapPart = part3;
-                fNext->blend4Pixels(px0, px1, px2, px3);
-                rightColumnCursor += 4;
+                Sk4f px00, px10, px20, px30;
+                fAccessor.get4Pixels(rowY0, ix, &px00, &px10, &px20, &px30);
+                Sk4f px01, px11, px21, px31;
+                fAccessor.get4Pixels(rowY1, ix, &px01, &px11, &px21, &px31);
+                fNext->blend4Pixels(
+                    lerp(&px00, &px01), lerp(&px10, &px11), lerp(&px20, &px21), lerp(&px30, &px31));
+                ix += 4;
                 count -= 4;
             }
-
             while (count > 0) {
-                Sk4f rightPart = partAtColumn(rightColumnCursor);
+                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix);
+                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix);
 
-                fNext->blendPixel(bilerp(overlapPart, rightPart));
-                overlapPart = rightPart;
-                rightColumnCursor += 1;
+                fNext->blendPixel(lerp(&pixelY0, &pixelY1));
+                ix += 1;
                 count -= 1;
             }
         } else {
-            // * negative direction - generate destination pixels by sliding the filter from
-            //                        right to left.
-            Sk4f overlapPart = partAtColumn(iXs[1]);
-            int leftColumnCursor = iXs[0];
-
+            int count = span.count();
             while (count >= 4) {
-                Sk4f part0, part1, part2, part3;
-                get4Parts(leftColumnCursor - 3, &part3, &part2, &part1, &part0);
-                Sk4f px0 = bilerp(part0, overlapPart);
-                Sk4f px1 = bilerp(part1, part0);
-                Sk4f px2 = bilerp(part2, part1);
-                Sk4f px3 = bilerp(part3, part2);
-                overlapPart = part3;
-                fNext->blend4Pixels(px0, px1, px2, px3);
-                leftColumnCursor -= 4;
+                Sk4f px00, px10, px20, px30;
+                fAccessor.get4Pixels(rowY0, ix - 3, &px30, &px20, &px10, &px00);
+                Sk4f px01, px11, px21, px31;
+                fAccessor.get4Pixels(rowY1, ix - 3, &px31, &px21, &px11, &px01);
+                fNext->blend4Pixels(
+                    lerp(&px00, &px01), lerp(&px10, &px11), lerp(&px20, &px21), lerp(&px30, &px31));
+                ix -= 4;
                 count -= 4;
             }
-
             while (count > 0) {
-                Sk4f leftPart = partAtColumn(leftColumnCursor);
+                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix);
+                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix);
 
-                fNext->blendPixel(bilerp(leftPart, overlapPart));
-                overlapPart = leftPart;
-                leftColumnCursor -= 1;
-                count -= 1;
-            }
-        }
-    }
-
-    // 1 < |dx| < 2. Going through the source pixels at a faster rate than the dest pixels, but
-    // still slow enough to take advantage of previous calculations.
-    void spanMediumRate(Span span) {
-        SkPoint start; SkScalar length; int count;
-        std::tie(start, length, count) = span;
-
-        // Calculate the distance between each sample point.
-        const SkScalar dx = length / (count - 1);
-        SkASSERT((-2.0f < dx && dx < -1.0f) || (1.0f < dx && dx < 2.0f));
-
-        // Generate the filter values for the top-left corner.
-        // Note: these values are in filter space; this has implications about how to adjust
-        // these values at each step. For example, as the sample point increases, the filter
-        // value decreases, this is because the filter and position are related by
-        // (1 - (X(sample) - .5)) % 1. The (1 - stuff) causes the filter to move in the opposite
-        // direction of the sample point which is increasing by dx.
-        SkScalar filterX = sample_to_filter(X(start));
-        SkScalar filterY = sample_to_filter(Y(start));
-
-        // Generate the four filter points from the sample point start. Generate the row* values.
-        Sk4i iXs, iYs;
-        this->filterPoints(start, &iXs, &iYs);
-        const void* const row0 = fAccessor.row(iYs[0]);
-        const void* const row1 = fAccessor.row(iYs[2]);
-
-        // Generate part of the filter value at xColumn.
-        auto partAtColumn = [&](int xColumn) {
-            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
-            Sk4f pxTop, pxBottom;
-            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
-            return pxTop * filterY + (1.0f - filterY) * pxBottom;
-        };
-
-        // The leftPart is made up of two pixels from the left column of the filter, right part
-        // is similar. The top and bottom pixels in the *Part are created as a linear blend of
-        // the top and bottom pixels using filterY. See the nextPart function below.
-        Sk4f leftPart  = partAtColumn(iXs[0]);
-        Sk4f rightPart = partAtColumn(iXs[1]);
-
-        // Create a destination color by blending together a left and right part using filterX.
-        auto bilerp = [&]() {
-            Sk4f pixel = leftPart * filterX + rightPart * (1.0f - filterX);
-            return check_pixel(pixel);
-        };
-
-        // Send the first pixel to the destination. This simplifies the loop structure so that no
-        // extra pixels are fetched for the last iteration of the loop.
-        fNext->blendPixel(bilerp());
-        count -= 1;
-
-        if (dx > 0.0f) {
-            // * positive direction - generate destination pixels by sliding the filter from left
-            //                        to right.
-            int rightPartCursor = iXs[1];
-
-            // Advance the filter from left to right. Remember that moving the top-left corner of
-            // the filter to the right actually makes the filter value smaller.
-            auto advanceFilter = [&]() {
-                filterX -= dx;
-                // At this point filterX is less than zero, but might actually be less than -1.
-                if (filterX > -1.0f) {
-                    filterX += 1.0f;
-                    leftPart = rightPart;
-                    rightPartCursor += 1;
-                    rightPart = partAtColumn(rightPartCursor);
-                } else {
-                    filterX += 2.0f;
-                    rightPartCursor += 2;
-                    leftPart = partAtColumn(rightPartCursor - 1);
-                    rightPart = partAtColumn(rightPartCursor);
-                }
-                SkASSERT(0.0f < filterX && filterX <= 1.0f);
-
-                return bilerp();
-            };
-
-            while (count >= 4) {
-                Sk4f px0 = advanceFilter(),
-                     px1 = advanceFilter(),
-                     px2 = advanceFilter(),
-                     px3 = advanceFilter();
-                fNext->blend4Pixels(px0, px1, px2, px3);
-                count -= 4;
-            }
-
-            while (count > 0) {
-                fNext->blendPixel(advanceFilter());
-                count -= 1;
-            }
-        } else {
-            // * negative direction - generate destination pixels by sliding the filter from
-            //                        right to left.
-            int leftPartCursor = iXs[0];
-
-            auto advanceFilter = [&]() {
-                // Remember, dx < 0 therefore this adds |dx| to filterX.
-                filterX -= dx;
-                // At this point, filterX is greater than one, but may actually be greater than two.
-                if (filterX < 2.0f) {
-                    filterX -= 1.0f;
-                    rightPart = leftPart;
-                    leftPartCursor -= 1;
-                    leftPart = partAtColumn(leftPartCursor);
-                } else {
-                    filterX -= 2.0f;
-                    leftPartCursor -= 2;
-                    rightPart = partAtColumn(leftPartCursor - 1);
-                    leftPart = partAtColumn(leftPartCursor);
-                }
-                SkASSERT(0.0f < filterX && filterX <= 1.0f);
-                return bilerp();
-            };
-
-            while (count >= 4) {
-                Sk4f px0 = advanceFilter(),
-                     px1 = advanceFilter(),
-                     px2 = advanceFilter(),
-                     px3 = advanceFilter();
-                fNext->blend4Pixels(px0, px1, px2, px3);
-                count -= 4;
-            }
-
-            while (count > 0) {
-                fNext->blendPixel(advanceFilter());
+                fNext->blendPixel(lerp(&pixelY0, &pixelY1));
+                ix -= 1;
                 count -= 1;
             }
         }
@@ -1009,26 +798,34 @@
 
     // We're moving through source space faster than dst (zoomed out),
     // so we'll never reuse a source pixel or be able to do contiguous loads.
-    void spanFastRate(Span span) {
-        SkPoint start; SkScalar length; int count;
+    void spanFastRate(Span span, SkScalar y1) {
+        SkPoint start;
+        SkScalar length;
+        int count;
         std::tie(start, length, count) = span;
         SkScalar x = X(start);
         SkScalar y = Y(start);
 
-        SkScalar dx = length / (count - 1);
-        while (count > 0) {
-            fNext->blendPixel(this->bilerpSamplePoint(SkPoint{x, y}));
-            x += dx;
-            count -= 1;
+        // In this sampler, it is assumed that if span.StartY() and y1 are the same then both
+        // y-lines are on the same tile.
+        if (y == y1) {
+            // Both y-lines are on the same tile.
+            span_fallback(span, this);
+        } else {
+            // The y-lines are on different tiles.
+            SkScalar dx = length / (count - 1);
+            Sk4f ys = {y - 0.5f, y - 0.5f, y1 + 0.5f, y1 + 0.5f};
+            while (count > 0) {
+                Sk4f xs = Sk4f{-0.5f, 0.5f, -0.5f, 0.5f} + Sk4f{x};
+                this->bilerpEdge(xs, ys);
+                x += dx;
+                count -= 1;
+            }
         }
     }
 
-    Next* const              fNext;
-    const SkShader::TileMode fXEdgeType;
-    const int                fXMax;
-    const SkShader::TileMode fYEdgeType;
-    const int                fYMax;
-    Accessor                 fAccessor;
+    Next* const fNext;
+    Accessor    fAccessor;
 };
 
 }  // namespace