In the current code, tiling and bilerp sampling are strongly tied together. They can be separated by taking advantage of observation that translating a sample point into filter points in the bilerp stage the filter points will be at most 0.5 outside the tile. This allows simplified repositioning for the various tiling modes; clamp and mirror use min and max while repeat has max -> 0 and 0-> max. This allows bilerp to simply treat the filter points that fall off the tile. This allows tiling and bilerp sampling to be totally separate.

This CL has several parts that are intertwined:
* move pin/wrap functionality into BilerpSampler.
* remove the nearest neighbor and bilerp tilers
* create a simplified general tiler
* remove the pipeline virtual calls bilerpEdge and bilerpSpan because everything works of sample points now.
* redo all the bilerp sampling to use the new local to methods to wrap/pin.
* introduce a new medium rate sample that handles spans with 1 < |dx| < 2.

This change improves the performance as displayed below:
Most of top 25 desktop improves or are the same. A few are worse, but close to the noise floor. In addition, this change has about 3% smaller code.

old time     new time   new/old
13274693  8414645  0.633886  top25desk_google_com_search_q_c.skp_1
4946466   3258018  0.658656  top25desk_wordpress.skp_1
6977187   5737584  0.822335  top25desk_youtube_com.skp_1
3770021   3296831  0.874486  top25desk_google_com__hl_en_q_b.skp_1
8890813   8600143  0.967307  top25desk_answers_yahoo_com.skp_1
3178974   3094300  0.973364  top25desk_facebook.skp_1
8871835   8711260  0.981901  top25desk_twitter.skp_1
838509    829290   0.989005  top25desk_blogger.skp_1
2821870   2801111  0.992644  top25desk_plus_google_com_11003.skp_1
511978    509530   0.995219  top25desk_techcrunch_com.skp_1
2408588   2397435  0.995369  top25desk_ebay_com.skp_1
4446919   4448004  1.00024   top25desk_espn.skp_1
2863241   2875696  1.00435   top25desk_google_com_calendar_.skp_1
7170086   7208447  1.00535   top25desk_booking_com.skp_1
7356109   7417776  1.00838   top25desk_pinterest.skp_1
5265591   5340392  1.01421   top25desk_weather_com.skp_1
5675244   5774144  1.01743   top25desk_sports_yahoo_com_.skp_1
1048531   1067663  1.01825   top25desk_games_yahoo_com.skp_1
2075501   2115131  1.01909   top25desk_amazon_com.skp_1
4262170   4370441  1.0254    top25desk_news_yahoo_com.skp_1
3789319   3897996  1.02868   top25desk_docs___1_open_documen.skp_1
919336    949979   1.03333   top25desk_wikipedia__1_tab_.skp_1
4274454   4489369  1.05028   top25desk_mail_google_com_mail_.skp_1
4149326   4376556  1.05476   top25desk_linkedin.skp_1

BUG=skia:5566
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2134893002
CQ_INCLUDE_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot;master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Committed: https://skia.googlesource.com/skia/+/8602ede5fdfa721dcad4dcb11db028c1c24265f1
Review-Url: https://codereview.chromium.org/2134893002
diff --git a/src/core/SkBitmapProcShader.h b/src/core/SkBitmapProcShader.h
index f21e3d6..6240348 100644
--- a/src/core/SkBitmapProcShader.h
+++ b/src/core/SkBitmapProcShader.h
@@ -54,7 +54,7 @@
     typedef SkShader INHERITED;
 };
 
-enum {kSkBlitterContextSize = 3200};
+enum {kSkBlitterContextSize = 3332};
 
 // Commonly used allocator. It currently is only used to allocate up to 3 objects. The total
 // bytes requested is calculated using one of our large shaders, its context size plus the size of
diff --git a/src/core/SkLinearBitmapPipeline.cpp b/src/core/SkLinearBitmapPipeline.cpp
index 088e829..44a3d77 100644
--- a/src/core/SkLinearBitmapPipeline.cpp
+++ b/src/core/SkLinearBitmapPipeline.cpp
@@ -165,15 +165,14 @@
 // Tile Stage
 
 template<typename XStrategy, typename YStrategy, typename Next>
-class NearestTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
+class CombinedTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
 public:
-    template <typename... Args>
-    NearestTileStage(Next* next, SkISize dimensions)
+    CombinedTileStage(Next* next, SkISize dimensions)
         : fNext{next}
         , fXStrategy{dimensions.width()}
         , fYStrategy{dimensions.height()}{ }
 
-    NearestTileStage(Next* next, const NearestTileStage& stage)
+    CombinedTileStage(Next* next, const CombinedTileStage& stage)
         : fNext{next}
         , fXStrategy{stage.fXStrategy}
         , fYStrategy{stage.fYStrategy} { }
@@ -195,9 +194,20 @@
         SkASSERT(!span.isEmpty());
         SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
+
+        if (span.count() == 1) {
+            // DANGER:
+            // The explicit casts from float to Sk4f are not usually necessary, but are here to
+            // work around an MSVC 2015u2 c++ code generation bug. This is tracked using skia bug
+            // 5566.
+            this->pointListFew(1, Sk4f{span.startX()}, Sk4f{span.startY()});
+            return;
+        }
+
         SkScalar x = X(start);
         SkScalar y = fYStrategy.tileY(Y(start));
         Span yAdjustedSpan{{x, y}, length, count};
+
         if (!fXStrategy.maybeProcessSpan(yAdjustedSpan, fNext)) {
             span_fallback(span, this);
         }
@@ -209,173 +219,27 @@
     YStrategy fYStrategy;
 };
 
-template<typename XStrategy, typename YStrategy, typename Next>
-class BilerpTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
-public:
-    template <typename... Args>
-    BilerpTileStage(Next* next, SkISize dimensions)
-        : fNext{next}
-        , fXMax(dimensions.width())
-        , fYMax(dimensions.height())
-        , fXStrategy{dimensions.width()}
-        , fYStrategy{dimensions.height()} { }
-
-    BilerpTileStage(Next* next, const BilerpTileStage& stage)
-        : fNext{next}
-        , fXMax{stage.fXMax}
-        , fYMax{stage.fYMax}
-        , fXStrategy{stage.fXStrategy}
-        , fYStrategy{stage.fYStrategy} { }
-
-    void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
-        fXStrategy.tileXPoints(&xs);
-        fYStrategy.tileYPoints(&ys);
-        // TODO: check to see if xs and ys are in range then just call pointListFew on next.
-        if (n >= 1) this->bilerpPoint(xs[0], ys[0]);
-        if (n >= 2) this->bilerpPoint(xs[1], ys[1]);
-        if (n >= 3) this->bilerpPoint(xs[2], ys[2]);
-    }
-
-    void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
-        fXStrategy.tileXPoints(&xs);
-        fYStrategy.tileYPoints(&ys);
-        // TODO: check to see if xs and ys are in range then just call pointList4 on next.
-        this->bilerpPoint(xs[0], ys[0]);
-        this->bilerpPoint(xs[1], ys[1]);
-        this->bilerpPoint(xs[2], ys[2]);
-        this->bilerpPoint(xs[3], ys[3]);
-    }
-
-    struct Wrapper {
-        void pointSpan(Span span) {
-            processor->breakIntoEdges(span);
-        }
-
-        void repeatSpan(Span span, int32_t repeatCount) {
-            while (repeatCount --> 0) {
-                processor->pointSpan(span);
-            }
-        }
-
-        BilerpTileStage* processor;
-    };
-
-    // The span you pass must not be empty.
-    void pointSpan(Span span) override {
-        SkASSERT(!span.isEmpty());
-
-        Wrapper wrapper = {this};
-        if (!fXStrategy.maybeProcessSpan(span, &wrapper)) {
-            span_fallback(span, this);
-        }
-    }
-
-private:
-    void bilerpPoint(SkScalar x, SkScalar y) {
-        Sk4f txs = Sk4f{x} + Sk4f{-0.5f, 0.5f, -0.5f, 0.5f};
-        Sk4f tys = Sk4f{y} + Sk4f{-0.5f, -0.5f, 0.5f, 0.5f};
-        fXStrategy.tileXPoints(&txs);
-        fYStrategy.tileYPoints(&tys);
-        fNext->bilerpEdge(txs, tys);
-    }
-
-    void handleEdges(Span span, SkScalar dx) {
-        SkPoint start; SkScalar length; int count;
-        std::tie(start, length, count) = span;
-        SkScalar x = X(start);
-        SkScalar y = Y(start);
-        SkScalar tiledY = fYStrategy.tileY(y);
-        while (count > 0) {
-            this->bilerpPoint(x, tiledY);
-            x += dx;
-            count -= 1;
-        }
-    }
-
-    void yProcessSpan(Span span) {
-        SkScalar tiledY = fYStrategy.tileY(span.startY());
-        if (0.5f <= tiledY && tiledY < fYMax - 0.5f ) {
-            Span tiledSpan{{span.startX(), tiledY}, span.length(), span.count()};
-            fNext->pointSpan(tiledSpan);
-        } else {
-            // Convert to the Y0 bilerp sample set by shifting by -0.5f. Then tile that new y
-            // value and shift it back resulting in the working Y0. Do the same thing with Y1 but
-            // in the opposite direction.
-            SkScalar y0 = fYStrategy.tileY(span.startY() - 0.5f) + 0.5f;
-            SkScalar y1 = fYStrategy.tileY(span.startY() + 0.5f) - 0.5f;
-            Span newSpan{{span.startX(), y0}, span.length(), span.count()};
-            fNext->bilerpSpan(newSpan, y1);
-        }
-    }
-    void breakIntoEdges(Span span) {
-        if (span.count() == 1) {
-            this->bilerpPoint(span.startX(), span.startY());
-        } else if (span.length() == 0) {
-            yProcessSpan(span);
-        } else {
-            SkScalar dx = span.length() / (span.count() - 1);
-            if (span.length() > 0) {
-                Span leftBorder = span.breakAt(0.5f, dx);
-                if (!leftBorder.isEmpty()) {
-                    this->handleEdges(leftBorder, dx);
-                }
-                Span center = span.breakAt(fXMax - 0.5f, dx);
-                if (!center.isEmpty()) {
-                    this->yProcessSpan(center);
-                }
-
-                if (!span.isEmpty()) {
-                    this->handleEdges(span, dx);
-                }
-            } else {
-                Span center = span.breakAt(fXMax + 0.5f, dx);
-                if (!span.isEmpty()) {
-                    this->handleEdges(span, dx);
-                }
-                Span leftEdge = center.breakAt(0.5f, dx);
-                if (!center.isEmpty()) {
-                    this->yProcessSpan(center);
-                }
-                if (!leftEdge.isEmpty()) {
-                    this->handleEdges(leftEdge, dx);
-                }
-
-            }
-        }
-    }
-
-    Next* const fNext;
-    SkScalar fXMax;
-    SkScalar fYMax;
-    XStrategy fXStrategy;
-    YStrategy fYStrategy;
-};
-
-template <typename XStrategy, typename YStrategy, typename Next>
-void make_tile_stage(
-    SkFilterQuality filterQuality, SkISize dimensions,
-    Next* next, SkLinearBitmapPipeline::TileStage* tileStage) {
-    if (filterQuality == kNone_SkFilterQuality) {
-        tileStage->initStage<NearestTileStage<XStrategy, YStrategy, Next>>(next, dimensions);
-    } else {
-        tileStage->initStage<BilerpTileStage<XStrategy, YStrategy, Next>>(next, dimensions);
-    }
-}
-template <typename XStrategy>
+template <typename XStrategy, typename Next>
 void choose_tiler_ymode(
     SkShader::TileMode yMode, SkFilterQuality filterQuality, SkISize dimensions,
-    SkLinearBitmapPipeline::SampleProcessorInterface* next,
+    Next* next,
     SkLinearBitmapPipeline::TileStage* tileStage) {
     switch (yMode) {
-        case SkShader::kClamp_TileMode:
-            make_tile_stage<XStrategy, YClampStrategy>(filterQuality, dimensions, next, tileStage);
+        case SkShader::kClamp_TileMode: {
+            using Tiler = CombinedTileStage<XStrategy, YClampStrategy, Next>;
+            tileStage->initStage<Tiler>(next, dimensions);
             break;
-        case SkShader::kRepeat_TileMode:
-            make_tile_stage<XStrategy, YRepeatStrategy>(filterQuality, dimensions, next, tileStage);
+        }
+        case SkShader::kRepeat_TileMode: {
+            using Tiler = CombinedTileStage<XStrategy, YRepeatStrategy, Next>;
+            tileStage->initStage<Tiler>(next, dimensions);
             break;
-        case SkShader::kMirror_TileMode:
-            make_tile_stage<XStrategy, YMirrorStrategy>(filterQuality, dimensions, next, tileStage);
+        }
+        case SkShader::kMirror_TileMode: {
+            using Tiler = CombinedTileStage<XStrategy, YMirrorStrategy, Next>;
+            tileStage->initStage<Tiler>(next, dimensions);
             break;
+        }
     }
 };
 
@@ -467,10 +331,6 @@
         fDest = dest;
     }
 
-    void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) override { SkFAIL("Not Implemented"); }
-
-    void bilerpSpan(Span span, SkScalar y) override { SkFAIL("Not Implemented"); }
-
     void setDestination(void* dst, int count) override  {
         fDest = static_cast<uint32_t*>(dst);
         fEnd = fDest + count;
@@ -538,10 +398,6 @@
         SkASSERT(fDest <= fEnd);
     }
 
-    void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) override { SkFAIL("Not Implemented"); }
-
-    void bilerpSpan(Span span, SkScalar y) override { SkFAIL("Not Implemented"); }
-
     void setDestination(void* dst, int count) override  {
         SkASSERT(count > 0);
         fDest = static_cast<uint32_t*>(dst);
@@ -582,12 +438,9 @@
     }
 }
 
-template<template <typename, typename> class Sampler>
-static SkLinearBitmapPipeline::SampleProcessorInterface* choose_pixel_sampler_base(
-    Blender* next,
+static SkLinearBitmapPipeline::PixelAccessorInterface* choose_pixel_accessor(
     const SkPixmap& srcPixmap,
     const SkColor A8TintColor,
-    SkLinearBitmapPipeline::SampleStage* sampleStage,
     SkLinearBitmapPipeline::Accessor* accessor)
 {
     const SkImageInfo& imageInfo = srcPixmap.info();
@@ -629,19 +482,19 @@
             break;
     }
 
-    using S = Sampler<PixelAccessorShim, Blender>;
-    sampleStage->initStage<S>(next, pixelAccessor);
-    return sampleStage->get();
+    return pixelAccessor;
 }
 
 SkLinearBitmapPipeline::SampleProcessorInterface* choose_pixel_sampler(
     Blender* next,
     SkFilterQuality filterQuality,
+    SkShader::TileMode xTile, SkShader::TileMode yTile,
     const SkPixmap& srcPixmap,
     const SkColor A8TintColor,
     SkLinearBitmapPipeline::SampleStage* sampleStage,
     SkLinearBitmapPipeline::Accessor* accessor) {
     const SkImageInfo& imageInfo = srcPixmap.info();
+    SkISize dimensions = imageInfo.dimensions();
 
     // Special case samplers with fully expanded templates
     if (imageInfo.gammaCloseToSRGB()) {
@@ -670,14 +523,14 @@
                     using S =
                     BilerpSampler<
                         PixelAccessor<kN32_SkColorType, kSRGB_SkGammaType>, Blender>;
-                    sampleStage->initStage<S>(next, srcPixmap);
+                    sampleStage->initStage<S>(next, dimensions, xTile, yTile, srcPixmap);
                     return sampleStage->get();
                 }
                 case kIndex_8_SkColorType: {
                     using S =
                     BilerpSampler<
                         PixelAccessor<kIndex_8_SkColorType, kSRGB_SkGammaType>, Blender>;
-                    sampleStage->initStage<S>(next, srcPixmap);
+                    sampleStage->initStage<S>(next, dimensions, xTile, yTile, srcPixmap);
                     return sampleStage->get();
                 }
                 default:
@@ -686,14 +539,16 @@
         }
     }
 
+    auto pixelAccessor = choose_pixel_accessor(srcPixmap, A8TintColor, accessor);
     // General cases.
     if (filterQuality == kNone_SkFilterQuality) {
-        return choose_pixel_sampler_base<NearestNeighborSampler>(
-            next, srcPixmap, A8TintColor, sampleStage, accessor);
+        using S = NearestNeighborSampler<PixelAccessorShim, Blender>;
+        sampleStage->initStage<S>(next, pixelAccessor);
     } else {
-        return choose_pixel_sampler_base<BilerpSampler>(
-            next, srcPixmap, A8TintColor, sampleStage, accessor);
+        using S = BilerpSampler<PixelAccessorShim, Blender>;
+        sampleStage->initStage<S>(next, dimensions, xTile, yTile, pixelAccessor);
     }
+    return sampleStage->get();
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -705,17 +560,17 @@
     SrcFPPixel(const SrcFPPixel& Blender) : fPostAlpha(Blender.fPostAlpha) {}
     void SK_VECTORCALL blendPixel(Sk4f pixel) override {
         SkASSERT(fDst + 1 <= fEnd );
-        SrcPixel(fDst, pixel, 0);
+        this->srcPixel(fDst, pixel, 0);
         fDst += 1;
     }
 
     void SK_VECTORCALL blend4Pixels(Sk4f p0, Sk4f p1, Sk4f p2, Sk4f p3) override {
         SkASSERT(fDst + 4 <= fEnd);
         SkPM4f* dst = fDst;
-        SrcPixel(dst, p0, 0);
-        SrcPixel(dst, p1, 1);
-        SrcPixel(dst, p2, 2);
-        SrcPixel(dst, p3, 3);
+        this->srcPixel(dst, p0, 0);
+        this->srcPixel(dst, p1, 1);
+        this->srcPixel(dst, p2, 2);
+        this->srcPixel(dst, p3, 3);
         fDst += 4;
     }
 
@@ -725,7 +580,9 @@
     }
 
 private:
-    void SK_VECTORCALL SrcPixel(SkPM4f* dst, Sk4f pixel, int index) {
+    void SK_VECTORCALL srcPixel(SkPM4f* dst, Sk4f pixel, int index) {
+        check_pixel(pixel);
+
         Sk4f newPixel = pixel;
         if (alphaType == kUnpremul_SkAlphaType) {
             newPixel = Premultiply(pixel);
@@ -797,7 +654,8 @@
     // identity matrix, the matrix stage is skipped, and the tilerStage is the first stage.
     auto blenderStage = choose_blender_for_shading(alphaType, postAlpha, &fBlenderStage);
     auto samplerStage = choose_pixel_sampler(
-        blenderStage, filterQuality, srcPixmap, paintColor, &fSampleStage, &fAccessor);
+        blenderStage, filterQuality, xTile, yTile,
+        srcPixmap, paintColor, &fSampleStage, &fAccessor);
     auto tilerStage   = choose_tiler(samplerStage, dimensions, xTile, yTile,
                                      filterQuality, dx, &fTileStage);
     fFirstStage       = choose_matrix(tilerStage, adjustedInverse, &fMatrixStage);
diff --git a/src/core/SkLinearBitmapPipeline.h b/src/core/SkLinearBitmapPipeline.h
index b0f7e9d..91b573d 100644
--- a/src/core/SkLinearBitmapPipeline.h
+++ b/src/core/SkLinearBitmapPipeline.h
@@ -133,9 +133,9 @@
     // These values were generated by the assert above in Stage::init{Sink|Stage}.
     using MatrixStage  = Stage<PointProcessorInterface,    160, PointProcessorInterface>;
     using TileStage    = Stage<PointProcessorInterface,    160, SampleProcessorInterface>;
-    using SampleStage  = Stage<SampleProcessorInterface,   100, BlendProcessorInterface>;
+    using SampleStage  = Stage<SampleProcessorInterface,   160, BlendProcessorInterface>;
     using BlenderStage = Stage<BlendProcessorInterface,     40>;
-    using Accessor     = PolyMemory<PixelAccessorInterface, 48>;
+    using Accessor     = PolyMemory<PixelAccessorInterface, 64>;
 
 private:
     PointProcessorInterface* fFirstStage;
diff --git a/src/core/SkLinearBitmapPipeline_core.h b/src/core/SkLinearBitmapPipeline_core.h
index 2c39a38..5ef6fca 100644
--- a/src/core/SkLinearBitmapPipeline_core.h
+++ b/src/core/SkLinearBitmapPipeline_core.h
@@ -178,6 +178,15 @@
         stage->pointListFew(count, xs, ys);
     }
 }
+
+inline Sk4f SK_VECTORCALL check_pixel(const Sk4f& pixel) {
+    SkASSERTF(0.0f <= pixel[0] && pixel[0] <= 1.0f, "pixel[0]: %f", pixel[0]);
+    SkASSERTF(0.0f <= pixel[1] && pixel[1] <= 1.0f, "pixel[1]: %f", pixel[1]);
+    SkASSERTF(0.0f <= pixel[2] && pixel[2] <= 1.0f, "pixel[2]: %f", pixel[2]);
+    SkASSERTF(0.0f <= pixel[3] && pixel[3] <= 1.0f, "pixel[3]: %f", pixel[3]);
+    return pixel;
+}
+
 }  // namespace
 
 class SkLinearBitmapPipeline::PointProcessorInterface {
@@ -201,26 +210,6 @@
     // Used for nearest neighbor when scale factor is 1.0. The span can just be repeated with no
     // edge pixel alignment problems. This is for handling a very common case.
     virtual void repeatSpan(Span span, int32_t repeatCount) = 0;
-
-    // The x's and y's are setup in the following order:
-    // +--------+--------+
-    // |        |        |
-    // |  px00  |  px10  |
-    // |    0   |    1   |
-    // +--------+--------+
-    // |        |        |
-    // |  px01  |  px11  |
-    // |    2   |    3   |
-    // +--------+--------+
-    // These pixels coordinates are arranged in the following order in xs and ys:
-    // px00  px10  px01  px11
-    virtual void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) = 0;
-
-    // A span represents sample points that have been mapped from destination space to source
-    // space. Each sample point is then expanded to the four bilerp points by add +/- 0.5. The
-    // resulting Y values my be off the tile. When y +/- 0.5 are more than 1 apart because of
-    // tiling, the second Y is used to denote the retiled Y value.
-    virtual void bilerpSpan(Span span, SkScalar y) = 0;
 };
 
 class SkLinearBitmapPipeline::DestinationInterface {
@@ -243,10 +232,10 @@
 public:
     virtual ~PixelAccessorInterface() { }
     virtual void SK_VECTORCALL getFewPixels(
-        int n, Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const = 0;
+        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const = 0;
 
     virtual void SK_VECTORCALL get4Pixels(
-        Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const = 0;
+        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const = 0;
 
     virtual void get4Pixels(
         const void* src, int index, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const = 0;
diff --git a/src/core/SkLinearBitmapPipeline_sample.h b/src/core/SkLinearBitmapPipeline_sample.h
index 759075b..20057cc 100644
--- a/src/core/SkLinearBitmapPipeline_sample.h
+++ b/src/core/SkLinearBitmapPipeline_sample.h
@@ -40,7 +40,7 @@
 // * px11 -> xy
 // So x * y is calculated first and then used to calculate all the other factors.
 static Sk4s SK_VECTORCALL bilerp4(Sk4s xs, Sk4s ys, Sk4f px00, Sk4f px10,
-                               Sk4f px01, Sk4f px11) {
+                                                    Sk4f px01, Sk4f px11) {
     // Calculate fractional xs and ys.
     Sk4s fxs = xs - xs.floor();
     Sk4s fys = ys - ys.floor();
@@ -134,20 +134,21 @@
 class PixelConverter<kIndex_8_SkColorType, gammaType> {
 public:
     using Element = uint8_t;
-    PixelConverter(const SkPixmap& srcPixmap) {
+    PixelConverter(const SkPixmap& srcPixmap)
+    : fColorTableSize(srcPixmap.ctable()->count()){
         SkColorTable* skColorTable = srcPixmap.ctable();
         SkASSERT(skColorTable != nullptr);
 
         fColorTable = (Sk4f*)SkAlign16((intptr_t)fColorTableStorage.get());
-        for (int i = 0; i < skColorTable->count(); i++) {
+        for (int i = 0; i < fColorTableSize; i++) {
             fColorTable[i] = pmcolor_to_rgba<gammaType>((*skColorTable)[i]);
         }
     }
 
-    PixelConverter(const PixelConverter& strategy) {
+    PixelConverter(const PixelConverter& strategy)
+    : fColorTableSize{strategy.fColorTableSize}{
         fColorTable = (Sk4f*)SkAlign16((intptr_t)fColorTableStorage.get());
-        // TODO: figure out the count.
-        for (int i = 0; i < 256; i++) {
+        for (int i = 0; i < fColorTableSize; i++) {
             fColorTable[i] = strategy.fColorTable[i];
         }
     }
@@ -158,9 +159,9 @@
 
 private:
     static const size_t kColorTableSize = sizeof(Sk4f[256]) + 12;
-
-    SkAutoMalloc         fColorTableStorage{kColorTableSize};
-    Sk4f*                fColorTable;
+    const int           fColorTableSize;
+    SkAutoMalloc        fColorTableStorage{kColorTableSize};
+    Sk4f*               fColorTable;
 };
 
 template <SkGammaType gammaType>
@@ -194,12 +195,12 @@
         : fPixelAccessor(accessor) { }
 
     void SK_VECTORCALL getFewPixels(
-        int n, Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const {
+        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const {
         fPixelAccessor->getFewPixels(n, xs, ys, px0, px1, px2);
     }
 
     void SK_VECTORCALL get4Pixels(
-        Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const {
+        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const {
         fPixelAccessor->get4Pixels(xs, ys, px0, px1, px2, px3);
     }
 
@@ -237,10 +238,8 @@
         , fConverter{srcPixmap, std::move<Args>(args)...} { }
 
     void SK_VECTORCALL getFewPixels (
-        int n, Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const override {
-        Sk4i XIs = SkNx_cast<int, SkScalar>(xs);
-        Sk4i YIs = SkNx_cast<int, SkScalar>(ys);
-        Sk4i bufferLoc = YIs * fWidth + XIs;
+        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const override {
+        Sk4i bufferLoc = ys * fWidth + xs;
         switch (n) {
             case 3:
                 *px2 = this->getPixelAt(bufferLoc[2]);
@@ -254,10 +253,8 @@
     }
 
     void SK_VECTORCALL get4Pixels(
-        Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const override {
-        Sk4i XIs = SkNx_cast<int, SkScalar>(xs);
-        Sk4i YIs = SkNx_cast<int, SkScalar>(ys);
-        Sk4i bufferLoc = YIs * fWidth + XIs;
+        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const override {
+        Sk4i bufferLoc = ys * fWidth + xs;
         *px0 = this->getPixelAt(bufferLoc[0]);
         *px1 = this->getPixelAt(bufferLoc[1]);
         *px2 = this->getPixelAt(bufferLoc[2]);
@@ -330,6 +327,7 @@
     }
 }
 
+// -- NearestNeighborSampler -----------------------------------------------------------------------
 // NearestNeighborSampler - use nearest neighbor filtering to create runs of destination pixels.
 template<typename Accessor, typename Next>
 class NearestNeighborSampler : public SkLinearBitmapPipeline::SampleProcessorInterface {
@@ -345,7 +343,7 @@
     void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
         SkASSERT(0 < n && n < 4);
         Sk4f px0, px1, px2;
-        fAccessor.getFewPixels(n, xs, ys, &px0, &px1, &px2);
+        fAccessor.getFewPixels(n, SkNx_cast<int>(xs), SkNx_cast<int>(ys), &px0, &px1, &px2);
         if (n >= 1) fNext->blendPixel(px0);
         if (n >= 2) fNext->blendPixel(px1);
         if (n >= 3) fNext->blendPixel(px2);
@@ -353,7 +351,7 @@
 
     void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
         Sk4f px0, px1, px2, px3;
-        fAccessor.get4Pixels(xs, ys, &px0, &px1, &px2, &px3);
+        fAccessor.get4Pixels(SkNx_cast<int>(xs), SkNx_cast<int>(ys), &px0, &px1, &px2, &px3);
         fNext->blend4Pixels(px0, px1, px2, px3);
     }
 
@@ -380,21 +378,11 @@
         }
     }
 
-    void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) override {
-        SkFAIL("Using nearest neighbor sampler, but calling a bilerpEdge.");
-    }
-
-    void bilerpSpan(Span span, SkScalar y) override {
-        SkFAIL("Using nearest neighbor sampler, but calling a bilerpSpan.");
-    }
-
 private:
     // When moving through source space more slowly than dst space (zoomed in),
     // we'll be sampling from the same source pixel more than once.
     void spanSlowRate(Span span) {
-        SkPoint start;
-        SkScalar length;
-        int count;
+        SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
         SkScalar x = X(start);
         SkFixed fx = SkScalarToFixed(x);
@@ -451,35 +439,82 @@
     Accessor    fAccessor;
 };
 
+// From an edgeType, the integer value of a pixel vs, and the integer value of the extreme edge
+// vMax, take the point which might be off the tile by one pixel and either wrap it or pin it to
+// generate the right pixel. The value vs is on the interval [-1, vMax + 1]. It produces a value
+// on the interval [0, vMax].
+// Note: vMax is not width or height, but width-1 or height-1 because it is the largest valid pixel.
+static inline int adjust_edge(SkShader::TileMode edgeType, int vs, int vMax) {
+    SkASSERT(-1 <= vs && vs <= vMax + 1)
+    switch (edgeType) {
+        case SkShader::kClamp_TileMode:
+        case SkShader::kMirror_TileMode:
+            vs = std::max(vs, 0);
+            vs = std::min(vs, vMax);
+            break;
+        case SkShader::kRepeat_TileMode:
+            vs = (vs <= vMax) ? vs : 0;
+            vs =    (vs >= 0) ? vs : vMax;
+            break;
+    }
+    SkASSERT(0 <= vs && vs <= vMax);
+    return vs;
+}
+
+// From a sample point on the tile, return the top or left filter value.
+// The result r should be in the range (0, 1]. Since this represents the weight given to the top
+// left element, then if x == 0.5 the filter value should be 1.0.
+// The input sample point must be on the tile, therefore it must be >= 0.
+static SkScalar sample_to_filter(SkScalar x) {
+    SkASSERT(x >= 0.0f);
+    // The usual form of the top or left edge is x - .5, but since we are working on the unit
+    // square, then x + .5 works just as well. This also guarantees that v > 0.0 allowing the use
+    // of trunc.
+    SkScalar v = x + 0.5f;
+    // Produce the top or left offset a value on the range [0, 1).
+    SkScalar f = v - SkScalarTruncToScalar(v);
+    // Produce the filter value which is on the range (0, 1].
+    SkScalar r =  1.0f - f;
+    SkASSERT(0.0f < r && r <= 1.0f);
+    return r;
+}
+
 // -- BilerpSampler --------------------------------------------------------------------------------
 // BilerpSampler - use a bilerp filter to create runs of destination pixels.
+// Note: in the code below, there are two types of points
+//       * sample points - these are the points passed in by pointList* and Spans.
+//       * filter points - are created from a sample point to form the coordinates of the points
+//                         to use in the filter and to generate the filter values.
 template<typename Accessor, typename Next>
 class BilerpSampler : public SkLinearBitmapPipeline::SampleProcessorInterface {
 public:
     template<typename... Args>
-    BilerpSampler(SkLinearBitmapPipeline::BlendProcessorInterface* next, Args&& ... args)
-        : fNext{next}, fAccessor{std::forward<Args>(args)...} { }
+    BilerpSampler(
+        SkLinearBitmapPipeline::BlendProcessorInterface* next,
+        SkISize dimensions,
+        SkShader::TileMode xTile, SkShader::TileMode yTile,
+        Args&& ... args
+    )
+        : fNext{next}
+        , fXEdgeType{xTile}
+        , fXMax{dimensions.width() - 1}
+        , fYEdgeType{yTile}
+        , fYMax{dimensions.height() - 1}
+        , fAccessor{std::forward<Args>(args)...} { }
 
     BilerpSampler(SkLinearBitmapPipeline::BlendProcessorInterface* next,
                    const BilerpSampler& sampler)
-        : fNext{next}, fAccessor{sampler.fAccessor} { }
-
-    Sk4f bilerpNonEdgePixel(SkScalar x, SkScalar y) {
-        Sk4f px00, px10, px01, px11;
-
-        // bilerp4() expects xs, ys are the top-lefts of the 2x2 kernel.
-        Sk4f xs = Sk4f{x} - 0.5f;
-        Sk4f ys = Sk4f{y} - 0.5f;
-        Sk4f sampleXs = xs + Sk4f{0.0f, 1.0f, 0.0f, 1.0f};
-        Sk4f sampleYs = ys + Sk4f{0.0f, 0.0f, 1.0f, 1.0f};
-        fAccessor.get4Pixels(sampleXs, sampleYs, &px00, &px10, &px01, &px11);
-        return bilerp4(xs, ys, px00, px10, px01, px11);
-    }
+        : fNext{next}
+        , fXEdgeType{sampler.fXEdgeType}
+        , fXMax{sampler.fXMax}
+        , fYEdgeType{sampler.fYEdgeType}
+        , fYMax{sampler.fYMax}
+        , fAccessor{sampler.fAccessor} { }
 
     void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
         SkASSERT(0 < n && n < 4);
         auto bilerpPixel = [&](int index) {
-            return this->bilerpNonEdgePixel(xs[index], ys[index]);
+            return this->bilerpSamplePoint(SkPoint{xs[index], ys[index]});
         };
 
         if (n >= 1) fNext->blendPixel(bilerpPixel(0));
@@ -489,13 +524,56 @@
 
     void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
         auto bilerpPixel = [&](int index) {
-            return this->bilerpNonEdgePixel(xs[index], ys[index]);
+            return this->bilerpSamplePoint(SkPoint{xs[index], ys[index]});
         };
         fNext->blend4Pixels(bilerpPixel(0), bilerpPixel(1), bilerpPixel(2), bilerpPixel(3));
     }
 
     void pointSpan(Span span) override {
-        this->bilerpSpan(span, span.startY());
+        SkASSERT(!span.isEmpty());
+        SkPoint start;
+        SkScalar length;
+        int count;
+        std::tie(start, length, count) = span;
+
+        // Nothing to do.
+        if (count == 0) {
+            return;
+        }
+
+        // Trivial case. No sample points are generated other than start.
+        if (count == 1) {
+            fNext->blendPixel(this->bilerpSamplePoint(start));
+            return;
+        }
+
+        // Note: the following code could be done in terms of dx = length / (count -1), but that
+        // would introduce a divide that is not needed for the most common dx == 1 cases.
+        SkScalar absLength = SkScalarAbs(length);
+        if (absLength == 0.0f) {
+            // |dx| == 0
+            // length is zero, so clamp an edge pixel.
+            this->spanZeroRate(span);
+        } else if (absLength < (count - 1)) {
+            // 0 < |dx| < 1.
+            this->spanSlowRate(span);
+        } else if (absLength == (count - 1)) {
+            // |dx| == 1.
+            if (sample_to_filter(span.startX()) == 1.0f
+                && sample_to_filter(span.startY()) == 1.0f) {
+                // All the pixels are aligned with the dest; go fast.
+                src_strategy_blend(span, fNext, &fAccessor);
+            } else {
+                // There is some sub-pixel offsets, so bilerp.
+                this->spanUnitRate(span);
+            }
+        } else if (absLength < 2.0f * (count - 1)) {
+            // 1 < |dx| < 2.
+            this->spanMediumRate(span);
+        } else {
+            // |dx| >= 2.
+            this->spanFastRate(span);
+        }
     }
 
     void repeatSpan(Span span, int32_t repeatCount) override {
@@ -505,292 +583,425 @@
         }
     }
 
-    void SK_VECTORCALL bilerpEdge(Sk4s sampleXs, Sk4s sampleYs) override {
-        Sk4f px00, px10, px01, px11;
-        Sk4f xs = Sk4f{sampleXs[0]};
-        Sk4f ys = Sk4f{sampleYs[0]};
-        fAccessor.get4Pixels(sampleXs, sampleYs, &px00, &px10, &px01, &px11);
-        Sk4f pixel = bilerp4(xs, ys, px00, px10, px01, px11);
-        fNext->blendPixel(pixel);
-    }
-
-    void bilerpSpan(Span span, SkScalar y) override {
-        SkASSERT(!span.isEmpty());
-        SkPoint start;
-        SkScalar length;
-        int count;
-        std::tie(start, length, count) = span;
-        SkScalar absLength = SkScalarAbs(length);
-        if (absLength == 0.0f) {
-            this->spanZeroRate(span, y);
-        } else if (absLength < (count - 1)) {
-            this->spanSlowRate(span, y);
-        } else if (absLength == (count - 1)) {
-            if (std::fmod(span.startX() - 0.5f, 1.0f) == 0.0f) {
-                if (std::fmod(span.startY() - 0.5f, 1.0f) == 0.0f) {
-                    src_strategy_blend(span, fNext, &fAccessor);
-                } else {
-                    this->spanUnitRateAlignedX(span, y);
-                }
-            } else {
-                this->spanUnitRate(span, y);
-            }
-        } else {
-            this->spanFastRate(span, y);
-        }
-    }
-
 private:
-    void spanZeroRate(Span span, SkScalar y1) {
-        SkScalar y0 = span.startY() - 0.5f;
-        y1 += 0.5f;
-        int iy0 = SkScalarFloorToInt(y0);
-        SkScalar filterY1 = y0 - iy0;
-        SkScalar filterY0 = 1.0f - filterY1;
-        int iy1 = SkScalarFloorToInt(y1);
-        int ix = SkScalarFloorToInt(span.startX());
-        Sk4f pixelY0 = fAccessor.getPixelFromRow(fAccessor.row(iy0), ix);
-        Sk4f pixelY1 = fAccessor.getPixelFromRow(fAccessor.row(iy1), ix);
-        Sk4f filterPixel = pixelY0 * filterY0 + pixelY1 * filterY1;
-        int count = span.count();
-        while (count >= 4) {
-            fNext->blend4Pixels(filterPixel, filterPixel, filterPixel, filterPixel);
-            count -= 4;
-        }
-        while (count > 0) {
-            fNext->blendPixel(filterPixel);
-            count -= 1;
-        }
+
+    // Convert a sample point to the points used by the filter.
+    void filterPoints(SkPoint sample, Sk4i* filterXs, Sk4i* filterYs) {
+        // May be less than zero. Be careful to use Floor.
+        int x0 = adjust_edge(fXEdgeType, SkScalarFloorToInt(X(sample) - 0.5), fXMax);
+        // Always greater than zero. Use the faster Trunc.
+        int x1 = adjust_edge(fXEdgeType, SkScalarTruncToInt(X(sample) + 0.5), fXMax);
+        int y0 = adjust_edge(fYEdgeType, SkScalarFloorToInt(Y(sample) - 0.5), fYMax);
+        int y1 = adjust_edge(fYEdgeType, SkScalarTruncToInt(Y(sample) + 0.5), fYMax);
+
+        *filterXs = Sk4i{x0, x1, x0, x1};
+        *filterYs = Sk4i{y0, y0, y1, y1};
     }
 
-    // When moving through source space more slowly than dst space (zoomed in),
-    // we'll be sampling from the same source pixel more than once.
-    void spanSlowRate(Span span, SkScalar ry1) {
-        SkPoint start;
-        SkScalar length;
-        int count;
+    // Given a sample point, generate a color by bilerping the four filter points.
+    Sk4f bilerpSamplePoint(SkPoint sample) {
+        Sk4i iXs, iYs;
+        filterPoints(sample, &iXs, &iYs);
+        Sk4f px00, px10, px01, px11;
+        fAccessor.get4Pixels(iXs, iYs, &px00, &px10, &px01, &px11);
+        return bilerp4(Sk4f{X(sample) - 0.5f}, Sk4f{Y(sample) - 0.5f}, px00, px10, px01, px11);
+    }
+
+    // Get two pixels at x from row0 and row1.
+    void get2PixelColumn(const void* row0, const void* row1, int x, Sk4f* px0, Sk4f* px1) {
+        *px0 = fAccessor.getPixelFromRow(row0, x);
+        *px1 = fAccessor.getPixelFromRow(row1, x);
+    }
+
+    // |dx| == 0. This code assumes that length is zero.
+    void spanZeroRate(Span span) {
+        SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
-        SkFixed fx = SkScalarToFixed(X(start)-0.5f);
+        SkASSERT(length == 0.0f);
 
-        SkFixed fdx = SkScalarToFixed(length / (count - 1));
+        // Filter for the blending of the top and bottom pixels.
+        SkScalar filterY = sample_to_filter(Y(start));
 
-        Sk4f xAdjust;
-        if (fdx >= 0) {
-            xAdjust = Sk4f{-1.0f};
-        } else {
-            xAdjust = Sk4f{1.0f};
-        }
-        int ix = SkFixedFloorToInt(fx);
-        int ioldx = ix;
-        Sk4f x{SkFixedToScalar(fx) - ix};
-        Sk4f dx{SkFixedToScalar(fdx)};
-        SkScalar ry0 = Y(start) - 0.5f;
-        ry1 += 0.5f;
-        SkScalar yFloor = std::floor(ry0);
-        Sk4f y1 = Sk4f{ry0 - yFloor};
-        Sk4f y0 = Sk4f{1.0f} - y1;
-        const void* const row0 = fAccessor.row(SkScalarFloorToInt(ry0));
-        const void* const row1 = fAccessor.row(SkScalarFloorToInt(ry1));
-        Sk4f fpixel00 = y0 * fAccessor.getPixelFromRow(row0, ix);
-        Sk4f fpixel01 = y1 * fAccessor.getPixelFromRow(row1, ix);
-        Sk4f fpixel10 = y0 * fAccessor.getPixelFromRow(row0, ix + 1);
-        Sk4f fpixel11 = y1 * fAccessor.getPixelFromRow(row1, ix + 1);
-        auto getNextPixel = [&]() {
-            if (ix != ioldx) {
-                fpixel00 = fpixel10;
-                fpixel01 = fpixel11;
-                fpixel10 = y0 * fAccessor.getPixelFromRow(row0, ix + 1);
-                fpixel11 = y1 * fAccessor.getPixelFromRow(row1, ix + 1);
-                ioldx = ix;
-                x = x + xAdjust;
-            }
+        // Generate the four filter points from the sample point start. Generate the row* values.
+        Sk4i iXs, iYs;
+        this->filterPoints(start, &iXs, &iYs);
+        const void* const row0 = fAccessor.row(iYs[0]);
+        const void* const row1 = fAccessor.row(iYs[2]);
 
-            Sk4f x0, x1;
-            x0 = Sk4f{1.0f} - x;
-            x1 = x;
-            Sk4f fpixel = x0 * (fpixel00 + fpixel01) + x1 * (fpixel10 + fpixel11);
-            fx += fdx;
-            ix = SkFixedFloorToInt(fx);
-            x = x + dx;
-            return fpixel;
-        };
+        // Get the two pixels that make up the clamping pixel.
+        Sk4f pxTop, pxBottom;
+        this->get2PixelColumn(row0, row1, SkScalarFloorToInt(X(start)), &pxTop, &pxBottom);
+        Sk4f pixel = pxTop * filterY + (1.0f - filterY) * pxBottom;
 
         while (count >= 4) {
-            Sk4f fpixel0 = getNextPixel();
-            Sk4f fpixel1 = getNextPixel();
-            Sk4f fpixel2 = getNextPixel();
-            Sk4f fpixel3 = getNextPixel();
-
-            fNext->blend4Pixels(fpixel0, fpixel1, fpixel2, fpixel3);
+            fNext->blend4Pixels(pixel, pixel, pixel, pixel);
             count -= 4;
         }
-
         while (count > 0) {
-            fNext->blendPixel(getNextPixel());
-
+            fNext->blendPixel(pixel);
             count -= 1;
         }
     }
 
-    // We're moving through source space at a rate of 1 source pixel per 1 dst pixel.
-    // We'll never re-use pixels, but we can at least load contiguous pixels.
-    void spanUnitRate(Span span, SkScalar y1) {
-        y1 += 0.5f;
-        SkScalar y0 = span.startY() - 0.5f;
-        int iy0 = SkScalarFloorToInt(y0);
-        SkScalar filterY1 = y0 - iy0;
-        SkScalar filterY0 = 1.0f - filterY1;
-        int iy1 = SkScalarFloorToInt(y1);
-        const void* rowY0 = fAccessor.row(iy0);
-        const void* rowY1 = fAccessor.row(iy1);
-        SkScalar x0 = span.startX() - 0.5f;
-        int ix0 = SkScalarFloorToInt(x0);
-        SkScalar filterX1 = x0 - ix0;
-        SkScalar filterX0 = 1.0f - filterX1;
+    // 0 < |dx| < 1. This code reuses the calculations from previous pixels to reduce
+    // computation. In particular, several destination pixels maybe generated from the same four
+    // source pixels.
+    // In the following code a "part" is a combination of two pixels from the same column of the
+    // filter.
+    void spanSlowRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
 
-        auto getPixelY0 = [&]() {
-            Sk4f px = fAccessor.getPixelFromRow(rowY0, ix0);
-            return px * filterY0;
+        // Calculate the distance between each sample point.
+        const SkScalar dx = length / (count - 1);
+        SkASSERT(-1.0f < dx && dx < 1.0f && dx != 0.0f);
+
+        // Generate the filter values for the top-left corner.
+        // Note: these values are in filter space; this has implications about how to adjust
+        // these values at each step. For example, as the sample point increases, the filter
+        // value decreases, this is because the filter and position are related by
+        // (1 - (X(sample) - .5)) % 1. The (1 - stuff) causes the filter to move in the opposite
+        // direction of the sample point which is increasing by dx.
+        SkScalar filterX = sample_to_filter(X(start));
+        SkScalar filterY = sample_to_filter(Y(start));
+
+        // Generate the four filter points from the sample point start. Generate the row* values.
+        Sk4i iXs, iYs;
+        this->filterPoints(start, &iXs, &iYs);
+        const void* const row0 = fAccessor.row(iYs[0]);
+        const void* const row1 = fAccessor.row(iYs[2]);
+
+        // Generate part of the filter value at xColumn.
+        auto partAtColumn = [&](int xColumn) {
+            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
+            Sk4f pxTop, pxBottom;
+            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
+            return pxTop * filterY + (1.0f - filterY) * pxBottom;
         };
 
-        auto getPixelY1 = [&]() {
-            Sk4f px = fAccessor.getPixelFromRow(rowY1, ix0);
-            return px * filterY1;
+        // The leftPart is made up of two pixels from the left column of the filter, right part
+        // is similar. The top and bottom pixels in the *Part are created as a linear blend of
+        // the top and bottom pixels using filterY. See the partAtColumn function above.
+        Sk4f leftPart  = partAtColumn(iXs[0]);
+        Sk4f rightPart = partAtColumn(iXs[1]);
+
+        // Create a destination color by blending together a left and right part using filterX.
+        auto bilerp = [&](const Sk4f& leftPart, const Sk4f& rightPart) {
+            Sk4f pixel = leftPart * filterX + rightPart * (1.0f - filterX);
+            return check_pixel(pixel);
         };
 
-        auto get4PixelsY0 = [&](int ix, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
-            fAccessor.get4Pixels(rowY0, ix, px0, px1, px2, px3);
-            *px0 = *px0 * filterY0;
-            *px1 = *px1 * filterY0;
-            *px2 = *px2 * filterY0;
-            *px3 = *px3 * filterY0;
-        };
+        // Send the first pixel to the destination. This simplifies the loop structure so that no
+        // extra pixels are fetched for the last iteration of the loop.
+        fNext->blendPixel(bilerp(leftPart, rightPart));
+        count -= 1;
 
-        auto get4PixelsY1 = [&](int ix, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
-            fAccessor.get4Pixels(rowY1, ix, px0, px1, px2, px3);
-            *px0 = *px0 * filterY1;
-            *px1 = *px1 * filterY1;
-            *px2 = *px2 * filterY1;
-            *px3 = *px3 * filterY1;
-        };
+        if (dx > 0.0f) {
+            // * positive direction - generate destination pixels by sliding the filter from left
+            //                        to right.
+            int rightPartCursor = iXs[1];
 
-        auto lerp = [&](Sk4f& pixelX0, Sk4f& pixelX1) {
-            return pixelX0 * filterX0 + pixelX1 * filterX1;
-        };
+            // Advance the filter from left to right. Remember that moving the top-left corner of
+            // the filter to the right actually makes the filter value smaller.
+            auto advanceFilter = [&]() {
+                filterX -= dx;
+                if (filterX <= 0.0f) {
+                    filterX += 1.0f;
+                    leftPart = rightPart;
+                    rightPartCursor += 1;
+                    rightPart = partAtColumn(rightPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
 
-        // Mid making 4 unit rate.
-        Sk4f pxB = getPixelY0() + getPixelY1();
-        if (span.length() > 0) {
-            int count = span.count();
+                return bilerp(leftPart, rightPart);
+            };
+
             while (count >= 4) {
-                Sk4f px00, px10, px20, px30;
-                get4PixelsY0(ix0, &px00, &px10, &px20, &px30);
-                Sk4f px01, px11, px21, px31;
-                get4PixelsY1(ix0, &px01, &px11, &px21, &px31);
-                Sk4f pxS0 = px00 + px01;
-                Sk4f px0 = lerp(pxB, pxS0);
-                Sk4f pxS1 = px10 + px11;
-                Sk4f px1 = lerp(pxS0, pxS1);
-                Sk4f pxS2 = px20 + px21;
-                Sk4f px2 = lerp(pxS1, pxS2);
-                Sk4f pxS3 = px30 + px31;
-                Sk4f px3 = lerp(pxS2, pxS3);
-                pxB = pxS3;
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
                 fNext->blend4Pixels(px0, px1, px2, px3);
-                ix0 += 4;
                 count -= 4;
             }
-            while (count > 0) {
-                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix0);
-                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix0);
 
-                fNext->blendPixel(lerp(pixelY0, pixelY1));
-                ix0 += 1;
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
                 count -= 1;
             }
         } else {
-            int count = span.count();
+            // * negative direction - generate destination pixels by sliding the filter from
+            //                        right to left.
+            int leftPartCursor = iXs[0];
+
+            // Advance the filter from right to left. Remember that moving the top-left corner of
+            // the filter to the left actually makes the filter value larger.
+            auto advanceFilter = [&]() {
+                // Remember, dx < 0 therefore this adds |dx| to filterX.
+                filterX -= dx;
+                // At this point filterX may be > 1, and needs to be wrapped back on to the filter
+                // interval, and the next column in the filter is calculated.
+                if (filterX > 1.0f) {
+                    filterX -= 1.0f;
+                    rightPart = leftPart;
+                    leftPartCursor -= 1;
+                    leftPart = partAtColumn(leftPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
+
+                return bilerp(leftPart, rightPart);
+            };
+
             while (count >= 4) {
-                Sk4f px00, px10, px20, px30;
-                get4PixelsY0(ix0 - 3, &px00, &px10, &px20, &px30);
-                Sk4f px01, px11, px21, px31;
-                get4PixelsY1(ix0 - 3, &px01, &px11, &px21, &px31);
-                Sk4f pxS3 = px30 + px31;
-                Sk4f px0 = lerp(pxS3, pxB);
-                Sk4f pxS2 = px20 + px21;
-                Sk4f px1 = lerp(pxS2, pxS3);
-                Sk4f pxS1 = px10 + px11;
-                Sk4f px2 = lerp(pxS1, pxS2);
-                Sk4f pxS0 = px00 + px01;
-                Sk4f px3 = lerp(pxS0, pxS1);
-                pxB = pxS0;
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
                 fNext->blend4Pixels(px0, px1, px2, px3);
-                ix0 -= 4;
                 count -= 4;
             }
-            while (count > 0) {
-                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix0);
-                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix0);
 
-                fNext->blendPixel(lerp(pixelY0, pixelY1));
-                ix0 -= 1;
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
                 count -= 1;
             }
         }
     }
 
-    void spanUnitRateAlignedX(Span span, SkScalar y1) {
-        SkScalar y0 = span.startY() - 0.5f;
-        y1 += 0.5f;
-        int iy0 = SkScalarFloorToInt(y0);
-        SkScalar filterY1 = y0 - iy0;
-        SkScalar filterY0 = 1.0f - filterY1;
-        int iy1 = SkScalarFloorToInt(y1);
-        int ix = SkScalarFloorToInt(span.startX());
-        const void* rowY0 = fAccessor.row(iy0);
-        const void* rowY1 = fAccessor.row(iy1);
-        auto lerp = [&](Sk4f* pixelY0, Sk4f* pixelY1) {
-            return *pixelY0 * filterY0 + *pixelY1 * filterY1;
+    // |dx| == 1. Moving through source space at a rate of 1 source pixel per 1 dst pixel.
+    // Every filter part is used for two destination pixels, and the code can bulk load four
+    // pixels at a time.
+    void spanUnitRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+        SkASSERT(SkScalarAbs(length) == (count - 1));
+
+        // Calculate the four filter points of start, and use the two different Y values to
+        // generate the row pointers.
+        Sk4i iXs, iYs;
+        filterPoints(start, &iXs, &iYs);
+        const void* row0 = fAccessor.row(iYs[0]);
+        const void* row1 = fAccessor.row(iYs[2]);
+
+        // Calculate the filter values for the top-left filter element.
+        const SkScalar filterX = sample_to_filter(X(start));
+        const SkScalar filterY = sample_to_filter(Y(start));
+
+        // Generate part of the filter value at xColumn.
+        auto partAtColumn = [&](int xColumn) {
+            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
+            Sk4f pxTop, pxBottom;
+            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
+            return pxTop * filterY + (1.0f - filterY) * pxBottom;
         };
 
-        if (span.length() > 0) {
-            int count = span.count();
+        auto get4Parts = [&](int ix, Sk4f* part0, Sk4f* part1, Sk4f* part2, Sk4f* part3) {
+            // Check if the pixels needed are near the edges. If not go fast using bulk pixels,
+            // otherwise be careful.
+            if (0 <= ix && ix <= fXMax - 3) {
+                Sk4f px00, px10, px20, px30,
+                     px01, px11, px21, px31;
+                fAccessor.get4Pixels(row0, ix, &px00, &px10, &px20, &px30);
+                fAccessor.get4Pixels(row1, ix, &px01, &px11, &px21, &px31);
+                *part0 = filterY * px00 + (1.0f - filterY) * px01;
+                *part1 = filterY * px10 + (1.0f - filterY) * px11;
+                *part2 = filterY * px20 + (1.0f - filterY) * px21;
+                *part3 = filterY * px30 + (1.0f - filterY) * px31;
+            } else {
+                *part0 = partAtColumn(ix + 0);
+                *part1 = partAtColumn(ix + 1);
+                *part2 = partAtColumn(ix + 2);
+                *part3 = partAtColumn(ix + 3);
+            }
+        };
+
+        auto bilerp = [&](const Sk4f& part0, const Sk4f& part1) {
+            return part0 * filterX + part1 * (1.0f - filterX);
+        };
+
+        if (length > 0) {
+            // * positive direction - generate destination pixels by sliding the filter from left
+            //                        to right.
+
+            // overlapPart is the filter part from the end of the previous four pixels used at
+            // the start of the next four pixels.
+            Sk4f overlapPart = partAtColumn(iXs[0]);
+            int rightColumnCursor = iXs[1];
             while (count >= 4) {
-                Sk4f px00, px10, px20, px30;
-                fAccessor.get4Pixels(rowY0, ix, &px00, &px10, &px20, &px30);
-                Sk4f px01, px11, px21, px31;
-                fAccessor.get4Pixels(rowY1, ix, &px01, &px11, &px21, &px31);
-                fNext->blend4Pixels(
-                    lerp(&px00, &px01), lerp(&px10, &px11), lerp(&px20, &px21), lerp(&px30, &px31));
-                ix += 4;
+                Sk4f part0, part1, part2, part3;
+                get4Parts(rightColumnCursor, &part0, &part1, &part2, &part3);
+                Sk4f px0 = bilerp(overlapPart, part0);
+                Sk4f px1 = bilerp(part0, part1);
+                Sk4f px2 = bilerp(part1, part2);
+                Sk4f px3 = bilerp(part2, part3);
+                overlapPart = part3;
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                rightColumnCursor += 4;
                 count -= 4;
             }
-            while (count > 0) {
-                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix);
-                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix);
 
-                fNext->blendPixel(lerp(&pixelY0, &pixelY1));
-                ix += 1;
+            while (count > 0) {
+                Sk4f rightPart = partAtColumn(rightColumnCursor);
+
+                fNext->blendPixel(bilerp(overlapPart, rightPart));
+                overlapPart = rightPart;
+                rightColumnCursor += 1;
                 count -= 1;
             }
         } else {
-            int count = span.count();
+            // * negative direction - generate destination pixels by sliding the filter from
+            //                        right to left.
+            Sk4f overlapPart = partAtColumn(iXs[1]);
+            int leftColumnCursor = iXs[0];
+
             while (count >= 4) {
-                Sk4f px00, px10, px20, px30;
-                fAccessor.get4Pixels(rowY0, ix - 3, &px30, &px20, &px10, &px00);
-                Sk4f px01, px11, px21, px31;
-                fAccessor.get4Pixels(rowY1, ix - 3, &px31, &px21, &px11, &px01);
-                fNext->blend4Pixels(
-                    lerp(&px00, &px01), lerp(&px10, &px11), lerp(&px20, &px21), lerp(&px30, &px31));
-                ix -= 4;
+                Sk4f part0, part1, part2, part3;
+                get4Parts(leftColumnCursor - 3, &part3, &part2, &part1, &part0);
+                Sk4f px0 = bilerp(part0, overlapPart);
+                Sk4f px1 = bilerp(part1, part0);
+                Sk4f px2 = bilerp(part2, part1);
+                Sk4f px3 = bilerp(part3, part2);
+                overlapPart = part3;
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                leftColumnCursor -= 4;
                 count -= 4;
             }
-            while (count > 0) {
-                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix);
-                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix);
 
-                fNext->blendPixel(lerp(&pixelY0, &pixelY1));
-                ix -= 1;
+            while (count > 0) {
+                Sk4f leftPart = partAtColumn(leftColumnCursor);
+
+                fNext->blendPixel(bilerp(leftPart, overlapPart));
+                overlapPart = leftPart;
+                leftColumnCursor -= 1;
+                count -= 1;
+            }
+        }
+    }
+
+    // 1 < |dx| < 2. Going through the source pixels at a faster rate than the dest pixels, but
+    // still slow enough to take advantage of previous calculations.
+    void spanMediumRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+
+        // Calculate the distance between each sample point.
+        const SkScalar dx = length / (count - 1);
+        SkASSERT((-2.0f < dx && dx < -1.0f) || (1.0f < dx && dx < 2.0f));
+
+        // Generate the filter values for the top-left corner.
+        // Note: these values are in filter space; this has implications about how to adjust
+        // these values at each step. For example, as the sample point increases, the filter
+        // value decreases, this is because the filter and position are related by
+        // (1 - (X(sample) - .5)) % 1. The (1 - stuff) causes the filter to move in the opposite
+        // direction of the sample point which is increasing by dx.
+        SkScalar filterX = sample_to_filter(X(start));
+        SkScalar filterY = sample_to_filter(Y(start));
+
+        // Generate the four filter points from the sample point start. Generate the row* values.
+        Sk4i iXs, iYs;
+        this->filterPoints(start, &iXs, &iYs);
+        const void* const row0 = fAccessor.row(iYs[0]);
+        const void* const row1 = fAccessor.row(iYs[2]);
+
+        // Generate part of the filter value at xColumn.
+        auto partAtColumn = [&](int xColumn) {
+            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
+            Sk4f pxTop, pxBottom;
+            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
+            return pxTop * filterY + (1.0f - filterY) * pxBottom;
+        };
+
+        // The leftPart is made up of two pixels from the left column of the filter, right part
+        // is similar. The top and bottom pixels in the *Part are created as a linear blend of
+        // the top and bottom pixels using filterY. See the nextPart function below.
+        Sk4f leftPart  = partAtColumn(iXs[0]);
+        Sk4f rightPart = partAtColumn(iXs[1]);
+
+        // Create a destination color by blending together a left and right part using filterX.
+        auto bilerp = [&](const Sk4f& leftPart, const Sk4f& rightPart) {
+            Sk4f pixel = leftPart * filterX + rightPart * (1.0f - filterX);
+            return check_pixel(pixel);
+        };
+
+        // Send the first pixel to the destination. This simplifies the loop structure so that no
+        // extra pixels are fetched for the last iteration of the loop.
+        fNext->blendPixel(bilerp(leftPart, rightPart));
+        count -= 1;
+
+        if (dx > 0.0f) {
+            // * positive direction - generate destination pixels by sliding the filter from left
+            //                        to right.
+            int rightPartCursor = iXs[1];
+
+            // Advance the filter from left to right. Remember that moving the top-left corner of
+            // the filter to the right actually makes the filter value smaller.
+            auto advanceFilter = [&]() {
+                filterX -= dx;
+                // At this point filterX is less than zero, but might actually be less than -1.
+                if (filterX > -1.0f) {
+                    filterX += 1.0f;
+                    leftPart = rightPart;
+                    rightPartCursor += 1;
+                    rightPart = partAtColumn(rightPartCursor);
+                } else {
+                    filterX += 2.0f;
+                    rightPartCursor += 2;
+                    leftPart = partAtColumn(rightPartCursor - 1);
+                    rightPart = partAtColumn(rightPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
+
+                return bilerp(leftPart, rightPart);
+            };
+
+            while (count >= 4) {
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                count -= 4;
+            }
+
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
+                count -= 1;
+            }
+        } else {
+            // * negative direction - generate destination pixels by sliding the filter from
+            //                        right to left.
+            int leftPartCursor = iXs[0];
+
+            auto advanceFilter = [&]() {
+                // Remember, dx < 0 therefore this adds |dx| to filterX.
+                filterX -= dx;
+                // At this point, filterX is greater than one, but may actually be greater than two.
+                if (filterX < 2.0f) {
+                    filterX -= 1.0f;
+                    rightPart = leftPart;
+                    leftPartCursor -= 1;
+                    leftPart = partAtColumn(leftPartCursor);
+                } else {
+                    filterX -= 2.0f;
+                    leftPartCursor -= 2;
+                    rightPart = partAtColumn(leftPartCursor - 1);
+                    leftPart = partAtColumn(leftPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
+                return bilerp(leftPart, rightPart);
+            };
+
+            while (count >= 4) {
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                count -= 4;
+            }
+
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
                 count -= 1;
             }
         }
@@ -798,34 +1009,26 @@
 
     // We're moving through source space faster than dst (zoomed out),
     // so we'll never reuse a source pixel or be able to do contiguous loads.
-    void spanFastRate(Span span, SkScalar y1) {
-        SkPoint start;
-        SkScalar length;
-        int count;
+    void spanFastRate(Span span) {
+        SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
         SkScalar x = X(start);
         SkScalar y = Y(start);
 
-        // In this sampler, it is assumed that if span.StartY() and y1 are the same then both
-        // y-lines are on the same tile.
-        if (y == y1) {
-            // Both y-lines are on the same tile.
-            span_fallback(span, this);
-        } else {
-            // The y-lines are on different tiles.
-            SkScalar dx = length / (count - 1);
-            Sk4f ys = {y - 0.5f, y - 0.5f, y1 + 0.5f, y1 + 0.5f};
-            while (count > 0) {
-                Sk4f xs = Sk4f{-0.5f, 0.5f, -0.5f, 0.5f} + Sk4f{x};
-                this->bilerpEdge(xs, ys);
-                x += dx;
-                count -= 1;
-            }
+        SkScalar dx = length / (count - 1);
+        while (count > 0) {
+            fNext->blendPixel(this->bilerpSamplePoint(SkPoint{x, y}));
+            x += dx;
+            count -= 1;
         }
     }
 
-    Next* const fNext;
-    Accessor    fAccessor;
+    Next* const              fNext;
+    const SkShader::TileMode fXEdgeType;
+    const int                fXMax;
+    const SkShader::TileMode fYEdgeType;
+    const int                fYMax;
+    Accessor                 fAccessor;
 };
 
 }  // namespace