Revert "Revert "Direct evaluation of gaussian""

This reverts commit a53d999007f92ecd4244b078fa909b76fd0d9f3b.

Reason for revert: Bug in SkNx_sse fixed.

Original change's description:
> Revert "Direct evaluation of gaussian"
>
> This reverts commit 5e18cdea0a0a3f23d8e8b8140c82a4b54e121402.
>
> Reason for revert: ASAN
> Original change's description:
> > Direct evaluation of gaussian
> >
> > The SVG(CSS) standard allows the 3 pass algorithm for sigma >= 2. But
> > sigma < 2, the code must evaluate to the convolution. The old code used
> > an interpolation scheme between windowed filters. This code directly
> > evaluates the gaussian kernel for sigma < 2.
> >
> > This code produces cleaner results, is 25% faster, and does not use a
> > temporary memory buffer.
> >
> > Change-Id: Ibd0caa73cadd06b637f55ba7bd4fefcfe7ac73db
> > Reviewed-on: https://skia-review.googlesource.com/62540
> > Commit-Queue: Herb Derby <herb@google.com>
> > Reviewed-by: Mike Klein <mtklein@google.com>
>
> TBR=mtklein@google.com,herb@google.com
>
> Change-Id: I936077dfa659d71bc361339d98340c55545a1eb8
> No-Presubmit: true
> No-Tree-Checks: true
> No-Try: true
> Reviewed-on: https://skia-review.googlesource.com/72481
> Reviewed-by: Brian Osman <brianosman@google.com>
> Commit-Queue: Brian Osman <brianosman@google.com>

TBR=mtklein@google.com,herb@google.com,brianosman@google.com

Change-Id: I4c30e3481308a8148d40223519e286885ec6f880
Reviewed-on: https://skia-review.googlesource.com/72900
Commit-Queue: Herb Derby <herb@google.com>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/bench/BlurBench.cpp b/bench/BlurBench.cpp
index 20afd4a..5b94d66 100644
--- a/bench/BlurBench.cpp
+++ b/bench/BlurBench.cpp
@@ -15,9 +15,11 @@
 
 #define MINI   0.01f
 #define SMALL   SkIntToScalar(2)
-#define REAL    1.5f
+#define REAL    0.5f
 #define BIG     SkIntToScalar(10)
 #define REALBIG 100.5f
+// The value that produces a sigma of just over 2.
+#define CUTOVER 2.6f
 
 static const char* gStyleName[] = {
     "normal",
@@ -111,5 +113,6 @@
 DEF_BENCH(return new BlurBench(REALBIG, kNormal_SkBlurStyle, SkBlurMaskFilter::kHighQuality_BlurFlag);)
 
 DEF_BENCH(return new BlurBench(REAL, kNormal_SkBlurStyle, SkBlurMaskFilter::kHighQuality_BlurFlag);)
+DEF_BENCH(return new BlurBench(CUTOVER, kNormal_SkBlurStyle, SkBlurMaskFilter::kHighQuality_BlurFlag);)
 
 DEF_BENCH(return new BlurBench(0, kNormal_SkBlurStyle);)
diff --git a/src/core/SkGaussFilter.cpp b/src/core/SkGaussFilter.cpp
index 548ff43..c0d612d 100644
--- a/src/core/SkGaussFilter.cpp
+++ b/src/core/SkGaussFilter.cpp
@@ -143,10 +143,9 @@
     }
 }
 
-int SkGaussFilter::filterDouble(double* values) const {
+int SkGaussFilter::filterDouble(double values[5]) const {
     for (int i = 0; i < fN; i++) {
         values[i] = fBasis[i];
     }
     return fN;
 }
-
diff --git a/src/core/SkGaussFilter.h b/src/core/SkGaussFilter.h
index 9af45c8..be00cf2 100644
--- a/src/core/SkGaussFilter.h
+++ b/src/core/SkGaussFilter.h
@@ -27,10 +27,16 @@
     int radius() const { return fN - 1; }
     int width() const { return 2 * this->radius() + 1; }
 
+    // TODO: remove filterDouble and use the ranged-for loop interface.
+
     // Take an array of values where the gaussian factors will be placed. Return the number of
     // values filled.
     int filterDouble(double values[5]) const;
 
+    // Allow a filter to be used in a C++ ranged-for loop.
+    const double* begin() const { return &fBasis[0];  }
+    const double* end()   const { return &fBasis[fN]; }
+
 private:
     double fBasis[5];
     int    fN;
diff --git a/src/core/SkMaskBlurFilter.cpp b/src/core/SkMaskBlurFilter.cpp
index fea98d6..9de68fc 100644
--- a/src/core/SkMaskBlurFilter.cpp
+++ b/src/core/SkMaskBlurFilter.cpp
@@ -11,6 +11,7 @@
 #include <climits>
 
 #include "SkArenaAlloc.h"
+#include "SkGaussFilter.h"
 #include "SkNx.h"
 #include "SkSafeMath.h"
 
@@ -568,8 +569,644 @@
     return dst;
 }
 
+#if !defined(SK_USE_LEGACY_INTERP_BLUR)
+static constexpr uint16_t _____ = 0u;
+static constexpr uint16_t kHalf = 0x80u;
+
+static SK_ALWAYS_INLINE Sk8h load(const uint8_t* from, int width) {
+    uint8_t buffer[8];
+    if (width < 8) {
+        sk_bzero(buffer, sizeof(buffer));
+        for (int i = 0; i < width; i++) {
+            buffer[i] = from[i];
+        }
+        from = buffer;
+    }
+    auto v = SkNx_cast<uint16_t>(Sk8b::Load(from));
+    // Convert from 0-255 to 8.8 encoding.
+    return v << 8;
+};
+
+static SK_ALWAYS_INLINE void store(uint8_t* to, const Sk8h& v, int width) {
+    Sk8b b = SkNx_cast<uint8_t>(v >> 8);
+    if (width == 8) {
+        b.store(to);
+    } else {
+        uint8_t buffer[8];
+        b.store(buffer);
+        for (int i = 0; i < width; i++) {
+            to[i] = buffer[i];
+        }
+    }
+};
+
+// In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded
+// in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8
+// format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits
+// resulting in a 8.8 format.
+//
+// The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This
+// system is setup to minimize the number of multiplies needed.
+//
+// Explanation:
+//    Blurring a specific mask value is given by the following equation where D_n is the resulting
+// mask value and S_n is the source value. The example below is for a filter with a radius of 1
+// and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are
+// aligned. The filter is given by G_n where n is the symmetric filter value.
+//
+//   D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1].
+//
+// We can start the source index at an offset relative to the destination separated by the
+// radius. This results in a non-traditional restating of the above filter.
+//
+//  D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1]
+//
+// If we look at three specific consecutive destinations the following equations result:
+//
+//   D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1]
+//   D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1]
+//   D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1].
+//
+// In the above equations, notice that S[7] is used in all three. In particular, two values are
+// used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and
+// D[7].
+//
+// From the point of view of a source value we end up with the following three equations.
+//
+// Given S[7]:
+//   D[5] += S[7]*G[1]
+//   D[6] += S[7]*G[0]
+//   D[7] += S[7]*G[1]
+//
+// In General:
+//   D[n]   += S[n]*G[1]
+//   D[n+1] += S[n]*G[0]
+//   D[n+2] += S[n]*G[1]
+//
+// Now these equations can be ganged using SIMD to form:
+//   D[n..n+7]   += S[n..n+7]*G[1]
+//   D[n+1..n+8] += S[n..n+7]*G[0]
+//   D[n+2..n+9] += S[n..n+7]*G[1]
+// The next set of values becomes.
+//   D[n+8..n+15]  += S[n+8..n+15]*G[1]
+//   D[n+9..n+16]  += S[n+8..n+15]*G[0]
+//   D[n+10..n+17] += S[n+8..n+15]*G[1]
+// You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both
+// S[n..7] and S[n+8..n+15].
+//
+// Just one more transformation allows the code to maintain all working values in
+// registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is
+// prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}.
+//
+//   D[n..n+7]  += S[n..n+7] * G[1]
+//   D[n..n+8]  += {0, S[n..n+7] * G[0]}
+//   D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
+//
+// Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a
+// register d8. In addition, S[0..n+7] becomes s0.
+//
+// The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below.
+//
+// Sk8h v0 = s0*G[0]
+// Sk8h v1 = s0*G[1]
+// /* D[n..n+7]  += S[n..n+7] * G[1] */
+// d0 += v1;
+// /* D[n..n+8]  += {0, S[n..n+7] * G[0]} */
+// d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}
+// d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____}
+// /* D[n..n+9]  += {0, 0, S[n..n+7] * G[1]} */
+// d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}
+// d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____}
+// Where we rely on the compiler to generate efficient code for the {____, n, ....} notation.
+
+static SK_ALWAYS_INLINE void blur_x_radius_1(
+        const Sk8h& s0,
+        const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
+        Sk8h* d0, Sk8h* d8) {
+
+    auto v1 = s0.mulHi(g1);
+    auto v0 = s0.mulHi(g0);
+
+    // D[n..n+7]  += S[n..n+7] * G[1]
+    *d0 += v1;
+
+    //D[n..n+8]  += {0, S[n..n+7] * G[0]}
+    *d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]};
+    *d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____};
+
+    // D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
+    *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
+    *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
+
+}
+
+static SK_ALWAYS_INLINE void blur_x_radius_2(
+        const Sk8h& s0,
+        const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
+        Sk8h* d0, Sk8h* d8) {
+    auto v0 = s0.mulHi(g0);
+    auto v1 = s0.mulHi(g1);
+    auto v2 = s0.mulHi(g2);
+
+    // D[n..n+7]  += S[n..n+7] * G[2]
+    *d0 += v2;
+
+    // D[n..n+8]  += {0, S[n..n+7] * G[1]}
+    *d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]};
+    *d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____};
+
+    // D[n..n+9]  += {0, 0, S[n..n+7] * G[0]}
+    *d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]};
+    *d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____};
+
+    // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[1]}
+    *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
+    *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
+
+    // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[2]}
+    *d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]};
+    *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____};
+}
+
+static SK_ALWAYS_INLINE void blur_x_radius_3(
+        const Sk8h& s0,
+        const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&,
+        Sk8h* d0, Sk8h* d8) {
+    auto v0 = s0.mulHi(gauss0);
+    auto v1 = s0.mulHi(gauss1);
+    auto v2 = s0.mulHi(gauss2);
+    auto v3 = s0.mulHi(gauss3);
+
+    // D[n..n+7]  += S[n..n+7] * G[3]
+    *d0 += v3;
+
+    // D[n..n+8]  += {0, S[n..n+7] * G[2]}
+    *d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]};
+    *d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____};
+
+    // D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
+    *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
+    *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
+
+    // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[0]}
+    *d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]};
+    *d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____};
+
+    // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[1]}
+    *d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]};
+    *d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____};
+
+    // D[n..n+12]  += {0, 0, 0, 0, 0, S[n..n+7] * G[2]}
+    *d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]};
+    *d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____};
+
+    // D[n..n+13]  += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
+    *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]};
+    *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____};
+}
+
+static SK_ALWAYS_INLINE void blur_x_radius_4(
+        const Sk8h& s0,
+        const Sk8h& gauss0,
+        const Sk8h& gauss1,
+        const Sk8h& gauss2,
+        const Sk8h& gauss3,
+        const Sk8h& gauss4,
+        Sk8h* d0, Sk8h* d8) {
+    auto v0 = s0.mulHi(gauss0);
+    auto v1 = s0.mulHi(gauss1);
+    auto v2 = s0.mulHi(gauss2);
+    auto v3 = s0.mulHi(gauss3);
+    auto v4 = s0.mulHi(gauss4);
+
+    // D[n..n+7]  += S[n..n+7] * G[4]
+    *d0 += v4;
+
+    // D[n..n+8]  += {0, S[n..n+7] * G[3]}
+    *d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]};
+    *d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____};
+
+    // D[n..n+9]  += {0, 0, S[n..n+7] * G[2]}
+    *d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]};
+    *d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____};
+
+    // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[1]}
+    *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
+    *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
+
+    // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[0]}
+    *d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]};
+    *d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____};
+
+    // D[n..n+12]  += {0, 0, 0, 0, 0, S[n..n+7] * G[1]}
+    *d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]};
+    *d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____};
+
+    // D[n..n+13]  += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]}
+    *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]};
+    *d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____};
+
+    // D[n..n+14]  += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
+    *d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]};
+    *d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____};
+
+    // D[n..n+15]  += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]}
+    *d8 += v4;
+}
+
+using BlurX = decltype(blur_x_radius_1);
+
+// BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
+static SK_ALWAYS_INLINE void blur_row(
+        BlurX blur,
+        const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
+        const uint8_t* src, int srcW,
+              uint8_t* dst, int dstW) {
+    // Clear the buffer to handle summing wider than source.
+    Sk8h d0{kHalf}, d8{kHalf};
+
+    // Go by multiples of 8 in src.
+    int x = 0;
+    for (; x <= srcW - 8; x += 8) {
+        blur(load(src, 8), g0, g1, g2, g3, g4, &d0, &d8);
+
+        store(dst, d0, 8);
+
+        d0 = d8;
+        d8 = Sk8h{kHalf};
+
+        src += 8;
+        dst += 8;
+    }
+
+    // There are src values left, but the remainder of src values is not a multiple of 8.
+    int srcTail = srcW - x;
+    if (srcTail > 0) {
+
+        blur(load(src, srcTail), g0, g1, g2, g3, g4, &d0, &d8);
+
+        int dstTail = std::min(8, dstW - x);
+        store(dst, d0, dstTail);
+
+        d0 = d8;
+        dst += dstTail;
+        x += dstTail;
+    }
+
+    // There are dst mask values to complete.
+    int dstTail = dstW - x;
+    if (dstTail > 0) {
+        store(dst, d0, dstTail);
+    }
+}
+
+// BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
+static SK_ALWAYS_INLINE void blur_x_rect(
+        BlurX blur,
+        uint16_t* gauss,
+        const uint8_t* src, size_t srcStride, int srcW,
+              uint8_t* dst, size_t dstStride, int dstW, int dstH) {
+
+    Sk8h g0{gauss[0]},
+         g1{gauss[1]},
+         g2{gauss[2]},
+         g3{gauss[3]},
+         g4{gauss[4]};
+
+    // Blur *ALL* the rows.
+    for (int y = 0; y < dstH; y++) {
+        blur_row(blur, g0, g1, g2, g3, g4, src, srcW, dst, dstW);
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+SK_ATTRIBUTE(noinline) static void direct_blur_x(
+    int radius, uint16_t* gauss,
+    const uint8_t* src, size_t srcStride, int srcW,
+          uint8_t* dst, size_t dstStride, int dstW, int dstH) {
+
+    switch (radius) {
+        case 1:
+            blur_x_rect(blur_x_radius_1, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
+            break;
+
+        case 2:
+            blur_x_rect(blur_x_radius_2, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
+            break;
+
+        case 3:
+            blur_x_rect(blur_x_radius_3, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
+            break;
+
+        case 4:
+            blur_x_rect(blur_x_radius_4, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
+            break;
+
+        default:
+            SkASSERTF(false, "The radius %d is not handled\n", radius);
+    }
+}
+
+// The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N
+// functions, but end up being simpler because there is no complicated shift of registers. We
+// start with the non-traditional form of the gaussian filter. In the following r is the value
+// when added generates the next value in the column.
+//
+//   D[n+0r] = S[n+0r]*G[1]
+//           + S[n+1r]*G[0]
+//           + S[n+2r]*G[1]
+//
+// Expanding out in a way similar to blur_x_radius_N for specific values of n.
+//
+//   D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1]
+//   D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1]
+//   D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1]
+//
+// We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we
+// can look at the calculation form the point of view of a source value.
+//
+//   Given S[n+0r]:
+//   D[n+0r] += S[n+0r]*G[1];
+//   /* D[n+0r] is done and can be stored now. */
+//   D[n+1r] += S[n+0r]*G[0];
+//   D[n+2r]  = S[n+0r]*G[1];
+//
+// Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in
+// S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also,
+// notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice
+// how values flow in the following two iterations in source.
+//
+//   D[n+0r] += S[n+0r]*G[1]
+//   D[n+1r] += S[n+0r]*G[0]
+//   D[n+2r]  = S[n+0r]*G[1]
+//   /* ------- */
+//   D[n+1r] += S[n+1r]*G[1]
+//   D[n+2r] += S[n+1r]*G[0]
+//   D[n+3r]  = S[n+1r]*G[1]
+//
+// Instead of using memory we can introduce temporaries d01 and d12. The update step changes
+// to the following.
+//
+//   answer = d01 + S[n+0r]*G[1]
+//   d01    = d12 + S[n+0r]*G[0]
+//   d12    =       S[n+0r]*G[1]
+//   return answer
+//
+// Finally, this can be ganged into SIMD style.
+//   answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1]
+//   d01[0..7]    = d12[0..7] + S[n+0r..n+0r+7]*G[0]
+//   d12[0..7]    =             S[n+0r..n+0r+7]*G[1]
+//   return answer[0..7]
+static SK_ALWAYS_INLINE Sk8h blur_y_radius_1(
+        const Sk8h& s0,
+        const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
+        Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
+    auto v0 = s0.mulHi(g0);
+    auto v1 = s0.mulHi(g1);
+
+    Sk8h answer = *d01 + v1;
+           *d01 = *d12 + v0;
+           *d12 =        v1 + kHalf;
+
+    return answer;
+}
+
+static SK_ALWAYS_INLINE Sk8h blur_y_radius_2(
+        const Sk8h& s0,
+        const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
+        Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
+    auto v0 = s0.mulHi(g0);
+    auto v1 = s0.mulHi(g1);
+    auto v2 = s0.mulHi(g2);
+
+    Sk8h answer = *d01 + v2;
+           *d01 = *d12 + v1;
+           *d12 = *d23 + v0;
+           *d23 = *d34 + v1;
+           *d34 =        v2 + kHalf;
+
+    return answer;
+}
+
+static SK_ALWAYS_INLINE Sk8h blur_y_radius_3(
+        const Sk8h& s0,
+        const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&,
+        Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) {
+    auto v0 = s0.mulHi(g0);
+    auto v1 = s0.mulHi(g1);
+    auto v2 = s0.mulHi(g2);
+    auto v3 = s0.mulHi(g3);
+
+    Sk8h answer = *d01 + v3;
+           *d01 = *d12 + v2;
+           *d12 = *d23 + v1;
+           *d23 = *d34 + v0;
+           *d34 = *d45 + v1;
+           *d45 = *d56 + v2;
+           *d56 =        v3 + kHalf;
+
+    return answer;
+}
+
+static SK_ALWAYS_INLINE Sk8h blur_y_radius_4(
+    const Sk8h& s0,
+    const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
+    Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) {
+    auto v0 = s0.mulHi(g0);
+    auto v1 = s0.mulHi(g1);
+    auto v2 = s0.mulHi(g2);
+    auto v3 = s0.mulHi(g3);
+    auto v4 = s0.mulHi(g4);
+
+    Sk8h answer = *d01 + v4;
+           *d01 = *d12 + v3;
+           *d12 = *d23 + v2;
+           *d23 = *d34 + v1;
+           *d34 = *d45 + v0;
+           *d45 = *d56 + v1;
+           *d56 = *d67 + v2;
+           *d67 = *d78 + v3;
+           *d78 =        v4 + kHalf;
+
+    return answer;
+}
+
+using BlurY = decltype(blur_y_radius_1);
+
+// BlurY will be one of blur_y_radius_(1|2|3|4).
+static SK_ALWAYS_INLINE void blur_column(
+        BlurY blur, int radius, int width,
+        const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
+        const uint8_t* src, size_t srcStride, int srcH,
+        uint8_t* dst, size_t dstStride) {
+    Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf},
+         d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf};
+
+    auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) {
+        store(to, v0, width);
+        to += dstStride;
+        store(to, v1, width);
+        return to + dstStride;
+    };
+
+    for (int y = 0; y < srcH; y += 1) {
+        auto s = load(src, width);
+        auto b = blur(s,
+                      g0, g1, g2, g3, g4,
+                      &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78);
+        store(dst, b, width);
+        src += srcStride;
+        dst += dstStride;
+    }
+
+    if (radius >= 1) {
+        dst = flush(dst, d01, d12);
+    }
+    if (radius >= 2) {
+        dst = flush(dst, d23, d34);
+    }
+    if (radius >= 3) {
+        dst = flush(dst, d45, d56);
+    }
+    if (radius >= 4) {
+              flush(dst, d67, d78);
+    }
+}
+
+// BlurY will be one of blur_y_radius_(1|2|3|4).
+static SK_ALWAYS_INLINE void blur_y_rect(
+        BlurY blur, int radius, uint16_t *gauss,
+        const uint8_t *src, size_t srcStride, int srcW, int srcH,
+        uint8_t *dst, size_t dstStride) {
+
+    Sk8h g0{gauss[0]},
+         g1{gauss[1]},
+         g2{gauss[2]},
+         g3{gauss[3]},
+         g4{gauss[4]};
+
+    int x = 0;
+    for (; x <= srcW - 8; x += 8) {
+        blur_column(blur, radius, 8,
+                    g0, g1, g2, g3, g4,
+                    src, srcStride, srcH,
+                    dst, dstStride);
+        src += 8;
+        dst += 8;
+    }
+
+    int xTail = srcW - x;
+    if (xTail > 0) {
+        blur_column(blur, radius, xTail,
+                    g0, g1, g2, g3, g4,
+                    src, srcStride, srcH,
+                    dst, dstStride);
+    }
+}
+
+SK_ATTRIBUTE(noinline) static void direct_blur_y(
+        int radius, uint16_t* gauss,
+        const uint8_t* src, size_t srcStride, int srcW, int srcH,
+              uint8_t* dst, size_t dstStride) {
+
+    switch (radius) {
+        case 1:
+            blur_y_rect(blur_y_radius_1, 1, gauss,
+                        src, srcStride, srcW, srcH,
+                        dst, dstStride);
+            break;
+
+        case 2:
+            blur_y_rect(blur_y_radius_2, 2, gauss,
+                        src, srcStride, srcW, srcH,
+                        dst, dstStride);
+            break;
+
+        case 3:
+            blur_y_rect(blur_y_radius_3, 3, gauss,
+                        src, srcStride, srcW, srcH,
+                        dst, dstStride);
+            break;
+
+        case 4:
+            blur_y_rect(blur_y_radius_4, 4, gauss,
+                        src, srcStride, srcW, srcH,
+                        dst, dstStride);
+            break;
+
+        default:
+            SkASSERTF(false, "The radius %d is not handled\n", radius);
+    }
+}
+
+static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMask* dst) {
+    SkASSERT(0 <= sigmaX && sigmaX < 2);
+    SkASSERT(0 <= sigmaY && sigmaY < 2);
+
+    SkGaussFilter filterX{sigmaX, SkGaussFilter::Type::Bessel},
+                  filterY{sigmaY, SkGaussFilter::Type::Bessel};
+
+    int radiusX = filterX.radius(),
+        radiusY = filterY.radius();
+
+    SkASSERT(radiusX <= 4 && radiusY <= 4);
+
+    auto prepareGauss = [](const SkGaussFilter& filter, uint16_t* factors) {
+        int i = 0;
+        for (double d : filter) {
+            factors[i++] = static_cast<uint16_t>(round(d * (1 << 16)));
+        }
+    };
+
+    uint16_t gaussFactorsX[5],
+             gaussFactorsY[5];
+
+    prepareGauss(filterX, gaussFactorsX);
+    prepareGauss(filterY, gaussFactorsY);
+
+    *dst = prepare_destination(radiusX, radiusY, src);
+    if (src.fImage == nullptr) {
+        return {SkTo<int32_t>(radiusX), SkTo<int32_t>(radiusY)};
+    }
+    if (dst->fImage == nullptr) {
+        dst->fBounds.setEmpty();
+        return {0, 0};
+    }
+
+    int srcW = src.fBounds.width(),
+        srcH = src.fBounds.height();
+
+    int dstW = dst->fBounds.width(),
+        dstH = dst->fBounds.height();
+
+    size_t srcStride = src.fRowBytes,
+           dstStride = dst->fRowBytes;
+
+    //TODO: handle bluring in only one direction.
+
+    // Blur vertically and copy to destination.
+    direct_blur_y(radiusY, gaussFactorsY,
+                  src.fImage,  srcStride, srcW, srcH,
+                  dst->fImage + radiusX, dstStride);
+
+    // Blur horizontally in place.
+    direct_blur_x(radiusX, gaussFactorsX,
+                  dst->fImage + radiusX,  dstStride, srcW,
+                  dst->fImage,            dstStride, dstW, dstH);
+
+    return {radiusX, radiusY};
+}
+#endif  // SK_USE_LEGACY_INTERP_BLUR
+
 SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const {
 
+    #if !defined(SK_USE_LEGACY_INTERP_BLUR)
+        if (fSigmaW < 2.0 && fSigmaH < 2.0) {
+            return small_blur(fSigmaW, fSigmaH, src, dst);
+        }
+    #endif
+
     // 1024 is a place holder guess until more analysis can be done.
     SkSTArenaAlloc<1024> alloc;
 
@@ -623,7 +1260,6 @@
                         tmpStart, tmpW, tmpStart + tmpW * tmpH);
         }
 
-
         // Blur vertically (scan in memory order because of the transposition),
         // and transpose back to the original orientation.
         auto scanH = planH->makeBlurScan(&alloc, tmpW, buffer);
@@ -665,7 +1301,7 @@
         for (size_t x = 0; x < srcW; x++) {
             auto srcStart = &src.fImage[x];
             auto dstStart = &dst->fImage[x];
-            scanH->blur(srcStart, src.fRowBytes, srcEnd,
+            scanH->blur(srcStart, src.fRowBytes,  srcEnd,
                         dstStart, dst->fRowBytes, dstEnd);
         }
     } else {
@@ -678,5 +1314,3 @@
 
     return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
 }
-
-