Adapt GrRectBlur for cases with rect W/H less than six sigma.

Add GM that generates reference blur rect images and compares against
actual.

Change-Id: If0ce291e211fefe96af8afdf0a60636b5f40ef47
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/239757
Commit-Queue: Brian Salomon <bsalomon@google.com>
Reviewed-by: Robert Phillips <robertphillips@google.com>
diff --git a/src/gpu/effects/GrRectBlurEffect.fp b/src/gpu/effects/GrRectBlurEffect.fp
index f357066..175fa65 100644
--- a/src/gpu/effects/GrRectBlurEffect.fp
+++ b/src/gpu/effects/GrRectBlurEffect.fp
@@ -6,6 +6,8 @@
  */
 
 @header {
+#include <cmath>
+#include "include/core/SkRect.h"
 #include "include/core/SkScalar.h"
 #include "src/core/SkBlurMask.h"
 #include "src/core/SkMathPriv.h"
@@ -21,53 +23,66 @@
 layout(when= highp) uniform float4 rectF;
 layout(when=!highp) uniform half4  rectH;
 
-in uniform sampler2D blurProfile;
-in uniform half invProfileWidth;
+// Texture that is a LUT for integral of normal distribution. The value at x (where x is a texture
+// coord between 0 and 1) is the integral from -inf to (3 * sigma * (-2 * x - 1)). I.e. x is mapped
+// 0 3*sigma to -3 sigma. The flip saves a reversal in the shader.
+in uniform sampler2D integral;
+// Used to produce normalized texture coords for lookups in 'integral'
+in uniform half invSixSigma;
+
+// There is a fast variant of the effect that does 2 texture lookups and a more general one for
+// wider blurs relative to rect sizes that does 4.
+layout(key) in bool isFast;
 
 @constructorParams {
     GrSamplerState samplerParams
 }
 
-@samplerParams(blurProfile) {
+@samplerParams(integral) {
     samplerParams
 }
 @class {
-static sk_sp<GrTextureProxy> CreateBlurProfileTexture(GrProxyProvider* proxyProvider,
-                                                      float sixSigma) {
-    // The "profile" we are calculating is the integral of a Gaussian with 'sigma' and a half
-    // plane. All such profiles are just scales of each other. So all we really care about is
-    // having enough resolution so that the linear interpolation done in texture lookup doesn't
-    // introduce noticeable artifacts. SkBlurMask::ComputeBlurProfile() produces profiles with
-    // ceil(6 * sigma) entries. We conservatively choose to have 2 texels for each dst pixel.
-    int minProfileWidth = 2 * sk_float_ceil2int(sixSigma);
-    // Bin by powers of 2 with a minimum so we get good profile reuse (remember we can just scale
-    // the texture coords to span the larger profile over a 6 sigma distance).
-    int profileWidth = SkTMax(SkNextPow2(minProfileWidth), 32);
+static sk_sp<GrTextureProxy> CreateIntegralTexture(GrProxyProvider* proxyProvider,
+                                                   float sixSigma) {
+    // The texture we're producing represents the integral of a normal distribution over a six-sigma
+    // range centered at zero. We want enough resolution so that the linear interpolation done in
+    // texture lookup doesn't introduce noticeable artifacts. We conservatively choose to have 2
+    // texels for each dst pixel.
+    int minWidth = 2 * sk_float_ceil2int(sixSigma);
+    // Bin by powers of 2 with a minimum so we get good profile reuse.
+    int width = SkTMax(SkNextPow2(minWidth), 32);
 
     static const GrUniqueKey::Domain kDomain = GrUniqueKey::GenerateDomain();
     GrUniqueKey key;
     GrUniqueKey::Builder builder(&key, kDomain, 1, "Rect Blur Mask");
-    builder[0] = profileWidth;
+    builder[0] = width;
     builder.finish();
 
-    sk_sp<GrTextureProxy> blurProfile(proxyProvider->findOrCreateProxyByUniqueKey(
+    sk_sp<GrTextureProxy> proxy(proxyProvider->findOrCreateProxyByUniqueKey(
             key, GrColorType::kAlpha_8, kTopLeft_GrSurfaceOrigin));
-    if (!blurProfile) {
+    if (!proxy) {
         SkBitmap bitmap;
-        if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(profileWidth, 1))) {
+        if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(width, 1))) {
             return nullptr;
         }
-        SkBlurMask::ComputeBlurProfile(bitmap.getAddr8(0, 0), profileWidth, profileWidth / 6.f);
+        *bitmap.getAddr8(0, 0) = 255;
+        const float invWidth = 1.f / width;
+        for (int i = 1; i < width - 1; ++i) {
+            float x = (i + 0.5f) * invWidth;
+            x = (-6 * x + 3) * SK_ScalarRoot2Over2;
+            float integral = 0.5f * (std::erf(x) + 1.f);
+            *bitmap.getAddr8(i, 0) = SkToU8(sk_float_round2int(255.f * integral));
+        }
+        *bitmap.getAddr8(width - 1, 0) = 0;
         bitmap.setImmutable();
-        blurProfile = proxyProvider->createProxyFromBitmap(bitmap, GrMipMapped::kNo);
-        if (!blurProfile) {
+        proxy = proxyProvider->createProxyFromBitmap(bitmap, GrMipMapped::kNo);
+        if (!proxy) {
             return nullptr;
         }
-        SkASSERT(blurProfile->origin() == kTopLeft_GrSurfaceOrigin);
-        proxyProvider->assignUniqueKeyToProxy(key, blurProfile.get());
+        SkASSERT(proxy->origin() == kTopLeft_GrSurfaceOrigin);
+        proxyProvider->assignUniqueKeyToProxy(key, proxy.get());
     }
-
-    return blurProfile;
+    return proxy;
 }
 }
 
@@ -75,6 +90,7 @@
      static std::unique_ptr<GrFragmentProcessor> Make(GrProxyProvider* proxyProvider,
                                                       const GrShaderCaps& caps,
                                                       const SkRect& rect, float sigma) {
+         SkASSERT(rect.isSorted());
          if (!caps.floatIs32Bits()) {
              // We promote the math that gets us into the Gaussian space to full float when the rect
              // coords are large. If we don't have full float then fail. We could probably clip the
@@ -85,55 +101,91 @@
              }
          }
 
-         // The profilee straddles the rect edges (half inside, half outside). Thus if the profile
-         // size is greater than the rect width/height then the area at the center of the rect is
-         // influenced by both edges. This is not handled by this effect.
-         float profileSize = 6 * sigma;
-         if (profileSize >= (float) rect.width() || profileSize >= (float) rect.height()) {
-             // if the blur sigma is too large so the gaussian overlaps the whole
-             // rect in either direction, fall back to CPU path for now.
+         const float sixSigma = 6 * sigma;
+         auto integral = CreateIntegralTexture(proxyProvider, sixSigma);
+         if (!integral) {
              return nullptr;
          }
 
-         auto profile = CreateBlurProfileTexture(proxyProvider, profileSize);
-         if (!profile) {
-             return nullptr;
-         }
-         // The profile is calculated such that the midpoint is at the rect's edge. To simplify
-         // calculating texture coords in the shader, we inset the rect such that the profile
-         // can be used with one end point aligned to the edges of the rect uniform. The texture
-         // coords should be scaled such that the profile is sampled over a 6 sigma range so inset
-         // by 3 sigma.
-         float halfWidth = profileSize / 2;
-         auto insetR = rect.makeInset(halfWidth, halfWidth);
-         // inverse of the width over which the profile texture should be interpolated outward from
-         // the inset rect.
-         float invWidth = 1.f / profileSize;
-         return std::unique_ptr<GrFragmentProcessor>(new GrRectBlurEffect(
-                 insetR, std::move(profile), invWidth, GrSamplerState::ClampBilerp()));
+         // In the fast variant we think of the midpoint of the integral texture as aligning
+         // with the closest rect edge both in x and y. To simplify texture coord calculation we
+         // inset the rect so that the edge of the inset rect corresponds to t = 0 in the texture.
+         // It actually simplifies things a bit in the !isFast case, too.
+         float threeSigma = sixSigma / 2;
+         SkRect insetRect = {rect.fLeft   + threeSigma,
+                             rect.fTop    + threeSigma,
+                             rect.fRight  - threeSigma,
+                             rect.fBottom - threeSigma};
+
+         // In our fast variant we find the nearest horizontal and vertical edges and for each
+         // do a lookup in the integral texture for each and multiply them. When the rect is
+         // less than 6 sigma wide then things aren't so simple and we have to consider both the
+         // left and right edge of the rectangle (and similar in y).
+         bool isFast = insetRect.isSorted();
+         // 1 / (6 * sigma) is the domain of the integral texture. We use the inverse to produce
+         // normalized texture coords from frag coord distances.
+         float invSixSigma = 1.f / sixSigma;
+         return std::unique_ptr<GrFragmentProcessor>(new GrRectBlurEffect(insetRect,
+                 std::move(integral), invSixSigma, isFast, GrSamplerState::ClampBilerp()));
      }
 }
 
 void main() {
-        // Get the smaller of the signed distance from the frag coord to the left and right edges
-        // and similar for y.
-        // The blur profile computed by SkMaskFilter::ComputeBlurProfile is actually 1 - integral.
-        // The integral is an S-looking shape that is symmetric about 0, so we just  compute x and
-        // "backwards" such that texture coord is 1 at the edge and goes to 0 as we move outward.
-        half x;
-        @if (highp) {
-            x = max(half(rectF.x - sk_FragCoord.x), half(sk_FragCoord.x - rectF.z));
+        half xCoverage, yCoverage;
+        @if (isFast) {
+            // Get the smaller of the signed distance from the frag coord to the left and right
+            // edges and similar for y.
+            // The integral texture goes "backwards" (from 3*sigma to -3*sigma), So, the below
+            // computations align the left edge of the integral texture with the inset rect's edge
+            // extending outward 6 * sigma from the inset rect.
+            half x, y;
+            @if (highp) {
+                x = max(half(rectF.x - sk_FragCoord.x), half(sk_FragCoord.x - rectF.z));
+                y = max(half(rectF.y - sk_FragCoord.y), half(sk_FragCoord.y - rectF.w));
+           } else {
+                x = max(half(rectH.x - sk_FragCoord.x), half(sk_FragCoord.x - rectH.z));
+                y = max(half(rectH.y - sk_FragCoord.y), half(sk_FragCoord.y - rectH.w));
+            }
+            xCoverage = sample(integral, half2(x * invSixSigma, 0.5)).a;
+            yCoverage = sample(integral, half2(y * invSixSigma, 0.5)).a;
+            sk_OutColor = sk_InColor * xCoverage * yCoverage;
         } else {
-            x = max(half(rectH.x - sk_FragCoord.x), half(sk_FragCoord.x - rectH.z));
+            // We just consider just the x direction here. In practice we compute x and y separately
+            // and multiply them together.
+            // We define our coord system so that the point at which we're evaluating a kernel
+            // defined by the normal distribution (K) as  0. In this coord system let L be left
+            // edge and R be the right edge of the rectangle.
+            // We can calculate C by integrating K with the half infinite ranges outside the L to R
+            // range and subtracting from 1:
+            //   C = 1 - <integral of K from from -inf to  L> - <integral of K from R to inf>
+            // K is symmetric about x=0 so:
+            //   C = 1 - <integral of K from from -inf to  L> - <integral of K from -inf to -R>
+
+            // The integral texture goes "backwards" (from 3*sigma to -3*sigma) which is factored
+            // in to the below calculations.
+            // Also, our rect uniform was pre-inset by 3 sigma from the actual rect being blurred,
+            // also factored in.
+            half l, r, t, b;
+            @if (highp) {
+                l = half(sk_FragCoord.x - rectF.x);
+                r = half(rectF.z - sk_FragCoord.x);
+                t = half(sk_FragCoord.y - rectF.y);
+                b = half(rectF.w - sk_FragCoord.y);
+            } else {
+                l = half(sk_FragCoord.x - rectH.x);
+                r = half(rectH.z - sk_FragCoord.x);
+                t = half(sk_FragCoord.y - rectH.y);
+                b = half(rectH.w - sk_FragCoord.y);
+            }
+            half il = 1 + l * invSixSigma;
+            half ir = 1 + r * invSixSigma;
+            half it = 1 + t * invSixSigma;
+            half ib = 1 + b * invSixSigma;
+            xCoverage = 1 - sample(integral, half2(il, 0.5)).a
+                          - sample(integral, half2(ir, 0.5)).a;
+            yCoverage = 1 - sample(integral, half2(it, 0.5)).a
+                          - sample(integral, half2(ib, 0.5)).a;
         }
-        half y;
-        @if (highp) {
-            y = max(half(rectF.y - sk_FragCoord.y), half(sk_FragCoord.y - rectF.w));
-        } else {
-            y = max(half(rectH.y - sk_FragCoord.y), half(sk_FragCoord.y - rectH.w));
-        }
-        half xCoverage = sample(blurProfile, half2(x * invProfileWidth, 0.5)).a;
-        half yCoverage = sample(blurProfile, half2(y * invProfileWidth, 0.5)).a;
         sk_OutColor = sk_InColor * xCoverage * yCoverage;
 }
 
diff --git a/src/gpu/effects/generated/GrRectBlurEffect.cpp b/src/gpu/effects/generated/GrRectBlurEffect.cpp
index a717040..186dcfe 100644
--- a/src/gpu/effects/generated/GrRectBlurEffect.cpp
+++ b/src/gpu/effects/generated/GrRectBlurEffect.cpp
@@ -25,8 +25,10 @@
         (void)_outer;
         auto rect = _outer.rect;
         (void)rect;
-        auto invProfileWidth = _outer.invProfileWidth;
-        (void)invProfileWidth;
+        auto invSixSigma = _outer.invSixSigma;
+        (void)invSixSigma;
+        auto isFast = _outer.isFast;
+        (void)isFast;
         highp = ((abs(rect.left()) > 16000.0 || abs(rect.top()) > 16000.0) ||
                  abs(rect.right()) > 16000.0) ||
                 abs(rect.bottom()) > 16000.0;
@@ -38,33 +40,65 @@
             rectHVar = args.fUniformHandler->addUniform(kFragment_GrShaderFlag, kHalf4_GrSLType,
                                                         "rectH");
         }
-        invProfileWidthVar = args.fUniformHandler->addUniform(kFragment_GrShaderFlag,
-                                                              kHalf_GrSLType, "invProfileWidth");
+        invSixSigmaVar = args.fUniformHandler->addUniform(kFragment_GrShaderFlag, kHalf_GrSLType,
+                                                          "invSixSigma");
         fragBuilder->codeAppendf(
-                "/* key */ bool highp = %s;\nhalf x;\n@if (highp) {\n    x = max(half(%s.x - "
-                "sk_FragCoord.x), half(sk_FragCoord.x - %s.z));\n} else {\n    x = "
-                "max(half(float(%s.x) - sk_FragCoord.x), half(sk_FragCoord.x - "
-                "float(%s.z)));\n}\nhalf y;\n@if (highp) {\n    y = max(half(%s.y - "
-                "sk_FragCoord.y), half(sk_FragCoord.y - %s.w));\n} else {\n    y = "
-                "max(half(float(%s.y) - sk_FragCoord.y), half(sk_FragCoord.y - "
-                "float(%s.w)));\n}\nhalf xCoverage = sample(%s, float2(half2(x * %s, "
-                "0.5))).%s.w;\nhalf yCoverage = sample(%s, flo",
-                (highp ? "true" : "false"),
+                "/* key */ bool highp = %s;\nhalf xCoverage, yCoverage;\n@if (%s) {\n    half x, "
+                "y;\n    @if (highp) {\n        x = max(half(%s.x - sk_FragCoord.x), "
+                "half(sk_FragCoord.x - %s.z));\n        y = max(half(%s.y - sk_FragCoord.y), "
+                "half(sk_FragCoord.y - %s.w));\n    } else {\n        x = max(half(float(%s.x) - "
+                "sk_FragCoord.x), half(sk_FragCoord.x - float(%s.z)));\n        y = "
+                "max(half(float(%s.y) - sk_FragCoord.y), half(sk_FragCoord.y - float(%s.w)));\n    "
+                "}\n    xCoverage = sample(%s, float2(half2(x * %s, 0.5))).",
+                (highp ? "true" : "false"), (_outer.isFast ? "true" : "false"),
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
                 rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
                 rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
                 rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
                 rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
-                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
-                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
                 rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
                 rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
                 fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
-                args.fUniformHandler->getUniformCStr(invProfileWidthVar),
-                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
-                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]));
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar));
         fragBuilder->codeAppendf(
-                "at2(half2(y * %s, 0.5))).%s.w;\n%s = (%s * xCoverage) * yCoverage;\n",
-                args.fUniformHandler->getUniformCStr(invProfileWidthVar),
+                "%s.w;\n    yCoverage = sample(%s, float2(half2(y * %s, 0.5))).%s.w;\n    %s = (%s "
+                "* xCoverage) * yCoverage;\n} else {\n    half l, r, t, b;\n    @if (highp) {\n    "
+                "    l = half(sk_FragCoord.x - %s.x);\n        r = half(%s.z - sk_FragCoord.x);\n  "
+                "      t = half(sk_FragCoord.y - %s.y);\n        b = half(%s.w - "
+                "sk_FragCoord.y);\n    } else {\n        l = half(sk_FragCoord.x - float(%s.x));\n "
+                "       r = half(float(%s.z) - sk_FragCoord.x);\n        t = half(sk_FragCoord.y - "
+                "float(%s.y));\n        b = half(float(",
+                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
+                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar),
+                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
+                args.fOutputColor, args.fInputColor,
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
+                rectFVar.isValid() ? args.fUniformHandler->getUniformCStr(rectFVar) : "float4(0)",
+                rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
+                rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
+                rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)");
+        fragBuilder->codeAppendf(
+                "%s.w) - sk_FragCoord.y);\n    }\n    half il = 1.0 + l * %s;\n    half ir = 1.0 + "
+                "r * %s;\n    half it = 1.0 + t * %s;\n    half ib = 1.0 + b * %s;\n    xCoverage "
+                "= (1.0 - sample(%s, float2(half2(il, 0.5))).%s.w) - sample(%s, float2(half2(ir, "
+                "0.5))).%s.w;\n    yCoverage = (1.0 - sample(%s, float2(half2(it, 0.5))).%s.w) - "
+                "sample(%s, float2(half2(ib, 0.5))).%s.w;\n}\n%s = (%s * xCoverage) * yCoverage;\n",
+                rectHVar.isValid() ? args.fUniformHandler->getUniformCStr(rectHVar) : "half4(0)",
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar),
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar),
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar),
+                args.fUniformHandler->getUniformCStr(invSixSigmaVar),
+                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
+                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
+                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
+                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
+                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
+                fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
+                fragBuilder->getProgramBuilder()->samplerVariable(args.fTexSamplers[0]),
                 fragBuilder->getProgramBuilder()->samplerSwizzle(args.fTexSamplers[0]).c_str(),
                 args.fOutputColor, args.fInputColor);
     }
@@ -73,18 +107,20 @@
     void onSetData(const GrGLSLProgramDataManager& pdman,
                    const GrFragmentProcessor& _proc) override {
         const GrRectBlurEffect& _outer = _proc.cast<GrRectBlurEffect>();
-        { pdman.set1f(invProfileWidthVar, (_outer.invProfileWidth)); }
+        { pdman.set1f(invSixSigmaVar, (_outer.invSixSigma)); }
         auto rect = _outer.rect;
         (void)rect;
         UniformHandle& rectF = rectFVar;
         (void)rectF;
         UniformHandle& rectH = rectHVar;
         (void)rectH;
-        GrSurfaceProxy& blurProfileProxy = *_outer.textureSampler(0).proxy();
-        GrTexture& blurProfile = *blurProfileProxy.peekTexture();
-        (void)blurProfile;
-        UniformHandle& invProfileWidth = invProfileWidthVar;
-        (void)invProfileWidth;
+        GrSurfaceProxy& integralProxy = *_outer.textureSampler(0).proxy();
+        GrTexture& integral = *integralProxy.peekTexture();
+        (void)integral;
+        UniformHandle& invSixSigma = invSixSigmaVar;
+        (void)invSixSigma;
+        auto isFast = _outer.isFast;
+        (void)isFast;
 
         float r[]{rect.fLeft, rect.fTop, rect.fRight, rect.fBottom};
         pdman.set4fv(highp ? rectF : rectH, 1, r);
@@ -92,7 +128,7 @@
     bool highp = false;
     UniformHandle rectFVar;
     UniformHandle rectHVar;
-    UniformHandle invProfileWidthVar;
+    UniformHandle invSixSigmaVar;
 };
 GrGLSLFragmentProcessor* GrRectBlurEffect::onCreateGLSLInstance() const {
     return new GrGLSLRectBlurEffect();
@@ -103,27 +139,30 @@
                   abs(rect.right()) > 16000.0) ||
                  abs(rect.bottom()) > 16000.0;
     b->add32((int32_t)highp);
+    b->add32((int32_t)isFast);
 }
 bool GrRectBlurEffect::onIsEqual(const GrFragmentProcessor& other) const {
     const GrRectBlurEffect& that = other.cast<GrRectBlurEffect>();
     (void)that;
     if (rect != that.rect) return false;
-    if (blurProfile != that.blurProfile) return false;
-    if (invProfileWidth != that.invProfileWidth) return false;
+    if (integral != that.integral) return false;
+    if (invSixSigma != that.invSixSigma) return false;
+    if (isFast != that.isFast) return false;
     return true;
 }
 GrRectBlurEffect::GrRectBlurEffect(const GrRectBlurEffect& src)
         : INHERITED(kGrRectBlurEffect_ClassID, src.optimizationFlags())
         , rect(src.rect)
-        , blurProfile(src.blurProfile)
-        , invProfileWidth(src.invProfileWidth) {
+        , integral(src.integral)
+        , invSixSigma(src.invSixSigma)
+        , isFast(src.isFast) {
     this->setTextureSamplerCnt(1);
 }
 std::unique_ptr<GrFragmentProcessor> GrRectBlurEffect::clone() const {
     return std::unique_ptr<GrFragmentProcessor>(new GrRectBlurEffect(*this));
 }
 const GrFragmentProcessor::TextureSampler& GrRectBlurEffect::onTextureSampler(int index) const {
-    return IthTextureSampler(index, blurProfile);
+    return IthTextureSampler(index, integral);
 }
 GR_DEFINE_FRAGMENT_PROCESSOR_TEST(GrRectBlurEffect);
 #if GR_TEST_UTILS
diff --git a/src/gpu/effects/generated/GrRectBlurEffect.h b/src/gpu/effects/generated/GrRectBlurEffect.h
index 5202d1a..b0c86bd 100644
--- a/src/gpu/effects/generated/GrRectBlurEffect.h
+++ b/src/gpu/effects/generated/GrRectBlurEffect.h
@@ -12,6 +12,8 @@
 #define GrRectBlurEffect_DEFINED
 #include "include/core/SkTypes.h"
 
+#include <cmath>
+#include "include/core/SkRect.h"
 #include "include/core/SkScalar.h"
 #include "src/core/SkBlurMask.h"
 #include "src/core/SkMathPriv.h"
@@ -22,47 +24,53 @@
 #include "src/gpu/GrFragmentProcessor.h"
 class GrRectBlurEffect : public GrFragmentProcessor {
 public:
-    static sk_sp<GrTextureProxy> CreateBlurProfileTexture(GrProxyProvider* proxyProvider,
-                                                          float sixSigma) {
-        // The "profile" we are calculating is the integral of a Gaussian with 'sigma' and a half
-        // plane. All such profiles are just scales of each other. So all we really care about is
-        // having enough resolution so that the linear interpolation done in texture lookup doesn't
-        // introduce noticeable artifacts. SkBlurMask::ComputeBlurProfile() produces profiles with
-        // ceil(6 * sigma) entries. We conservatively choose to have 2 texels for each dst pixel.
-        int minProfileWidth = 2 * sk_float_ceil2int(sixSigma);
-        // Bin by powers of 2 with a minimum so we get good profile reuse (remember we can just
-        // scale the texture coords to span the larger profile over a 6 sigma distance).
-        int profileWidth = SkTMax(SkNextPow2(minProfileWidth), 32);
+    static sk_sp<GrTextureProxy> CreateIntegralTexture(GrProxyProvider* proxyProvider,
+                                                       float sixSigma) {
+        // The texture we're producing represents the integral of a normal distribution over a
+        // six-sigma range centered at zero. We want enough resolution so that the linear
+        // interpolation done in texture lookup doesn't introduce noticeable artifacts. We
+        // conservatively choose to have 2 texels for each dst pixel.
+        int minWidth = 2 * sk_float_ceil2int(sixSigma);
+        // Bin by powers of 2 with a minimum so we get good profile reuse.
+        int width = SkTMax(SkNextPow2(minWidth), 32);
 
         static const GrUniqueKey::Domain kDomain = GrUniqueKey::GenerateDomain();
         GrUniqueKey key;
         GrUniqueKey::Builder builder(&key, kDomain, 1, "Rect Blur Mask");
-        builder[0] = profileWidth;
+        builder[0] = width;
         builder.finish();
 
-        sk_sp<GrTextureProxy> blurProfile(proxyProvider->findOrCreateProxyByUniqueKey(
+        sk_sp<GrTextureProxy> proxy(proxyProvider->findOrCreateProxyByUniqueKey(
                 key, GrColorType::kAlpha_8, kTopLeft_GrSurfaceOrigin));
-        if (!blurProfile) {
+        if (!proxy) {
             SkBitmap bitmap;
-            if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(profileWidth, 1))) {
+            if (!bitmap.tryAllocPixels(SkImageInfo::MakeA8(width, 1))) {
                 return nullptr;
             }
-            SkBlurMask::ComputeBlurProfile(bitmap.getAddr8(0, 0), profileWidth, profileWidth / 6.f);
+            *bitmap.getAddr8(0, 0) = 255;
+            const float invWidth = 1.f / width;
+            for (int i = 1; i < width - 1; ++i) {
+                float x = (i + 0.5f) * invWidth;
+                x = (-6 * x + 3) * SK_ScalarRoot2Over2;
+                float integral = 0.5f * (std::erf(x) + 1.f);
+                *bitmap.getAddr8(i, 0) = SkToU8(sk_float_round2int(255.f * integral));
+            }
+            *bitmap.getAddr8(width - 1, 0) = 0;
             bitmap.setImmutable();
-            blurProfile = proxyProvider->createProxyFromBitmap(bitmap, GrMipMapped::kNo);
-            if (!blurProfile) {
+            proxy = proxyProvider->createProxyFromBitmap(bitmap, GrMipMapped::kNo);
+            if (!proxy) {
                 return nullptr;
             }
-            SkASSERT(blurProfile->origin() == kTopLeft_GrSurfaceOrigin);
-            proxyProvider->assignUniqueKeyToProxy(key, blurProfile.get());
+            SkASSERT(proxy->origin() == kTopLeft_GrSurfaceOrigin);
+            proxyProvider->assignUniqueKeyToProxy(key, proxy.get());
         }
-
-        return blurProfile;
+        return proxy;
     }
 
     static std::unique_ptr<GrFragmentProcessor> Make(GrProxyProvider* proxyProvider,
                                                      const GrShaderCaps& caps, const SkRect& rect,
                                                      float sigma) {
+        SkASSERT(rect.isSorted());
         if (!caps.floatIs32Bits()) {
             // We promote the math that gets us into the Gaussian space to full float when the rect
             // coords are large. If we don't have full float then fail. We could probably clip the
@@ -73,48 +81,49 @@
             }
         }
 
-        // The profilee straddles the rect edges (half inside, half outside). Thus if the profile
-        // size is greater than the rect width/height then the area at the center of the rect is
-        // influenced by both edges. This is not handled by this effect.
-        float profileSize = 6 * sigma;
-        if (profileSize >= (float)rect.width() || profileSize >= (float)rect.height()) {
-            // if the blur sigma is too large so the gaussian overlaps the whole
-            // rect in either direction, fall back to CPU path for now.
+        const float sixSigma = 6 * sigma;
+        auto integral = CreateIntegralTexture(proxyProvider, sixSigma);
+        if (!integral) {
             return nullptr;
         }
 
-        auto profile = CreateBlurProfileTexture(proxyProvider, profileSize);
-        if (!profile) {
-            return nullptr;
-        }
-        // The profile is calculated such that the midpoint is at the rect's edge. To simplify
-        // calculating texture coords in the shader, we inset the rect such that the profile
-        // can be used with one end point aligned to the edges of the rect uniform. The texture
-        // coords should be scaled such that the profile is sampled over a 6 sigma range so inset
-        // by 3 sigma.
-        float halfWidth = profileSize / 2;
-        auto insetR = rect.makeInset(halfWidth, halfWidth);
-        // inverse of the width over which the profile texture should be interpolated outward from
-        // the inset rect.
-        float invWidth = 1.f / profileSize;
-        return std::unique_ptr<GrFragmentProcessor>(new GrRectBlurEffect(
-                insetR, std::move(profile), invWidth, GrSamplerState::ClampBilerp()));
+        // In the fast variant we think of the midpoint of the integral texture as aligning
+        // with the closest rect edge both in x and y. To simplify texture coord calculation we
+        // inset the rect so that the edge of the inset rect corresponds to t = 0 in the texture.
+        // It actually simplifies things a bit in the !isFast case, too.
+        float threeSigma = sixSigma / 2;
+        SkRect insetRect = {rect.fLeft + threeSigma, rect.fTop + threeSigma,
+                            rect.fRight - threeSigma, rect.fBottom - threeSigma};
+
+        // In our fast variant we find the nearest horizontal and vertical edges and for each
+        // do a lookup in the integral texture for each and multiply them. When the rect is
+        // less than 6 sigma wide then things aren't so simple and we have to consider both the
+        // left and right edge of the rectangle (and similar in y).
+        bool isFast = insetRect.isSorted();
+        // 1 / (6 * sigma) is the domain of the integral texture. We use the inverse to produce
+        // normalized texture coords from frag coord distances.
+        float invSixSigma = 1.f / sixSigma;
+        return std::unique_ptr<GrFragmentProcessor>(
+                new GrRectBlurEffect(insetRect, std::move(integral), invSixSigma, isFast,
+                                     GrSamplerState::ClampBilerp()));
     }
     GrRectBlurEffect(const GrRectBlurEffect& src);
     std::unique_ptr<GrFragmentProcessor> clone() const override;
     const char* name() const override { return "RectBlurEffect"; }
     SkRect rect;
-    TextureSampler blurProfile;
-    float invProfileWidth;
+    TextureSampler integral;
+    float invSixSigma;
+    bool isFast;
 
 private:
-    GrRectBlurEffect(SkRect rect, sk_sp<GrTextureProxy> blurProfile, float invProfileWidth,
+    GrRectBlurEffect(SkRect rect, sk_sp<GrTextureProxy> integral, float invSixSigma, bool isFast,
                      GrSamplerState samplerParams)
             : INHERITED(kGrRectBlurEffect_ClassID,
                         (OptimizationFlags)kCompatibleWithCoverageAsAlpha_OptimizationFlag)
             , rect(rect)
-            , blurProfile(std::move(blurProfile), samplerParams)
-            , invProfileWidth(invProfileWidth) {
+            , integral(std::move(integral), samplerParams)
+            , invSixSigma(invSixSigma)
+            , isFast(isFast) {
         this->setTextureSamplerCnt(1);
     }
     GrGLSLFragmentProcessor* onCreateGLSLInstance() const override;