Add MSAA and non-aa modes to GrFillRRect Op

Adds a non-aa mode and an MSAA mode that uses the sample mask. Also
adds a new cap to decide whether we prefer this new sample mask Op for
large round rects, or whether it's faster to just continue drawing
them as paths like before.

Bug: skia:
Change-Id: Ic344ace26e7889c312c3040ad345b4d9a717f96d
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/204135
Reviewed-by: Brian Salomon <bsalomon@google.com>
Commit-Queue: Chris Dalton <csmartdalton@google.com>
diff --git a/gm/samplelocations.cpp b/gm/samplelocations.cpp
index 2efe192..d103483 100644
--- a/gm/samplelocations.cpp
+++ b/gm/samplelocations.cpp
@@ -138,7 +138,7 @@
                            f->sampleOffsets(), coord.fsIn());
         f->codeAppendf(    "if (all(lessThanEqual(abs(samplecoord), float2(1)))) {");
         f->maskOffMultisampleCoverage(
-                "~(1 << i)", GrGLSLFragmentShaderBuilder::Scope::kInsideLoopOrBranch);
+                "~(1 << i)", GrGLSLFPFragmentBuilder::ScopeFlags::kInsideLoop);
         f->codeAppendf(    "}");
         f->codeAppendf("}");
     }
diff --git a/src/gpu/GrCaps.cpp b/src/gpu/GrCaps.cpp
index 4011b37..7c4bcd7 100644
--- a/src/gpu/GrCaps.cpp
+++ b/src/gpu/GrCaps.cpp
@@ -74,6 +74,8 @@
 
     fPreferVRAMUseOverFlushes = true;
 
+    fPreferTrianglesOverSampleMask = false;
+
     // Default to true, allow older versions of OpenGL to disable explicitly
     fClampToBorderSupport = true;
 
@@ -210,6 +212,8 @@
     writer->appendBool("Blacklist Coverage Counting Path Renderer [workaround]",
                        fBlacklistCoverageCounting);
     writer->appendBool("Prefer VRAM Use over flushes [workaround]", fPreferVRAMUseOverFlushes);
+    writer->appendBool("Prefer more triangles over sample mask [MSAA only]",
+                       fPreferTrianglesOverSampleMask);
     writer->appendBool("Avoid stencil buffers [workaround]", fAvoidStencilBuffers);
 
     if (this->advancedBlendEquationSupport()) {
diff --git a/src/gpu/GrCaps.h b/src/gpu/GrCaps.h
index b11077a..74cdb7a 100644
--- a/src/gpu/GrCaps.h
+++ b/src/gpu/GrCaps.h
@@ -73,6 +73,8 @@
 
     bool preferVRAMUseOverFlushes() const { return fPreferVRAMUseOverFlushes; }
 
+    bool preferTrianglesOverSampleMask() const { return fPreferTrianglesOverSampleMask; }
+
     bool blacklistCoverageCounting() const { return fBlacklistCoverageCounting; }
 
     bool avoidStencilBuffers() const { return fAvoidStencilBuffers; }
@@ -358,6 +360,9 @@
     // ANGLE performance workaround
     bool fPreferVRAMUseOverFlushes                   : 1;
 
+    // On some platforms it's better to make more triangles than to use the sample mask (MSAA only).
+    bool fPreferTrianglesOverSampleMask              : 1;
+
     // TODO: this may need to be an enum to support different fence types
     bool fFenceSyncSupport                           : 1;
 
diff --git a/src/gpu/GrRenderTargetContext.cpp b/src/gpu/GrRenderTargetContext.cpp
index 19c7c42..ff3c7c6 100644
--- a/src/gpu/GrRenderTargetContext.cpp
+++ b/src/gpu/GrRenderTargetContext.cpp
@@ -1225,21 +1225,22 @@
     AutoCheckFlush acf(this->drawingManager());
 
     GrAAType aaType = this->chooseAAType(aa);
-    if (GrAAType::kCoverage == aaType) {
-        std::unique_ptr<GrDrawOp> op;
-        if (style.isSimpleFill()) {
-            op = GrFillRRectOp::Make(fContext, viewMatrix, rrect, *this->caps(), std::move(paint));
-        }
-        if (!op) {
-            assert_alive(paint);
-            op = GrOvalOpFactory::MakeRRectOp(fContext, std::move(paint), viewMatrix, rrect, stroke,
-                                              this->caps()->shaderCaps());
-        }
 
-        if (op) {
-            this->addDrawOp(*clip, std::move(op));
-            return;
-        }
+    std::unique_ptr<GrDrawOp> op;
+    if (style.isSimpleFill()) {
+        assert_alive(paint);
+        op = GrFillRRectOp::Make(
+                fContext, aaType, viewMatrix, rrect, *this->caps(), std::move(paint));
+    }
+    if (!op && GrAAType::kCoverage == aaType) {
+        assert_alive(paint);
+        op = GrOvalOpFactory::MakeRRectOp(
+                fContext, std::move(paint), viewMatrix, rrect, stroke, this->caps()->shaderCaps());
+
+    }
+    if (op) {
+        this->addDrawOp(*clip, std::move(op));
+        return;
     }
 
     assert_alive(paint);
@@ -1630,29 +1631,31 @@
     AutoCheckFlush acf(this->drawingManager());
 
     GrAAType aaType = this->chooseAAType(aa);
-    if (GrAAType::kCoverage == aaType) {
-        std::unique_ptr<GrDrawOp> op;
+
+    std::unique_ptr<GrDrawOp> op;
+    if (style.isSimpleFill()) {
         // GrFillRRectOp has special geometry and a fragment-shader branch to conditionally evaluate
         // the arc equation. This same special geometry and fragment branch also turn out to be a
         // substantial optimization for drawing ovals (namely, by not evaluating the arc equation
         // inside the oval's inner diamond). Given these optimizations, it's a clear win to draw
         // ovals the exact same way we do round rects.
         //
-        // However, we still don't draw true circles as round rects, because it can cause perf
-        // regressions on some platforms as compared to the dedicated circle Op.
-        if (style.isSimpleFill() && oval.height() != oval.width()) {
-            op = GrFillRRectOp::Make(
-                    fContext, viewMatrix, SkRRect::MakeOval(oval), *this->caps(), std::move(paint));
-        }
-        if (!op) {
+        // However, we still don't draw true circles as round rects in coverage mode, because it can
+        // cause perf regressions on some platforms as compared to the dedicated circle Op.
+        if (GrAAType::kCoverage != aaType || oval.height() != oval.width()) {
             assert_alive(paint);
-            op = GrOvalOpFactory::MakeOvalOp(fContext, std::move(paint), viewMatrix, oval, style,
-                                             this->caps()->shaderCaps());
+            op = GrFillRRectOp::Make(fContext, aaType, viewMatrix, SkRRect::MakeOval(oval),
+                                     *this->caps(), std::move(paint));
         }
-        if (op) {
-            this->addDrawOp(clip, std::move(op));
-            return;
-        }
+    }
+    if (!op && GrAAType::kCoverage == aaType) {
+        assert_alive(paint);
+        op = GrOvalOpFactory::MakeOvalOp(fContext, std::move(paint), viewMatrix, oval, style,
+                                         this->caps()->shaderCaps());
+    }
+    if (op) {
+        this->addDrawOp(clip, std::move(op));
+        return;
     }
 
     assert_alive(paint);
diff --git a/src/gpu/gl/GrGLCaps.cpp b/src/gpu/gl/GrGLCaps.cpp
index df8e60e..c1889dd 100644
--- a/src/gpu/gl/GrGLCaps.cpp
+++ b/src/gpu/gl/GrGLCaps.cpp
@@ -526,6 +526,12 @@
     fPreferVRAMUseOverFlushes = !isANGLE;
 #endif
 
+    if (kARM_GrGLVendor == ctxInfo.vendor()) {
+        // ARM seems to do better with larger quantities of fine triangles, as opposed to using the
+        // sample mask. (At least in our current round rect op.)
+        fPreferTrianglesOverSampleMask = true;
+    }
+
     if (kChromium_GrGLDriver == ctxInfo.driver()) {
         fMustClearUploadedBufferData = true;
     }
@@ -759,6 +765,13 @@
         }
     }
 
+    // FIXME: The sample mask round rect op draws nothing on several Adreno and Radeon bots.
+    // Temporarily disable while we investigate.
+    // http://skbug.com/8921
+    if (kQualcomm_GrGLVendor == ctxInfo.vendor() || kATI_GrGLVendor == ctxInfo.vendor()) {
+        shaderCaps->fSampleVariablesSupport = false;
+    }
+
     shaderCaps->fVersionDeclString = get_glsl_version_decl_string(standard,
                                                                   shaderCaps->fGLSLGeneration,
                                                                   fIsCoreProfile);
diff --git a/src/gpu/glsl/GrGLSLFragmentShaderBuilder.cpp b/src/gpu/glsl/GrGLSLFragmentShaderBuilder.cpp
index 891e973..5c4b1dc 100644
--- a/src/gpu/glsl/GrGLSLFragmentShaderBuilder.cpp
+++ b/src/gpu/glsl/GrGLSLFragmentShaderBuilder.cpp
@@ -91,7 +91,8 @@
     return "_sampleOffsets";
 }
 
-void GrGLSLFragmentShaderBuilder::maskOffMultisampleCoverage(const char* mask, Scope scope) {
+void GrGLSLFragmentShaderBuilder::maskOffMultisampleCoverage(
+        const char* mask, ScopeFlags scopeFlags) {
     const GrShaderCaps& shaderCaps = *fProgramBuilder->shaderCaps();
     if (!shaderCaps.sampleVariablesSupport()) {
         SkDEBUGFAIL("Attempted to mask sample coverage without support.");
@@ -101,18 +102,60 @@
         this->addFeature(1 << kSampleVariables_GLSLPrivateFeature, extension);
     }
 
-    if (!fHasInitializedSampleMask && Scope::kTopLevel == scope) {
-        this->codeAppendf("gl_SampleMask[0] = (%s);", mask);
-        fHasInitializedSampleMask = true;
-        return;
+    if (!fHasModifiedSampleMask) {
+        fHasModifiedSampleMask = true;
+        if (ScopeFlags::kTopLevel != scopeFlags) {
+            this->codePrependf("gl_SampleMask[0] = ~0;");
+        }
+        if (!(ScopeFlags::kInsideLoop & scopeFlags)) {
+            this->codeAppendf("gl_SampleMask[0] = (%s);", mask);
+            return;
+        }
     }
-    if (!fHasInitializedSampleMask) {
-        this->codePrependf("gl_SampleMask[0] = ~0;");
-        fHasInitializedSampleMask = true;
-    }
+
     this->codeAppendf("gl_SampleMask[0] &= (%s);", mask);
 }
 
+void GrGLSLFragmentShaderBuilder::applyFnToMultisampleMask(
+        const char* fn, const char* grad, ScopeFlags scopeFlags) {
+    SkASSERT(CustomFeatures::kSampleLocations & fProgramBuilder->header().processorFeatures());
+    SkDEBUGCODE(fUsedProcessorFeaturesThisStage_DebugOnly |= CustomFeatures::kSampleLocations);
+    SkDEBUGCODE(fUsedProcessorFeaturesAllStages_DebugOnly |= CustomFeatures::kSampleLocations);
+
+    int sampleCnt = fProgramBuilder->effectiveSampleCnt();
+    SkASSERT(sampleCnt > 1);
+
+    this->codeAppendf("{");
+
+    if (!grad) {
+        SkASSERT(fProgramBuilder->shaderCaps()->shaderDerivativeSupport());
+        // In order to use HW derivatives, our neighbors within the same primitive must also be
+        // executing the same code. A per-pixel branch makes this pre-condition impossible to
+        // fulfill.
+        SkASSERT(!(ScopeFlags::kInsidePerPixelBranch & scopeFlags));
+        this->codeAppendf("float2 grad = float2(dFdx(fn), dFdy(fn));");
+        this->codeAppendf("float fnwidth = fwidth(fn);");
+        grad = "grad";
+    } else {
+        this->codeAppendf("float fnwidth = abs(%s.x) + abs(%s.y);", grad, grad);
+    }
+
+    this->codeAppendf("int mask = 0;");
+    this->codeAppendf("if (%s*2 < fnwidth) {", fn);  // Are ANY samples inside the implicit fn?
+    this->codeAppendf(    "if (%s*-2 >= fnwidth) {", fn);  // Are ALL samples inside the implicit?
+    this->codeAppendf(        "mask = ~0;");
+    this->codeAppendf(    "} else for (int i = 0; i < %i; ++i) {", sampleCnt);
+    this->codeAppendf(        "float fnsample = dot(%s, _sampleOffsets[i]) + %s;", grad, fn);
+    this->codeAppendf(        "if (fnsample < 0) {");
+    this->codeAppendf(            "mask |= (1 << i);");
+    this->codeAppendf(        "}");
+    this->codeAppendf(    "}");
+    this->codeAppendf("}");
+    this->maskOffMultisampleCoverage("mask", scopeFlags);
+
+    this->codeAppendf("}");
+}
+
 const char* GrGLSLFragmentShaderBuilder::dstColor() {
     SkDEBUGCODE(fHasReadDstColorThisStage_DebugOnly = true;)
 
@@ -217,10 +260,10 @@
                      == fUsedProcessorFeaturesAllStages_DebugOnly);
 
     if (CustomFeatures::kSampleLocations & fProgramBuilder->header().processorFeatures()) {
-        this->definitions().append("const float2 _sampleOffsets[] = float2[](");
         const GrPipeline& pipeline = fProgramBuilder->pipeline();
         const SkTArray<SkPoint>& sampleLocations =
                 fProgramBuilder->renderTarget()->renderTargetPriv().getSampleLocations(pipeline);
+        this->definitions().append("const float2 _sampleOffsets[] = float2[](");
         for (int i = 0; i < sampleLocations.count(); ++i) {
             SkPoint offset = sampleLocations[i] - SkPoint::Make(.5f, .5f);
             if (kBottomLeft_GrSurfaceOrigin == this->getSurfaceOrigin()) {
diff --git a/src/gpu/glsl/GrGLSLFragmentShaderBuilder.h b/src/gpu/glsl/GrGLSLFragmentShaderBuilder.h
index 94ba82c..3f1c2c5 100644
--- a/src/gpu/glsl/GrGLSLFragmentShaderBuilder.h
+++ b/src/gpu/glsl/GrGLSLFragmentShaderBuilder.h
@@ -54,9 +54,15 @@
      */
     virtual const char* sampleOffsets() = 0;
 
-    enum class Scope : bool {
-        kTopLevel,
-        kInsideLoopOrBranch
+    enum class ScopeFlags {
+        // Every fragment will always execute this code, and will do it exactly once.
+        kTopLevel = 0,
+        // Either all fragments in a given primitive, or none, will execute this code.
+        kInsidePerPrimitiveBranch = (1 << 0),
+        // Any given fragment may or may not execute this code.
+        kInsidePerPixelBranch = (1 << 1),
+        // This code will be executed more than once.
+        kInsideLoop = (1 << 2)
     };
 
     /**
@@ -68,7 +74,21 @@
      *
      * Requires MSAA and GLSL support for sample variables.
      */
-    virtual void maskOffMultisampleCoverage(const char* mask, Scope) = 0;
+    virtual void maskOffMultisampleCoverage(const char* mask, ScopeFlags) = 0;
+
+    /**
+     * Turns off coverage at each sample where the implicit function fn > 0.
+     *
+     * The provided "fn" value represents the implicit function at pixel center. We then approximate
+     * the implicit at each sample by riding the gradient, "grad", linearly from pixel center to
+     * each sample location.
+     *
+     * If "grad" is null, we approximate the gradient using HW derivatives.
+     *
+     * Requires MSAA and GLSL support for sample variables. Also requires HW derivatives if not
+     * providing a gradient.
+     */
+    virtual void applyFnToMultisampleMask(const char* fn, const char* grad, ScopeFlags) = 0;
 
     /**
      * Fragment procs with child procs should call these functions before/after calling emitCode
@@ -82,6 +102,8 @@
     virtual void forceHighPrecision() = 0;
 };
 
+GR_MAKE_BITFIELD_CLASS_OPS(GrGLSLFPFragmentBuilder::ScopeFlags);
+
 /*
  * This class is used by Xfer processors to build their fragment code.
  */
@@ -119,7 +141,8 @@
 
     // GrGLSLFPFragmentBuilder interface.
     const char* sampleOffsets() override;
-    void maskOffMultisampleCoverage(const char* mask, Scope) override;
+    void maskOffMultisampleCoverage(const char* mask, ScopeFlags) override;
+    void applyFnToMultisampleMask(const char* fn, const char* grad, ScopeFlags) override;
     const SkString& getMangleString() const override { return fMangleString; }
     void onBeforeChildProcEmitCode() override;
     void onAfterChildProcEmitCode() override;
@@ -187,7 +210,7 @@
     bool fHasCustomColorOutput = false;
     int fCustomColorOutputIndex = -1;
     bool fHasSecondaryOutput = false;
-    bool fHasInitializedSampleMask = false;
+    bool fHasModifiedSampleMask = false;
     bool fForceHighPrecision = false;
 
     friend class GrGLSLProgramBuilder;
diff --git a/src/gpu/ops/GrFillRRectOp.cpp b/src/gpu/ops/GrFillRRectOp.cpp
index ead28f6..12d4300 100644
--- a/src/gpu/ops/GrFillRRectOp.cpp
+++ b/src/gpu/ops/GrFillRRectOp.cpp
@@ -21,34 +21,41 @@
 
 // Hardware derivatives are not always accurate enough for highly elliptical corners. This method
 // checks to make sure the corners will still all look good if we use HW derivatives.
-static bool can_use_hw_derivatives(const GrShaderCaps&, const SkMatrix&, const SkRRect&);
+static bool can_use_hw_derivatives_with_coverage(
+        const GrShaderCaps&, const SkMatrix&, const SkRRect&);
 
 std::unique_ptr<GrFillRRectOp> GrFillRRectOp::Make(
-        GrRecordingContext* ctx, const SkMatrix& viewMatrix, const SkRRect& rrect,
+        GrRecordingContext* ctx, GrAAType aaType, const SkMatrix& viewMatrix, const SkRRect& rrect,
         const GrCaps& caps, GrPaint&& paint) {
     if (!caps.instanceAttribSupport()) {
         return nullptr;
     }
 
-    // TODO: Support perspective in a follow-on CL. This shouldn't be difficult, since we already
-    // use HW derivatives. The only trick will be adjusting the AA outset to account for
-    // perspective.  (i.e., outset = 0.5 * z.)
-    if (viewMatrix.hasPerspective()) {
-        return nullptr;
-    }
-
-    GrOpMemoryPool* pool = ctx->priv().opMemoryPool();
-    return pool->allocate<GrFillRRectOp>(*caps.shaderCaps(), viewMatrix, rrect, std::move(paint));
-}
-
-GrFillRRectOp::GrFillRRectOp(const GrShaderCaps& shaderCaps, const SkMatrix& viewMatrix,
-                             const SkRRect& rrect, GrPaint&& paint)
-        : GrDrawOp(ClassID())
-        , fOriginalColor(paint.getColor4f())
-        , fLocalRect(rrect.rect())
-        , fProcessors(std::move(paint)) {
-    if (can_use_hw_derivatives(shaderCaps, viewMatrix, rrect)) {
-        fFlags |= Flags::kUseHWDerivatives;
+    Flags flags = Flags::kNone;
+    if (GrAAType::kCoverage == aaType) {
+        // TODO: Support perspective in a follow-on CL. This shouldn't be difficult, since we
+        // already use HW derivatives. The only trick will be adjusting the AA outset to account for
+        // perspective. (i.e., outset = 0.5 * z.)
+        if (viewMatrix.hasPerspective()) {
+            return nullptr;
+        }
+        if (can_use_hw_derivatives_with_coverage(*caps.shaderCaps(), viewMatrix, rrect)) {
+            // HW derivatives (more specifically, fwidth()) are consistently faster on all platforms
+            // in coverage mode. We use them as long as the approximation will be accurate enough.
+            flags |= Flags::kUseHWDerivatives;
+        }
+    } else {
+        if (GrAAType::kMSAA == aaType) {
+            if (!caps.sampleLocationsSupport() || !caps.shaderCaps()->sampleVariablesSupport()) {
+                return nullptr;
+            }
+        }
+        if (viewMatrix.hasPerspective()) {
+            // HW derivatives are consistently slower on all platforms in sample mask mode. We
+            // therefore only use them when there is perspective, since then we can't interpolate
+            // the symbolic screen-space gradient.
+            flags |= Flags::kUseHWDerivatives | Flags::kHasPerspective;
+        }
     }
 
     // Produce a matrix that draws the round rect from normalized [-1, -1, +1, +1] space.
@@ -60,23 +67,61 @@
     // Map to device space.
     m.postConcat(viewMatrix);
 
-    // Since m is an affine matrix that maps the rect [-1, -1, +1, +1] into the shape's
-    // device-space quad, it's quite simple to find the bounding rectangle:
-    SkASSERT(!m.hasPerspective());
-    SkRect bounds = SkRect::MakeXYWH(m.getTranslateX(), m.getTranslateY(), 0, 0);
-    bounds.outset(SkScalarAbs(m.getScaleX()) + SkScalarAbs(m.getSkewX()),
-                  SkScalarAbs(m.getSkewY()) + SkScalarAbs(m.getScaleY()));
-    this->setBounds(bounds, GrOp::HasAABloat::kYes, GrOp::IsZeroArea::kNo);
+    SkRect devBounds;
+    if (!(flags & Flags::kHasPerspective)) {
+        // Since m is an affine matrix that maps the rect [-1, -1, +1, +1] into the shape's
+        // device-space quad, it's quite simple to find the bounding rectangle:
+        devBounds = SkRect::MakeXYWH(m.getTranslateX(), m.getTranslateY(), 0, 0);
+        devBounds.outset(SkScalarAbs(m.getScaleX()) + SkScalarAbs(m.getSkewX()),
+                         SkScalarAbs(m.getSkewY()) + SkScalarAbs(m.getScaleY()));
+    } else {
+        viewMatrix.mapRect(&devBounds, rrect.rect());
+    }
+
+    if (GrAAType::kMSAA == aaType && caps.preferTrianglesOverSampleMask()) {
+        // We are on a platform that prefers fine triangles instead of using the sample mask. See if
+        // the round rect is large enough that it will be faster for us to send it off to the
+        // default path renderer instead. The 200x200 threshold was arrived at using the
+        // "shapes_rrect" benchmark on an ARM Galaxy S9.
+        if (devBounds.height() * devBounds.width() > 200 * 200) {
+            return nullptr;
+        }
+    }
+
+    GrOpMemoryPool* pool = ctx->priv().opMemoryPool();
+    return pool->allocate<GrFillRRectOp>(aaType, rrect, flags, m, std::move(paint), devBounds);
+}
+
+GrFillRRectOp::GrFillRRectOp(
+        GrAAType aaType, const SkRRect& rrect, Flags flags,
+        const SkMatrix& totalShapeMatrix, GrPaint&& paint, const SkRect& devBounds)
+        : GrDrawOp(ClassID())
+        , fAAType(aaType)
+        , fOriginalColor(paint.getColor4f())
+        , fLocalRect(rrect.rect())
+        , fFlags(flags)
+        , fProcessors(std::move(paint)) {
+    SkASSERT((fFlags & Flags::kHasPerspective) == totalShapeMatrix.hasPerspective());
+    this->setBounds(devBounds, GrOp::HasAABloat::kYes, GrOp::IsZeroArea::kNo);
 
     // Write the matrix attribs.
-    this->writeInstanceData(m.getScaleX(), m.getSkewX(), m.getSkewY(), m.getScaleY());
-    this->writeInstanceData(m.getTranslateX(), m.getTranslateY());
+    const SkMatrix& m = totalShapeMatrix;
+    if (!(fFlags & Flags::kHasPerspective)) {
+        // Affine 2D transformation (float2x2 plus float2 translate).
+        SkASSERT(!m.hasPerspective());
+        this->writeInstanceData(m.getScaleX(), m.getSkewX(), m.getSkewY(), m.getScaleY());
+        this->writeInstanceData(m.getTranslateX(), m.getTranslateY());
+    } else {
+        // Perspective float3x3 transformation matrix.
+        SkASSERT(m.hasPerspective());
+        m.get9(this->appendInstanceData<float>(9));
+    }
 
     // Convert the radii to [-1, -1, +1, +1] space and write their attribs.
     Sk4f radiiX, radiiY;
     Sk4f::Load2(SkRRectPriv::GetRadiiArray(rrect), &radiiX, &radiiY);
-    (radiiX * (2/(r - l))).store(this->appendInstanceData<float>(4));
-    (radiiY * (2/(b - t))).store(this->appendInstanceData<float>(4));
+    (radiiX * (2/rrect.width())).store(this->appendInstanceData<float>(4));
+    (radiiY * (2/rrect.height())).store(this->appendInstanceData<float>(4));
 
     // We will write the color and local rect attribs during finalize().
 }
@@ -132,13 +177,73 @@
     }
 }
 
-namespace {
+class GrFillRRectOp::Processor : public GrGeometryProcessor {
+public:
+    Processor(GrAAType aaType, Flags flags)
+            : GrGeometryProcessor(kGrFillRRectOp_Processor_ClassID)
+            , fAAType(aaType)
+            , fFlags(flags) {
+        int numVertexAttribs = (GrAAType::kCoverage == fAAType) ? 3 : 2;
+        this->setVertexAttributes(kVertexAttribs, numVertexAttribs);
 
-// Our round rect geometry consists of an inset octagon with solid coverage, surrounded by linear
+        if (!(flags & Flags::kHasPerspective)) {
+            // Affine 2D transformation (float2x2 plus float2 translate).
+            fInstanceAttribs.emplace_back("skew", kFloat4_GrVertexAttribType, kFloat4_GrSLType);
+            fInstanceAttribs.emplace_back(
+                    "translate", kFloat2_GrVertexAttribType, kFloat2_GrSLType);
+        } else {
+            // Perspective float3x3 transformation matrix.
+            fInstanceAttribs.emplace_back("persp_x", kFloat3_GrVertexAttribType, kFloat3_GrSLType);
+            fInstanceAttribs.emplace_back("persp_y", kFloat3_GrVertexAttribType, kFloat3_GrSLType);
+            fInstanceAttribs.emplace_back("persp_z", kFloat3_GrVertexAttribType, kFloat3_GrSLType);
+        }
+        fInstanceAttribs.emplace_back("radii_x", kFloat4_GrVertexAttribType, kFloat4_GrSLType);
+        fInstanceAttribs.emplace_back("radii_y", kFloat4_GrVertexAttribType, kFloat4_GrSLType);
+        fColorAttrib = &fInstanceAttribs.push_back(
+                MakeColorAttribute("color", (flags & Flags::kWideColor)));
+        if (fFlags & Flags::kHasLocalCoords) {
+            fInstanceAttribs.emplace_back(
+                    "local_rect", kFloat4_GrVertexAttribType, kFloat4_GrSLType);
+        }
+        this->setInstanceAttributes(fInstanceAttribs.begin(), fInstanceAttribs.count());
+
+        if (GrAAType::kMSAA == fAAType) {
+            this->setWillUseCustomFeature(CustomFeatures::kSampleLocations);
+        }
+    }
+
+    const char* name() const override { return "GrFillRRectOp::Processor"; }
+
+    void getGLSLProcessorKey(const GrShaderCaps& caps, GrProcessorKeyBuilder* b) const override {
+        b->add32(((uint32_t)fFlags << 16) | (uint32_t)fAAType);
+    }
+
+    GrGLSLPrimitiveProcessor* createGLSLInstance(const GrShaderCaps&) const override;
+
+private:
+    static constexpr Attribute kVertexAttribs[] = {
+            {"radii_selector", kFloat4_GrVertexAttribType, kFloat4_GrSLType},
+            {"corner_and_radius_outsets", kFloat4_GrVertexAttribType, kFloat4_GrSLType},
+            // Coverage only.
+            {"aa_bloat_and_coverage", kFloat4_GrVertexAttribType, kFloat4_GrSLType}};
+
+    const GrAAType fAAType;
+    const Flags fFlags;
+
+    SkSTArray<6, Attribute> fInstanceAttribs;
+    const Attribute* fColorAttrib;
+
+    class CoverageImpl;
+    class MSAAImpl;
+};
+
+constexpr GrPrimitiveProcessor::Attribute GrFillRRectOp::Processor::kVertexAttribs[];
+
+// Our coverage geometry consists of an inset octagon with solid coverage, surrounded by linear
 // coverage ramps on the horizontal and vertical edges, and "arc coverage" pieces on the diagonal
 // edges. The Vertex struct tells the shader where to place its vertex within a normalized
 // ([l, t, r, b] = [-1, -1, +1, +1]) space, and how to calculate coverage. See onEmitCode.
-struct Vertex {
+struct CoverageVertex {
     std::array<float, 4> fRadiiSelector;
     std::array<float, 2> fCorner;
     std::array<float, 2> fRadiusOutset;
@@ -152,7 +257,7 @@
 // rectangles.
 static constexpr float kOctoOffset = 1/(1 + SK_ScalarRoot2Over2);
 
-static constexpr Vertex kVertexData[] = {
+static constexpr CoverageVertex kCoverageVertexData[] = {
         // Left inset edge.
         {{{0,0,0,1}},  {{-1,+1}},  {{0,-1}},  {{+1,0}},  1,  1},
         {{{1,0,0,0}},  {{-1,-1}},  {{0,+1}},  {{+1,0}},  1,  1},
@@ -219,9 +324,9 @@
         {{{0,0,0,1}},  {{-1,+1}},  {{0,-kOctoOffset}},  {{-1,+1}},  0,  0},
         {{{0,0,0,1}},  {{-1,+1}},  {{+kOctoOffset,0}},  {{-1,+1}},  0,  0}};
 
-GR_DECLARE_STATIC_UNIQUE_KEY(gVertexBufferKey);
+GR_DECLARE_STATIC_UNIQUE_KEY(gCoverageVertexBufferKey);
 
-static constexpr uint16_t kIndexData[] = {
+static constexpr uint16_t kCoverageIndexData[] = {
         // Inset octagon (solid coverage).
         0, 1, 7,
         1, 2, 7,
@@ -260,64 +365,18 @@
         39, 36, 38,
         36, 38, 37};
 
-GR_DECLARE_STATIC_UNIQUE_KEY(gIndexBufferKey);
+GR_DECLARE_STATIC_UNIQUE_KEY(gCoverageIndexBufferKey);
 
-}
-
-class GrFillRRectOp::Processor : public GrGeometryProcessor {
-public:
-    Processor(Flags flags)
-            : GrGeometryProcessor(kGrFillRRectOp_Processor_ClassID)
-            , fFlags(flags) {
-        this->setVertexAttributes(kVertexAttribs, 3);
-        fInSkew = { "skew", kFloat4_GrVertexAttribType, kFloat4_GrSLType };
-        fInTranslate = { "translate", kFloat2_GrVertexAttribType, kFloat2_GrSLType };
-        fInRadiiX = { "radii_x", kFloat4_GrVertexAttribType, kFloat4_GrSLType };
-        fInRadiiY = { "radii_y", kFloat4_GrVertexAttribType, kFloat4_GrSLType };
-        fInColor = MakeColorAttribute("color", (flags & Flags::kWideColor));
-        fInLocalRect = {"local_rect", kFloat4_GrVertexAttribType, kFloat4_GrSLType};
-
-        this->setInstanceAttributes(&fInSkew, (flags & Flags::kHasLocalCoords) ? 6 : 5);
-        SkASSERT(this->vertexStride() == sizeof(Vertex));
-    }
-
-    const char* name() const override { return "GrFillRRectOp::Processor"; }
-
-    void getGLSLProcessorKey(const GrShaderCaps& caps, GrProcessorKeyBuilder* b) const override {
-        b->add32(static_cast<uint32_t>(fFlags));
-    }
-
-    GrGLSLPrimitiveProcessor* createGLSLInstance(const GrShaderCaps&) const override;
-
-private:
-    static constexpr Attribute kVertexAttribs[] = {
-            {"radii_selector", kFloat4_GrVertexAttribType, kFloat4_GrSLType},
-            {"corner_and_radius_outsets", kFloat4_GrVertexAttribType, kFloat4_GrSLType},
-            {"aa_bloat_and_coverage", kFloat4_GrVertexAttribType, kFloat4_GrSLType}};
-
-    Attribute fInSkew;
-    Attribute fInTranslate;
-    Attribute fInRadiiX;
-    Attribute fInRadiiY;
-    Attribute fInColor;
-    Attribute fInLocalRect;  // Conditional.
-
-    const Flags fFlags;
-
-    class Impl;
-};
-
-constexpr GrPrimitiveProcessor::Attribute GrFillRRectOp::Processor::kVertexAttribs[];
-
-class GrFillRRectOp::Processor::Impl : public GrGLSLGeometryProcessor {
-public:
+class GrFillRRectOp::Processor::CoverageImpl : public GrGLSLGeometryProcessor {
     void onEmitCode(EmitArgs& args, GrGPArgs* gpArgs) override {
         const auto& proc = args.fGP.cast<Processor>();
         bool useHWDerivatives = (proc.fFlags & Flags::kUseHWDerivatives);
 
+        SkASSERT(proc.vertexStride() == sizeof(CoverageVertex));
+
         GrGLSLVaryingHandler* varyings = args.fVaryingHandler;
         varyings->emitAttributes(proc);
-        varyings->addPassThroughAttribute(proc.fInColor, args.fOutputColor,
+        varyings->addPassThroughAttribute(*proc.fColorAttrib, args.fOutputColor,
                                           GrGLSLVaryingHandler::Interpolation::kCanBeFlat);
 
         // Emit the vertex shader.
@@ -388,6 +447,7 @@
                              args.fFPCoordTransformHandler);
 
         // Transform to device space.
+        SkASSERT(!(proc.fFlags & Flags::kHasPerspective));
         v->codeAppend("float2x2 skewmatrix = float2x2(skew.xy, skew.zw);");
         v->codeAppend("float2 devcoord = vertexpos * skewmatrix + translate;");
         gpArgs->fPositionVar.set(kFloat2_GrSLType, "devcoord");
@@ -420,7 +480,7 @@
         f->codeAppendf("float x_plus_1=%s.x, y=%s.y;", arcCoord.fsIn(), arcCoord.fsIn());
         f->codeAppendf("half coverage;");
         f->codeAppendf("if (0 == x_plus_1) {");
-        f->codeAppendf(    "coverage = half(y);");  // We are a non-arc pixel (i.e., linear coverage).
+        f->codeAppendf(    "coverage = half(y);");  // We are a non-arc pixel (linear coverage).
         f->codeAppendf("} else {");
         f->codeAppendf(    "float fn = x_plus_1 * (x_plus_1 - 2);");  // fn = (x+1)*(x-1) = x^2-1
         f->codeAppendf(    "fn = fma(y,y, fn);");  // fn = x^2 + y^2 - 1
@@ -443,9 +503,195 @@
     }
 };
 
+// Our MSAA geometry consists of an inset octagon with full sample mask coverage, circumscribed
+// by a larger octagon that modifies the sample mask for the arc at each corresponding corner.
+struct MSAAVertex {
+    std::array<float, 4> fRadiiSelector;
+    std::array<float, 2> fCorner;
+    std::array<float, 2> fRadiusOutset;
+};
+
+static constexpr MSAAVertex kMSAAVertexData[] = {
+        // Left edge. (Negative radii selector indicates this is not an arc section.)
+        {{{0,0,0,-1}},  {{-1,+1}},  {{0,-1}}},
+        {{{-1,0,0,0}},  {{-1,-1}},  {{0,+1}}},
+
+        // Top edge.
+        {{{-1,0,0,0}},  {{-1,-1}},  {{+1,0}}},
+        {{{0,-1,0,0}},  {{+1,-1}},  {{-1,0}}},
+
+        // Right edge.
+        {{{0,-1,0,0}},  {{+1,-1}},  {{0,+1}}},
+        {{{0,0,-1,0}},  {{+1,+1}},  {{0,-1}}},
+
+        // Bottom edge.
+        {{{0,0,-1,0}},  {{+1,+1}},  {{-1,0}}},
+        {{{0,0,0,-1}},  {{-1,+1}},  {{+1,0}}},
+
+        // Top-left corner.
+        {{{1,0,0,0}},  {{-1,-1}},  {{0,+1}}},
+        {{{1,0,0,0}},  {{-1,-1}},  {{0,+kOctoOffset}}},
+        {{{1,0,0,0}},  {{-1,-1}},  {{+1,0}}},
+        {{{1,0,0,0}},  {{-1,-1}},  {{+kOctoOffset,0}}},
+
+        // Top-right corner.
+        {{{0,1,0,0}},  {{+1,-1}},  {{-1,0}}},
+        {{{0,1,0,0}},  {{+1,-1}},  {{-kOctoOffset,0}}},
+        {{{0,1,0,0}},  {{+1,-1}},  {{0,+1}}},
+        {{{0,1,0,0}},  {{+1,-1}},  {{0,+kOctoOffset}}},
+
+        // Bottom-right corner.
+        {{{0,0,1,0}},  {{+1,+1}},  {{0,-1}}},
+        {{{0,0,1,0}},  {{+1,+1}},  {{0,-kOctoOffset}}},
+        {{{0,0,1,0}},  {{+1,+1}},  {{-1,0}}},
+        {{{0,0,1,0}},  {{+1,+1}},  {{-kOctoOffset,0}}},
+
+        // Bottom-left corner.
+        {{{0,0,0,1}},  {{-1,+1}},  {{+1,0}}},
+        {{{0,0,0,1}},  {{-1,+1}},  {{+kOctoOffset,0}}},
+        {{{0,0,0,1}},  {{-1,+1}},  {{0,-1}}},
+        {{{0,0,0,1}},  {{-1,+1}},  {{0,-kOctoOffset}}}};
+
+GR_DECLARE_STATIC_UNIQUE_KEY(gMSAAVertexBufferKey);
+
+static constexpr uint16_t kMSAAIndexData[] = {
+        // Inset octagon. (Full sample mask.)
+        0, 1, 2,
+        0, 2, 3,
+        0, 3, 6,
+        3, 4, 5,
+        3, 5, 6,
+        6, 7, 0,
+
+        // Top-left arc. (Sample mask is set to the arc.)
+         8,  9, 10,
+         9, 11, 10,
+
+        // Top-right arc.
+        12, 13, 14,
+        13, 15, 14,
+
+        // Bottom-right arc.
+        16, 17, 18,
+        17, 19, 18,
+
+        // Bottom-left arc.
+        20, 21, 22,
+        21, 23, 22};
+
+GR_DECLARE_STATIC_UNIQUE_KEY(gMSAAIndexBufferKey);
+
+class GrFillRRectOp::Processor::MSAAImpl : public GrGLSLGeometryProcessor {
+    void onEmitCode(EmitArgs& args, GrGPArgs* gpArgs) override {
+        const auto& proc = args.fGP.cast<Processor>();
+        bool useHWDerivatives = (proc.fFlags & Flags::kUseHWDerivatives);
+        bool hasPerspective = (proc.fFlags & Flags::kHasPerspective);
+        bool hasLocalCoords = (proc.fFlags & Flags::kHasLocalCoords);
+        SkASSERT(useHWDerivatives == hasPerspective);
+
+        SkASSERT(proc.vertexStride() == sizeof(MSAAVertex));
+
+        // Emit the vertex shader.
+        GrGLSLVertexBuilder* v = args.fVertBuilder;
+
+        GrGLSLVaryingHandler* varyings = args.fVaryingHandler;
+        varyings->emitAttributes(proc);
+        varyings->addPassThroughAttribute(*proc.fColorAttrib, args.fOutputColor,
+                                          GrGLSLVaryingHandler::Interpolation::kCanBeFlat);
+
+        // Unpack vertex attribs.
+        v->codeAppendf("float2 corner = corner_and_radius_outsets.xy;");
+        v->codeAppendf("float2 radius_outset = corner_and_radius_outsets.zw;");
+
+        // Identify our radii.
+        v->codeAppend("float2 radii;");
+        v->codeAppend("radii.x = dot(radii_selector, radii_x);");
+        v->codeAppend("radii.y = dot(radii_selector, radii_y);");
+        v->codeAppendf("bool is_arc_section = (radii.x > 0);");
+        v->codeAppendf("radii = abs(radii);");
+
+        // Find our vertex position, adjusted for radii. Our rect is drawn in normalized
+        // [-1,-1,+1,+1] space.
+        v->codeAppend("float2 vertexpos = corner + radius_outset * radii;");
+
+        // Emit transforms.
+        GrShaderVar localCoord("", kFloat2_GrSLType);
+        if (hasLocalCoords) {
+            v->codeAppend("float2 localcoord = (local_rect.xy * (1 - vertexpos) + "
+                                               "local_rect.zw * (1 + vertexpos)) * .5;");
+            localCoord.set(kFloat2_GrSLType, "localcoord");
+        }
+        this->emitTransforms(v, varyings, args.fUniformHandler, localCoord,
+                             args.fFPCoordTransformHandler);
+
+        // Transform to device space.
+        if (!hasPerspective) {
+            v->codeAppend("float2x2 skewmatrix = float2x2(skew.xy, skew.zw);");
+            v->codeAppend("float2 devcoord = vertexpos * skewmatrix + translate;");
+            gpArgs->fPositionVar.set(kFloat2_GrSLType, "devcoord");
+        } else {
+            v->codeAppend("float3x3 persp_matrix = float3x3(persp_x, persp_y, persp_z);");
+            v->codeAppend("float3 devcoord = float3(vertexpos, 1) * persp_matrix;");
+            gpArgs->fPositionVar.set(kFloat3_GrSLType, "devcoord");
+        }
+
+        // Determine normalized arc coordinates for the implicit function.
+        GrGLSLVarying arcCoord((useHWDerivatives) ? kFloat2_GrSLType : kFloat4_GrSLType);
+        varyings->addVarying("arccoord", &arcCoord);
+        v->codeAppendf("if (is_arc_section) {");
+        v->codeAppendf(    "%s.xy = 1 - abs(radius_outset);", arcCoord.vsOut());
+        if (!useHWDerivatives) {
+            // The gradient is order-1: Interpolate it across arccoord.zw.
+            // This doesn't work with perspective.
+            SkASSERT(!hasPerspective);
+            v->codeAppendf("float2x2 derivatives = inverse(skewmatrix);");
+            v->codeAppendf("%s.zw = derivatives * (%s.xy/radii * corner * 2);",
+                           arcCoord.vsOut(), arcCoord.vsOut());
+        }
+        v->codeAppendf("} else {");
+        if (useHWDerivatives) {
+            v->codeAppendf("%s = float2(0);", arcCoord.vsOut());
+        } else {
+            v->codeAppendf("%s = float4(0);", arcCoord.vsOut());
+        }
+        v->codeAppendf("}");
+
+        // Emit the fragment shader.
+        GrGLSLFPFragmentBuilder* f = args.fFragBuilder;
+
+        f->codeAppendf("%s = half4(1);", args.fOutputCoverage);
+
+        // If x,y == 0, then we are drawing a triangle that does not track an arc.
+        f->codeAppendf("if (float2(0) != %s.xy) {", arcCoord.fsIn());
+        f->codeAppendf(    "float fn = dot(%s.xy, %s.xy) - 1;", arcCoord.fsIn(), arcCoord.fsIn());
+        if (GrAAType::kMSAA == proc.fAAType) {
+            using ScopeFlags = GrGLSLFPFragmentBuilder::ScopeFlags;
+            if (!useHWDerivatives) {
+                f->codeAppendf("float2 grad = %s.zw;", arcCoord.fsIn());
+                f->applyFnToMultisampleMask("fn", "grad", ScopeFlags::kInsidePerPrimitiveBranch);
+            } else {
+                f->applyFnToMultisampleMask("fn", nullptr, ScopeFlags::kInsidePerPrimitiveBranch);
+            }
+        } else {
+            f->codeAppendf("if (fn > 0) {");
+            f->codeAppendf(    "%s = half4(0);", args.fOutputCoverage);
+            f->codeAppendf("}");
+        }
+        f->codeAppendf("}");
+    }
+
+    void setData(const GrGLSLProgramDataManager& pdman, const GrPrimitiveProcessor&,
+                 FPCoordTransformIter&& transformIter) override {
+        this->setTransformDataHelper(SkMatrix::I(), pdman, &transformIter);
+    }
+};
+
 GrGLSLPrimitiveProcessor* GrFillRRectOp::Processor::createGLSLInstance(
         const GrShaderCaps&) const {
-    return new Impl();
+    if (GrAAType::kCoverage != fAAType) {
+        return new MSAAImpl();
+    }
+    return new CoverageImpl();
 }
 
 void GrFillRRectOp::onExecute(GrOpFlushState* flushState, const SkRect& chainBounds) {
@@ -453,26 +699,50 @@
         return;  // Setup failed.
     }
 
-    GR_DEFINE_STATIC_UNIQUE_KEY(gIndexBufferKey);
+    sk_sp<const GrBuffer> indexBuffer, vertexBuffer;
+    int indexCount;
 
-    sk_sp<const GrBuffer> indexBuffer = flushState->resourceProvider()->findOrMakeStaticBuffer(
-            GrGpuBufferType::kIndex, sizeof(kIndexData), kIndexData, gIndexBufferKey);
-    if (!indexBuffer) {
+    if (GrAAType::kCoverage == fAAType) {
+        GR_DEFINE_STATIC_UNIQUE_KEY(gCoverageIndexBufferKey);
+
+        indexBuffer = flushState->resourceProvider()->findOrMakeStaticBuffer(
+                GrGpuBufferType::kIndex, sizeof(kCoverageIndexData), kCoverageIndexData,
+                gCoverageIndexBufferKey);
+
+        GR_DEFINE_STATIC_UNIQUE_KEY(gCoverageVertexBufferKey);
+
+        vertexBuffer = flushState->resourceProvider()->findOrMakeStaticBuffer(
+                GrGpuBufferType::kVertex, sizeof(kCoverageVertexData), kCoverageVertexData,
+                gCoverageVertexBufferKey);
+
+        indexCount = SK_ARRAY_COUNT(kCoverageIndexData);
+    } else {
+        GR_DEFINE_STATIC_UNIQUE_KEY(gMSAAIndexBufferKey);
+
+        indexBuffer = flushState->resourceProvider()->findOrMakeStaticBuffer(
+                GrGpuBufferType::kIndex, sizeof(kMSAAIndexData), kMSAAIndexData,
+                gMSAAIndexBufferKey);
+
+        GR_DEFINE_STATIC_UNIQUE_KEY(gMSAAVertexBufferKey);
+
+        vertexBuffer = flushState->resourceProvider()->findOrMakeStaticBuffer(
+                GrGpuBufferType::kVertex, sizeof(kMSAAVertexData), kMSAAVertexData,
+                gMSAAVertexBufferKey);
+
+        indexCount = SK_ARRAY_COUNT(kMSAAIndexData);
+    }
+
+    if (!indexBuffer || !vertexBuffer) {
         return;
     }
 
-    GR_DEFINE_STATIC_UNIQUE_KEY(gVertexBufferKey);
-
-    sk_sp<const GrBuffer> vertexBuffer = flushState->resourceProvider()->findOrMakeStaticBuffer(
-            GrGpuBufferType::kVertex, sizeof(kVertexData), kVertexData, gVertexBufferKey);
-    if (!vertexBuffer) {
-        return;
-    }
-
-    Processor proc(fFlags);
+    Processor proc(fAAType, fFlags);
     SkASSERT(proc.instanceStride() == (size_t)fInstanceStride);
 
     GrPipeline::InitArgs initArgs;
+    if (GrAAType::kMSAA == fAAType) {
+        initArgs.fFlags = GrPipeline::kHWAntialias_Flag;
+    }
     initArgs.fCaps = &flushState->caps();
     initArgs.fResourceProvider = flushState->resourceProvider();
     initArgs.fDstProxy = flushState->drawOpArgs().fDstProxy;
@@ -481,15 +751,16 @@
     GrPipeline pipeline(initArgs, std::move(fProcessors), std::move(clip));
 
     GrMesh mesh(GrPrimitiveType::kTriangles);
-    mesh.setIndexedInstanced(std::move(indexBuffer), SK_ARRAY_COUNT(kIndexData), fInstanceBuffer,
-                             fInstanceCount, fBaseInstance, GrPrimitiveRestart::kNo);
+    mesh.setIndexedInstanced(
+            std::move(indexBuffer), indexCount, fInstanceBuffer, fInstanceCount, fBaseInstance,
+            GrPrimitiveRestart::kNo);
     mesh.setVertexData(std::move(vertexBuffer));
-    flushState->rtCommandBuffer()->draw(proc, pipeline, &fixedDynamicState, nullptr, &mesh, 1,
-                                        this->bounds());
+    flushState->rtCommandBuffer()->draw(
+            proc, pipeline, &fixedDynamicState, nullptr, &mesh, 1, this->bounds());
 }
 
 // Will the given corner look good if we use HW derivatives?
-static bool can_use_hw_derivatives(const Sk2f& devScale, const Sk2f& cornerRadii) {
+static bool can_use_hw_derivatives_with_coverage(const Sk2f& devScale, const Sk2f& cornerRadii) {
     Sk2f devRadii = devScale * cornerRadii;
     if (devRadii[1] < devRadii[0]) {
         devRadii = SkNx_shuffle<1,0>(devRadii);
@@ -500,13 +771,14 @@
     return minDevRadius * minDevRadius * 5 > devRadii[1];
 }
 
-static bool can_use_hw_derivatives(const Sk2f& devScale, const SkVector& cornerRadii) {
-    return can_use_hw_derivatives(devScale, Sk2f::Load(&cornerRadii));
+static bool can_use_hw_derivatives_with_coverage(
+        const Sk2f& devScale, const SkVector& cornerRadii) {
+    return can_use_hw_derivatives_with_coverage(devScale, Sk2f::Load(&cornerRadii));
 }
 
 // Will the given round rect look good if we use HW derivatives?
-static bool can_use_hw_derivatives(const GrShaderCaps& shaderCaps, const SkMatrix& viewMatrix,
-                                   const SkRRect& rrect) {
+static bool can_use_hw_derivatives_with_coverage(
+        const GrShaderCaps& shaderCaps, const SkMatrix& viewMatrix, const SkRRect& rrect) {
     if (!shaderCaps.shaderDerivativeSupport()) {
         return false;
     }
@@ -521,27 +793,27 @@
 
         case SkRRect::kOval_Type:
         case SkRRect::kSimple_Type:
-            return can_use_hw_derivatives(devScale, rrect.getSimpleRadii());
+            return can_use_hw_derivatives_with_coverage(devScale, rrect.getSimpleRadii());
 
         case SkRRect::kNinePatch_Type: {
             Sk2f r0 = Sk2f::Load(SkRRectPriv::GetRadiiArray(rrect));
             Sk2f r1 = Sk2f::Load(SkRRectPriv::GetRadiiArray(rrect) + 2);
             Sk2f minRadii = Sk2f::Min(r0, r1);
             Sk2f maxRadii = Sk2f::Max(r0, r1);
-            return can_use_hw_derivatives(devScale, Sk2f(minRadii[0], maxRadii[1])) &&
-                   can_use_hw_derivatives(devScale, Sk2f(maxRadii[0], minRadii[1]));
+            return can_use_hw_derivatives_with_coverage(devScale, Sk2f(minRadii[0], maxRadii[1])) &&
+                   can_use_hw_derivatives_with_coverage(devScale, Sk2f(maxRadii[0], minRadii[1]));
         }
 
         case SkRRect::kComplex_Type: {
             for (int i = 0; i < 4; ++i) {
                 auto corner = static_cast<SkRRect::Corner>(i);
-                if (!can_use_hw_derivatives(devScale, rrect.radii(corner))) {
+                if (!can_use_hw_derivatives_with_coverage(devScale, rrect.radii(corner))) {
                     return false;
                 }
             }
             return true;
         }
     }
-    SK_ABORT("Unreachable code.");
+    SK_ABORT("Invalid round rect type.");
     return false;  // Add this return to keep GCC happy.
 }
diff --git a/src/gpu/ops/GrFillRRectOp.h b/src/gpu/ops/GrFillRRectOp.h
index 3aad818..9bc88a0 100644
--- a/src/gpu/ops/GrFillRRectOp.h
+++ b/src/gpu/ops/GrFillRRectOp.h
@@ -16,11 +16,16 @@
 public:
     DEFINE_OP_CLASS_ID
 
-    static std::unique_ptr<GrFillRRectOp> Make(GrRecordingContext*, const SkMatrix&,
-                                               const SkRRect&, const GrCaps&, GrPaint&&);
+    static std::unique_ptr<GrFillRRectOp> Make(
+            GrRecordingContext*, GrAAType, const SkMatrix& viewMatrix, const SkRRect&,
+            const GrCaps&, GrPaint&&);
 
     const char* name() const override { return "GrFillRRectOp"; }
-    FixedFunctionFlags fixedFunctionFlags() const override { return FixedFunctionFlags::kNone; }
+    FixedFunctionFlags fixedFunctionFlags() const override {
+        return (GrAAType::kMSAA == fAAType)
+                ? FixedFunctionFlags::kUsesHWAA
+                : FixedFunctionFlags::kNone;
+    }
     GrProcessorSet::Analysis finalize(
             const GrCaps&, const GrAppliedClip*, GrFSAAType, GrClampType) override;
     CombineResult onCombineIfPossible(GrOp*, const GrCaps&) override;
@@ -35,22 +40,24 @@
     enum class Flags {
         kNone = 0,
         kUseHWDerivatives = 1 << 0,
-        kHasLocalCoords = 1 << 1,
-        kWideColor = 1 << 2
+        kHasPerspective = 1 << 1,
+        kHasLocalCoords = 1 << 2,
+        kWideColor = 1 << 3
     };
 
     GR_DECL_BITFIELD_CLASS_OPS_FRIENDS(Flags);
 
     class Processor;
 
-    GrFillRRectOp(const GrShaderCaps&, const SkMatrix&, const SkRRect&, GrPaint&&);
+    GrFillRRectOp(GrAAType, const SkRRect&, Flags, const SkMatrix& totalShapeMatrix,
+                  GrPaint&&, const SkRect& devBounds);
 
     // These methods are used to append data of various POD types to our internal array of instance
     // data. The actual layout of the instance buffer can vary from Op to Op.
-    template <typename T> inline void* appendInstanceData(int count) {
+    template <typename T> inline T* appendInstanceData(int count) {
         static_assert(std::is_pod<T>::value, "");
         static_assert(4 == alignof(T), "");
-        return fInstanceData.push_back_n(sizeof(T) * count);
+        return reinterpret_cast<T*>(fInstanceData.push_back_n(sizeof(T) * count));
     }
 
     template <typename T, typename... Args>
@@ -61,9 +68,10 @@
 
     void writeInstanceData() {}  // Halt condition.
 
+    const GrAAType fAAType;
     const SkPMColor4f fOriginalColor;
     const SkRect fLocalRect;
-    Flags fFlags = Flags::kNone;
+    Flags fFlags;
     GrProcessorSet fProcessors;
 
     SkSTArray<sizeof(float) * 16 * 4, char, /*MEM_MOVE=*/ true> fInstanceData;
diff --git a/src/gpu/vk/GrVkCaps.cpp b/src/gpu/vk/GrVkCaps.cpp
index 2c71f72..6f9eed7 100644
--- a/src/gpu/vk/GrVkCaps.cpp
+++ b/src/gpu/vk/GrVkCaps.cpp
@@ -386,6 +386,11 @@
         fAvoidUpdateBuffers = true;
     }
 
+    if (kARM_VkVendor == properties.vendorID) {
+        // ARM seems to do better with more fine triangles as opposed to using the sample mask.
+        // (At least in our current round rect op.)
+        fPreferTrianglesOverSampleMask = true;
+    }
 
     this->initConfigTable(vkInterface, physDev, properties);
     this->initStencilFormat(vkInterface, physDev);