diff --git a/src/gpu/GrOnFlushResourceProvider.cpp b/src/gpu/GrOnFlushResourceProvider.cpp
index 33d116c..bbf1029 100644
--- a/src/gpu/GrOnFlushResourceProvider.cpp
+++ b/src/gpu/GrOnFlushResourceProvider.cpp
@@ -85,7 +85,7 @@
     sk_sp<const GrBuffer> buffer = resourceProvider->findOrMakeStaticBuffer(intendedType, size,
                                                                             data, key);
     // Static buffers should never have pending IO.
-    SkASSERT(!buffer->resourcePriv().hasPendingIO_debugOnly());
+    SkASSERT(!buffer || !buffer->resourcePriv().hasPendingIO_debugOnly());
     return buffer;
 }
 
diff --git a/src/gpu/GrProcessor.h b/src/gpu/GrProcessor.h
index 82f0b80..de3925d 100644
--- a/src/gpu/GrProcessor.h
+++ b/src/gpu/GrProcessor.h
@@ -84,6 +84,7 @@
         kEllipticalRRectEffect_ClassID,
         kGP_ClassID,
         kVertexColorSpaceBenchGP_ClassID,
+        kGrAAFillRRectOp_Processor_ClassID,
         kGrAARectEffect_ClassID,
         kGrAlphaThresholdFragmentProcessor_ClassID,
         kGrArithmeticFP_ClassID,
diff --git a/src/gpu/GrProcessorSet.h b/src/gpu/GrProcessorSet.h
index c70caba..09c082a 100644
--- a/src/gpu/GrProcessorSet.h
+++ b/src/gpu/GrProcessorSet.h
@@ -129,7 +129,7 @@
      * This analyzes the processors given an op's input color and coverage as well as a clip. The
      * state of the processor set may change to an equivalent but more optimal set of processors.
      * This new state requires that the caller respect the returned 'inputColorOverride'. This is
-     * indicated by the returned Analysis's inputColorIsOverriden(). 'inputColorOverride' will not
+     * indicated by the returned Analysis's inputColorIsOverridden(). 'inputColorOverride' will not
      * be written if the analysis does not override the input color.
      *
      * This must be called before the processor set is used to construct a GrPipeline and may only
diff --git a/src/gpu/GrRenderTargetContext.cpp b/src/gpu/GrRenderTargetContext.cpp
index 614415d..60a3b03 100644
--- a/src/gpu/GrRenderTargetContext.cpp
+++ b/src/gpu/GrRenderTargetContext.cpp
@@ -44,6 +44,7 @@
 #include "ops/GrDrawAtlasOp.h"
 #include "ops/GrDrawOp.h"
 #include "ops/GrDrawVerticesOp.h"
+#include "ops/GrAAFillRRectOp.h"
 #include "ops/GrLatticeOp.h"
 #include "ops/GrOp.h"
 #include "ops/GrOvalOpFactory.h"
@@ -971,13 +972,16 @@
 
     GrAAType aaType = this->chooseAAType(aa, GrAllowMixedSamples::kNo);
     if (GrAAType::kCoverage == aaType) {
-        const GrShaderCaps* shaderCaps = this->caps()->shaderCaps();
-        std::unique_ptr<GrDrawOp> op = GrOvalOpFactory::MakeRRectOp(fContext,
-                                                                    std::move(paint),
-                                                                    viewMatrix,
-                                                                    rrect,
-                                                                    stroke,
-                                                                    shaderCaps);
+        std::unique_ptr<GrDrawOp> op;
+        if (style.isSimpleFill()) {
+            op = GrAAFillRRectOp::Make(fContext, viewMatrix, rrect, *this->caps(),
+                                       std::move(paint));
+        }
+        if (!op) {
+            op = GrOvalOpFactory::MakeRRectOp(fContext, std::move(paint), viewMatrix, rrect, stroke,
+                                              this->caps()->shaderCaps());
+        }
+
         if (op) {
             this->addDrawOp(*clip, std::move(op));
             return;
diff --git a/src/gpu/GrResourceProvider.cpp b/src/gpu/GrResourceProvider.cpp
index b0a9136..78bd97b 100644
--- a/src/gpu/GrResourceProvider.cpp
+++ b/src/gpu/GrResourceProvider.cpp
@@ -275,7 +275,7 @@
     }
     if (auto buffer = this->createBuffer(size, intendedType, kStatic_GrAccessPattern, Flags::kNone,
                                          data)) {
-        // We shouldn't bin and/or cachestatic buffers.
+        // We shouldn't bin and/or cache static buffers.
         SkASSERT(buffer->sizeInBytes() == size);
         SkASSERT(!buffer->resourcePriv().getScratchKey().isValid());
         SkASSERT(!buffer->resourcePriv().hasPendingIO_debugOnly());
diff --git a/src/gpu/ops/GrAAFillRRectOp.cpp b/src/gpu/ops/GrAAFillRRectOp.cpp
new file mode 100644
index 0000000..3c98570
--- /dev/null
+++ b/src/gpu/ops/GrAAFillRRectOp.cpp
@@ -0,0 +1,544 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "GrAAFillRRectOp.h"
+
+#include "GrCaps.h"
+#include "GrContextPriv.h"
+#include "GrGpuCommandBuffer.h"
+#include "GrMemoryPool.h"
+#include "SkRRectPriv.h"
+#include "glsl/GrGLSLFragmentShaderBuilder.h"
+#include "glsl/GrGLSLGeometryProcessor.h"
+#include "glsl/GrGLSLVarying.h"
+#include "glsl/GrGLSLVertexGeoBuilder.h"
+
+// Hardware derivatives are not always accurate enough for highly elliptical corners. This method
+// checks to make sure the corners will still all look good if we use HW derivatives.
+static bool can_use_hw_derivatives(const GrShaderCaps&, const SkMatrix&, const SkRRect&);
+
+std::unique_ptr<GrAAFillRRectOp> GrAAFillRRectOp::Make(
+        GrContext* ctx, const SkMatrix& viewMatrix, const SkRRect& rrect, const GrCaps& caps,
+        GrPaint&& paint) {
+    if (!caps.instanceAttribSupport()) {
+        return nullptr;
+    }
+
+    // TODO: Support perspective in a follow-on CL. This shouldn't be difficult, since we already
+    // use HW derivatives. The only trick will be adjusting the AA outset to account for
+    // perspective.  (i.e., outset = 0.5 * z.)
+    if (viewMatrix.hasPerspective()) {
+        return nullptr;
+    }
+
+    GrOpMemoryPool* pool = ctx->contextPriv().opMemoryPool();
+    return pool->allocate<GrAAFillRRectOp>(*caps.shaderCaps(), viewMatrix, rrect, std::move(paint));
+}
+
+GrAAFillRRectOp::GrAAFillRRectOp(const GrShaderCaps& shaderCaps, const SkMatrix& viewMatrix,
+                                 const SkRRect& rrect, GrPaint&& paint)
+        : GrDrawOp(ClassID())
+        , fOriginalColor(paint.getColor4f())
+        , fLocalRect(rrect.rect())
+        , fProcessors(std::move(paint)) {
+    if (can_use_hw_derivatives(shaderCaps, viewMatrix, rrect)) {
+        fFlags |= Flags::kUseHWDerivatives;
+    }
+
+    // Produce a matrix that draws the round rect from normalized [-1, -1, +1, +1] space.
+    float l = rrect.rect().left(), r = rrect.rect().right(),
+          t = rrect.rect().top(), b = rrect.rect().bottom();
+    SkMatrix m;
+    // Unmap the normalized rect [-1, -1, +1, +1] back to [l, t, r, b].
+    m.setScaleTranslate((r - l)/2, (b - t)/2, (l + r)/2, (t + b)/2);
+    // Map to device space.
+    m.postConcat(viewMatrix);
+
+    // Since m is an affine matrix that maps the rect [-1, -1, +1, +1] into the shape's
+    // device-space quad, it's quite simple to find the bounding rectangle:
+    SkASSERT(!m.hasPerspective());
+    SkRect bounds = SkRect::MakeXYWH(m.getTranslateX(), m.getTranslateY(), 0, 0);
+    bounds.outset(SkScalarAbs(m.getScaleX()) + SkScalarAbs(m.getSkewX()),
+                  SkScalarAbs(m.getSkewY()) + SkScalarAbs(m.getScaleY()));
+    this->setBounds(bounds, GrOp::HasAABloat::kYes, GrOp::IsZeroArea::kNo);
+
+    // Write the matrix attribs.
+    this->writeInstanceData(m.getScaleX(), m.getSkewX(), m.getSkewY(), m.getScaleY());
+    this->writeInstanceData(m.getTranslateX(), m.getTranslateY());
+
+    // Convert the radii to [-1, -1, +1, +1] space and write their attribs.
+    Sk4f radiiX, radiiY;
+    Sk4f::Load2(SkRRectPriv::GetRadiiArray(rrect), &radiiX, &radiiY);
+    (radiiX * (2/(r - l))).store(this->appendInstanceData<float>(4));
+    (radiiY * (2/(b - t))).store(this->appendInstanceData<float>(4));
+
+    // We will write the color and local rect attribs during finalize().
+}
+
+GrDrawOp::RequiresDstTexture GrAAFillRRectOp::finalize(const GrCaps& caps,
+                                                       const GrAppliedClip* clip) {
+    SkASSERT(1 == fInstanceCount);
+
+    SkPMColor4f overrideColor;
+    const GrProcessorSet::Analysis& analysis = fProcessors.finalize(
+            fOriginalColor, GrProcessorAnalysisCoverage::kSingleChannel, clip, false, caps,
+            &overrideColor);
+
+    // Finish writing the instance attribs.
+    this->writeInstanceData(
+            (analysis.inputColorIsOverridden() ? overrideColor : fOriginalColor).toBytes_RGBA());
+    if (analysis.usesLocalCoords()) {
+        this->writeInstanceData(fLocalRect);
+        fFlags |= Flags::kHasLocalCoords;
+    }
+    fInstanceStride = fInstanceData.count();
+
+    return RequiresDstTexture(analysis.requiresDstTexture());
+}
+
+GrDrawOp::CombineResult GrAAFillRRectOp::onCombineIfPossible(GrOp* op, const GrCaps&) {
+    const auto& that = *op->cast<GrAAFillRRectOp>();
+    if (fFlags != that.fFlags || fProcessors != that.fProcessors ||
+        fInstanceData.count() > std::numeric_limits<int>::max() - that.fInstanceData.count()) {
+        return CombineResult::kCannotCombine;
+    }
+
+    fInstanceData.push_back_n(that.fInstanceData.count(), that.fInstanceData.begin());
+    fInstanceCount += that.fInstanceCount;
+    SkASSERT(fInstanceStride == that.fInstanceStride);
+    return CombineResult::kMerged;
+}
+
+void GrAAFillRRectOp::onPrepare(GrOpFlushState* flushState) {
+    if (void* instanceData = flushState->makeVertexSpace(fInstanceStride, fInstanceCount,
+                                                         &fInstanceBuffer, &fBaseInstance)) {
+        SkASSERT(fInstanceStride * fInstanceCount == fInstanceData.count());
+        memcpy(instanceData, fInstanceData.begin(), fInstanceData.count());
+    }
+}
+
+namespace {
+
+// Our round rect geometry consists of an inset octagon with solid coverage, surrounded by linear
+// coverage ramps on the horizontal and vertical edges, and "arc coverage" pieces on the diagonal
+// edges. The Vertex struct tells the shader where to place its vertex within a normalized
+// ([l, t, r, b] = [-1, -1, +1, +1]) space, and how to calculate coverage. See onEmitCode.
+struct Vertex {
+    std::array<float, 4> fRadiiSelector;
+    std::array<float, 2> fCorner;
+    std::array<float, 2> fRadiusOutset;
+    std::array<float, 2> fAABloatDirection;
+    float fCoverage;
+    float fIsLinearCoverage;
+    std::array<float, 4> fArcCoordMatrix;
+};
+
+// This is the offset (when multiplied by radii) from the corners of a bounding box to the vertices
+// of its inscribed octagon. We draw the outside portion of arcs with quarter-octagons rather than
+// rectangles.
+static constexpr float kOctoOffset = 1/(1 + SK_ScalarRoot2Over2);
+
+// This matrix is used to calculate normalized arc coordinates.
+// (i.e., arccoord.x^2 + arccoord.y^2 == 1). The formula to find the arc coord is:
+//
+//   arccoord = arc_coord_matrix.xz * aa_bloatradius/radii + arc_coord_matrix.yw;
+//
+// See kVertexData and onEmitCode.
+static constexpr std::array<float, 4> kArcMatrices[] = {
+        {{+1, 1,  0, 0}},
+        {{-1, 1,  0, 0}},
+        {{ 0, 0, -1, 1}},
+        {{ 0, 0, +1, 1}},
+        {{+1, 1-kOctoOffset, +1, 1}},
+        {{+1, 1, +1, 1-kOctoOffset}}};
+
+static constexpr Vertex kVertexData[] = {
+        // Left inset edge.
+        {{{0,0,0,1}},  {{-1,+1}},  {{0,-1}},  {{+1,0}},  1,  1,  {{0,0,0,0}}},
+        {{{1,0,0,0}},  {{-1,-1}},  {{0,+1}},  {{+1,0}},  1,  1,  {{0,0,0,0}}},
+
+        // Top inset edge.
+        {{{1,0,0,0}},  {{-1,-1}},  {{+1,0}},  {{0,+1}},  1,  1,  {{0,0,0,0}}},
+        {{{0,1,0,0}},  {{+1,-1}},  {{-1,0}},  {{0,+1}},  1,  1,  {{0,0,0,0}}},
+
+        // Right inset edge.
+        {{{0,1,0,0}},  {{+1,-1}},  {{0,+1}},  {{-1,0}},  1,  1,  {{0,0,0,0}}},
+        {{{0,0,1,0}},  {{+1,+1}},  {{0,-1}},  {{-1,0}},  1,  1,  {{0,0,0,0}}},
+
+        // Bottom inset edge.
+        {{{0,0,1,0}},  {{+1,+1}},  {{-1,0}},  {{0,-1}},  1,  1,  {{0,0,0,0}}},
+        {{{0,0,0,1}},  {{-1,+1}},  {{+1,0}},  {{0,-1}},  1,  1,  {{0,0,0,0}}},
+
+
+        // Left outset edge.
+        {{{0,0,0,1}},  {{-1,+1}},  {{0,-1}},  {{-1,0}},  0,  1,  {{0,0,0,0}}},
+        {{{1,0,0,0}},  {{-1,-1}},  {{0,+1}},  {{-1,0}},  0,  1,  {{0,0,0,0}}},
+
+        // Top outset edge.
+        {{{1,0,0,0}},  {{-1,-1}},  {{+1,0}},  {{0,-1}},  0,  1,  {{0,0,0,0}}},
+        {{{0,1,0,0}},  {{+1,-1}},  {{-1,0}},  {{0,-1}},  0,  1,  {{0,0,0,0}}},
+
+        // Right outset edge.
+        {{{0,1,0,0}},  {{+1,-1}},  {{0,+1}},  {{+1,0}},  0,  1,  {{0,0,0,0}}},
+        {{{0,0,1,0}},  {{+1,+1}},  {{0,-1}},  {{+1,0}},  0,  1,  {{0,0,0,0}}},
+
+        // Bottom outset edge.
+        {{{0,0,1,0}},  {{+1,+1}},  {{-1,0}},  {{0,+1}},  0,  1,  {{0,0,0,0}}},
+        {{{0,0,0,1}},  {{-1,+1}},  {{+1,0}},  {{0,+1}},  0,  1,  {{0,0,0,0}}},
+
+
+        // Top-left corner.
+        {{{1,0,0,0}},  {{-1,-1}},  {{ 0,+1}},  {{-1, 0}},  0,  0,  kArcMatrices[0]},
+        {{{1,0,0,0}},  {{-1,-1}},  {{ 0,+1}},  {{+1, 0}},  1,  0,  kArcMatrices[1]},
+        {{{1,0,0,0}},  {{-1,-1}},  {{+1, 0}},  {{ 0,+1}},  1,  0,  kArcMatrices[2]},
+        {{{1,0,0,0}},  {{-1,-1}},  {{+1, 0}},  {{ 0,-1}},  0,  0,  kArcMatrices[3]},
+        {{{1,0,0,0}},  {{-1,-1}},  {{+kOctoOffset,0}},  {{-1,-1}},  0,  0,  kArcMatrices[4]},
+        {{{1,0,0,0}},  {{-1,-1}},  {{0,+kOctoOffset}},  {{-1,-1}},  0,  0,  kArcMatrices[5]},
+
+        // Top-right corner.
+        {{{0,1,0,0}},  {{+1,-1}},  {{-1, 0}},  {{ 0,-1}},  0,  0,  kArcMatrices[3]},
+        {{{0,1,0,0}},  {{+1,-1}},  {{-1, 0}},  {{ 0,+1}},  1,  0,  kArcMatrices[2]},
+        {{{0,1,0,0}},  {{+1,-1}},  {{ 0,+1}},  {{-1, 0}},  1,  0,  kArcMatrices[1]},
+        {{{0,1,0,0}},  {{+1,-1}},  {{ 0,+1}},  {{+1, 0}},  0,  0,  kArcMatrices[0]},
+        {{{0,1,0,0}},  {{+1,-1}},  {{0,+kOctoOffset}},  {{+1,-1}},  0,  0,  kArcMatrices[5]},
+        {{{0,1,0,0}},  {{+1,-1}},  {{-kOctoOffset,0}},  {{+1,-1}},  0,  0,  kArcMatrices[4]},
+
+        // Bottom-right corner.
+        {{{0,0,1,0}},  {{+1,+1}},  {{ 0,-1}},  {{+1, 0}},  0,  0,  kArcMatrices[0]},
+        {{{0,0,1,0}},  {{+1,+1}},  {{ 0,-1}},  {{-1, 0}},  1,  0,  kArcMatrices[1]},
+        {{{0,0,1,0}},  {{+1,+1}},  {{-1, 0}},  {{ 0,-1}},  1,  0,  kArcMatrices[2]},
+        {{{0,0,1,0}},  {{+1,+1}},  {{-1, 0}},  {{ 0,+1}},  0,  0,  kArcMatrices[3]},
+        {{{0,0,1,0}},  {{+1,+1}},  {{-kOctoOffset,0}},  {{+1,+1}},  0,  0,  kArcMatrices[4]},
+        {{{0,0,1,0}},  {{+1,+1}},  {{0,-kOctoOffset}},  {{+1,+1}},  0,  0,  kArcMatrices[5]},
+
+        // Bottom-left corner.
+        {{{0,0,0,1}},  {{-1,+1}},  {{+1, 0}},  {{ 0,+1}},  0,  0,  kArcMatrices[3]},
+        {{{0,0,0,1}},  {{-1,+1}},  {{+1, 0}},  {{ 0,-1}},  1,  0,  kArcMatrices[2]},
+        {{{0,0,0,1}},  {{-1,+1}},  {{ 0,-1}},  {{+1, 0}},  1,  0,  kArcMatrices[1]},
+        {{{0,0,0,1}},  {{-1,+1}},  {{ 0,-1}},  {{-1, 0}},  0,  0,  kArcMatrices[0]},
+        {{{0,0,0,1}},  {{-1,+1}},  {{0,-kOctoOffset}},  {{-1, 0}},  0,  0,  kArcMatrices[5]},
+        {{{0,0,0,1}},  {{-1,+1}},  {{+kOctoOffset,0}},  {{-1,+1}},  0,  0,  kArcMatrices[4]}};
+
+GR_DECLARE_STATIC_UNIQUE_KEY(gVertexBufferKey);
+
+static constexpr uint16_t kIndexData[] = {
+        // Inset octagon (solid coverage).
+        0, 1, 7,
+        1, 2, 7,
+        7, 2, 6,
+        2, 3, 6,
+        6, 3, 5,
+        3, 4, 5,
+
+        // AA borders (linear coverage).
+        0, 1, 8, 1, 9, 8,
+        2, 3, 10, 3, 11, 10,
+        4, 5, 12, 5, 13, 12,
+        6, 7, 14, 7, 15, 14,
+
+        // Top-left arc.
+        16, 17, 21,
+        17, 21, 18,
+        21, 18, 20,
+        18, 20, 19,
+
+        // Top-right arc.
+        22, 23, 27,
+        23, 27, 24,
+        27, 24, 26,
+        24, 26, 25,
+
+        // Bottom-right arc.
+        28, 29, 33,
+        29, 33, 30,
+        33, 30, 32,
+        30, 32, 31,
+
+        // Bottom-left arc.
+        34, 35, 39,
+        35, 39, 36,
+        39, 36, 38,
+        36, 38, 37};
+
+GR_DECLARE_STATIC_UNIQUE_KEY(gIndexBufferKey);
+
+}
+
+class GrAAFillRRectOp::Processor : public GrGeometryProcessor {
+public:
+    Processor(Flags flags)
+            : GrGeometryProcessor(kGrAAFillRRectOp_Processor_ClassID)
+            , fFlags(flags) {
+        this->setVertexAttributes(kVertexAttribs, 4);
+        this->setInstanceAttributes(kInstanceAttribs, (flags & Flags::kHasLocalCoords) ? 6 : 5);
+        SkASSERT(this->vertexStride() == sizeof(Vertex));
+    }
+
+    const char* name() const override { return "GrAAFillRRectOp::Processor"; }
+
+    void getGLSLProcessorKey(const GrShaderCaps& caps, GrProcessorKeyBuilder* b) const override {
+        b->add32(static_cast<uint32_t>(fFlags));
+    }
+
+    GrGLSLPrimitiveProcessor* createGLSLInstance(const GrShaderCaps&) const override;
+
+private:
+    static constexpr Attribute kVertexAttribs[] = {
+            {"radii_selector", kFloat4_GrVertexAttribType, kFloat4_GrSLType},
+            {"corner_and_radius_outsets", kFloat4_GrVertexAttribType, kFloat4_GrSLType},
+            {"aa_bloat_and_coverage", kFloat4_GrVertexAttribType, kFloat4_GrSLType},
+            {"arc_coord_matrix", kFloat4_GrVertexAttribType, kFloat4_GrSLType}};
+
+    static constexpr Attribute kInstanceAttribs[] = {
+            {"skew", kFloat4_GrVertexAttribType, kFloat4_GrSLType},
+            {"translate", kFloat2_GrVertexAttribType, kFloat2_GrSLType},
+            {"radii_x", kFloat4_GrVertexAttribType, kFloat4_GrSLType},
+            {"radii_y", kFloat4_GrVertexAttribType, kFloat4_GrSLType},
+            {"color", kUByte4_norm_GrVertexAttribType, kHalf4_GrSLType},
+            {"local_rect", kFloat4_GrVertexAttribType, kFloat4_GrSLType}};  // Conditional.
+
+    static constexpr int kColorAttribIdx = 4;
+
+    const Flags fFlags;
+
+    class Impl;
+};
+
+constexpr GrPrimitiveProcessor::Attribute GrAAFillRRectOp::Processor::kVertexAttribs[];
+constexpr GrPrimitiveProcessor::Attribute GrAAFillRRectOp::Processor::kInstanceAttribs[];
+
+class GrAAFillRRectOp::Processor::Impl : public GrGLSLGeometryProcessor {
+public:
+    void onEmitCode(EmitArgs& args, GrGPArgs* gpArgs) override {
+        const auto& proc = args.fGP.cast<Processor>();
+        bool useHWDerivatives = (proc.fFlags & Flags::kUseHWDerivatives);
+
+        GrGLSLVaryingHandler* varyings = args.fVaryingHandler;
+        varyings->emitAttributes(proc);
+        varyings->addPassThroughAttribute(proc.kInstanceAttribs[kColorAttribIdx], args.fOutputColor,
+                                          GrGLSLVaryingHandler::Interpolation::kCanBeFlat);
+
+        // Emit the vertex shader.
+        GrGLSLVertexBuilder* v = args.fVertBuilder;
+
+        // Unpack vertex attribs.
+        v->codeAppend("float2 corner = corner_and_radius_outsets.xy;");
+        v->codeAppend("float2 radius_outset = corner_and_radius_outsets.zw;");
+        v->codeAppend("float2 aa_bloat_direction = aa_bloat_and_coverage.xy;");
+        v->codeAppend("float coverage = aa_bloat_and_coverage.z;");
+        v->codeAppend("float is_linear_coverage = aa_bloat_and_coverage.w;");
+
+        // Find the amount to bloat each edge for AA (in source space).
+        v->codeAppend("float2 pixellength = inversesqrt("
+                              "float2(dot(skew.xz, skew.xz), dot(skew.yw, skew.yw)));");
+        v->codeAppend("float4 normalized_axis_dirs = skew * pixellength.xyxy;");
+        v->codeAppend("float2 axiswidths = (abs(normalized_axis_dirs.xy) + "
+                                           "abs(normalized_axis_dirs.zw));");
+        v->codeAppend("float2 aa_bloatradius = axiswidths * pixellength * .5;");
+
+        // Identify our radii.
+        v->codeAppend("float2 radii = float2(dot(radii_selector, radii_x), "
+                                            "dot(radii_selector, radii_y));");
+
+        v->codeAppend("if (any(greaterThan(aa_bloatradius, float2(1)))) {");
+                          // The rrect is more narrow than an AA coverage ramp. We can't draw as-is
+                          // or else opposite AA borders will overlap. Instead, fudge the size up to
+                          // the width of a coverage ramp, and then reduce total coverage to make
+                          // the rect appear more thin.
+        v->codeAppend(    "corner = max(abs(corner), aa_bloatradius) * sign(corner);");
+        v->codeAppend(    "coverage /= max(aa_bloatradius.x, 1) * max(aa_bloatradius.y, 1);");
+                          // Set radii to zero to ensure we take the "linear coverage" codepath.
+                          // (The "coverage" variable only has effect in the linear codepath.)
+        v->codeAppend(    "radii = float2(0);");
+        v->codeAppend("}");
+
+        v->codeAppend("if (any(lessThan(radii, aa_bloatradius * 1.25))) {");
+                          // The radii are very small. Demote this arc to a sharp 90 degree corner.
+        v->codeAppend(    "radii = aa_bloatradius;");
+                          // Snap octagon vertices to the corner of the bounding box.
+        v->codeAppend(    "radius_outset = floor(abs(radius_outset)) * radius_outset;");
+        v->codeAppend(    "is_linear_coverage = 1;");
+        v->codeAppend("} else {");
+                          // Don't let actual arc radii get smaller than a pixel.
+        v->codeAppend(    "radii = clamp(radii, pixellength, 2 - pixellength);");
+        v->codeAppend("}");
+        // Bias radii slightly inward to avoid accidental overlap of geometries from fp rounding.
+        v->codeAppend("radii -= aa_bloatradius * 1e-3;");
+
+        // Find our vertex position, adjusted for radii and bloated for AA. Our rect is drawn in
+        // normalized [-1,-1,+1,+1] space.
+        v->codeAppend("float2 vertexpos = corner"
+                              "+ radius_outset * radii"
+                              "+ aa_bloat_direction.xy * aa_bloatradius;");
+
+        // Emit transforms.
+        GrShaderVar localCoord("", kFloat2_GrSLType);
+        if (proc.fFlags & Flags::kHasLocalCoords) {
+            v->codeAppend("float2 localcoord = (local_rect.xy * (1 - vertexpos) + "
+                                               "local_rect.zw * (1 + vertexpos)) * .5;");
+            localCoord.set(kFloat2_GrSLType, "localcoord");
+        }
+        this->emitTransforms(v, varyings, args.fUniformHandler, localCoord,
+                             args.fFPCoordTransformHandler);
+
+        // Transform to device space.
+        v->codeAppend("float2x2 skewmatrix = float2x2(skew.xy, skew.zw);");
+        v->codeAppend("float2 devcoord = vertexpos * skewmatrix + translate;");
+        gpArgs->fPositionVar.set(kFloat2_GrSLType, "devcoord");
+
+        // Setup interpolants for coverage.
+        GrGLSLVarying arcCoord(useHWDerivatives ? kFloat2_GrSLType : kFloat4_GrSLType);
+        varyings->addVarying("arccoord", &arcCoord);
+        v->codeAppend("if (0 != is_linear_coverage) {");
+                           // We are a non-corner piece: Set x=0 to indicate built-in coverage, and
+                           // interpolate linear coverage across y.
+        v->codeAppendf(    "%s.xy = float2(0, coverage);", arcCoord.vsOut());
+        v->codeAppend("} else {");
+        v->codeAppend(    "float2 arccoord = "
+                               "arc_coord_matrix.xz * aa_bloatradius/radii + arc_coord_matrix.yw;");
+                           // We are a corner piece: Interpolate the arc coordinates for coverage.
+                           // Emit x+1 to ensure no pixel in the arc has a x value of 0 (since x=0
+                           // instructs the fragment shader to use linear coverage).
+        v->codeAppendf(    "%s.xy = float2(arccoord.x+1, arccoord.y);", arcCoord.vsOut());
+        if (!useHWDerivatives) {
+            // The gradient is order-1: Interpolate it across arccoord.zw.
+            v->codeAppendf("float2x2 derivatives = inverse(skewmatrix);");
+            v->codeAppendf("%s.zw = derivatives * (arccoord/radii * 2);", arcCoord.vsOut());
+        }
+        v->codeAppend("}");
+
+        // Emit the fragment shader.
+        GrGLSLFPFragmentBuilder* f = args.fFragBuilder;
+
+        f->codeAppendf("float x_plus_1=%s.x, y=%s.y;", arcCoord.fsIn(), arcCoord.fsIn());
+        f->codeAppendf("half coverage;");
+        f->codeAppendf("if (0 == x_plus_1) {");
+        f->codeAppendf(    "coverage = y;");  // We are a non-arc pixel (i.e., linear coverage).
+        f->codeAppendf("} else {");
+        f->codeAppendf(    "float fn = x_plus_1 * (x_plus_1 - 2);");  // fn = (x+1)*(x-1) = x^2-1
+        f->codeAppendf(    "fn = fma(y,y, fn);");  // fn = x^2 + y^2 - 1
+        if (useHWDerivatives) {
+            f->codeAppendf("float fnwidth = fwidth(fn);");
+        } else {
+            // The gradient is interpolated across arccoord.zw.
+            f->codeAppendf("float gx=%s.z, gy=%s.w;", arcCoord.fsIn(), arcCoord.fsIn());
+            f->codeAppendf("float fnwidth = abs(gx) + abs(gy);");
+        }
+        f->codeAppendf(    "half d = fn/fnwidth;");
+        f->codeAppendf(    "coverage = clamp(.5 - d, 0, 1);");
+        f->codeAppendf("}");
+        f->codeAppendf("%s = half4(coverage);", args.fOutputCoverage);
+    }
+
+    void setData(const GrGLSLProgramDataManager& pdman, const GrPrimitiveProcessor&,
+                 FPCoordTransformIter&& transformIter) override {
+        this->setTransformDataHelper(SkMatrix::I(), pdman, &transformIter);
+    }
+};
+
+GrGLSLPrimitiveProcessor* GrAAFillRRectOp::Processor::createGLSLInstance(
+        const GrShaderCaps&) const {
+    return new Impl();
+}
+
+void GrAAFillRRectOp::onExecute(GrOpFlushState* flushState, const SkRect& chainBounds) {
+    if (!fInstanceBuffer) {
+        return;  // Setup failed.
+    }
+
+    GR_DEFINE_STATIC_UNIQUE_KEY(gIndexBufferKey);
+
+    sk_sp<const GrBuffer> indexBuffer =
+            flushState->resourceProvider()->findOrMakeStaticBuffer(
+                    kIndex_GrBufferType, sizeof(kIndexData), kIndexData, gIndexBufferKey);
+    if (!indexBuffer) {
+        return;
+    }
+
+    GR_DEFINE_STATIC_UNIQUE_KEY(gVertexBufferKey);
+
+    sk_sp<const GrBuffer> vertexBuffer =
+            flushState->resourceProvider()->findOrMakeStaticBuffer(
+                    kVertex_GrBufferType, sizeof(kVertexData), kVertexData, gVertexBufferKey);
+    if (!vertexBuffer) {
+        return;
+    }
+
+    Processor proc(fFlags);
+    SkASSERT(proc.instanceStride() == (size_t)fInstanceStride);
+
+    GrPipeline::InitArgs initArgs;
+    initArgs.fProxy = flushState->drawOpArgs().fProxy;
+    initArgs.fCaps = &flushState->caps();
+    initArgs.fResourceProvider = flushState->resourceProvider();
+    initArgs.fDstProxy = flushState->drawOpArgs().fDstProxy;
+    GrPipeline pipeline(initArgs, std::move(fProcessors), flushState->detachAppliedClip());
+
+    GrMesh mesh(GrPrimitiveType::kTriangles);
+    mesh.setIndexedInstanced(indexBuffer.get(), SK_ARRAY_COUNT(kIndexData), fInstanceBuffer,
+                             fInstanceCount, fBaseInstance, GrPrimitiveRestart::kNo);
+    mesh.setVertexData(vertexBuffer.get());
+    flushState->rtCommandBuffer()->draw(proc, pipeline, nullptr, nullptr, &mesh, 1, this->bounds());
+}
+
+// Will the given corner look good if we use HW derivatives?
+static bool can_use_hw_derivatives(const Sk2f& devScale, const Sk2f& cornerRadii) {
+    Sk2f devRadii = devScale * cornerRadii;
+    if (devRadii[1] < devRadii[0]) {
+        devRadii = SkNx_shuffle<1,0>(devRadii);
+    }
+    float minDevRadius = SkTMax(devRadii[0], 1.f);  // Shader clamps radius at a minimum of 1.
+    // Is the gradient smooth enough for this corner look ok if we use hardware derivatives?
+    // This threshold was arrived at subjevtively on an NVIDIA chip.
+    return minDevRadius * minDevRadius * 5 > devRadii[1];
+}
+
+static bool can_use_hw_derivatives(const Sk2f& devScale, const SkVector& cornerRadii) {
+    return can_use_hw_derivatives(devScale, Sk2f::Load(&cornerRadii));
+}
+
+// Will the given round rect look good if we use HW derivatives?
+static bool can_use_hw_derivatives(const GrShaderCaps& shaderCaps, const SkMatrix& viewMatrix,
+                                   const SkRRect& rrect) {
+    if (!shaderCaps.shaderDerivativeSupport()) {
+        return false;
+    }
+
+    Sk2f x = Sk2f(viewMatrix.getScaleX(), viewMatrix.getSkewX());
+    Sk2f y = Sk2f(viewMatrix.getSkewY(), viewMatrix.getScaleY());
+    Sk2f devScale = (x*x + y*y).sqrt();
+    switch (rrect.getType()) {
+        case SkRRect::kEmpty_Type:
+        case SkRRect::kRect_Type:
+            return true;
+
+        case SkRRect::kOval_Type:
+        case SkRRect::kSimple_Type:
+            return can_use_hw_derivatives(devScale, rrect.getSimpleRadii());
+
+        case SkRRect::kNinePatch_Type: {
+            Sk2f r0 = Sk2f::Load(SkRRectPriv::GetRadiiArray(rrect));
+            Sk2f r1 = Sk2f::Load(SkRRectPriv::GetRadiiArray(rrect) + 2);
+            Sk2f minRadii = Sk2f::Min(r0, r1);
+            Sk2f maxRadii = Sk2f::Max(r0, r1);
+            return can_use_hw_derivatives(devScale, Sk2f(minRadii[0], maxRadii[1])) &&
+                   can_use_hw_derivatives(devScale, Sk2f(maxRadii[0], minRadii[1]));
+        }
+
+        case SkRRect::kComplex_Type: {
+            for (int i = 0; i < 4; ++i) {
+                auto corner = static_cast<SkRRect::Corner>(i);
+                if (!can_use_hw_derivatives(devScale, rrect.radii(corner))) {
+                    return false;
+                }
+            }
+            return true;
+        }
+    }
+    SK_ABORT("Unreachable code.");
+    return false;  // Add this return to keep GCC happy.
+}
diff --git a/src/gpu/ops/GrAAFillRRectOp.h b/src/gpu/ops/GrAAFillRRectOp.h
new file mode 100644
index 0000000..b1428cf
--- /dev/null
+++ b/src/gpu/ops/GrAAFillRRectOp.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrAAFillRRectOp_DEFINED
+#define GrAAFillRRectOp_DEFINED
+
+#include "GrDrawOp.h"
+
+class GrAAFillRRectOp : public GrDrawOp {
+public:
+    DEFINE_OP_CLASS_ID
+
+    static std::unique_ptr<GrAAFillRRectOp> Make(GrContext*, const SkMatrix&, const SkRRect&,
+                                                 const GrCaps&, GrPaint&&);
+
+    const char* name() const override { return "GrAAFillRRectOp"; }
+    FixedFunctionFlags fixedFunctionFlags() const override { return FixedFunctionFlags::kNone; }
+    RequiresDstTexture finalize(const GrCaps&, const GrAppliedClip*) override;
+    CombineResult onCombineIfPossible(GrOp*, const GrCaps&) override;
+    void visitProxies(const VisitProxyFunc& fn, VisitorType) const override {
+        fProcessors.visitProxies(fn);
+    }
+    void onPrepare(GrOpFlushState*) override;
+
+    void onExecute(GrOpFlushState*, const SkRect& chainBounds) override;
+
+private:
+    enum class Flags {
+        kNone = 0,
+        kUseHWDerivatives = 1 << 0,
+        kHasLocalCoords = 1 << 1
+    };
+
+    GR_DECL_BITFIELD_CLASS_OPS_FRIENDS(Flags)
+
+    class Processor;
+
+    GrAAFillRRectOp(const GrShaderCaps&, const SkMatrix&, const SkRRect&, GrPaint&&);
+
+    // These methods are used to append data of various POD types to our internal array of instance
+    // data. The actual layout of the instance buffer can vary from Op to Op.
+    template <typename T> inline void* appendInstanceData(int count) {
+        static_assert(std::is_pod<T>::value, "");
+        static_assert(4 == alignof(T), "");
+        return fInstanceData.push_back_n(sizeof(T) * count);
+    }
+
+    template <typename T, typename... Args>
+    inline void writeInstanceData(const T& val, const Args&... remainder) {
+        memcpy(this->appendInstanceData<T>(1), &val, sizeof(T));
+        this->writeInstanceData(remainder...);
+    }
+
+    void writeInstanceData() {}  // Halt condition.
+
+    const SkPMColor4f fOriginalColor;
+    const SkRect fLocalRect;
+    Flags fFlags = Flags::kNone;
+    GrProcessorSet fProcessors;
+
+    SkSTArray<sizeof(float) * 16 * 4, char, /*MEM_MOVE=*/ true> fInstanceData;
+    int fInstanceCount = 1;
+    int fInstanceStride = 0;
+
+    const GrBuffer* fInstanceBuffer = nullptr;
+    int fBaseInstance;
+
+    friend class GrOpMemoryPool;
+};
+
+GR_MAKE_BITFIELD_CLASS_OPS(GrAAFillRRectOp::Flags)
+
+#endif
