Clear and discard stencil buffers on tilers, take 2

Bug: skia:
Change-Id: I6a3bc7e50f8d4f211ed4afb2a614581b2778b27a
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/245300
Reviewed-by: Greg Daniel <egdaniel@google.com>
Commit-Queue: Chris Dalton <csmartdalton@google.com>
diff --git a/src/gpu/GrCaps.cpp b/src/gpu/GrCaps.cpp
index 4b14a91..60ff6f7 100644
--- a/src/gpu/GrCaps.cpp
+++ b/src/gpu/GrCaps.cpp
@@ -177,7 +177,7 @@
     writer->appendBool("MSAA Resolves Automatically", fMSAAResolvesAutomatically);
     writer->appendBool("Use primitive restart", fUsePrimitiveRestart);
     writer->appendBool("Prefer client-side dynamic buffers", fPreferClientSideDynamicBuffers);
-    writer->appendBool("Prefer fullscreen clears", fPreferFullscreenClears);
+    writer->appendBool("Prefer fullscreen clears (and stencil discard)", fPreferFullscreenClears);
     writer->appendBool("Must clear buffer memory", fMustClearUploadedBufferData);
     writer->appendBool("Should initialize textures", fShouldInitializeTextures);
     writer->appendBool("Supports importing AHardwareBuffers", fSupportsAHardwareBufferImages);
diff --git a/src/gpu/GrCaps.h b/src/gpu/GrCaps.h
index 6785a0b..23e690d 100644
--- a/src/gpu/GrCaps.h
+++ b/src/gpu/GrCaps.h
@@ -64,6 +64,14 @@
     // initialize each tile with a constant value rather than loading each pixel from memory.
     bool preferFullscreenClears() const { return fPreferFullscreenClears; }
 
+    // Should we discard stencil values after a render pass? (Tilers get better performance if we
+    // always load stencil buffers with a "clear" op, and then discard the content when finished.)
+    bool discardStencilValuesAfterRenderPass() const {
+        // This method is actually just a duplicate of preferFullscreenClears(), with a descriptive
+        // name for the sake of readability.
+        return this->preferFullscreenClears();
+    }
+
     bool preferVRAMUseOverFlushes() const { return fPreferVRAMUseOverFlushes; }
 
     bool preferTrianglesOverSampleMask() const { return fPreferTrianglesOverSampleMask; }
diff --git a/src/gpu/GrOpsTask.cpp b/src/gpu/GrOpsTask.cpp
index ff66b93..f7ceca5 100644
--- a/src/gpu/GrOpsTask.cpp
+++ b/src/gpu/GrOpsTask.cpp
@@ -18,8 +18,11 @@
 #include "src/gpu/GrOpFlushState.h"
 #include "src/gpu/GrOpsRenderPass.h"
 #include "src/gpu/GrRecordingContextPriv.h"
+#include "src/gpu/GrRenderTarget.h"
 #include "src/gpu/GrRenderTargetContext.h"
+#include "src/gpu/GrRenderTargetPriv.h"
 #include "src/gpu/GrResourceAllocator.h"
+#include "src/gpu/GrStencilAttachment.h"
 #include "src/gpu/GrTexturePriv.h"
 #include "src/gpu/geometry/GrRect.h"
 #include "src/gpu/ops/GrClearOp.h"
@@ -427,7 +430,7 @@
 static GrOpsRenderPass* create_render_pass(
         GrGpu* gpu, GrRenderTarget* rt, GrSurfaceOrigin origin, const SkIRect& bounds,
         GrLoadOp colorLoadOp, const SkPMColor4f& loadClearColor, GrLoadOp stencilLoadOp,
-        const SkTArray<GrTextureProxy*, true>& sampledProxies) {
+        GrStoreOp stencilStoreOp, const SkTArray<GrTextureProxy*, true>& sampledProxies) {
     const GrOpsRenderPass::LoadAndStoreInfo kColorLoadStoreInfo {
         colorLoadOp,
         GrStoreOp::kStore,
@@ -441,7 +444,7 @@
     // lower level (inside the VK command buffer).
     const GrOpsRenderPass::StencilLoadAndStoreInfo stencilLoadAndStoreInfo {
         stencilLoadOp,
-        GrStoreOp::kStore,
+        stencilStoreOp,
     };
 
     return gpu->getOpsRenderPass(rt, origin, bounds, kColorLoadStoreInfo, stencilLoadAndStoreInfo,
@@ -463,23 +466,60 @@
     SkASSERT(fTarget->peekRenderTarget());
     TRACE_EVENT0("skia.gpu", TRACE_FUNC);
 
-    // TODO: at the very least, we want the stencil store op to always be discard (at this
-    // level). In Vulkan, sub-command buffers would still need to load & store the stencil buffer.
-
     // Make sure load ops are not kClear if the GPU needs to use draws for clears
     SkASSERT(fColorLoadOp != GrLoadOp::kClear ||
              !flushState->gpu()->caps()->performColorClearsAsDraws());
-    SkASSERT(fStencilLoadOp != GrLoadOp::kClear ||
-             !flushState->gpu()->caps()->performStencilClearsAsDraws());
+
+    const GrCaps& caps = *flushState->gpu()->caps();
+    GrRenderTarget* renderTarget = fTarget.get()->peekRenderTarget();
+    SkASSERT(renderTarget);
+    GrStencilAttachment* stencil = renderTarget->renderTargetPriv().getStencilAttachment();
+
+    GrLoadOp stencilLoadOp;
+    switch (fInitialStencilContent) {
+        case StencilContent::kDontCare:
+            stencilLoadOp = GrLoadOp::kDiscard;
+            break;
+        case StencilContent::kUserBitsCleared:
+            SkASSERT(!caps.performStencilClearsAsDraws());
+            SkASSERT(stencil);
+            if (caps.discardStencilValuesAfterRenderPass()) {
+                // Always clear the stencil if it is being discarded after render passes. This is
+                // also an optimization because we are on a tiler and it avoids loading the values
+                // from memory.
+                stencilLoadOp = GrLoadOp::kClear;
+                break;
+            }
+            if (!stencil->hasPerformedInitialClear()) {
+                stencilLoadOp = GrLoadOp::kClear;
+                stencil->markHasPerformedInitialClear();
+                break;
+            }
+            // renderTargetContexts are required to leave the user stencil bits in a cleared state
+            // once finished, meaning the stencil values will always remain cleared after the
+            // initial clear. Just fall through to reloading the existing (cleared) stencil values
+            // from memory.
+        case StencilContent::kPreserved:
+            SkASSERT(stencil);
+            stencilLoadOp = GrLoadOp::kLoad;
+            break;
+    }
+
+    // NOTE: If fMustPreserveStencil is set, then we are executing a renderTargetContext that split
+    // its opsTask.
+    //
+    // FIXME: We don't currently flag render passes that don't use stencil at all. In that case
+    // their store op might be "discard", and we currently make the assumption that a discard will
+    // not invalidate what's already in main memory. This is probably ok for now, but certainly
+    // something we want to address soon.
+    GrStoreOp stencilStoreOp = (caps.discardStencilValuesAfterRenderPass() && !fMustPreserveStencil)
+            ? GrStoreOp::kDiscard
+            : GrStoreOp::kStore;
+
     GrOpsRenderPass* renderPass = create_render_pass(
-                                                    flushState->gpu(),
-                                                    fTarget->peekRenderTarget(),
-                                                    fTarget->origin(),
-                                                    fClippedContentBounds,
-                                                    fColorLoadOp,
-                                                    fLoadClearColor,
-                                                    fStencilLoadOp,
-                                                    fSampledProxies);
+            flushState->gpu(), fTarget->peekRenderTarget(), fTarget->origin(),
+            fClippedContentBounds, fColorLoadOp, fLoadClearColor, stencilLoadOp, stencilStoreOp,
+            fSampledProxies);
     flushState->setOpsRenderPass(renderPass);
     renderPass->begin();
 
@@ -547,7 +587,7 @@
     // opsTasks' color & stencil load ops.
     if (this->isEmpty()) {
         fColorLoadOp = GrLoadOp::kDiscard;
-        fStencilLoadOp = GrLoadOp::kDiscard;
+        fInitialStencilContent = StencilContent::kDontCare;
         fTotalBounds.setEmpty();
     }
 }
@@ -555,17 +595,34 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifdef SK_DEBUG
-static const char* load_op_to_name(GrLoadOp op) {
-    return GrLoadOp::kLoad == op ? "load" : GrLoadOp::kClear == op ? "clear" : "discard";
-}
-
 void GrOpsTask::dump(bool printDependencies) const {
     GrRenderTask::dump(printDependencies);
 
-    SkDebugf("ColorLoadOp: %s %x StencilLoadOp: %s\n",
-             load_op_to_name(fColorLoadOp),
-             GrLoadOp::kClear == fColorLoadOp ? fLoadClearColor.toBytes_RGBA() : 0x0,
-             load_op_to_name(fStencilLoadOp));
+    SkDebugf("fColorLoadOp: ");
+    switch (fColorLoadOp) {
+        case GrLoadOp::kLoad:
+            SkDebugf("kLoad\n");
+            break;
+        case GrLoadOp::kClear:
+            SkDebugf("kClear (0x%x)\n", fLoadClearColor.toBytes_RGBA());
+            break;
+        case GrLoadOp::kDiscard:
+            SkDebugf("kDiscard\n");
+            break;
+    }
+
+    SkDebugf("fInitialStencilContent: ");
+    switch (fInitialStencilContent) {
+        case StencilContent::kDontCare:
+            SkDebugf("kDontCare\n");
+            break;
+        case StencilContent::kUserBitsCleared:
+            SkDebugf("kUserBitsCleared\n");
+            break;
+        case StencilContent::kPreserved:
+            SkDebugf("kPreserved\n");
+            break;
+    }
 
     SkDebugf("ops (%d):\n", fOpChains.count());
     for (int i = 0; i < fOpChains.count(); ++i) {
diff --git a/src/gpu/GrOpsTask.h b/src/gpu/GrOpsTask.h
index a8588ce..80e4ffd 100644
--- a/src/gpu/GrOpsTask.h
+++ b/src/gpu/GrOpsTask.h
@@ -117,8 +117,30 @@
 
     void deleteOps();
 
-    // Must only be called if native stencil buffer clearing is enabled
-    void setStencilLoadOp(GrLoadOp op) { fStencilLoadOp = op; }
+    enum class StencilContent {
+        kDontCare,
+        kUserBitsCleared,  // User bits: cleared
+                           // Clip bit: don't care (Ganesh always pre-clears the clip bit.)
+        kPreserved
+    };
+
+    // Lets the caller specify what the content of the stencil buffer should be at the beginning
+    // of the render pass.
+    //
+    // When requesting kClear: Tilers will load the stencil buffer with a "clear" op; non-tilers
+    // will clear the stencil on first load, and then preserve it on subsequent loads. (Preserving
+    // works because renderTargetContexts are required to leave the user bits in a cleared state
+    // once finished.)
+    //
+    // NOTE: initialContent must not be kClear if caps.performStencilClearsAsDraws() is true.
+    void setInitialStencilContent(StencilContent initialContent) {
+        fInitialStencilContent = initialContent;
+    }
+
+    // If a renderTargetContext splits its opsTask, it uses this method to guarantee stencil values
+    // get preserved across its split tasks.
+    void setMustPreserveStencil() { fMustPreserveStencil = true; }
+
     // Must only be called if native color buffer clearing is enabled.
     void setColorLoadOp(GrLoadOp op, const SkPMColor4f& color);
     // Sets the clear color to transparent black
@@ -247,7 +269,8 @@
 
     GrLoadOp fColorLoadOp = GrLoadOp::kLoad;
     SkPMColor4f fLoadClearColor = SK_PMColor4fTRANSPARENT;
-    GrLoadOp fStencilLoadOp = GrLoadOp::kLoad;
+    StencilContent fInitialStencilContent = StencilContent::kDontCare;
+    bool fMustPreserveStencil = false;
 
     uint32_t fLastClipStackGenID;
     SkIRect fLastDevClipBounds;
diff --git a/src/gpu/GrRenderTargetContext.cpp b/src/gpu/GrRenderTargetContext.cpp
index af6792c..039f4d1 100644
--- a/src/gpu/GrRenderTargetContext.cpp
+++ b/src/gpu/GrRenderTargetContext.cpp
@@ -206,7 +206,20 @@
     SkDEBUGCODE(this->validate();)
 
     if (!fOpsTask || fOpsTask->isClosed()) {
-        fOpsTask = this->drawingManager()->newOpsTask(fRenderTargetProxy, fManagedOpsTask);
+        sk_sp<GrOpsTask> newOpsTask =
+                this->drawingManager()->newOpsTask(fRenderTargetProxy, fManagedOpsTask);
+        if (fNumStencilSamples > 0) {
+            // fNumStencilSamples is initially 0, and it should only get modified after the point
+            // that we've started building an opsTask.
+            SkASSERT(fOpsTask);
+            // Store the stencil values to memory upon completion of fOpsTask.
+            fOpsTask->setMustPreserveStencil();
+            // Reload the stencil buffer content at the beginning of newOpsTask.
+            // FIXME: Could the topo sort insert a task between these two that modifies the stencil
+            // values?
+            newOpsTask->setInitialStencilContent(GrOpsTask::StencilContent::kPreserved);
+        }
+        fOpsTask = std::move(newOpsTask);
     }
 
     return fOpsTask.get();
@@ -739,7 +752,7 @@
 void GrRenderTargetContext::setNeedsStencil(bool multisampled) {
     // Don't clear stencil until after we've changed fNumStencilSamples. This ensures we don't loop
     // forever in the event that there are driver bugs and we need to clear as a draw.
-    bool needsStencilClear = !fNumStencilSamples;
+    bool hasInitializedStencil = fNumStencilSamples > 0;
 
     int numRequiredSamples = this->numSamples();
     if (multisampled && 1 == numRequiredSamples) {
@@ -756,17 +769,14 @@
         fRenderTargetProxy->setNeedsStencil(fNumStencilSamples);
     }
 
-    if (needsStencilClear) {
+    if (!hasInitializedStencil) {
         if (this->caps()->performStencilClearsAsDraws()) {
             // There is a driver bug with clearing stencil. We must use an op to manually clear the
             // stencil buffer before the op that required 'setNeedsStencil'.
             this->internalStencilClear(GrFixedClip::Disabled(), /* inside mask */ false);
         } else {
-            // Setting the clear stencil load op is preferable. On non-tilers, this lets the flush
-            // code note when the instantiated stencil buffer is already clear and skip the clear
-            // altogether. And on tilers, loading the stencil buffer cleared is even faster than
-            // preserving the previous contents.
-            this->getOpsTask()->setStencilLoadOp(GrLoadOp::kClear);
+            this->getOpsTask()->setInitialStencilContent(
+                    GrOpsTask::StencilContent::kUserBitsCleared);
         }
     }
 }
diff --git a/src/gpu/GrStencilAttachment.h b/src/gpu/GrStencilAttachment.h
index 22595f5..5a56d30 100644
--- a/src/gpu/GrStencilAttachment.h
+++ b/src/gpu/GrStencilAttachment.h
@@ -25,9 +25,9 @@
     int height() const { return fHeight; }
     int bits() const { return fBits; }
     int numSamples() const { return fSampleCnt; }
-    bool isDirty() const { return fIsDirty; }
 
-    void cleared() { fIsDirty = false; }
+    bool hasPerformedInitialClear() const { return fHasPerformedInitialClear; }
+    void markHasPerformedInitialClear() { fHasPerformedInitialClear = true; }
 
     // We create a unique stencil buffer at each width, height and sampleCnt and share it for
     // all render targets that require a stencil with those params.
@@ -40,8 +40,7 @@
             , fWidth(width)
             , fHeight(height)
             , fBits(bits)
-            , fSampleCnt(sampleCnt)
-            , fIsDirty(true) {
+            , fSampleCnt(sampleCnt) {
     }
 
 private:
@@ -51,7 +50,7 @@
     int fHeight;
     int fBits;
     int fSampleCnt;
-    bool fIsDirty;
+    bool fHasPerformedInitialClear = false;
 
     typedef GrGpuResource INHERITED;
 };
diff --git a/src/gpu/ccpr/GrStencilAtlasOp.cpp b/src/gpu/ccpr/GrStencilAtlasOp.cpp
index 4f2e357..f5b16ba 100644
--- a/src/gpu/ccpr/GrStencilAtlasOp.cpp
+++ b/src/gpu/ccpr/GrStencilAtlasOp.cpp
@@ -88,7 +88,19 @@
         0xffff,                        0xffff>()
 );
 
-// Resolves stencil winding counts to A8 coverage and resets stencil values to zero.
+// Resolves stencil winding counts to A8 coverage. Leaves stencil values untouched.
+static constexpr GrUserStencilSettings kResolveStencilCoverage(
+    GrUserStencilSettings::StaticInitSeparate<
+        0x0000,                           0x0000,
+        GrUserStencilTest::kNotEqual,     GrUserStencilTest::kNotEqual,
+        0xffff,                           0x1,
+        GrUserStencilOp::kKeep,           GrUserStencilOp::kKeep,
+        GrUserStencilOp::kKeep,           GrUserStencilOp::kKeep,
+        0xffff,                           0xffff>()
+);
+
+// Same as above, but also resets stencil values to zero. This is better for non-tilers
+// where we prefer to not clear the stencil buffer at the beginning of every render pass.
 static constexpr GrUserStencilSettings kResolveStencilCoverageAndReset(
     GrUserStencilSettings::StaticInitSeparate<
         0x0000,                           0x0000,
@@ -119,17 +131,22 @@
     // not necessary, and will even cause artifacts if using mixed samples.
     constexpr auto noHWAA = GrPipeline::InputFlags::kNone;
 
-    GrPipeline resolvePipeline(
-            GrScissorTest::kEnabled, SkBlendMode::kSrc, flushState->drawOpArgs().fOutputSwizzle,
-            noHWAA, &kResolveStencilCoverageAndReset);
+    const auto* stencilResolveSettings = (flushState->caps().discardStencilValuesAfterRenderPass())
+            // The next draw will be the final op in the renderTargetContext. So if Ganesh is
+            // planning to discard the stencil values anyway, we don't actually need to reset them
+            // back to zero.
+            ? &kResolveStencilCoverage
+            : &kResolveStencilCoverageAndReset;
+
+    GrPipeline resolvePipeline(GrScissorTest::kEnabled, SkBlendMode::kSrc,
+                               flushState->drawOpArgs().fOutputSwizzle, noHWAA,
+                               stencilResolveSettings);
     GrPipeline::FixedDynamicState scissorRectState(drawBoundsRect);
 
     GrMesh mesh(GrPrimitiveType::kTriangleStrip);
-    mesh.setInstanced(
-            fResources->refStencilResolveBuffer(),
-            fEndStencilResolveInstance - fBaseStencilResolveInstance, fBaseStencilResolveInstance,
-            4);
-    flushState->opsRenderPass()->draw(
-            StencilResolveProcessor(), resolvePipeline, &scissorRectState, nullptr, &mesh, 1,
-            SkRect::Make(drawBoundsRect));
+    mesh.setInstanced(fResources->refStencilResolveBuffer(),
+                      fEndStencilResolveInstance - fBaseStencilResolveInstance,
+                      fBaseStencilResolveInstance, 4);
+    flushState->opsRenderPass()->draw(StencilResolveProcessor(), resolvePipeline, &scissorRectState,
+                                      nullptr, &mesh, 1, SkRect::Make(drawBoundsRect));
 }
diff --git a/src/gpu/gl/GrGLGpu.cpp b/src/gpu/gl/GrGLGpu.cpp
index 1897480..af40b95 100644
--- a/src/gpu/gl/GrGLGpu.cpp
+++ b/src/gpu/gl/GrGLGpu.cpp
@@ -1289,7 +1289,7 @@
             this->disableScissor();
             this->disableWindowRectangles();
             this->flushColorWrite(true);
-            this->flushClearColor(0, 0, 0, 0);
+            this->flushClearColor(SK_PMColor4fTRANSPARENT);
             for (int i = 0; i < mipLevelCount; ++i) {
                 if (levelClearMask & (1U << i)) {
                     this->bindSurfaceFBOForPixelOps(tex.get(), i, GR_GL_FRAMEBUFFER,
@@ -1846,16 +1846,7 @@
     this->flushScissor(clip.scissorState(), glRT->width(), glRT->height(), origin);
     this->flushWindowRectangles(clip.windowRectsState(), glRT, origin);
     this->flushColorWrite(true);
-
-    GrGLfloat r = color.fR, g = color.fG, b = color.fB, a = color.fA;
-    if (this->glCaps().clearToBoundaryValuesIsBroken() &&
-        (1 == r || 0 == r) && (1 == g || 0 == g) && (1 == b || 0 == b) && (1 == a || 0 == a)) {
-        static const GrGLfloat safeAlpha1 = nextafter(1.f, 2.f);
-        static const GrGLfloat safeAlpha0 = nextafter(0.f, -1.f);
-        a = (1 == a) ? safeAlpha1 : safeAlpha0;
-    }
-    this->flushClearColor(r, g, b, a);
-
+    this->flushClearColor(color);
     GL_CALL(Clear(GR_GL_COLOR_BUFFER_BIT));
 }
 
@@ -1866,10 +1857,8 @@
         return;
     }
 
-    GrStencilAttachment* sb = target->renderTargetPriv().getStencilAttachment();
-    // this should only be called internally when we know we have a
-    // stencil buffer.
-    SkASSERT(sb);
+    // This should only be called internally when we know we have a stencil buffer.
+    SkASSERT(target->renderTargetPriv().getStencilAttachment());
 
     GrGLRenderTarget* glRT = static_cast<GrGLRenderTarget*>(target);
     this->flushRenderTargetNoColorWrites(glRT);
@@ -1881,9 +1870,78 @@
     GL_CALL(ClearStencil(clearValue));
     GL_CALL(Clear(GR_GL_STENCIL_BUFFER_BIT));
     fHWStencilSettings.invalidate();
-    if (!clearValue) {
-        sb->cleared();
+}
+
+void GrGLGpu::beginCommandBuffer(GrRenderTarget* rt,
+                                 const GrOpsRenderPass::LoadAndStoreInfo& colorLoadStore,
+                                 const GrOpsRenderPass::StencilLoadAndStoreInfo& stencilLoadStore) {
+    SkASSERT(!fIsExecutingCommandBuffer_DebugOnly);
+
+    this->handleDirtyContext();
+
+    auto glRT = static_cast<GrGLRenderTarget*>(rt);
+    this->flushRenderTarget(glRT);
+    SkDEBUGCODE(fIsExecutingCommandBuffer_DebugOnly = true);
+
+    GrGLbitfield clearMask = 0;
+    if (GrLoadOp::kClear == colorLoadStore.fLoadOp) {
+        SkASSERT(!this->caps()->performColorClearsAsDraws());
+        this->flushClearColor(colorLoadStore.fClearColor);
+        this->flushColorWrite(true);
+        clearMask |= GR_GL_COLOR_BUFFER_BIT;
     }
+    if (GrLoadOp::kClear == stencilLoadStore.fLoadOp) {
+        SkASSERT(!this->caps()->performStencilClearsAsDraws());
+        GL_CALL(StencilMask(0xffffffff));
+        GL_CALL(ClearStencil(0));
+        clearMask |= GR_GL_STENCIL_BUFFER_BIT;
+    }
+    if (clearMask) {
+        this->disableScissor();
+        this->disableWindowRectangles();
+        GL_CALL(Clear(clearMask));
+    }
+}
+
+void GrGLGpu::endCommandBuffer(GrRenderTarget* rt,
+                               const GrOpsRenderPass::LoadAndStoreInfo& colorLoadStore,
+                               const GrOpsRenderPass::StencilLoadAndStoreInfo& stencilLoadStore) {
+    SkASSERT(fIsExecutingCommandBuffer_DebugOnly);
+
+    this->handleDirtyContext();
+
+    if (rt->uniqueID() != fHWBoundRenderTargetUniqueID) {
+        // The framebuffer binding changed in the middle of a command buffer. We should have already
+        // printed a warning during onFBOChanged.
+        return;
+    }
+
+    if (GrGLCaps::kNone_InvalidateFBType != this->glCaps().invalidateFBType()) {
+        auto glRT = static_cast<GrGLRenderTarget*>(rt);
+
+        SkSTArray<2, GrGLenum> discardAttachments;
+        if (GrStoreOp::kDiscard == colorLoadStore.fStoreOp) {
+            discardAttachments.push_back(
+                    (0 == glRT->renderFBOID()) ? GR_GL_COLOR : GR_GL_COLOR_ATTACHMENT0);
+        }
+        if (GrStoreOp::kDiscard == stencilLoadStore.fStoreOp) {
+            discardAttachments.push_back(
+                    (0 == glRT->renderFBOID()) ? GR_GL_STENCIL : GR_GL_STENCIL_ATTACHMENT);
+        }
+
+        if (!discardAttachments.empty()) {
+            if (GrGLCaps::kInvalidate_InvalidateFBType == this->glCaps().invalidateFBType()) {
+                GL_CALL(InvalidateFramebuffer(GR_GL_FRAMEBUFFER, discardAttachments.count(),
+                                              discardAttachments.begin()));
+            } else {
+                SkASSERT(GrGLCaps::kDiscard_InvalidateFBType == this->glCaps().invalidateFBType());
+                GL_CALL(DiscardFramebuffer(GR_GL_FRAMEBUFFER, discardAttachments.count(),
+                                           discardAttachments.begin()));
+            }
+        }
+    }
+
+    SkDEBUGCODE(fIsExecutingCommandBuffer_DebugOnly = false);
 }
 
 void GrGLGpu::clearStencilClip(const GrFixedClip& clip,
@@ -2689,7 +2747,14 @@
     }
 }
 
-void GrGLGpu::flushClearColor(GrGLfloat r, GrGLfloat g, GrGLfloat b, GrGLfloat a) {
+void GrGLGpu::flushClearColor(const SkPMColor4f& color) {
+    GrGLfloat r = color.fR, g = color.fG, b = color.fB, a = color.fA;
+    if (this->glCaps().clearToBoundaryValuesIsBroken() &&
+        (1 == r || 0 == r) && (1 == g || 0 == g) && (1 == b || 0 == b) && (1 == a || 0 == a)) {
+        static const GrGLfloat safeAlpha1 = nextafter(1.f, 2.f);
+        static const GrGLfloat safeAlpha0 = nextafter(0.f, -1.f);
+        a = (1 == a) ? safeAlpha1 : safeAlpha0;
+    }
     if (r != fHWClearColor[0] || g != fHWClearColor[1] ||
         b != fHWClearColor[2] || a != fHWClearColor[3]) {
         GL_CALL(ClearColor(r, g, b, a));
@@ -2847,6 +2912,12 @@
         this->caps()->workarounds().restore_scissor_on_fbo_change) {
         GL_CALL(Flush());
     }
+#ifdef SK_DEBUG
+    if (fIsExecutingCommandBuffer_DebugOnly) {
+        SkDebugf("WARNING: GL FBO binding changed while executing a command buffer. "
+                 "This will severely hurt performance.\n");
+    }
+#endif
 }
 
 void GrGLGpu::bindFramebuffer(GrGLenum target, GrGLuint fboid) {
diff --git a/src/gpu/gl/GrGLGpu.h b/src/gpu/gl/GrGLGpu.h
index 7bb25fe..7c77892 100644
--- a/src/gpu/gl/GrGLGpu.h
+++ b/src/gpu/gl/GrGLGpu.h
@@ -117,6 +117,13 @@
     // stencil buffer as not dirty?
     void clearStencil(GrRenderTarget*, int clearValue);
 
+    void beginCommandBuffer(GrRenderTarget*,
+                            const GrOpsRenderPass::LoadAndStoreInfo& colorLoadStore,
+                            const GrOpsRenderPass::StencilLoadAndStoreInfo& stencilLoadStore);
+
+    void endCommandBuffer(GrRenderTarget*, const GrOpsRenderPass::LoadAndStoreInfo& colorLoadStore,
+                          const GrOpsRenderPass::StencilLoadAndStoreInfo& stencilLoadStore);
+
     GrOpsRenderPass* getOpsRenderPass(
             GrRenderTarget*, GrSurfaceOrigin, const SkIRect&,
             const GrOpsRenderPass::LoadAndStoreInfo&,
@@ -335,7 +342,7 @@
     };
 
     void flushColorWrite(bool writeColor);
-    void flushClearColor(GrGLfloat r, GrGLfloat g, GrGLfloat b, GrGLfloat a);
+    void flushClearColor(const SkPMColor4f&);
 
     // flushes the scissor. see the note on flushBoundTextureAndParams about
     // flushing the scissor after that function is called.
@@ -658,6 +665,9 @@
         GrGLsync fSync;
     };
     std::list<FinishCallback> fFinishCallbacks;
+
+    SkDEBUGCODE(bool fIsExecutingCommandBuffer_DebugOnly = false);
+
     friend class GrGLPathRendering; // For accessing setTextureUnit.
 
     typedef GrGpu INHERITED;
diff --git a/src/gpu/gl/GrGLOpsRenderPass.cpp b/src/gpu/gl/GrGLOpsRenderPass.cpp
index dd24c68..717d71d 100644
--- a/src/gpu/gl/GrGLOpsRenderPass.cpp
+++ b/src/gpu/gl/GrGLOpsRenderPass.cpp
@@ -11,22 +11,9 @@
 #include "src/gpu/GrFixedClip.h"
 #include "src/gpu/GrRenderTargetPriv.h"
 
-void GrGLOpsRenderPass::begin() {
-    if (GrLoadOp::kClear == fColorLoadAndStoreInfo.fLoadOp) {
-        fGpu->clear(GrFixedClip::Disabled(), fColorLoadAndStoreInfo.fClearColor,
-                    fRenderTarget, fOrigin);
-    }
-    if (GrLoadOp::kClear == fStencilLoadAndStoreInfo.fLoadOp) {
-        GrStencilAttachment* sb = fRenderTarget->renderTargetPriv().getStencilAttachment();
-        if (sb && (sb->isDirty() || fRenderTarget->alwaysClearStencil())) {
-            fGpu->clearStencil(fRenderTarget, 0x0);
-        }
-    }
-}
-
 void GrGLOpsRenderPass::set(GrRenderTarget* rt, GrSurfaceOrigin origin,
-                            const GrOpsRenderPass::LoadAndStoreInfo& colorInfo,
-                            const GrOpsRenderPass::StencilLoadAndStoreInfo& stencilInfo) {
+                            const LoadAndStoreInfo& colorInfo,
+                            const StencilLoadAndStoreInfo& stencilInfo) {
     SkASSERT(fGpu);
     SkASSERT(!fRenderTarget);
     SkASSERT(fGpu == rt->getContext()->priv().getGpu());
diff --git a/src/gpu/gl/GrGLOpsRenderPass.h b/src/gpu/gl/GrGLOpsRenderPass.h
index 998ef24..7a29d84 100644
--- a/src/gpu/gl/GrGLOpsRenderPass.h
+++ b/src/gpu/gl/GrGLOpsRenderPass.h
@@ -26,8 +26,13 @@
 public:
     GrGLOpsRenderPass(GrGLGpu* gpu) : fGpu(gpu) {}
 
-    void begin() override;
-    void end() override {}
+    void begin() override {
+        fGpu->beginCommandBuffer(fRenderTarget, fColorLoadAndStoreInfo, fStencilLoadAndStoreInfo);
+    }
+
+    void end() override {
+        fGpu->endCommandBuffer(fRenderTarget, fColorLoadAndStoreInfo, fStencilLoadAndStoreInfo);
+    }
 
     void insertEventMarker(const char* msg) override {
         fGpu->insertEventMarker(msg);
@@ -37,9 +42,8 @@
         state->doUpload(upload);
     }
 
-    void set(GrRenderTarget*, GrSurfaceOrigin,
-             const GrOpsRenderPass::LoadAndStoreInfo&,
-             const GrOpsRenderPass::StencilLoadAndStoreInfo&);
+    void set(GrRenderTarget*, GrSurfaceOrigin, const LoadAndStoreInfo&,
+             const StencilLoadAndStoreInfo&);
 
     void reset() {
         fRenderTarget = nullptr;