Clear and discard stencil buffers on tilers

Bug: skia:
Change-Id: I42e99cb75567825ac7751fc0ec56e4c45132628b
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/210425
Reviewed-by: Greg Daniel <egdaniel@google.com>
Commit-Queue: Chris Dalton <csmartdalton@google.com>
diff --git a/src/gpu/GrCaps.cpp b/src/gpu/GrCaps.cpp
index 7543820..26b3c59 100644
--- a/src/gpu/GrCaps.cpp
+++ b/src/gpu/GrCaps.cpp
@@ -32,6 +32,7 @@
     fUsePrimitiveRestart = false;
     fPreferClientSideDynamicBuffers = false;
     fPreferFullscreenClears = false;
+    fDiscardStencilAfterCommandBuffer = false;
     fMustClearUploadedBufferData = false;
     fShouldInitializeTextures = false;
     fSupportsAHardwareBufferImages = false;
@@ -207,6 +208,8 @@
     writer->appendBool("Use primitive restart", fUsePrimitiveRestart);
     writer->appendBool("Prefer client-side dynamic buffers", fPreferClientSideDynamicBuffers);
     writer->appendBool("Prefer fullscreen clears", fPreferFullscreenClears);
+    writer->appendBool("Discard stencil values after executing a command buffer",
+                       fDiscardStencilAfterCommandBuffer);
     writer->appendBool("Must clear buffer memory", fMustClearUploadedBufferData);
     writer->appendBool("Should initialize textures", fShouldInitializeTextures);
     writer->appendBool("Supports importing AHardwareBuffers", fSupportsAHardwareBufferImages);
diff --git a/src/gpu/GrCaps.h b/src/gpu/GrCaps.h
index 9dd739c..8f31f7c 100644
--- a/src/gpu/GrCaps.h
+++ b/src/gpu/GrCaps.h
@@ -71,6 +71,11 @@
     // initialize each tile with a constant value rather than loading each pixel from memory.
     bool preferFullscreenClears() const { return fPreferFullscreenClears; }
 
+    // On tilers we can save memory bandwidth by clearing the stencil buffer at the beginning of
+    // a command buffer, and discarding it at the end. This encourages the driver to only use fast,
+    // on-chip tile memory and never actually transfer stencil values to and from main memory.
+    bool discardStencilAfterCommandBuffer() const { return fDiscardStencilAfterCommandBuffer; }
+
     bool preferVRAMUseOverFlushes() const { return fPreferVRAMUseOverFlushes; }
 
     bool preferTrianglesOverSampleMask() const { return fPreferTrianglesOverSampleMask; }
@@ -374,6 +379,7 @@
     bool fUsePrimitiveRestart                        : 1;
     bool fPreferClientSideDynamicBuffers             : 1;
     bool fPreferFullscreenClears                     : 1;
+    bool fDiscardStencilAfterCommandBuffer           : 1;
     bool fMustClearUploadedBufferData                : 1;
     bool fShouldInitializeTextures                   : 1;
     bool fSupportsAHardwareBufferImages              : 1;
diff --git a/src/gpu/GrOpList.cpp b/src/gpu/GrOpList.cpp
index 109289b..5cee97d 100644
--- a/src/gpu/GrOpList.cpp
+++ b/src/gpu/GrOpList.cpp
@@ -187,9 +187,6 @@
 }
 
 #ifdef SK_DEBUG
-static const char* op_to_name(GrLoadOp op) {
-    return GrLoadOp::kLoad == op ? "load" : GrLoadOp::kClear == op ? "clear" : "discard";
-}
 
 void GrOpList::dump(bool printDependencies) const {
     SkDebugf("--------------------------------------------------------------\n");
@@ -198,10 +195,9 @@
              fTarget.get() && fTarget.get()->peekSurface()
                      ? fTarget.get()->peekSurface()->uniqueID().asUInt()
                      : -1);
-    SkDebugf("ColorLoadOp: %s %x StencilLoadOp: %s\n",
-             op_to_name(fColorLoadOp),
-             GrLoadOp::kClear == fColorLoadOp ? fLoadClearColor.toBytes_RGBA() : 0x0,
-             op_to_name(fStencilLoadOp));
+    SkDebugf("ColorLoadOp: %s %x\n",
+             GrLoadOpName(fColorLoadOp),
+             GrLoadOp::kClear == fColorLoadOp ? fLoadClearColor.toBytes_RGBA() : 0x0);
 
     if (printDependencies) {
         SkDebugf("I rely On (%d): ", fDependencies.count());
diff --git a/src/gpu/GrRenderTargetContext.cpp b/src/gpu/GrRenderTargetContext.cpp
index 7f372b1..7ecab05 100644
--- a/src/gpu/GrRenderTargetContext.cpp
+++ b/src/gpu/GrRenderTargetContext.cpp
@@ -234,7 +234,21 @@
     SkDEBUGCODE(this->validate();)
 
     if (!fOpList || fOpList->isClosed()) {
-        fOpList = this->drawingManager()->newRTOpList(fRenderTargetProxy, fManagedOpList);
+        sk_sp<GrRenderTargetOpList> newOpList =
+                this->drawingManager()->newRTOpList(fRenderTargetProxy, fManagedOpList);
+        if (fHasInitializedStencil) {
+            SkASSERT(fOpList);
+            // Always load/store stencil between opList splits. (We know this must be a split
+            // because the stencil is already initialized.)
+            // FIXME: In addition to simply reducing the split frequency, we might want to think
+            // about cases where we can safely skip this heavy-handed load/store solution.
+            fOpList->setStencilStoreOp(GrStoreOp::kStore);  // Store stencil after previous opList.
+            newOpList->setStencilLoadOp(GrLoadOp::kLoad);  // Load stencil before next opList.
+            if (!this->caps()->discardStencilAfterCommandBuffer()) {
+                newOpList->setStencilStoreOp(GrStoreOp::kStore);
+            }
+        }
+        fOpList = std::move(newOpList);
     }
 
     return fOpList.get();
@@ -843,9 +857,6 @@
         paint.setColor4f({0.f, 0.f, 0.f, 0.f});
         paint.setPorterDuffXPFactory(SkBlendMode::kSrcOver);
 
-        // Mark stencil usage here before addDrawOp() so that it doesn't try to re-call
-        // internalStencilClear() just because the op has stencil settings.
-        this->setNeedsStencil();
         this->addDrawOp(clip, GrFillRectOp::Make(fContext, std::move(paint),
                         GrAAType::kNone, SkMatrix::I(), rtRect, ss));
     } else {
@@ -883,8 +894,6 @@
         return;
     }
 
-    fRenderTargetContext->setNeedsStencil();
-
     std::unique_ptr<GrOp> op = GrStencilPathOp::Make(fRenderTargetContext->fContext,
                                                      viewMatrix,
                                                      GrAA::kYes == doStencilMSAA,
@@ -2252,20 +2261,26 @@
         return;
     }
 
-    if (fixedFunctionFlags & GrDrawOp::FixedFunctionFlags::kUsesStencil ||
-        appliedClip.hasStencilClip()) {
-        if (this->caps()->performStencilClearsAsDraws()) {
-            // Must use an op to perform the clear of the stencil buffer before this op, but only
-            // have to clear the first time any draw needs it (this also ensures we don't loop
-            // forever when the internal stencil clear adds a draw op that has stencil settings).
-            if (!fRenderTargetProxy->needsStencil()) {
-                // Send false so that the stencil buffer is fully cleared to 0
+    if (fixedFunctionFlags & GrDrawOp::FixedFunctionFlags::kUsesStencil) {
+        if (!fHasInitializedStencil) {
+            GrRenderTargetOpList* opList = this->getRTOpList();
+
+            // Do this first, to ensure we don't recurse forever if the internal stencil clear gets
+            // triggered and adds a draw op that has stencil settings.
+            fHasInitializedStencil = true;
+
+            if (this->caps()->performStencilClearsAsDraws()) {
+                // Send false so that the stencil buffer is fully cleared to 0.
                 this->internalStencilClear(GrFixedClip::Disabled(), /* inside mask */ false);
+            } else {
+                opList->setStencilLoadOp(GrLoadOp::kClear);
             }
-        } else {
-            // Just make sure the stencil buffer is cleared before the draw op, easy to do it as
-            // a load at the start
-            this->getRTOpList()->setStencilLoadOp(GrLoadOp::kClear);
+
+            if (!this->caps()->discardStencilAfterCommandBuffer()) {
+                // Preserve stencil data if we aren't on a tiler. The opList will notice this, track
+                // that the user bits are clean, and potentially skip future clear-on-load ops.
+                opList->setStencilStoreOp(GrStoreOp::kStore);
+            }
         }
 
         this->setNeedsStencil();
diff --git a/src/gpu/GrRenderTargetContext.h b/src/gpu/GrRenderTargetContext.h
index 6f39325..6b058d2 100644
--- a/src/gpu/GrRenderTargetContext.h
+++ b/src/gpu/GrRenderTargetContext.h
@@ -553,6 +553,8 @@
     SkSurfaceProps fSurfaceProps;
     bool fManagedOpList;
 
+    bool fHasInitializedStencil = false;
+
     typedef GrSurfaceContext INHERITED;
 };
 
diff --git a/src/gpu/GrRenderTargetOpList.cpp b/src/gpu/GrRenderTargetOpList.cpp
index 84fc9cb..f5b1753 100644
--- a/src/gpu/GrRenderTargetOpList.cpp
+++ b/src/gpu/GrRenderTargetOpList.cpp
@@ -5,6 +5,7 @@
  * found in the LICENSE file.
  */
 
+#include "include/gpu/GrRenderTarget.h"
 #include "include/private/GrAuditTrail.h"
 #include "include/private/GrRecordingContext.h"
 #include "src/core/SkExchange.h"
@@ -18,7 +19,9 @@
 #include "src/gpu/GrRect.h"
 #include "src/gpu/GrRenderTargetContext.h"
 #include "src/gpu/GrRenderTargetOpList.h"
+#include "src/gpu/GrRenderTargetPriv.h"
 #include "src/gpu/GrResourceAllocator.h"
+#include "src/gpu/GrStencilAttachment.h"
 #include "src/gpu/ops/GrClearOp.h"
 #include "src/gpu/ops/GrCopySurfaceOp.h"
 
@@ -373,6 +376,9 @@
 void GrRenderTargetOpList::dump(bool printDependencies) const {
     INHERITED::dump(printDependencies);
 
+    SkDebugf("fStencilLoadOp: %s\n", GrLoadOpName(fStencilLoadOp));
+    SkDebugf("fStencilStoreOp: %s\n", GrStoreOpName(fStencilStoreOp));
+
     SkDebugf("ops (%d):\n", fOpChains.count());
     for (int i = 0; i < fOpChains.count(); ++i) {
         SkDebugf("*******************************\n");
@@ -428,13 +434,10 @@
     }
 }
 
-static GrGpuRTCommandBuffer* create_command_buffer(GrGpu* gpu,
-                                                   GrRenderTarget* rt,
-                                                   GrSurfaceOrigin origin,
-                                                   const SkRect& bounds,
-                                                   GrLoadOp colorLoadOp,
-                                                   const SkPMColor4f& loadClearColor,
-                                                   GrLoadOp stencilLoadOp) {
+static GrGpuRTCommandBuffer* create_command_buffer(
+        GrGpu* gpu, GrRenderTarget* rt, GrSurfaceOrigin origin, const SkRect& bounds,
+        GrLoadOp colorLoadOp, const SkPMColor4f& loadClearColor, GrLoadOp stencilLoadOp, GrStoreOp
+        stencilStoreOp) {
     const GrGpuRTCommandBuffer::LoadAndStoreInfo kColorLoadStoreInfo {
         colorLoadOp,
         GrStoreOp::kStore,
@@ -448,7 +451,7 @@
     // lower level (inside the VK command buffer).
     const GrGpuRTCommandBuffer::StencilLoadAndStoreInfo stencilLoadAndStoreInfo {
         stencilLoadOp,
-        GrStoreOp::kStore,
+        stencilStoreOp,
     };
 
     return gpu->getCommandBuffer(rt, origin, bounds, kColorLoadStoreInfo, stencilLoadAndStoreInfo);
@@ -472,22 +475,25 @@
     SkASSERT(fTarget.get()->peekRenderTarget());
     TRACE_EVENT0("skia", TRACE_FUNC);
 
-    // TODO: at the very least, we want the stencil store op to always be discard (at this
-    // level). In Vulkan, sub-command buffers would still need to load & store the stencil buffer.
-
     // Make sure load ops are not kClear if the GPU needs to use draws for clears
     SkASSERT(fColorLoadOp != GrLoadOp::kClear ||
              !flushState->gpu()->caps()->performColorClearsAsDraws());
     SkASSERT(fStencilLoadOp != GrLoadOp::kClear ||
              !flushState->gpu()->caps()->performStencilClearsAsDraws());
+
+    GrRenderTarget* renderTarget = fTarget.get()->peekRenderTarget();
+    GrStencilAttachment* stencil = renderTarget->renderTargetPriv().getStencilAttachment();
+    GrLoadOp stencilLoadOp = fStencilLoadOp;
+    if (stencil && GrLoadOp::kClear == stencilLoadOp && !stencil->userBitsAreDirty()) {
+        if (!flushState->caps().preferFullscreenClears()) {
+            stencilLoadOp = GrLoadOp::kLoad;
+        }
+    }
+
     GrGpuRTCommandBuffer* commandBuffer = create_command_buffer(
-                                                    flushState->gpu(),
-                                                    fTarget.get()->peekRenderTarget(),
-                                                    fTarget.get()->origin(),
-                                                    fTarget.get()->getBoundsRect(),
-                                                    fColorLoadOp,
-                                                    fLoadClearColor,
-                                                    fStencilLoadOp);
+            flushState->gpu(), renderTarget, fTarget.get()->origin(),
+            fTarget.get()->getBoundsRect(), fColorLoadOp, fLoadClearColor, stencilLoadOp,
+            fStencilStoreOp);
     flushState->setCommandBuffer(commandBuffer);
     commandBuffer->begin();
 
@@ -516,6 +522,16 @@
     flushState->gpu()->submit(commandBuffer);
     flushState->setCommandBuffer(nullptr);
 
+    if (stencil && GrStoreOp::kStore == fStencilStoreOp) {
+        // The user stencil bits are always initialized and kept at zero for the duration of a
+        // command buffer. So if we store the stencil, we know we're storing clean user bits.
+        stencil->userBitsCleared();
+    }
+    // FIXME: We don't currently have a way to flag command buffers that don't use stencil at all.
+    // In that case, their store op will be discard, and we currently make the assumption that a
+    // store op of "discard" will not invalidate what's already in main memory. This is probably ok
+    // for now, but certainly something we want to address soon.
+
     return true;
 }
 
@@ -535,10 +551,6 @@
     }
 }
 
-void GrRenderTargetOpList::setStencilLoadOp(GrLoadOp op) {
-    fStencilLoadOp = op;
-}
-
 void GrRenderTargetOpList::setColorLoadOp(GrLoadOp op, const SkPMColor4f& color) {
     fColorLoadOp = op;
     fLoadClearColor = color;
diff --git a/src/gpu/GrRenderTargetOpList.h b/src/gpu/GrRenderTargetOpList.h
index e27f694..739cda3 100644
--- a/src/gpu/GrRenderTargetOpList.h
+++ b/src/gpu/GrRenderTargetOpList.h
@@ -127,8 +127,10 @@
 
     bool onIsUsed(GrSurfaceProxy*) const override;
 
-    // Must only be called if native stencil buffer clearing is enabled
-    void setStencilLoadOp(GrLoadOp op);
+    // Load op must not be kClear if caps.performStencilClearsAsDraws() is true.
+    void setStencilLoadOp(GrLoadOp op) { fStencilLoadOp = op; }
+    void setStencilStoreOp(GrStoreOp op) { fStencilStoreOp = op; }
+
     // Must only be called if native color buffer clearing is enabled.
     void setColorLoadOp(GrLoadOp op, const SkPMColor4f& color);
     // Sets the clear color to transparent black
@@ -224,9 +226,12 @@
 
     void forwardCombine(const GrCaps&);
 
-    uint32_t                       fLastClipStackGenID;
-    SkIRect                        fLastDevClipBounds;
-    int                            fLastClipNumAnalyticFPs;
+    uint32_t fLastClipStackGenID;
+    SkIRect fLastDevClipBounds;
+    int fLastClipNumAnalyticFPs;
+
+    GrLoadOp fStencilLoadOp = GrLoadOp::kDiscard;
+    GrStoreOp fStencilStoreOp = GrStoreOp::kDiscard;
 
     // We must track if we have a wait op so that we don't delete the op when we have a full clear.
     bool fHasWaitOp = false;;
@@ -236,8 +241,8 @@
 
     // MDB TODO: 4096 for the first allocation of the clip space will be huge overkill.
     // Gather statistics to determine the correct size.
-    SkArenaAlloc                   fClipAllocator{4096};
-    SkDEBUGCODE(int                fNumClips;)
+    SkArenaAlloc fClipAllocator{4096};
+    SkDEBUGCODE(int fNumClips;)
 
     typedef GrOpList INHERITED;
 };
diff --git a/src/gpu/GrStencilAttachment.h b/src/gpu/GrStencilAttachment.h
index 22595f5..fad6e89 100644
--- a/src/gpu/GrStencilAttachment.h
+++ b/src/gpu/GrStencilAttachment.h
@@ -25,9 +25,9 @@
     int height() const { return fHeight; }
     int bits() const { return fBits; }
     int numSamples() const { return fSampleCnt; }
-    bool isDirty() const { return fIsDirty; }
+    bool userBitsAreDirty() const { return fUserBitsAreDirty; }
 
-    void cleared() { fIsDirty = false; }
+    void userBitsCleared() { fUserBitsAreDirty = false; }
 
     // We create a unique stencil buffer at each width, height and sampleCnt and share it for
     // all render targets that require a stencil with those params.
@@ -41,7 +41,7 @@
             , fHeight(height)
             , fBits(bits)
             , fSampleCnt(sampleCnt)
-            , fIsDirty(true) {
+            , fUserBitsAreDirty(true) {
     }
 
 private:
@@ -51,7 +51,7 @@
     int fHeight;
     int fBits;
     int fSampleCnt;
-    bool fIsDirty;
+    bool fUserBitsAreDirty;
 
     typedef GrGpuResource INHERITED;
 };
diff --git a/src/gpu/gl/GrGLCaps.cpp b/src/gpu/gl/GrGLCaps.cpp
index ce0233c..8bccc42 100644
--- a/src/gpu/gl/GrGLCaps.cpp
+++ b/src/gpu/gl/GrGLCaps.cpp
@@ -170,6 +170,7 @@
         kImagination_GrGLVendor == ctxInfo.vendor() ||
         kQualcomm_GrGLVendor == ctxInfo.vendor() ) {
         fPreferFullscreenClears = true;
+        fDiscardStencilAfterCommandBuffer = true;
     }
 
     if (GR_IS_GR_GL(standard)) {
diff --git a/src/gpu/gl/GrGLGpu.cpp b/src/gpu/gl/GrGLGpu.cpp
index 577ab0a..aead9d2 100644
--- a/src/gpu/gl/GrGLGpu.cpp
+++ b/src/gpu/gl/GrGLGpu.cpp
@@ -1646,7 +1646,7 @@
             this->disableScissor();
             this->disableWindowRectangles();
             this->flushColorWrite(true);
-            this->flushClearColor(0, 0, 0, 0);
+            this->flushClearColor({0, 0, 0, 0});
             GL_CALL(Clear(GR_GL_COLOR_BUFFER_BIT));
             this->unbindTextureFBOForPixelOps(GR_GL_FRAMEBUFFER, tex.get());
             fHWBoundRenderTargetUniqueID.makeInvalid();
@@ -2194,16 +2194,7 @@
     this->flushScissor(clip.scissorState(), glRT->getViewport(), origin);
     this->flushWindowRectangles(clip.windowRectsState(), glRT, origin);
     this->flushColorWrite(true);
-
-    GrGLfloat r = color.fR, g = color.fG, b = color.fB, a = color.fA;
-    if (this->glCaps().clearToBoundaryValuesIsBroken() &&
-        (1 == r || 0 == r) && (1 == g || 0 == g) && (1 == b || 0 == b) && (1 == a || 0 == a)) {
-        static const GrGLfloat safeAlpha1 = nextafter(1.f, 2.f);
-        static const GrGLfloat safeAlpha0 = nextafter(0.f, -1.f);
-        a = (1 == a) ? safeAlpha1 : safeAlpha0;
-    }
-    this->flushClearColor(r, g, b, a);
-
+    this->flushClearColor(color);
     GL_CALL(Clear(GR_GL_COLOR_BUFFER_BIT));
 }
 
@@ -2214,10 +2205,8 @@
         return;
     }
 
-    GrStencilAttachment* sb = target->renderTargetPriv().getStencilAttachment();
-    // this should only be called internally when we know we have a
-    // stencil buffer.
-    SkASSERT(sb);
+    // this should only be called internally when we know we have a stencil buffer.
+    SkASSERT(target->renderTargetPriv().getStencilAttachment());
 
     GrGLRenderTarget* glRT = static_cast<GrGLRenderTarget*>(target);
     this->flushRenderTargetNoColorWrites(glRT);
@@ -2229,9 +2218,79 @@
     GL_CALL(ClearStencil(clearValue));
     GL_CALL(Clear(GR_GL_STENCIL_BUFFER_BIT));
     fHWStencilSettings.invalidate();
-    if (!clearValue) {
-        sb->cleared();
+}
+
+void GrGLGpu::beginCommandBuffer(GrRenderTarget* rt, const ColorLoadAndStoreInfo& colorLoadStore,
+                                 const StencilLoadAndStoreInfo& stencilLoadStore) {
+    SkASSERT(!fIsExecutingCommandBuffer_DebugOnly);
+
+    this->handleDirtyContext();
+
+    auto glRT = static_cast<GrGLRenderTarget*>(rt);
+    this->flushRenderTarget(glRT);
+    SkDEBUGCODE(fIsExecutingCommandBuffer_DebugOnly = true);
+
+    GrGLbitfield clearMask = 0;
+    if (GrLoadOp::kClear == colorLoadStore.fLoadOp) {
+        SkASSERT(!this->caps()->performColorClearsAsDraws());
+        this->flushClearColor(colorLoadStore.fClearColor);
+        this->flushColorWrite(true);
+        clearMask |= GR_GL_COLOR_BUFFER_BIT;
     }
+    if (GrLoadOp::kClear == stencilLoadStore.fLoadOp) {
+        SkASSERT(!this->caps()->performStencilClearsAsDraws());
+        GL_CALL(StencilMask(0xffffffff));
+        GL_CALL(ClearStencil(0));
+        clearMask |= GR_GL_STENCIL_BUFFER_BIT;
+    }
+    if (clearMask) {
+        this->disableScissor();
+        this->disableWindowRectangles();
+        GL_CALL(Clear(clearMask));
+    }
+}
+
+void GrGLGpu::endCommandBuffer(GrRenderTarget* rt, const ColorLoadAndStoreInfo& colorLoadStore,
+                               const StencilLoadAndStoreInfo& stencilLoadStore) {
+    SkASSERT(fIsExecutingCommandBuffer_DebugOnly);
+
+    this->handleDirtyContext();
+
+    if (rt->uniqueID() != fHWBoundRenderTargetUniqueID) {
+#ifdef SK_DEBUG
+        SkDebugf("WARNING: GL framebuffer changed in the middle of a command buffer. This will "
+                 "kill performance.\n");
+        fIsExecutingCommandBuffer_DebugOnly = false;
+#endif
+        return;
+    }
+
+    if (this->caps()->discardRenderTargetSupport()) {
+        auto glRT = static_cast<GrGLRenderTarget*>(rt);
+
+        SkSTArray<2, GrGLenum> discardAttachments;
+        if (GrStoreOp::kStore != colorLoadStore.fStoreOp) {
+            discardAttachments.push_back(
+                    (0 == glRT->renderFBOID()) ? GR_GL_COLOR : GR_GL_COLOR_ATTACHMENT0);
+        }
+        if (GrStoreOp::kStore != stencilLoadStore.fStoreOp) {
+            discardAttachments.push_back(
+                    (0 == glRT->renderFBOID()) ? GR_GL_STENCIL : GR_GL_STENCIL_ATTACHMENT);
+        }
+
+        if (!discardAttachments.empty()) {
+            if (GrGLCaps::kInvalidate_InvalidateFBType == this->glCaps().invalidateFBType()) {
+                GL_CALL(InvalidateFramebuffer(GR_GL_FRAMEBUFFER, discardAttachments.count(),
+                                              discardAttachments.begin()));
+            } else {
+                SkASSERT(GrGLCaps::kDiscard_InvalidateFBType == this->glCaps().invalidateFBType());
+                GL_CALL(DiscardFramebuffer(GR_GL_FRAMEBUFFER, discardAttachments.count(),
+                                           discardAttachments.begin()));
+            }
+        }
+    }
+
+    SkDEBUGCODE(fIsExecutingCommandBuffer_DebugOnly = false);
 }
 
 void GrGLGpu::clearStencilClip(const GrFixedClip& clip,
@@ -3136,7 +3195,14 @@
     }
 }
 
-void GrGLGpu::flushClearColor(GrGLfloat r, GrGLfloat g, GrGLfloat b, GrGLfloat a) {
+void GrGLGpu::flushClearColor(const SkPMColor4f& color) {
+    GrGLfloat r = color.fR, g = color.fG, b = color.fB, a = color.fA;
+    if (this->glCaps().clearToBoundaryValuesIsBroken() &&
+        (1 == r || 0 == r) && (1 == g || 0 == g) && (1 == b || 0 == b) && (1 == a || 0 == a)) {
+        static const GrGLfloat safeAlpha1 = nextafter(1.f, 2.f);
+        static const GrGLfloat safeAlpha0 = nextafter(0.f, -1.f);
+        a = (1 == a) ? safeAlpha1 : safeAlpha0;
+    }
     if (r != fHWClearColor[0] || g != fHWClearColor[1] ||
         b != fHWClearColor[2] || a != fHWClearColor[3]) {
         GL_CALL(ClearColor(r, g, b, a));
diff --git a/src/gpu/gl/GrGLGpu.h b/src/gpu/gl/GrGLGpu.h
index e8d5be1..85c6773 100644
--- a/src/gpu/gl/GrGLGpu.h
+++ b/src/gpu/gl/GrGLGpu.h
@@ -122,6 +122,15 @@
     // stencil buffer as not dirty?
     void clearStencil(GrRenderTarget*, int clearValue);
 
+    using ColorLoadAndStoreInfo = GrGpuRTCommandBuffer::LoadAndStoreInfo;
+    using StencilLoadAndStoreInfo = GrGpuRTCommandBuffer::StencilLoadAndStoreInfo;
+
+    void beginCommandBuffer(
+            GrRenderTarget*, const ColorLoadAndStoreInfo&, const StencilLoadAndStoreInfo&);
+
+    void endCommandBuffer(
+            GrRenderTarget*, const ColorLoadAndStoreInfo&, const StencilLoadAndStoreInfo&);
+
     GrGpuRTCommandBuffer* getCommandBuffer(
             GrRenderTarget*, GrSurfaceOrigin, const SkRect&,
             const GrGpuRTCommandBuffer::LoadAndStoreInfo&,
@@ -352,7 +361,7 @@
     };
 
     void flushColorWrite(bool writeColor);
-    void flushClearColor(GrGLfloat r, GrGLfloat g, GrGLfloat b, GrGLfloat a);
+    void flushClearColor(const SkPMColor4f&);
 
     // flushes the scissor. see the note on flushBoundTextureAndParams about
     // flushing the scissor after that function is called.
@@ -676,6 +685,9 @@
         GrGLsync fSync;
     };
     std::list<FinishCallback> fFinishCallbacks;
+
+    SkDEBUGCODE(bool fIsExecutingCommandBuffer_DebugOnly = false);
+
     friend class GrGLPathRendering; // For accessing setTextureUnit.
 
     typedef GrGpu INHERITED;
diff --git a/src/gpu/gl/GrGLGpuCommandBuffer.cpp b/src/gpu/gl/GrGLGpuCommandBuffer.cpp
index 1412aa3..655b915 100644
--- a/src/gpu/gl/GrGLGpuCommandBuffer.cpp
+++ b/src/gpu/gl/GrGLGpuCommandBuffer.cpp
@@ -11,22 +11,9 @@
 #include "src/gpu/GrFixedClip.h"
 #include "src/gpu/GrRenderTargetPriv.h"
 
-void GrGLGpuRTCommandBuffer::begin() {
-    if (GrLoadOp::kClear == fColorLoadAndStoreInfo.fLoadOp) {
-        fGpu->clear(GrFixedClip::Disabled(), fColorLoadAndStoreInfo.fClearColor,
-                    fRenderTarget, fOrigin);
-    }
-    if (GrLoadOp::kClear == fStencilLoadAndStoreInfo.fLoadOp) {
-        GrStencilAttachment* sb = fRenderTarget->renderTargetPriv().getStencilAttachment();
-        if (sb && (sb->isDirty() || fRenderTarget->alwaysClearStencil())) {
-            fGpu->clearStencil(fRenderTarget, 0x0);
-        }
-    }
-}
-
-void GrGLGpuRTCommandBuffer::set(GrRenderTarget* rt, GrSurfaceOrigin origin,
-                                 const GrGpuRTCommandBuffer::LoadAndStoreInfo& colorInfo,
-                                 const GrGpuRTCommandBuffer::StencilLoadAndStoreInfo& stencilInfo) {
+void GrGLGpuRTCommandBuffer::set(
+        GrRenderTarget* rt, GrSurfaceOrigin origin, const LoadAndStoreInfo& colorInfo,
+        const StencilLoadAndStoreInfo& stencilInfo) {
     SkASSERT(fGpu);
     SkASSERT(!fRenderTarget);
     SkASSERT(fGpu == rt->getContext()->priv().getGpu());
diff --git a/src/gpu/gl/GrGLGpuCommandBuffer.h b/src/gpu/gl/GrGLGpuCommandBuffer.h
index 0f9370f..9831b63 100644
--- a/src/gpu/gl/GrGLGpuCommandBuffer.h
+++ b/src/gpu/gl/GrGLGpuCommandBuffer.h
@@ -55,8 +55,13 @@
 public:
     GrGLGpuRTCommandBuffer(GrGLGpu* gpu) : fGpu(gpu) {}
 
-    void begin() override;
-    void end() override {}
+    void begin() override {
+        fGpu->beginCommandBuffer(fRenderTarget, fColorLoadAndStoreInfo, fStencilLoadAndStoreInfo);
+    }
+
+    void end() override {
+        fGpu->endCommandBuffer(fRenderTarget, fColorLoadAndStoreInfo, fStencilLoadAndStoreInfo);
+    }
 
     void discard() override { }
 
@@ -79,9 +84,8 @@
                                  srcRect.height(), bufferColorType, transferBuffer, offset);
     }
 
-    void set(GrRenderTarget*, GrSurfaceOrigin,
-             const GrGpuRTCommandBuffer::LoadAndStoreInfo&,
-             const GrGpuRTCommandBuffer::StencilLoadAndStoreInfo&);
+    void set(GrRenderTarget*, GrSurfaceOrigin, const LoadAndStoreInfo&, const
+             StencilLoadAndStoreInfo&);
 
     void reset() {
         fRenderTarget = nullptr;
diff --git a/src/gpu/vk/GrVkCaps.cpp b/src/gpu/vk/GrVkCaps.cpp
index 7a50e1e..391aaa5 100644
--- a/src/gpu/vk/GrVkCaps.cpp
+++ b/src/gpu/vk/GrVkCaps.cpp
@@ -417,6 +417,7 @@
         // On NVIDIA and Intel, the discard load followed by clear is faster.
         // TODO: Evaluate on ARM, Imagination, and ATI.
         fPreferFullscreenClears = true;
+        fDiscardStencilAfterCommandBuffer = true;
     }
 
     if (kQualcomm_VkVendor == properties.vendorID || kARM_VkVendor == properties.vendorID) {