Revert "Clear and discard stencil buffers on tilers"

This reverts commit 7b2c85577d79bf1bda6b57d9af8ad793e2fc4862.

Reason for revert: nanobench hangs on chromebook bots during mipmap regeneration

Original change's description:
> Clear and discard stencil buffers on tilers
>
> Bug: skia:
> Change-Id: I42e99cb75567825ac7751fc0ec56e4c45132628b
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/210425
> Reviewed-by: Greg Daniel <egdaniel@google.com>
> Commit-Queue: Chris Dalton <csmartdalton@google.com>

TBR=egdaniel@google.com,robertphillips@google.com,csmartdalton@google.com

Change-Id: I69e04c4797b6bde934d86c83bf0348c8abadcfc3
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Bug: skia:
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/213824
Commit-Queue: Michael Ludwig <michaelludwig@google.com>
Reviewed-by: Michael Ludwig <michaelludwig@google.com>
diff --git a/src/gpu/GrCaps.cpp b/src/gpu/GrCaps.cpp
index 26b3c59..7543820 100644
--- a/src/gpu/GrCaps.cpp
+++ b/src/gpu/GrCaps.cpp
@@ -32,7 +32,6 @@
     fUsePrimitiveRestart = false;
     fPreferClientSideDynamicBuffers = false;
     fPreferFullscreenClears = false;
-    fDiscardStencilAfterCommandBuffer = false;
     fMustClearUploadedBufferData = false;
     fShouldInitializeTextures = false;
     fSupportsAHardwareBufferImages = false;
@@ -208,8 +207,6 @@
     writer->appendBool("Use primitive restart", fUsePrimitiveRestart);
     writer->appendBool("Prefer client-side dynamic buffers", fPreferClientSideDynamicBuffers);
     writer->appendBool("Prefer fullscreen clears", fPreferFullscreenClears);
-    writer->appendBool("Discard stencil values after executing a command buffer",
-                       fDiscardStencilAfterCommandBuffer);
     writer->appendBool("Must clear buffer memory", fMustClearUploadedBufferData);
     writer->appendBool("Should initialize textures", fShouldInitializeTextures);
     writer->appendBool("Supports importing AHardwareBuffers", fSupportsAHardwareBufferImages);
diff --git a/src/gpu/GrCaps.h b/src/gpu/GrCaps.h
index 8f31f7c..9dd739c 100644
--- a/src/gpu/GrCaps.h
+++ b/src/gpu/GrCaps.h
@@ -71,11 +71,6 @@
     // initialize each tile with a constant value rather than loading each pixel from memory.
     bool preferFullscreenClears() const { return fPreferFullscreenClears; }
 
-    // On tilers we can save memory bandwidth by clearing the stencil buffer at the beginning of
-    // a command buffer, and discarding it at the end. This encourages the driver to only use fast,
-    // on-chip tile memory and never actually transfer stencil values to and from main memory.
-    bool discardStencilAfterCommandBuffer() const { return fDiscardStencilAfterCommandBuffer; }
-
     bool preferVRAMUseOverFlushes() const { return fPreferVRAMUseOverFlushes; }
 
     bool preferTrianglesOverSampleMask() const { return fPreferTrianglesOverSampleMask; }
@@ -379,7 +374,6 @@
     bool fUsePrimitiveRestart                        : 1;
     bool fPreferClientSideDynamicBuffers             : 1;
     bool fPreferFullscreenClears                     : 1;
-    bool fDiscardStencilAfterCommandBuffer           : 1;
     bool fMustClearUploadedBufferData                : 1;
     bool fShouldInitializeTextures                   : 1;
     bool fSupportsAHardwareBufferImages              : 1;
diff --git a/src/gpu/GrOpList.cpp b/src/gpu/GrOpList.cpp
index 5cee97d..109289b 100644
--- a/src/gpu/GrOpList.cpp
+++ b/src/gpu/GrOpList.cpp
@@ -187,6 +187,9 @@
 }
 
 #ifdef SK_DEBUG
+static const char* op_to_name(GrLoadOp op) {
+    return GrLoadOp::kLoad == op ? "load" : GrLoadOp::kClear == op ? "clear" : "discard";
+}
 
 void GrOpList::dump(bool printDependencies) const {
     SkDebugf("--------------------------------------------------------------\n");
@@ -195,9 +198,10 @@
              fTarget.get() && fTarget.get()->peekSurface()
                      ? fTarget.get()->peekSurface()->uniqueID().asUInt()
                      : -1);
-    SkDebugf("ColorLoadOp: %s %x\n",
-             GrLoadOpName(fColorLoadOp),
-             GrLoadOp::kClear == fColorLoadOp ? fLoadClearColor.toBytes_RGBA() : 0x0);
+    SkDebugf("ColorLoadOp: %s %x StencilLoadOp: %s\n",
+             op_to_name(fColorLoadOp),
+             GrLoadOp::kClear == fColorLoadOp ? fLoadClearColor.toBytes_RGBA() : 0x0,
+             op_to_name(fStencilLoadOp));
 
     if (printDependencies) {
         SkDebugf("I rely On (%d): ", fDependencies.count());
diff --git a/src/gpu/GrRenderTargetContext.cpp b/src/gpu/GrRenderTargetContext.cpp
index b2a08de..ce8945b 100644
--- a/src/gpu/GrRenderTargetContext.cpp
+++ b/src/gpu/GrRenderTargetContext.cpp
@@ -234,21 +234,7 @@
     SkDEBUGCODE(this->validate();)
 
     if (!fOpList || fOpList->isClosed()) {
-        sk_sp<GrRenderTargetOpList> newOpList =
-                this->drawingManager()->newRTOpList(fRenderTargetProxy, fManagedOpList);
-        if (fHasInitializedStencil) {
-            SkASSERT(fOpList);
-            // Always load/store stencil between opList splits. (We know this must be a split
-            // because the stencil is already initialized.)
-            // FIXME: In addition to simply reducing the split frequency, we might want to think
-            // about cases where we can safely skip this heavy-handed load/store solution.
-            fOpList->setStencilStoreOp(GrStoreOp::kStore);  // Store stencil after previous opList.
-            newOpList->setStencilLoadOp(GrLoadOp::kLoad);  // Load stencil before next opList.
-            if (!this->caps()->discardStencilAfterCommandBuffer()) {
-                newOpList->setStencilStoreOp(GrStoreOp::kStore);
-            }
-        }
-        fOpList = std::move(newOpList);
+        fOpList = this->drawingManager()->newRTOpList(fRenderTargetProxy, fManagedOpList);
     }
 
     return fOpList.get();
@@ -857,6 +843,9 @@
         paint.setColor4f({0.f, 0.f, 0.f, 0.f});
         paint.setPorterDuffXPFactory(SkBlendMode::kSrcOver);
 
+        // Mark stencil usage here before addDrawOp() so that it doesn't try to re-call
+        // internalStencilClear() just because the op has stencil settings.
+        this->setNeedsStencil();
         this->addDrawOp(clip, GrFillRectOp::Make(fContext, std::move(paint),
                         GrAAType::kNone, SkMatrix::I(), rtRect, ss));
     } else {
@@ -894,6 +883,8 @@
         return;
     }
 
+    fRenderTargetContext->setNeedsStencil();
+
     std::unique_ptr<GrOp> op = GrStencilPathOp::Make(fRenderTargetContext->fContext,
                                                      viewMatrix,
                                                      GrAA::kYes == doStencilMSAA,
@@ -2261,26 +2252,20 @@
         return;
     }
 
-    if (fixedFunctionFlags & GrDrawOp::FixedFunctionFlags::kUsesStencil) {
-        if (!fHasInitializedStencil) {
-            GrRenderTargetOpList* opList = this->getRTOpList();
-
-            // Do this first, to ensure we don't recurse forever if the internal stencil clear gets
-            // triggered and adds a draw op that has stencil settings.
-            fHasInitializedStencil = true;
-
-            if (this->caps()->performStencilClearsAsDraws()) {
-                // Send false so that the stencil buffer is fully cleared to 0.
+    if (fixedFunctionFlags & GrDrawOp::FixedFunctionFlags::kUsesStencil ||
+        appliedClip.hasStencilClip()) {
+        if (this->caps()->performStencilClearsAsDraws()) {
+            // Must use an op to perform the clear of the stencil buffer before this op, but only
+            // have to clear the first time any draw needs it (this also ensures we don't loop
+            // forever when the internal stencil clear adds a draw op that has stencil settings).
+            if (!fRenderTargetProxy->needsStencil()) {
+                // Send false so that the stencil buffer is fully cleared to 0
                 this->internalStencilClear(GrFixedClip::Disabled(), /* inside mask */ false);
-            } else {
-                opList->setStencilLoadOp(GrLoadOp::kClear);
             }
-
-            if (!this->caps()->discardStencilAfterCommandBuffer()) {
-                // Preserve stencil data if we aren't on a tiler. The opList will notice this, track
-                // that the user bits are clean, and potentially skip future clear-on-load ops.
-                opList->setStencilStoreOp(GrStoreOp::kStore);
-            }
+        } else {
+            // Just make sure the stencil buffer is cleared before the draw op, easy to do it as
+            // a load at the start
+            this->getRTOpList()->setStencilLoadOp(GrLoadOp::kClear);
         }
 
         this->setNeedsStencil();
diff --git a/src/gpu/GrRenderTargetContext.h b/src/gpu/GrRenderTargetContext.h
index 6b058d2..6f39325 100644
--- a/src/gpu/GrRenderTargetContext.h
+++ b/src/gpu/GrRenderTargetContext.h
@@ -553,8 +553,6 @@
     SkSurfaceProps fSurfaceProps;
     bool fManagedOpList;
 
-    bool fHasInitializedStencil = false;
-
     typedef GrSurfaceContext INHERITED;
 };
 
diff --git a/src/gpu/GrRenderTargetOpList.cpp b/src/gpu/GrRenderTargetOpList.cpp
index f5b1753..84fc9cb 100644
--- a/src/gpu/GrRenderTargetOpList.cpp
+++ b/src/gpu/GrRenderTargetOpList.cpp
@@ -5,7 +5,6 @@
  * found in the LICENSE file.
  */
 
-#include "include/gpu/GrRenderTarget.h"
 #include "include/private/GrAuditTrail.h"
 #include "include/private/GrRecordingContext.h"
 #include "src/core/SkExchange.h"
@@ -19,9 +18,7 @@
 #include "src/gpu/GrRect.h"
 #include "src/gpu/GrRenderTargetContext.h"
 #include "src/gpu/GrRenderTargetOpList.h"
-#include "src/gpu/GrRenderTargetPriv.h"
 #include "src/gpu/GrResourceAllocator.h"
-#include "src/gpu/GrStencilAttachment.h"
 #include "src/gpu/ops/GrClearOp.h"
 #include "src/gpu/ops/GrCopySurfaceOp.h"
 
@@ -376,9 +373,6 @@
 void GrRenderTargetOpList::dump(bool printDependencies) const {
     INHERITED::dump(printDependencies);
 
-    SkDebugf("fStencilLoadOp: %s\n", GrLoadOpName(fStencilLoadOp));
-    SkDebugf("fStencilStoreOp: %s\n", GrStoreOpName(fStencilStoreOp));
-
     SkDebugf("ops (%d):\n", fOpChains.count());
     for (int i = 0; i < fOpChains.count(); ++i) {
         SkDebugf("*******************************\n");
@@ -434,10 +428,13 @@
     }
 }
 
-static GrGpuRTCommandBuffer* create_command_buffer(
-        GrGpu* gpu, GrRenderTarget* rt, GrSurfaceOrigin origin, const SkRect& bounds,
-        GrLoadOp colorLoadOp, const SkPMColor4f& loadClearColor, GrLoadOp stencilLoadOp, GrStoreOp
-        stencilStoreOp) {
+static GrGpuRTCommandBuffer* create_command_buffer(GrGpu* gpu,
+                                                   GrRenderTarget* rt,
+                                                   GrSurfaceOrigin origin,
+                                                   const SkRect& bounds,
+                                                   GrLoadOp colorLoadOp,
+                                                   const SkPMColor4f& loadClearColor,
+                                                   GrLoadOp stencilLoadOp) {
     const GrGpuRTCommandBuffer::LoadAndStoreInfo kColorLoadStoreInfo {
         colorLoadOp,
         GrStoreOp::kStore,
@@ -451,7 +448,7 @@
     // lower level (inside the VK command buffer).
     const GrGpuRTCommandBuffer::StencilLoadAndStoreInfo stencilLoadAndStoreInfo {
         stencilLoadOp,
-        stencilStoreOp,
+        GrStoreOp::kStore,
     };
 
     return gpu->getCommandBuffer(rt, origin, bounds, kColorLoadStoreInfo, stencilLoadAndStoreInfo);
@@ -475,25 +472,22 @@
     SkASSERT(fTarget.get()->peekRenderTarget());
     TRACE_EVENT0("skia", TRACE_FUNC);
 
+    // TODO: at the very least, we want the stencil store op to always be discard (at this
+    // level). In Vulkan, sub-command buffers would still need to load & store the stencil buffer.
+
     // Make sure load ops are not kClear if the GPU needs to use draws for clears
     SkASSERT(fColorLoadOp != GrLoadOp::kClear ||
              !flushState->gpu()->caps()->performColorClearsAsDraws());
     SkASSERT(fStencilLoadOp != GrLoadOp::kClear ||
              !flushState->gpu()->caps()->performStencilClearsAsDraws());
-
-    GrRenderTarget* renderTarget = fTarget.get()->peekRenderTarget();
-    GrStencilAttachment* stencil = renderTarget->renderTargetPriv().getStencilAttachment();
-    GrLoadOp stencilLoadOp = fStencilLoadOp;
-    if (stencil && GrLoadOp::kClear == stencilLoadOp && !stencil->userBitsAreDirty()) {
-        if (!flushState->caps().preferFullscreenClears()) {
-            stencilLoadOp = GrLoadOp::kLoad;
-        }
-    }
-
     GrGpuRTCommandBuffer* commandBuffer = create_command_buffer(
-            flushState->gpu(), renderTarget, fTarget.get()->origin(),
-            fTarget.get()->getBoundsRect(), fColorLoadOp, fLoadClearColor, stencilLoadOp,
-            fStencilStoreOp);
+                                                    flushState->gpu(),
+                                                    fTarget.get()->peekRenderTarget(),
+                                                    fTarget.get()->origin(),
+                                                    fTarget.get()->getBoundsRect(),
+                                                    fColorLoadOp,
+                                                    fLoadClearColor,
+                                                    fStencilLoadOp);
     flushState->setCommandBuffer(commandBuffer);
     commandBuffer->begin();
 
@@ -522,16 +516,6 @@
     flushState->gpu()->submit(commandBuffer);
     flushState->setCommandBuffer(nullptr);
 
-    if (stencil && GrStoreOp::kStore == fStencilStoreOp) {
-        // The user stencil bits are always initialized and kept at zero for the duration of a
-        // command buffer. So if we store the stencil, we know we're storing clean user bits.
-        stencil->userBitsCleared();
-    }
-    // FIXME: We don't currently have a way to flag command buffers that don't use stencil at all.
-    // In that case, their store op will be discard, and we currently make the assumption that a
-    // store op of "discard" will not invalidate what's already in main memory. This is probably ok
-    // for now, but certainly something we want to address soon.
-
     return true;
 }
 
@@ -551,6 +535,10 @@
     }
 }
 
+void GrRenderTargetOpList::setStencilLoadOp(GrLoadOp op) {
+    fStencilLoadOp = op;
+}
+
 void GrRenderTargetOpList::setColorLoadOp(GrLoadOp op, const SkPMColor4f& color) {
     fColorLoadOp = op;
     fLoadClearColor = color;
diff --git a/src/gpu/GrRenderTargetOpList.h b/src/gpu/GrRenderTargetOpList.h
index 739cda3..e27f694 100644
--- a/src/gpu/GrRenderTargetOpList.h
+++ b/src/gpu/GrRenderTargetOpList.h
@@ -127,10 +127,8 @@
 
     bool onIsUsed(GrSurfaceProxy*) const override;
 
-    // Load op must not be kClear if caps.performStencilClearsAsDraws() is true.
-    void setStencilLoadOp(GrLoadOp op) { fStencilLoadOp = op; }
-    void setStencilStoreOp(GrStoreOp op) { fStencilStoreOp = op; }
-
+    // Must only be called if native stencil buffer clearing is enabled
+    void setStencilLoadOp(GrLoadOp op);
     // Must only be called if native color buffer clearing is enabled.
     void setColorLoadOp(GrLoadOp op, const SkPMColor4f& color);
     // Sets the clear color to transparent black
@@ -226,12 +224,9 @@
 
     void forwardCombine(const GrCaps&);
 
-    uint32_t fLastClipStackGenID;
-    SkIRect fLastDevClipBounds;
-    int fLastClipNumAnalyticFPs;
-
-    GrLoadOp fStencilLoadOp = GrLoadOp::kDiscard;
-    GrStoreOp fStencilStoreOp = GrStoreOp::kDiscard;
+    uint32_t                       fLastClipStackGenID;
+    SkIRect                        fLastDevClipBounds;
+    int                            fLastClipNumAnalyticFPs;
 
     // We must track if we have a wait op so that we don't delete the op when we have a full clear.
     bool fHasWaitOp = false;;
@@ -241,8 +236,8 @@
 
     // MDB TODO: 4096 for the first allocation of the clip space will be huge overkill.
     // Gather statistics to determine the correct size.
-    SkArenaAlloc fClipAllocator{4096};
-    SkDEBUGCODE(int fNumClips;)
+    SkArenaAlloc                   fClipAllocator{4096};
+    SkDEBUGCODE(int                fNumClips;)
 
     typedef GrOpList INHERITED;
 };
diff --git a/src/gpu/GrStencilAttachment.h b/src/gpu/GrStencilAttachment.h
index fad6e89..22595f5 100644
--- a/src/gpu/GrStencilAttachment.h
+++ b/src/gpu/GrStencilAttachment.h
@@ -25,9 +25,9 @@
     int height() const { return fHeight; }
     int bits() const { return fBits; }
     int numSamples() const { return fSampleCnt; }
-    bool userBitsAreDirty() const { return fUserBitsAreDirty; }
+    bool isDirty() const { return fIsDirty; }
 
-    void userBitsCleared() { fUserBitsAreDirty = false; }
+    void cleared() { fIsDirty = false; }
 
     // We create a unique stencil buffer at each width, height and sampleCnt and share it for
     // all render targets that require a stencil with those params.
@@ -41,7 +41,7 @@
             , fHeight(height)
             , fBits(bits)
             , fSampleCnt(sampleCnt)
-            , fUserBitsAreDirty(true) {
+            , fIsDirty(true) {
     }
 
 private:
@@ -51,7 +51,7 @@
     int fHeight;
     int fBits;
     int fSampleCnt;
-    bool fUserBitsAreDirty;
+    bool fIsDirty;
 
     typedef GrGpuResource INHERITED;
 };
diff --git a/src/gpu/gl/GrGLCaps.cpp b/src/gpu/gl/GrGLCaps.cpp
index 4822bf4..d235c77 100644
--- a/src/gpu/gl/GrGLCaps.cpp
+++ b/src/gpu/gl/GrGLCaps.cpp
@@ -170,7 +170,6 @@
         kImagination_GrGLVendor == ctxInfo.vendor() ||
         kQualcomm_GrGLVendor == ctxInfo.vendor() ) {
         fPreferFullscreenClears = true;
-        fDiscardStencilAfterCommandBuffer = true;
     }
 
     if (GR_IS_GR_GL(standard)) {
diff --git a/src/gpu/gl/GrGLGpu.cpp b/src/gpu/gl/GrGLGpu.cpp
index 67f58b2..5798f72 100644
--- a/src/gpu/gl/GrGLGpu.cpp
+++ b/src/gpu/gl/GrGLGpu.cpp
@@ -1685,7 +1685,7 @@
             this->disableScissor();
             this->disableWindowRectangles();
             this->flushColorWrite(true);
-            this->flushClearColor({0, 0, 0, 0});
+            this->flushClearColor(0, 0, 0, 0);
             GL_CALL(Clear(GR_GL_COLOR_BUFFER_BIT));
             this->unbindTextureFBOForPixelOps(GR_GL_FRAMEBUFFER, tex.get());
             fHWBoundRenderTargetUniqueID.makeInvalid();
@@ -2233,7 +2233,16 @@
     this->flushScissor(clip.scissorState(), glRT->getViewport(), origin);
     this->flushWindowRectangles(clip.windowRectsState(), glRT, origin);
     this->flushColorWrite(true);
-    this->flushClearColor(color);
+
+    GrGLfloat r = color.fR, g = color.fG, b = color.fB, a = color.fA;
+    if (this->glCaps().clearToBoundaryValuesIsBroken() &&
+        (1 == r || 0 == r) && (1 == g || 0 == g) && (1 == b || 0 == b) && (1 == a || 0 == a)) {
+        static const GrGLfloat safeAlpha1 = nextafter(1.f, 2.f);
+        static const GrGLfloat safeAlpha0 = nextafter(0.f, -1.f);
+        a = (1 == a) ? safeAlpha1 : safeAlpha0;
+    }
+    this->flushClearColor(r, g, b, a);
+
     GL_CALL(Clear(GR_GL_COLOR_BUFFER_BIT));
 }
 
@@ -2244,8 +2253,10 @@
         return;
     }
 
-    // this should only be called internally when we know we have a stencil buffer.
-    SkASSERT(target->renderTargetPriv().getStencilAttachment());
+    GrStencilAttachment* sb = target->renderTargetPriv().getStencilAttachment();
+    // this should only be called internally when we know we have a
+    // stencil buffer.
+    SkASSERT(sb);
 
     GrGLRenderTarget* glRT = static_cast<GrGLRenderTarget*>(target);
     this->flushRenderTargetNoColorWrites(glRT);
@@ -2257,79 +2268,9 @@
     GL_CALL(ClearStencil(clearValue));
     GL_CALL(Clear(GR_GL_STENCIL_BUFFER_BIT));
     fHWStencilSettings.invalidate();
-}
-
-void GrGLGpu::beginCommandBuffer(GrRenderTarget* rt, const ColorLoadAndStoreInfo& colorLoadStore,
-                                 const StencilLoadAndStoreInfo& stencilLoadStore) {
-    SkASSERT(!fIsExecutingCommandBuffer_DebugOnly);
-
-    this->handleDirtyContext();
-
-    auto glRT = static_cast<GrGLRenderTarget*>(rt);
-    this->flushRenderTarget(glRT);
-    SkDEBUGCODE(fIsExecutingCommandBuffer_DebugOnly = true);
-
-    GrGLbitfield clearMask = 0;
-    if (GrLoadOp::kClear == colorLoadStore.fLoadOp) {
-        SkASSERT(!this->caps()->performColorClearsAsDraws());
-        this->flushClearColor(colorLoadStore.fClearColor);
-        this->flushColorWrite(true);
-        clearMask |= GR_GL_COLOR_BUFFER_BIT;
+    if (!clearValue) {
+        sb->cleared();
     }
-    if (GrLoadOp::kClear == stencilLoadStore.fLoadOp) {
-        SkASSERT(!this->caps()->performStencilClearsAsDraws());
-        GL_CALL(StencilMask(0xffffffff));
-        GL_CALL(ClearStencil(0));
-        clearMask |= GR_GL_STENCIL_BUFFER_BIT;
-    }
-    if (clearMask) {
-        this->disableScissor();
-        this->disableWindowRectangles();
-        GL_CALL(Clear(clearMask));
-    }
-}
-
-void GrGLGpu::endCommandBuffer(GrRenderTarget* rt, const ColorLoadAndStoreInfo& colorLoadStore,
-                               const StencilLoadAndStoreInfo& stencilLoadStore) {
-    SkASSERT(fIsExecutingCommandBuffer_DebugOnly);
-
-    this->handleDirtyContext();
-
-    if (rt->uniqueID() != fHWBoundRenderTargetUniqueID) {
-#ifdef SK_DEBUG
-        SkDebugf("WARNING: GL framebuffer changed in the middle of a command buffer. This will "
-                 "kill performance.\n");
-        fIsExecutingCommandBuffer_DebugOnly = false;
-#endif
-        return;
-    }
-
-    if (this->caps()->discardRenderTargetSupport()) {
-        auto glRT = static_cast<GrGLRenderTarget*>(rt);
-
-        SkSTArray<2, GrGLenum> discardAttachments;
-        if (GrStoreOp::kStore != colorLoadStore.fStoreOp) {
-            discardAttachments.push_back(
-                    (0 == glRT->renderFBOID()) ? GR_GL_COLOR : GR_GL_COLOR_ATTACHMENT0);
-        }
-        if (GrStoreOp::kStore != stencilLoadStore.fStoreOp) {
-            discardAttachments.push_back(
-                    (0 == glRT->renderFBOID()) ? GR_GL_STENCIL : GR_GL_STENCIL_ATTACHMENT);
-        }
-
-        if (!discardAttachments.empty()) {
-            if (GrGLCaps::kInvalidate_InvalidateFBType == this->glCaps().invalidateFBType()) {
-                GL_CALL(InvalidateFramebuffer(GR_GL_FRAMEBUFFER, discardAttachments.count(),
-                                              discardAttachments.begin()));
-            } else {
-                SkASSERT(GrGLCaps::kDiscard_InvalidateFBType == this->glCaps().invalidateFBType());
-                GL_CALL(DiscardFramebuffer(GR_GL_FRAMEBUFFER, discardAttachments.count(),
-                                           discardAttachments.begin()));
-            }
-        }
-    }
-
-    SkDEBUGCODE(fIsExecutingCommandBuffer_DebugOnly = false);
 }
 
 void GrGLGpu::clearStencilClip(const GrFixedClip& clip,
@@ -3234,14 +3175,7 @@
     }
 }
 
-void GrGLGpu::flushClearColor(const SkPMColor4f& color) {
-    GrGLfloat r = color.fR, g = color.fG, b = color.fB, a = color.fA;
-    if (this->glCaps().clearToBoundaryValuesIsBroken() &&
-        (1 == r || 0 == r) && (1 == g || 0 == g) && (1 == b || 0 == b) && (1 == a || 0 == a)) {
-        static const GrGLfloat safeAlpha1 = nextafter(1.f, 2.f);
-        static const GrGLfloat safeAlpha0 = nextafter(0.f, -1.f);
-        a = (1 == a) ? safeAlpha1 : safeAlpha0;
-    }
+void GrGLGpu::flushClearColor(GrGLfloat r, GrGLfloat g, GrGLfloat b, GrGLfloat a) {
     if (r != fHWClearColor[0] || g != fHWClearColor[1] ||
         b != fHWClearColor[2] || a != fHWClearColor[3]) {
         GL_CALL(ClearColor(r, g, b, a));
diff --git a/src/gpu/gl/GrGLGpu.h b/src/gpu/gl/GrGLGpu.h
index 85c6773..e8d5be1 100644
--- a/src/gpu/gl/GrGLGpu.h
+++ b/src/gpu/gl/GrGLGpu.h
@@ -122,15 +122,6 @@
     // stencil buffer as not dirty?
     void clearStencil(GrRenderTarget*, int clearValue);
 
-    using ColorLoadAndStoreInfo = GrGpuRTCommandBuffer::LoadAndStoreInfo;
-    using StencilLoadAndStoreInfo = GrGpuRTCommandBuffer::StencilLoadAndStoreInfo;
-
-    void beginCommandBuffer(
-            GrRenderTarget*, const ColorLoadAndStoreInfo&, const StencilLoadAndStoreInfo&);
-
-    void endCommandBuffer(
-            GrRenderTarget*, const ColorLoadAndStoreInfo&, const StencilLoadAndStoreInfo&);
-
     GrGpuRTCommandBuffer* getCommandBuffer(
             GrRenderTarget*, GrSurfaceOrigin, const SkRect&,
             const GrGpuRTCommandBuffer::LoadAndStoreInfo&,
@@ -361,7 +352,7 @@
     };
 
     void flushColorWrite(bool writeColor);
-    void flushClearColor(const SkPMColor4f&);
+    void flushClearColor(GrGLfloat r, GrGLfloat g, GrGLfloat b, GrGLfloat a);
 
     // flushes the scissor. see the note on flushBoundTextureAndParams about
     // flushing the scissor after that function is called.
@@ -685,9 +676,6 @@
         GrGLsync fSync;
     };
     std::list<FinishCallback> fFinishCallbacks;
-
-    SkDEBUGCODE(bool fIsExecutingCommandBuffer_DebugOnly = false);
-
     friend class GrGLPathRendering; // For accessing setTextureUnit.
 
     typedef GrGpu INHERITED;
diff --git a/src/gpu/gl/GrGLGpuCommandBuffer.cpp b/src/gpu/gl/GrGLGpuCommandBuffer.cpp
index 655b915..1412aa3 100644
--- a/src/gpu/gl/GrGLGpuCommandBuffer.cpp
+++ b/src/gpu/gl/GrGLGpuCommandBuffer.cpp
@@ -11,9 +11,22 @@
 #include "src/gpu/GrFixedClip.h"
 #include "src/gpu/GrRenderTargetPriv.h"
 
-void GrGLGpuRTCommandBuffer::set(
-        GrRenderTarget* rt, GrSurfaceOrigin origin, const LoadAndStoreInfo& colorInfo,
-        const StencilLoadAndStoreInfo& stencilInfo) {
+void GrGLGpuRTCommandBuffer::begin() {
+    if (GrLoadOp::kClear == fColorLoadAndStoreInfo.fLoadOp) {
+        fGpu->clear(GrFixedClip::Disabled(), fColorLoadAndStoreInfo.fClearColor,
+                    fRenderTarget, fOrigin);
+    }
+    if (GrLoadOp::kClear == fStencilLoadAndStoreInfo.fLoadOp) {
+        GrStencilAttachment* sb = fRenderTarget->renderTargetPriv().getStencilAttachment();
+        if (sb && (sb->isDirty() || fRenderTarget->alwaysClearStencil())) {
+            fGpu->clearStencil(fRenderTarget, 0x0);
+        }
+    }
+}
+
+void GrGLGpuRTCommandBuffer::set(GrRenderTarget* rt, GrSurfaceOrigin origin,
+                                 const GrGpuRTCommandBuffer::LoadAndStoreInfo& colorInfo,
+                                 const GrGpuRTCommandBuffer::StencilLoadAndStoreInfo& stencilInfo) {
     SkASSERT(fGpu);
     SkASSERT(!fRenderTarget);
     SkASSERT(fGpu == rt->getContext()->priv().getGpu());
diff --git a/src/gpu/gl/GrGLGpuCommandBuffer.h b/src/gpu/gl/GrGLGpuCommandBuffer.h
index 9831b63..0f9370f 100644
--- a/src/gpu/gl/GrGLGpuCommandBuffer.h
+++ b/src/gpu/gl/GrGLGpuCommandBuffer.h
@@ -55,13 +55,8 @@
 public:
     GrGLGpuRTCommandBuffer(GrGLGpu* gpu) : fGpu(gpu) {}
 
-    void begin() override {
-        fGpu->beginCommandBuffer(fRenderTarget, fColorLoadAndStoreInfo, fStencilLoadAndStoreInfo);
-    }
-
-    void end() override {
-        fGpu->endCommandBuffer(fRenderTarget, fColorLoadAndStoreInfo, fStencilLoadAndStoreInfo);
-    }
+    void begin() override;
+    void end() override {}
 
     void discard() override { }
 
@@ -84,8 +79,9 @@
                                  srcRect.height(), bufferColorType, transferBuffer, offset);
     }
 
-    void set(GrRenderTarget*, GrSurfaceOrigin, const LoadAndStoreInfo&, const
-             StencilLoadAndStoreInfo&);
+    void set(GrRenderTarget*, GrSurfaceOrigin,
+             const GrGpuRTCommandBuffer::LoadAndStoreInfo&,
+             const GrGpuRTCommandBuffer::StencilLoadAndStoreInfo&);
 
     void reset() {
         fRenderTarget = nullptr;
diff --git a/src/gpu/vk/GrVkCaps.cpp b/src/gpu/vk/GrVkCaps.cpp
index 9305358..b20595f 100644
--- a/src/gpu/vk/GrVkCaps.cpp
+++ b/src/gpu/vk/GrVkCaps.cpp
@@ -417,7 +417,6 @@
         // On NVIDIA and Intel, the discard load followed by clear is faster.
         // TODO: Evaluate on ARM, Imagination, and ATI.
         fPreferFullscreenClears = true;
-        fDiscardStencilAfterCommandBuffer = true;
     }
 
     if (kQualcomm_VkVendor == properties.vendorID || kARM_VkVendor == properties.vendorID) {