Reland "Metal: Hold refs for input buffers from bindBuffer calls."

This is a reland of a75dc33a677f130df1b24855df2f30fccb2e5137

Original change's description:
> Metal: Hold refs for input buffers from bindBuffer calls.
> 
> Mirrors what we have in Vulkan and Direct3D.
> Also adds command buffer tracking, again like Vulkan and Direct3D.
> 
> Change-Id: I2280d92274d81830aec7950afc64a0147e38c317
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/305396
> Reviewed-by: Greg Daniel <egdaniel@google.com>
> Commit-Queue: Jim Van Verth <jvanverth@google.com>

Change-Id: I26ebad195fe044c82ea9ad5684ef1ac6d03cbc37
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/306537
Reviewed-by: Jim Van Verth <jvanverth@google.com>
Commit-Queue: Jim Van Verth <jvanverth@google.com>
diff --git a/src/gpu/mtl/GrMtlBuffer.h b/src/gpu/mtl/GrMtlBuffer.h
index 7f11d78..1376823 100644
--- a/src/gpu/mtl/GrMtlBuffer.h
+++ b/src/gpu/mtl/GrMtlBuffer.h
@@ -25,7 +25,6 @@
 
     id<MTLBuffer> mtlBuffer() const { return fMtlBuffer; }
     size_t offset() const { return fOffset; }
-    void bind(); // for initial binding of XferGpuToCpu buffers
 
 protected:
     GrMtlBuffer(GrMtlGpu*, size_t size, GrGpuBufferType intendedType, GrAccessPattern);
diff --git a/src/gpu/mtl/GrMtlBuffer.mm b/src/gpu/mtl/GrMtlBuffer.mm
index 8f0843b..46fea8e 100644
--- a/src/gpu/mtl/GrMtlBuffer.mm
+++ b/src/gpu/mtl/GrMtlBuffer.mm
@@ -35,21 +35,26 @@
         : INHERITED(gpu, size, intendedType, accessPattern)
         , fIsDynamic(accessPattern != kStatic_GrAccessPattern)
         , fOffset(0) {
-    // In most cases, we'll allocate dynamic buffers when we map them, below.
-    if (!fIsDynamic) {
-        NSUInteger options = 0;
-        if (@available(macOS 10.11, iOS 9.0, *)) {
+    NSUInteger options = 0;
+    if (@available(macOS 10.11, iOS 9.0, *)) {
+        if (fIsDynamic) {
+#ifdef SK_BUILD_FOR_MAC
+            options |= MTLResourceStorageModeManaged;
+#else
+            options |= MTLResourceStorageModeShared;
+#endif
+        } else {
             options |= MTLResourceStorageModePrivate;
         }
-#ifdef SK_BUILD_FOR_MAC
-        // Mac requires 4-byte alignment for copies so we need
-        // to ensure we have space for the extra data
-        size = SkAlign4(size);
-#endif
-        fMtlBuffer = size == 0 ? nil :
-                [gpu->device() newBufferWithLength: size
-                                           options: options];
     }
+#ifdef SK_BUILD_FOR_MAC
+    // Mac requires 4-byte alignment for copies so we need
+    // to ensure we have space for the extra data
+    size = SkAlign4(size);
+#endif
+    fMtlBuffer = size == 0 ? nil :
+            [gpu->device() newBufferWithLength: size
+                                       options: options];
     this->registerWithCache(SkBudgeted::kYes);
     VALIDATE();
 }
@@ -60,11 +65,6 @@
     SkASSERT(fMapPtr == nullptr);
 }
 
-void GrMtlBuffer::bind() {
-    SkASSERT(fIsDynamic && GrGpuBufferType::kXferGpuToCpu == this->intendedType());
-    fMtlBuffer = this->mtlGpu()->resourceProvider().getDynamicBuffer(this->size(), &fOffset);
-}
-
 bool GrMtlBuffer::onUpdateData(const void* src, size_t srcInBytes) {
     if (!fIsDynamic) {
         if (fMtlBuffer == nil) {
@@ -122,9 +122,6 @@
     VALIDATE();
     SkASSERT(!this->isMapped());
     if (fIsDynamic) {
-        if (GrGpuBufferType::kXferGpuToCpu != this->intendedType()) {
-            fMtlBuffer = this->mtlGpu()->resourceProvider().getDynamicBuffer(sizeInBytes, &fOffset);
-        }
         fMappedBuffer = fMtlBuffer;
         fMapPtr = static_cast<char*>(fMtlBuffer.contents) + fOffset;
     } else {
diff --git a/src/gpu/mtl/GrMtlCommandBuffer.h b/src/gpu/mtl/GrMtlCommandBuffer.h
index 959cff5..50373fb 100644
--- a/src/gpu/mtl/GrMtlCommandBuffer.h
+++ b/src/gpu/mtl/GrMtlCommandBuffer.h
@@ -11,15 +11,16 @@
 #import <Metal/Metal.h>
 
 #include "include/core/SkRefCnt.h"
+#include "src/gpu/GrBuffer.h"
 #include "src/gpu/mtl/GrMtlUtil.h"
 
 class GrMtlGpu;
 class GrMtlPipelineState;
 class GrMtlOpsRenderPass;
 
-class GrMtlCommandBuffer {
+class GrMtlCommandBuffer : public SkRefCnt {
 public:
-    static GrMtlCommandBuffer* Create(id<MTLCommandQueue> queue);
+    static sk_sp<GrMtlCommandBuffer> Make(id<MTLCommandQueue> queue);
     ~GrMtlCommandBuffer();
 
     void commit(bool waitUntilCompleted);
@@ -33,10 +34,16 @@
         [fCmdBuffer addCompletedHandler:block];
     }
 
+    void addGrBuffer(sk_sp<const GrBuffer> buffer) {
+        fTrackedGrBuffers.push_back(std::move(buffer));
+    }
+
     void encodeSignalEvent(id<MTLEvent>, uint64_t value) SK_API_AVAILABLE(macos(10.14), ios(12.0));
     void encodeWaitForEvent(id<MTLEvent>, uint64_t value) SK_API_AVAILABLE(macos(10.14), ios(12.0));
 
 private:
+    static const int kInitialTrackedResourcesCount = 32;
+
     GrMtlCommandBuffer(id<MTLCommandBuffer> cmdBuffer)
         : fCmdBuffer(cmdBuffer)
         , fPreviousRenderPassDescriptor(nil) {}
@@ -47,6 +54,8 @@
     id<MTLBlitCommandEncoder>   fActiveBlitCommandEncoder;
     id<MTLRenderCommandEncoder> fActiveRenderCommandEncoder;
     MTLRenderPassDescriptor*    fPreviousRenderPassDescriptor;
+
+    SkSTArray<kInitialTrackedResourcesCount, sk_sp<const GrBuffer>> fTrackedGrBuffers;
 };
 
 #endif
diff --git a/src/gpu/mtl/GrMtlCommandBuffer.mm b/src/gpu/mtl/GrMtlCommandBuffer.mm
index 85b5328..7c04021 100644
--- a/src/gpu/mtl/GrMtlCommandBuffer.mm
+++ b/src/gpu/mtl/GrMtlCommandBuffer.mm
@@ -15,7 +15,7 @@
 #error This file must be compiled with Arc. Use -fobjc-arc flag
 #endif
 
-GrMtlCommandBuffer* GrMtlCommandBuffer::Create(id<MTLCommandQueue> queue) {
+sk_sp<GrMtlCommandBuffer> GrMtlCommandBuffer::Make(id<MTLCommandQueue> queue) {
     id<MTLCommandBuffer> mtlCommandBuffer;
     mtlCommandBuffer = [queue commandBuffer];
     if (nil == mtlCommandBuffer) {
@@ -24,11 +24,12 @@
 
     mtlCommandBuffer.label = @"GrMtlCommandBuffer::Create";
 
-    return new GrMtlCommandBuffer(mtlCommandBuffer);
+    return sk_sp<GrMtlCommandBuffer>(new GrMtlCommandBuffer(mtlCommandBuffer));
 }
 
 GrMtlCommandBuffer::~GrMtlCommandBuffer() {
     this->endAllEncoding();
+    fTrackedGrBuffers.reset();
     fCmdBuffer = nil;
 }
 
diff --git a/src/gpu/mtl/GrMtlGpu.h b/src/gpu/mtl/GrMtlGpu.h
index 5299ecb..6dbc566 100644
--- a/src/gpu/mtl/GrMtlGpu.h
+++ b/src/gpu/mtl/GrMtlGpu.h
@@ -8,6 +8,7 @@
 #ifndef GrMtlGpu_DEFINED
 #define GrMtlGpu_DEFINED
 
+#include "include/private/SkDeque.h"
 #include "src/gpu/GrFinishCallbacks.h"
 #include "src/gpu/GrGpu.h"
 #include "src/gpu/GrRenderTarget.h"
@@ -15,6 +16,7 @@
 #include "src/gpu/GrTexture.h"
 
 #include "src/gpu/mtl/GrMtlCaps.h"
+#include "src/gpu/mtl/GrMtlCommandBuffer.h"
 #include "src/gpu/mtl/GrMtlResourceProvider.h"
 #include "src/gpu/mtl/GrMtlStencilAttachment.h"
 #include "src/gpu/mtl/GrMtlUtil.h"
@@ -52,11 +54,6 @@
         kSkip_SyncQueue
     };
 
-    // Commits the current command buffer to the queue and then creates a new command buffer. If
-    // sync is set to kForce_SyncQueue, the function will wait for all work in the committed
-    // command buffer to finish before returning.
-    void submitCommandBuffer(SyncQueue sync);
-
     void deleteBackendTexture(const GrBackendTexture&) override;
 
     bool compile(const GrProgramDesc&, const GrProgramInfo&) override;
@@ -214,6 +211,13 @@
 
     bool onSubmitToGpu(bool syncCpu) override;
 
+    // Commits the current command buffer to the queue and then creates a new command buffer. If
+    // sync is set to kForce_SyncQueue, the function will wait for all work in the committed
+    // command buffer to finish before returning.
+    void submitCommandBuffer(SyncQueue sync);
+
+    void checkForFinishedCommandBuffers();
+
     // Function that uploads data onto textures with private storage mode (GPU access only).
     bool uploadToTexture(GrMtlTexture* tex, int left, int top, int width, int height,
                          GrColorType dataColorType, const GrMipLevel texels[], int mipLevels);
@@ -247,7 +251,16 @@
     id<MTLDevice> fDevice;
     id<MTLCommandQueue> fQueue;
 
-    GrMtlCommandBuffer* fCmdBuffer;
+    sk_sp<GrMtlCommandBuffer> fCurrentCmdBuffer;
+
+    struct OutstandingCommandBuffer {
+        OutstandingCommandBuffer(sk_sp<GrMtlCommandBuffer> commandBuffer, GrFence fence)
+            : fCommandBuffer(std::move(commandBuffer))
+            , fFence(fence) {}
+        sk_sp<GrMtlCommandBuffer> fCommandBuffer;
+        GrFence fFence;
+    };
+    SkDeque fOutstandingCommandBuffers;
 
     std::unique_ptr<SkSL::Compiler> fCompiler;
 
diff --git a/src/gpu/mtl/GrMtlGpu.mm b/src/gpu/mtl/GrMtlGpu.mm
index 0449781..932c694 100644
--- a/src/gpu/mtl/GrMtlGpu.mm
+++ b/src/gpu/mtl/GrMtlGpu.mm
@@ -112,12 +112,18 @@
     return sk_sp<GrGpu>(new GrMtlGpu(direct, options, device, queue, featureSet));
 }
 
+// This constant determines how many OutstandingCommandBuffers are allocated together as a block in
+// the deque. As such it needs to balance allocating too much memory vs. incurring
+// allocation/deallocation thrashing. It should roughly correspond to the max number of outstanding
+// command buffers we expect to see.
+static const int kDefaultOutstandingAllocCnt = 8;
+
 GrMtlGpu::GrMtlGpu(GrDirectContext* direct, const GrContextOptions& options,
                    id<MTLDevice> device, id<MTLCommandQueue> queue, MTLFeatureSet featureSet)
         : INHERITED(direct)
         , fDevice(device)
         , fQueue(queue)
-        , fCmdBuffer(nullptr)
+        , fOutstandingCommandBuffers(sizeof(OutstandingCommandBuffer), kDefaultOutstandingAllocCnt)
         , fCompiler(new SkSL::Compiler())
         , fResourceProvider(this)
         , fDisconnected(false)
@@ -135,24 +141,25 @@
 void GrMtlGpu::disconnect(DisconnectType type) {
     INHERITED::disconnect(type);
 
-    if (DisconnectType::kCleanup == type) {
+    if (!fDisconnected) {
         this->destroyResources();
-    } else {
-        delete fCmdBuffer;
-        fCmdBuffer = nullptr;
-
-        fResourceProvider.destroyResources();
-
-        fQueue = nil;
-        fDevice = nil;
-
         fDisconnected = true;
     }
 }
 
 void GrMtlGpu::destroyResources() {
-    // Will implicitly delete the command buffer
     this->submitCommandBuffer(SyncQueue::kForce_SyncQueue);
+
+    // We used a placement new for each object in fOutstandingCommandBuffers, so we're responsible
+    // for calling the destructor on each of them as well.
+    while (!fOutstandingCommandBuffers.empty()) {
+        OutstandingCommandBuffer* buffer =
+                (OutstandingCommandBuffer*)fOutstandingCommandBuffers.front();
+        this->deleteFence(buffer->fFence);
+        buffer->~OutstandingCommandBuffer();
+        fOutstandingCommandBuffers.pop_front();
+    }
+
     fResourceProvider.destroyResources();
 
     fQueue = nil;
@@ -175,18 +182,44 @@
 }
 
 GrMtlCommandBuffer* GrMtlGpu::commandBuffer() {
-    if (!fCmdBuffer) {
-        fCmdBuffer = GrMtlCommandBuffer::Create(fQueue);
+    if (!fCurrentCmdBuffer) {
+        fCurrentCmdBuffer = GrMtlCommandBuffer::Make(fQueue);
+
+        // This should be done after we have a new command buffer in case the freeing of any
+        // resources held by a finished command buffer causes us to send a new command to the gpu
+        // (like changing the resource state).
+        this->checkForFinishedCommandBuffers();
     }
-    return fCmdBuffer;
+    return fCurrentCmdBuffer.get();
 }
 
 void GrMtlGpu::submitCommandBuffer(SyncQueue sync) {
-    if (fCmdBuffer) {
-        fResourceProvider.addBufferCompletionHandler(fCmdBuffer);
-        fCmdBuffer->commit(SyncQueue::kForce_SyncQueue == sync);
-        delete fCmdBuffer;
-        fCmdBuffer = nullptr;
+    // TODO: handle sync with empty command buffer
+    if (fCurrentCmdBuffer) {
+        fResourceProvider.addBufferCompletionHandler(fCurrentCmdBuffer.get());
+
+        GrFence fence = this->insertFence();
+        new (fOutstandingCommandBuffers.push_back()) OutstandingCommandBuffer(
+                fCurrentCmdBuffer, fence);
+
+        fCurrentCmdBuffer->commit(SyncQueue::kForce_SyncQueue == sync);
+        fCurrentCmdBuffer.reset();
+    }
+}
+
+void GrMtlGpu::checkForFinishedCommandBuffers() {
+    // Iterate over all the outstanding command buffers to see if any have finished. The command
+    // buffers are in order from oldest to newest, so we start at the front to check if their fence
+    // has signaled. If so we pop it off and move onto the next.
+    // Repeat till we find a command list that has not finished yet (and all others afterwards are
+    // also guaranteed to not have finished).
+    OutstandingCommandBuffer* front = (OutstandingCommandBuffer*)fOutstandingCommandBuffers.front();
+    while (front && this->waitFence(front->fFence)) {
+        // Since we used placement new we are responsible for calling the destructor manually.
+        this->deleteFence(front->fFence);
+        front->~OutstandingCommandBuffer();
+        fOutstandingCommandBuffers.pop_front();
+        front = (OutstandingCommandBuffer*)fOutstandingCommandBuffers.front();
     }
 }
 
@@ -1254,7 +1287,6 @@
     }
 
     GrMtlBuffer* grMtlBuffer = static_cast<GrMtlBuffer*>(transferBuffer);
-    grMtlBuffer->bind();
 
     size_t transBufferRowBytes = bpp * width;
     size_t transBufferImageBytes = transBufferRowBytes * height;
diff --git a/src/gpu/mtl/GrMtlOpsRenderPass.mm b/src/gpu/mtl/GrMtlOpsRenderPass.mm
index aa06422..b74717c 100644
--- a/src/gpu/mtl/GrMtlOpsRenderPass.mm
+++ b/src/gpu/mtl/GrMtlOpsRenderPass.mm
@@ -263,6 +263,7 @@
         SkASSERT(!vertexBuffer->isCpuBuffer());
         SkASSERT(!static_cast<const GrGpuBuffer*>(vertexBuffer.get())->isMapped());
         fActiveVertexBuffer = std::move(vertexBuffer);
+        fGpu->commandBuffer()->addGrBuffer(fActiveVertexBuffer);
         ++inputBufferIndex;
     }
     if (instanceBuffer) {
@@ -270,11 +271,13 @@
         SkASSERT(!static_cast<const GrGpuBuffer*>(instanceBuffer.get())->isMapped());
         this->setVertexBuffer(fActiveRenderCmdEncoder, instanceBuffer.get(), 0, inputBufferIndex++);
         fActiveInstanceBuffer = std::move(instanceBuffer);
+        fGpu->commandBuffer()->addGrBuffer(fActiveInstanceBuffer);
     }
     if (indexBuffer) {
         SkASSERT(!indexBuffer->isCpuBuffer());
         SkASSERT(!static_cast<const GrGpuBuffer*>(indexBuffer.get())->isMapped());
         fActiveIndexBuffer = std::move(indexBuffer);
+        fGpu->commandBuffer()->addGrBuffer(fActiveIndexBuffer);
     }
 }