Cache CPU memory buffers used for client side arrays.

Use same cache for CPU-side copy of data when using GPU buffers.

Change-Id: I09f2837211a30aabc50e9897c090f5fbc6d90492
Reviewed-on: https://skia-review.googlesource.com/c/189484
Commit-Queue: Brian Salomon <bsalomon@google.com>
Reviewed-by: Robert Phillips <robertphillips@google.com>
diff --git a/src/gpu/GrBufferAllocPool.cpp b/src/gpu/GrBufferAllocPool.cpp
index cf986bc..36c0431 100644
--- a/src/gpu/GrBufferAllocPool.cpp
+++ b/src/gpu/GrBufferAllocPool.cpp
@@ -18,6 +18,56 @@
 #include "SkSafeMath.h"
 #include "SkTraceEvent.h"
 
+sk_sp<GrBufferAllocPool::CpuBufferCache> GrBufferAllocPool::CpuBufferCache::Make(
+        int maxBuffersToCache) {
+    return sk_sp<CpuBufferCache>(new CpuBufferCache(maxBuffersToCache));
+}
+
+GrBufferAllocPool::CpuBufferCache::CpuBufferCache(int maxBuffersToCache)
+        : fMaxBuffersToCache(maxBuffersToCache) {
+    if (fMaxBuffersToCache) {
+        fBuffers.reset(new Buffer[fMaxBuffersToCache]);
+    }
+}
+
+sk_sp<GrCpuBuffer> GrBufferAllocPool::CpuBufferCache::makeBuffer(size_t size,
+                                                                 bool mustBeInitialized) {
+    SkASSERT(size > 0);
+    Buffer* result = nullptr;
+    if (size == kDefaultBufferSize) {
+        int i = 0;
+        for (; i < fMaxBuffersToCache && fBuffers[i].fBuffer; ++i) {
+            SkASSERT(fBuffers[i].fBuffer->size() == kDefaultBufferSize);
+            if (fBuffers[i].fBuffer->unique()) {
+                result = &fBuffers[i];
+            }
+        }
+        if (!result && i < fMaxBuffersToCache) {
+            fBuffers[i].fBuffer = GrCpuBuffer::Make(size);
+            result = &fBuffers[i];
+        }
+    }
+    Buffer tempResult;
+    if (!result) {
+        tempResult.fBuffer = GrCpuBuffer::Make(size);
+        result = &tempResult;
+    }
+    if (mustBeInitialized && !result->fCleared) {
+        result->fCleared = true;
+        memset(result->fBuffer->data(), 0, result->fBuffer->size());
+    }
+    return result->fBuffer;
+}
+
+void GrBufferAllocPool::CpuBufferCache::releaseAll() {
+    for (int i = 0; i < fMaxBuffersToCache && fBuffers[i].fBuffer; ++i) {
+        fBuffers[i].fBuffer.reset();
+        fBuffers[i].fCleared = false;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
 #ifdef SK_DEBUG
     #define VALIDATE validate
 #else
@@ -35,13 +85,12 @@
 
 constexpr size_t GrBufferAllocPool::kDefaultBufferSize;
 
-GrBufferAllocPool::GrBufferAllocPool(GrGpu* gpu, GrGpuBufferType bufferType, void* initialBuffer)
-        : fBlocks(8), fGpu(gpu), fBufferType(bufferType), fInitialCpuData(initialBuffer) {
-    if (fInitialCpuData) {
-        fCpuDataSize = kDefaultBufferSize;
-        fCpuData = fInitialCpuData;
-    }
-}
+GrBufferAllocPool::GrBufferAllocPool(GrGpu* gpu, GrGpuBufferType bufferType,
+                                     sk_sp<CpuBufferCache> cpuBufferCache)
+        : fBlocks(8)
+        , fCpuBufferCache(std::move(cpuBufferCache))
+        , fGpu(gpu)
+        , fBufferType(bufferType) {}
 
 void GrBufferAllocPool::deleteBlocks() {
     if (fBlocks.count()) {
@@ -59,9 +108,6 @@
 GrBufferAllocPool::~GrBufferAllocPool() {
     VALIDATE();
     this->deleteBlocks();
-    if (fCpuData != fInitialCpuData) {
-        sk_free(fCpuData);
-    }
 }
 
 void GrBufferAllocPool::reset() {
@@ -98,7 +144,7 @@
         SkASSERT(!fBlocks.empty());
         const GrBuffer* buffer = fBlocks.back().fBuffer.get();
         if (!buffer->isCpuBuffer() && !static_cast<const GrGpuBuffer*>(buffer)->isMapped()) {
-            SkASSERT(fCpuData == fBufferPtr);
+            SkASSERT(fCpuStagingBuffer && fCpuStagingBuffer->data() == fBufferPtr);
         }
     } else if (!fBlocks.empty()) {
         const GrBuffer* buffer = fBlocks.back().fBuffer.get();
@@ -321,7 +367,8 @@
         }
     }
     if (!fBufferPtr) {
-        fBufferPtr = this->resetCpuData(block.fBytesFree);
+        this->resetCpuData(block.fBytesFree);
+        fBufferPtr = fCpuStagingBuffer->data();
     }
 
     VALIDATE(true);
@@ -337,30 +384,26 @@
     fBufferPtr = nullptr;
 }
 
-void* GrBufferAllocPool::resetCpuData(size_t newSize) {
-    if (newSize <= fCpuDataSize) {
-        SkASSERT(!newSize || fCpuData);
-        return fCpuData;
+void GrBufferAllocPool::resetCpuData(size_t newSize) {
+    SkASSERT(newSize >= kDefaultBufferSize || !newSize);
+    if (!newSize) {
+        fCpuStagingBuffer.reset();
+        return;
     }
-    if (fCpuData != fInitialCpuData) {
-        sk_free(fCpuData);
+    if (fCpuStagingBuffer && newSize <= fCpuStagingBuffer->size()) {
+        return;
     }
-    if (fGpu->caps()->mustClearUploadedBufferData()) {
-        fCpuData = sk_calloc_throw(newSize);
-    } else {
-        fCpuData = sk_malloc_throw(newSize);
-    }
-    fCpuDataSize = newSize;
-    return fCpuData;
+    bool mustInitialize = fGpu->caps()->mustClearUploadedBufferData();
+    fCpuStagingBuffer = fCpuBufferCache ? fCpuBufferCache->makeBuffer(newSize, mustInitialize)
+                                        : GrCpuBuffer::Make(newSize);
 }
 
-
 void GrBufferAllocPool::flushCpuData(const BufferBlock& block, size_t flushSize) {
     SkASSERT(block.fBuffer.get());
     SkASSERT(!block.fBuffer.get()->isCpuBuffer());
     GrGpuBuffer* buffer = static_cast<GrGpuBuffer*>(block.fBuffer.get());
     SkASSERT(!buffer->isMapped());
-    SkASSERT(fCpuData == fBufferPtr);
+    SkASSERT(fCpuStagingBuffer && fCpuStagingBuffer->data() == fBufferPtr);
     SkASSERT(flushSize <= buffer->size());
     VALIDATE(true);
 
@@ -381,15 +424,17 @@
     auto resourceProvider = fGpu->getContext()->priv().resourceProvider();
 
     if (fGpu->caps()->preferClientSideDynamicBuffers()) {
-        return GrCpuBuffer::Make(size);
+        bool mustInitialize = fGpu->caps()->mustClearUploadedBufferData();
+        return fCpuBufferCache ? fCpuBufferCache->makeBuffer(size, mustInitialize)
+                               : GrCpuBuffer::Make(size);
     }
     return resourceProvider->createBuffer(size, fBufferType, kDynamic_GrAccessPattern);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
-GrVertexBufferAllocPool::GrVertexBufferAllocPool(GrGpu* gpu, void* initialCpuBuffer)
-        : GrBufferAllocPool(gpu, GrGpuBufferType::kVertex, initialCpuBuffer) {}
+GrVertexBufferAllocPool::GrVertexBufferAllocPool(GrGpu* gpu, sk_sp<CpuBufferCache> cpuBufferCache)
+        : GrBufferAllocPool(gpu, GrGpuBufferType::kVertex, std::move(cpuBufferCache)) {}
 
 void* GrVertexBufferAllocPool::makeSpace(size_t vertexSize,
                                          int vertexCount,
@@ -441,8 +486,8 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
-GrIndexBufferAllocPool::GrIndexBufferAllocPool(GrGpu* gpu, void* initialCpuBuffer)
-        : GrBufferAllocPool(gpu, GrGpuBufferType::kIndex, initialCpuBuffer) {}
+GrIndexBufferAllocPool::GrIndexBufferAllocPool(GrGpu* gpu, sk_sp<CpuBufferCache> cpuBufferCache)
+        : GrBufferAllocPool(gpu, GrGpuBufferType::kIndex, std::move(cpuBufferCache)) {}
 
 void* GrIndexBufferAllocPool::makeSpace(int indexCount, sk_sp<const GrBuffer>* buffer,
                                         int* startIndex) {
diff --git a/src/gpu/GrBufferAllocPool.h b/src/gpu/GrBufferAllocPool.h
index d0fda19..b499e80 100644
--- a/src/gpu/GrBufferAllocPool.h
+++ b/src/gpu/GrBufferAllocPool.h
@@ -8,14 +8,14 @@
 #ifndef GrBufferAllocPool_DEFINED
 #define GrBufferAllocPool_DEFINED
 
-#include "GrGpuBuffer.h"
+#include "GrCpuBuffer.h"
+#include "GrNonAtomicRef.h"
 #include "GrTypesPriv.h"
 #include "SkNoncopyable.h"
 #include "SkTArray.h"
 #include "SkTDArray.h"
 #include "SkTypes.h"
 
-
 class GrGpu;
 
 /**
@@ -35,6 +35,28 @@
     static constexpr size_t kDefaultBufferSize = 1 << 15;
 
     /**
+     * A cache object that can be shared by multiple GrBufferAllocPool instances. It caches
+     * cpu buffer allocations to avoid reallocating them.
+     */
+    class CpuBufferCache : public GrNonAtomicRef<CpuBufferCache> {
+    public:
+        static sk_sp<CpuBufferCache> Make(int maxBuffersToCache);
+
+        sk_sp<GrCpuBuffer> makeBuffer(size_t size, bool mustBeInitialized);
+        void releaseAll();
+
+    private:
+        CpuBufferCache(int maxBuffersToCache);
+
+        struct Buffer {
+            sk_sp<GrCpuBuffer> fBuffer;
+            bool fCleared = false;
+        };
+        std::unique_ptr<Buffer[]> fBuffers;
+        int fMaxBuffersToCache = 0;
+    };
+
+    /**
      * Ensures all buffers are unmapped and have all data written to them.
      * Call before drawing using buffers from the pool.
      */
@@ -56,11 +78,11 @@
      *
      * @param gpu                   The GrGpu used to create the buffers.
      * @param bufferType            The type of buffers to create.
-     * @param initialBuffer         If non-null this should be a kDefaultBufferSize byte allocation.
-     *                              This parameter can be used to avoid malloc/free when all
-     *                              usages can be satisfied with default-sized buffers.
+     * @param cpuBufferCache        If non-null a cache for client side array buffers
+     *                              or staging buffers used before data is uploaded to
+     *                              GPU buffer objects.
      */
-    GrBufferAllocPool(GrGpu* gpu, GrGpuBufferType bufferType, void* initialBuffer);
+    GrBufferAllocPool(GrGpu* gpu, GrGpuBufferType bufferType, sk_sp<CpuBufferCache> cpuBufferCache);
 
     virtual ~GrBufferAllocPool();
 
@@ -129,18 +151,17 @@
     void destroyBlock();
     void deleteBlocks();
     void flushCpuData(const BufferBlock& block, size_t flushSize);
-    void* resetCpuData(size_t newSize);
+    void resetCpuData(size_t newSize);
 #ifdef SK_DEBUG
     void validate(bool unusedBlockAllowed = false) const;
 #endif
     size_t fBytesInUse = 0;
 
     SkTArray<BufferBlock> fBlocks;
+    sk_sp<CpuBufferCache> fCpuBufferCache;
+    sk_sp<GrCpuBuffer> fCpuStagingBuffer;
     GrGpu* fGpu;
     GrGpuBufferType fBufferType;
-    void* fInitialCpuData = nullptr;
-    void* fCpuData = nullptr;
-    size_t fCpuDataSize = 0;
     void* fBufferPtr = nullptr;
 };
 
@@ -153,11 +174,11 @@
      * Constructor
      *
      * @param gpu                   The GrGpu used to create the vertex buffers.
-     * @param initialBuffer         If non-null this should be a kDefaultBufferSize byte allocation.
-     *                              This parameter can be used to avoid malloc/free when all
-     *                              usages can be satisfied with default-sized buffers.
+     * @param cpuBufferCache        If non-null a cache for client side array buffers
+     *                              or staging buffers used before data is uploaded to
+     *                              GPU buffer objects.
      */
-    GrVertexBufferAllocPool(GrGpu* gpu, void* initialBuffer);
+    GrVertexBufferAllocPool(GrGpu* gpu, sk_sp<CpuBufferCache> cpuBufferCache);
 
     /**
      * Returns a block of memory to hold vertices. A buffer designated to hold
@@ -232,11 +253,11 @@
      * Constructor
      *
      * @param gpu                   The GrGpu used to create the index buffers.
-     * @param initialBuffer         If non-null this should be a kDefaultBufferSize byte allocation.
-     *                              This parameter can be used to avoid malloc/free when all
-     *                              usages can be satisfied with default-sized buffers.
+     * @param cpuBufferCache        If non-null a cache for client side array buffers
+     *                              or staging buffers used before data is uploaded to
+     *                              GPU buffer objects.
      */
-    GrIndexBufferAllocPool(GrGpu* gpu, void* initialBuffer);
+    GrIndexBufferAllocPool(GrGpu* gpu, sk_sp<CpuBufferCache> cpuBufferCache);
 
     /**
      * Returns a block of memory to hold indices. A buffer designated to hold
diff --git a/src/gpu/GrDrawingManager.cpp b/src/gpu/GrDrawingManager.cpp
index 3b4c73f..29c72d1 100644
--- a/src/gpu/GrDrawingManager.cpp
+++ b/src/gpu/GrDrawingManager.cpp
@@ -229,14 +229,16 @@
     fActiveOpList = nullptr;
 
     fDAG.prepForFlush();
-    SkASSERT(SkToBool(fVertexBufferSpace) == SkToBool(fIndexBufferSpace));
-    if (!fVertexBufferSpace) {
-        fVertexBufferSpace.reset(new char[GrBufferAllocPool::kDefaultBufferSize]());
-        fIndexBufferSpace.reset(new char[GrBufferAllocPool::kDefaultBufferSize]());
+    if (!fCpuBufferCache) {
+        // We cache more buffers when the backend is using client side arrays. Otherwise, we
+        // expect each pool will use a CPU buffer as a staging buffer before uploading to a GPU
+        // buffer object. Each pool only requires one staging buffer at a time.
+        int maxCachedBuffers = fContext->priv().caps()->preferClientSideDynamicBuffers() ? 2 : 6;
+        fCpuBufferCache = GrBufferAllocPool::CpuBufferCache::Make(maxCachedBuffers);
     }
 
     GrOpFlushState flushState(gpu, fContext->priv().resourceProvider(), &fTokenTracker,
-                              fVertexBufferSpace.get(), fIndexBufferSpace.get());
+                              fCpuBufferCache);
 
     GrOnFlushResourceProvider onFlushProvider(this);
     // TODO: AFAICT the only reason fFlushState is on GrDrawingManager rather than on the
diff --git a/src/gpu/GrDrawingManager.h b/src/gpu/GrDrawingManager.h
index 7b17917..e543ed7 100644
--- a/src/gpu/GrDrawingManager.h
+++ b/src/gpu/GrDrawingManager.h
@@ -8,6 +8,7 @@
 #ifndef GrDrawingManager_DEFINED
 #define GrDrawingManager_DEFINED
 
+#include "GrBufferAllocPool.h"
 #include "GrDeferredUpload.h"
 #include "GrPathRenderer.h"
 #include "GrPathRendererChain.h"
@@ -161,9 +162,9 @@
     GrContext*                        fContext;
     GrPathRendererChain::Options      fOptionsForPathRendererChain;
     GrTextContext::Options            fOptionsForTextContext;
-
-    std::unique_ptr<char[]>           fVertexBufferSpace;
-    std::unique_ptr<char[]>           fIndexBufferSpace;
+    // This cache is used by both the vertex and index pools. It reuses memory across multiple
+    // flushes.
+    sk_sp<GrBufferAllocPool::CpuBufferCache> fCpuBufferCache;
     // In debug builds we guard against improper thread handling
     GrSingleOwner*                    fSingleOwner;
 
diff --git a/src/gpu/GrOpFlushState.cpp b/src/gpu/GrOpFlushState.cpp
index 37ee176..7b983c4 100644
--- a/src/gpu/GrOpFlushState.cpp
+++ b/src/gpu/GrOpFlushState.cpp
@@ -16,9 +16,10 @@
 //////////////////////////////////////////////////////////////////////////////
 
 GrOpFlushState::GrOpFlushState(GrGpu* gpu, GrResourceProvider* resourceProvider,
-                               GrTokenTracker* tokenTracker, void* vertexSpace, void* indexSpace)
-        : fVertexPool(gpu, vertexSpace)
-        , fIndexPool(gpu, indexSpace)
+                               GrTokenTracker* tokenTracker,
+                               sk_sp<GrBufferAllocPool::CpuBufferCache> cpuBufferCache)
+        : fVertexPool(gpu, cpuBufferCache)
+        , fIndexPool(gpu, std::move(cpuBufferCache))
         , fGpu(gpu)
         , fResourceProvider(resourceProvider)
         , fTokenTracker(tokenTracker) {}
diff --git a/src/gpu/GrOpFlushState.h b/src/gpu/GrOpFlushState.h
index 9159d49..54aa4dd 100644
--- a/src/gpu/GrOpFlushState.h
+++ b/src/gpu/GrOpFlushState.h
@@ -29,8 +29,8 @@
     // vertexSpace and indexSpace may either be null or an alloation of size
     // GrBufferAllocPool::kDefaultBufferSize. If the latter, then CPU memory is only allocated for
     // vertices/indices when a buffer larger than kDefaultBufferSize is required.
-    GrOpFlushState(GrGpu*, GrResourceProvider*, GrTokenTracker*, void* vertexSpace,
-                   void* indexSpace);
+    GrOpFlushState(GrGpu*, GrResourceProvider*, GrTokenTracker*,
+                   sk_sp<GrBufferAllocPool::CpuBufferCache> = nullptr);
 
     ~GrOpFlushState() final { this->reset(); }