Make GrMemoryPool play nice with bucketing allocators.

Some memory allocators have very coarse size buckets, so for example on
Android (jemalloc) an attempt to allocate 32 KiB + 1 byte will end up
allocating 40 KiB, wasting 8 KiB.

GrMemoryPool ctor takes two arguments that specify prealloc / block sizes,
and then inflates them to accommodate some bookkeeping structures. Since
most places create GrMemoryPools with pow2 numbers (which have buckets in
most allocators) the inflation causes allocator to select next size bucket,
wasting memory.

This CL makes GrMemoryPool to stop inflating sizes it was created with, and
allocate specified amounts exactly. Part of allocated memory is then used for
bookkeeping structures. Additionally, GrObjectMemoryPool template is provided,
which takes prealloc / block object counts (instead of sizes) and guarantees
that specified number of objects will fit in prealloc / block spaces.

BUG=651872
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2525773002

Review-Url: https://codereview.chromium.org/2525773002
diff --git a/src/gpu/GrMemoryPool.cpp b/src/gpu/GrMemoryPool.cpp
index 6bc0f54..c95b59f 100644
--- a/src/gpu/GrMemoryPool.cpp
+++ b/src/gpu/GrMemoryPool.cpp
@@ -13,17 +13,19 @@
     #define VALIDATE
 #endif
 
+constexpr size_t GrMemoryPool::kSmallestMinAllocSize;
+
 GrMemoryPool::GrMemoryPool(size_t preallocSize, size_t minAllocSize) {
     SkDEBUGCODE(fAllocationCnt = 0);
     SkDEBUGCODE(fAllocBlockCnt = 0);
 
-    minAllocSize = SkTMax<size_t>(minAllocSize, 1 << 10);
-    fMinAllocSize = GrSizeAlignUp(minAllocSize + kPerAllocPad, kAlignment);
-    fPreallocSize = GrSizeAlignUp(preallocSize + kPerAllocPad, kAlignment);
-    fPreallocSize = SkTMax(fPreallocSize, fMinAllocSize);
+    minAllocSize = SkTMax<size_t>(GrSizeAlignUp(minAllocSize, kAlignment), kSmallestMinAllocSize);
+    preallocSize = SkTMax<size_t>(GrSizeAlignUp(preallocSize, kAlignment), minAllocSize);
+
+    fMinAllocSize = minAllocSize;
     fSize = 0;
 
-    fHead = CreateBlock(fPreallocSize);
+    fHead = CreateBlock(preallocSize);
     fTail = fHead;
     fHead->fNext = nullptr;
     fHead->fPrev = nullptr;
@@ -43,7 +45,7 @@
     size += kPerAllocPad;
     size = GrSizeAlignUp(size, kAlignment);
     if (fTail->fFreeSize < size) {
-        size_t blockSize = size;
+        size_t blockSize = size + kHeaderSize;
         blockSize = SkTMax<size_t>(blockSize, fMinAllocSize);
         BlockHeader* block = CreateBlock(blockSize);
 
@@ -87,7 +89,7 @@
         if (fHead == block) {
             fHead->fCurrPtr = reinterpret_cast<intptr_t>(fHead) + kHeaderSize;
             fHead->fLiveCount = 0;
-            fHead->fFreeSize = fPreallocSize;
+            fHead->fFreeSize = fHead->fSize - kHeaderSize;
         } else {
             BlockHeader* prev = block->fPrev;
             BlockHeader* next = block->fNext;
@@ -115,18 +117,18 @@
     VALIDATE;
 }
 
-GrMemoryPool::BlockHeader* GrMemoryPool::CreateBlock(size_t size) {
-    size_t paddedSize = size + kHeaderSize;
+GrMemoryPool::BlockHeader* GrMemoryPool::CreateBlock(size_t blockSize) {
+    blockSize = SkTMax<size_t>(blockSize, kHeaderSize);
     BlockHeader* block =
-        reinterpret_cast<BlockHeader*>(sk_malloc_throw(paddedSize));
+        reinterpret_cast<BlockHeader*>(sk_malloc_throw(blockSize));
     // we assume malloc gives us aligned memory
     SkASSERT(!(reinterpret_cast<intptr_t>(block) % kAlignment));
     SkDEBUGCODE(block->fBlockSentinal = kAssignedMarker);
     block->fLiveCount = 0;
-    block->fFreeSize = size;
+    block->fFreeSize = blockSize - kHeaderSize;
     block->fCurrPtr = reinterpret_cast<intptr_t>(block) + kHeaderSize;
     block->fPrevPtr = 0; // gcc warns on assigning nullptr to an intptr_t.
-    block->fSize = paddedSize;
+    block->fSize = blockSize;
     return block;
 }
 
@@ -153,18 +155,16 @@
         intptr_t b = reinterpret_cast<intptr_t>(block);
         size_t ptrOffset = block->fCurrPtr - b;
         size_t totalSize = ptrOffset + block->fFreeSize;
-        size_t userSize = totalSize - kHeaderSize;
         intptr_t userStart = b + kHeaderSize;
 
         SkASSERT(!(b % kAlignment));
         SkASSERT(!(totalSize % kAlignment));
-        SkASSERT(!(userSize % kAlignment));
         SkASSERT(!(block->fCurrPtr % kAlignment));
         if (fHead != block) {
             SkASSERT(block->fLiveCount);
-            SkASSERT(userSize >= fMinAllocSize);
+            SkASSERT(totalSize >= fMinAllocSize);
         } else {
-            SkASSERT(userSize == fPreallocSize);
+            SkASSERT(totalSize == block->fSize);
         }
         if (!block->fLiveCount) {
             SkASSERT(ptrOffset ==  kHeaderSize);
diff --git a/src/gpu/GrMemoryPool.h b/src/gpu/GrMemoryPool.h
index 43826d3..e483aab 100644
--- a/src/gpu/GrMemoryPool.h
+++ b/src/gpu/GrMemoryPool.h
@@ -13,16 +13,23 @@
 /**
  * Allocates memory in blocks and parcels out space in the blocks for allocation
  * requests. It is optimized for allocate / release speed over memory
- * effeciency. The interface is designed to be used to implement operator new
+ * efficiency. The interface is designed to be used to implement operator new
  * and delete overrides. All allocations are expected to be released before the
  * pool's destructor is called. Allocations will be 8-byte aligned.
  */
 class GrMemoryPool {
 public:
     /**
-     * Prealloc size is the amount of space to make available at pool creation
-     * time and keep around until pool destruction. The min alloc size is the
-     * smallest allowed size of additional allocations.
+     * Prealloc size is the amount of space to allocate at pool creation
+     * time and keep around until pool destruction. The min alloc size is
+     * the smallest allowed size of additional allocations. Both sizes are
+     * adjusted to ensure that:
+     *   1. they are are 8-byte aligned
+     *   2. minAllocSize >= kSmallestMinAllocSize
+     *   3. preallocSize >= minAllocSize
+     *
+     * Both sizes is what the pool will end up allocating from the system, and
+     * portions of the allocated memory is used for internal bookkeeping.
      */
     GrMemoryPool(size_t preallocSize, size_t minAllocSize);
 
@@ -48,6 +55,16 @@
      */
     size_t size() const { return fSize; }
 
+    /**
+     * Returns the preallocated size of the GrMemoryPool
+     */
+    size_t preallocSize() const { return fHead->fSize; }
+
+    /**
+     * Minimum value of minAllocSize constructor argument.
+     */
+    constexpr static size_t kSmallestMinAllocSize = 1 << 10;
+
 private:
     struct BlockHeader;
 
@@ -81,14 +98,7 @@
         BlockHeader* fHeader;    ///< pointer back to the block header in which an alloc resides
     };
 
-    enum {
-        // We assume this alignment is good enough for everybody.
-        kAlignment    = 8,
-        kHeaderSize   = GR_CT_ALIGN_UP(sizeof(BlockHeader), kAlignment),
-        kPerAllocPad  = GR_CT_ALIGN_UP(sizeof(AllocHeader), kAlignment),
-    };
     size_t                            fSize;
-    size_t                            fPreallocSize;
     size_t                            fMinAllocSize;
     BlockHeader*                      fHead;
     BlockHeader*                      fTail;
@@ -96,6 +106,79 @@
     int                               fAllocationCnt;
     int                               fAllocBlockCnt;
 #endif
+
+protected:
+    enum {
+        // We assume this alignment is good enough for everybody.
+        kAlignment    = 8,
+        kHeaderSize   = GR_CT_ALIGN_UP(sizeof(BlockHeader), kAlignment),
+        kPerAllocPad  = GR_CT_ALIGN_UP(sizeof(AllocHeader), kAlignment),
+    };
 };
 
+/**
+ * Variant of GrMemoryPool that can only allocate objects of a single type. It is
+ * not as flexible as GrMemoryPool, but it has more convenient allocate() method,
+ * and more importantly, it guarantees number of objects that are preallocated at
+ * construction or when adding a new memory block. I.e.
+ *
+ * GrMemoryPool pool(3 * sizeof(T), 1000 * sizeof(T));
+ * pool.allocate(sizeof(T));
+ * pool.allocate(sizeof(T));
+ * pool.allocate(sizeof(T));
+ *
+ * will preallocate 3 * sizeof(T) bytes and use some of those bytes for internal
+ * structures. Because of that, last allocate() call will end up allocating a new
+ * block of 1000 * sizeof(T) bytes. In contrast,
+ *
+ * GrObjectMemoryPool<T> pool(3, 1000);
+ * pool.allocate();
+ * pool.allocate();
+ * pool.allocate();
+ *
+ * guarantees to preallocate enough memory for 3 objects of sizeof(T), so last
+ * allocate() will use preallocated memory and won't cause allocation of a new block.
+ *
+ * Same thing is true for the second (minAlloc) ctor argument: this class guarantees
+ * that a newly added block will have enough space for 1000 objects of sizeof(T), while
+ * GrMemoryPool does not.
+ */
+template <class T>
+class GrObjectMemoryPool: public GrMemoryPool {
+public:
+    /**
+     * Preallocates memory for preallocCount objects, and sets new block size to be
+     * enough to hold minAllocCount objects.
+     */
+    GrObjectMemoryPool(size_t preallocCount, size_t minAllocCount)
+        : GrMemoryPool(CountToSize(preallocCount),
+                       CountToSize(SkTMax(minAllocCount, kSmallestMinAllocCount))) {
+    }
+
+    /**
+     * Allocates memory for an object, but doesn't construct or otherwise initialize it.
+     * The memory must be freed with release().
+     */
+    T* allocate() { return static_cast<T*>(GrMemoryPool::allocate(sizeof(T))); }
+
+private:
+    constexpr static size_t kTotalObjectSize =
+        kPerAllocPad + GR_CT_ALIGN_UP(sizeof(T), kAlignment);
+
+    constexpr static size_t CountToSize(size_t count) {
+        return kHeaderSize + count * kTotalObjectSize;
+    }
+
+public:
+    /**
+     * Minimum value of minAllocCount constructor argument.
+     */
+    constexpr static size_t kSmallestMinAllocCount =
+        (GrMemoryPool::kSmallestMinAllocSize - kHeaderSize + kTotalObjectSize - 1) /
+            kTotalObjectSize;
+};
+
+template <class T>
+constexpr size_t GrObjectMemoryPool<T>::kSmallestMinAllocCount;
+
 #endif
diff --git a/src/gpu/instanced/InstancedRendering.cpp b/src/gpu/instanced/InstancedRendering.cpp
index 30ec5ce..99a749a 100644
--- a/src/gpu/instanced/InstancedRendering.cpp
+++ b/src/gpu/instanced/InstancedRendering.cpp
@@ -18,7 +18,7 @@
 InstancedRendering::InstancedRendering(GrGpu* gpu)
     : fGpu(SkRef(gpu)),
       fState(State::kRecordingDraws),
-      fDrawPool(1024 * sizeof(Batch::Draw), 1024 * sizeof(Batch::Draw)) {
+      fDrawPool(1024, 1024) {
 }
 
 GrDrawBatch* InstancedRendering::recordRect(const SkRect& rect, const SkMatrix& viewMatrix,
@@ -241,7 +241,7 @@
       fIsTracked(false),
       fNumDraws(1),
       fNumChangesInGeometry(0) {
-    fHeadDraw = fTailDraw = (Draw*)fInstancedRendering->fDrawPool.allocate(sizeof(Draw));
+    fHeadDraw = fTailDraw = fInstancedRendering->fDrawPool.allocate();
 #ifdef SK_DEBUG
     fHeadDraw->fGeometry = {-1, 0};
 #endif
diff --git a/src/gpu/instanced/InstancedRendering.h b/src/gpu/instanced/InstancedRendering.h
index 1325370..97d6e19 100644
--- a/src/gpu/instanced/InstancedRendering.h
+++ b/src/gpu/instanced/InstancedRendering.h
@@ -173,7 +173,7 @@
 
     const sk_sp<GrGpu>                   fGpu;
     State                                fState;
-    GrMemoryPool                         fDrawPool;
+    GrObjectMemoryPool<Batch::Draw>      fDrawPool;
     SkSTArray<1024, ParamsTexel, true>   fParams;
     BatchList                            fTrackedBatches;
     sk_sp<const GrBuffer>                fVertexBuffer;
diff --git a/tests/GrMemoryPoolTest.cpp b/tests/GrMemoryPoolTest.cpp
index 863574a..4eb5ca7 100644
--- a/tests/GrMemoryPoolTest.cpp
+++ b/tests/GrMemoryPoolTest.cpp
@@ -10,6 +10,7 @@
 #if SK_SUPPORT_GPU
 #include "GrMemoryPool.h"
 #include "SkRandom.h"
+#include "SkTArray.h"
 #include "SkTDArray.h"
 #include "SkTemplates.h"
 
@@ -227,4 +228,174 @@
     }
 }
 
+// GrMemoryPool requires that it's empty at the point of destruction. This helps
+// achieving that by releasing all added memory in the destructor.
+class AutoPoolReleaser {
+public:
+    AutoPoolReleaser(GrMemoryPool& pool): fPool(pool) {
+    }
+    ~AutoPoolReleaser() {
+        for (void* ptr: fAllocated) {
+            fPool.release(ptr);
+        }
+    }
+    void add(void* ptr) {
+        fAllocated.push_back(ptr);
+    }
+private:
+    GrMemoryPool& fPool;
+    SkTArray<void*> fAllocated;
+};
+
+DEF_TEST(GrMemoryPoolAPI, reporter) {
+    constexpr size_t kSmallestMinAllocSize = GrMemoryPool::kSmallestMinAllocSize;
+
+    // Allocates memory until pool adds a new block (pool.size() changes).
+    auto allocateMemory = [](GrMemoryPool& pool, AutoPoolReleaser& r) {
+        size_t origPoolSize = pool.size();
+        while (pool.size() == origPoolSize) {
+            r.add(pool.allocate(31));
+        }
+    };
+
+    // Effective prealloc space capacity is >= kSmallestMinAllocSize.
+    {
+        GrMemoryPool pool(0, 0);
+        REPORTER_ASSERT(reporter, pool.preallocSize() == kSmallestMinAllocSize);
+    }
+
+    // Effective prealloc space capacity is >= minAllocSize.
+    {
+        constexpr size_t kMinAllocSize = kSmallestMinAllocSize * 2;
+        GrMemoryPool pool(kSmallestMinAllocSize, kMinAllocSize);
+        REPORTER_ASSERT(reporter, pool.preallocSize() == kMinAllocSize);
+    }
+
+    // Effective block size capacity >= kSmallestMinAllocSize.
+    {
+        GrMemoryPool pool(kSmallestMinAllocSize, kSmallestMinAllocSize / 2);
+        AutoPoolReleaser r(pool);
+
+        allocateMemory(pool, r);
+        REPORTER_ASSERT(reporter, pool.size() == kSmallestMinAllocSize);
+    }
+
+    // Pool allocates exactly preallocSize on creation.
+    {
+        constexpr size_t kPreallocSize = kSmallestMinAllocSize * 5;
+        GrMemoryPool pool(kPreallocSize, 0);
+        REPORTER_ASSERT(reporter, pool.preallocSize() == kPreallocSize);
+    }
+
+    // Pool allocates exactly minAllocSize when it expands.
+    {
+        constexpr size_t kMinAllocSize = kSmallestMinAllocSize * 7;
+        GrMemoryPool pool(0, kMinAllocSize);
+        AutoPoolReleaser r(pool);
+
+        allocateMemory(pool, r);
+        REPORTER_ASSERT(reporter, pool.size() == kMinAllocSize);
+
+        allocateMemory(pool, r);
+        REPORTER_ASSERT(reporter, pool.size() == 2 * kMinAllocSize);
+    }
+
+    // When asked to allocate amount > minAllocSize, pool allocates larger block
+    // to accommodate all internal structures.
+    {
+        constexpr size_t kMinAllocSize = kSmallestMinAllocSize * 2;
+        GrMemoryPool pool(kSmallestMinAllocSize, kMinAllocSize);
+        AutoPoolReleaser r(pool);
+
+        REPORTER_ASSERT(reporter, pool.size() == 0);
+
+        constexpr size_t hugeSize = 10 * kMinAllocSize;
+        r.add(pool.allocate(hugeSize));
+        REPORTER_ASSERT(reporter, pool.size() > hugeSize);
+
+        // Block size allocated to accommodate huge request doesn't include any extra
+        // space, so next allocation request allocates a new block.
+        size_t hugeBlockSize = pool.size();
+        r.add(pool.allocate(0));
+        REPORTER_ASSERT(reporter, pool.size() == hugeBlockSize + kMinAllocSize);
+    }
+}
+
+DEF_TEST(GrObjectMemoryPoolAPI, reporter) {
+    struct Data {
+        int value[5];
+    };
+    using DataObjectPool = GrObjectMemoryPool<Data>;
+    constexpr size_t kSmallestMinAllocCount = DataObjectPool::kSmallestMinAllocCount;
+
+    // Allocates objects until pool adds a new block (pool.size() changes).
+    // Returns number of objects that fit into the current block (i.e. before pool.size()
+    // changed; newly allocated block always ends up with one object allocated from it).
+    auto allocateObjects = [](DataObjectPool& pool, AutoPoolReleaser& r) -> size_t {
+        size_t count = 0;
+        size_t origPoolSize = pool.size();
+        while (pool.size() == origPoolSize) {
+            r.add(pool.allocate());
+            count++;
+        }
+        return count - 1;
+    };
+
+    // Effective prealloc space capacity is >= kSmallestMinAllocCount.
+    {
+        DataObjectPool pool(kSmallestMinAllocCount / 3, 0);
+        AutoPoolReleaser r(pool);
+
+        size_t preallocCount = allocateObjects(pool, r);
+        REPORTER_ASSERT(reporter, preallocCount == kSmallestMinAllocCount);
+    }
+
+    // Effective prealloc space capacity is >= minAllocCount.
+    {
+        DataObjectPool pool(kSmallestMinAllocCount, 2 * kSmallestMinAllocCount);
+        AutoPoolReleaser r(pool);
+
+        size_t preallocCount = allocateObjects(pool, r);
+        REPORTER_ASSERT(reporter, preallocCount == 2 * kSmallestMinAllocCount);
+    }
+
+    // Effective block capacity is >= kSmallestMinAllocCount.
+    {
+        DataObjectPool pool(kSmallestMinAllocCount, kSmallestMinAllocCount / 2);
+        AutoPoolReleaser r(pool);
+
+        // Fill prealloc space
+        allocateObjects(pool, r);
+
+        size_t minAllocCount = 1 + allocateObjects(pool, r);
+        REPORTER_ASSERT(reporter, minAllocCount == kSmallestMinAllocCount);
+    }
+
+    // Pool allocates space for exactly preallocCount objects on creation.
+    {
+        constexpr size_t kPreallocCount = kSmallestMinAllocCount * 7 / 3;
+        DataObjectPool pool(kPreallocCount, 0);
+        AutoPoolReleaser r(pool);
+
+        size_t preallocCount = allocateObjects(pool, r);
+        REPORTER_ASSERT(reporter, preallocCount == kPreallocCount);
+    }
+
+    // Pool allocates space for minAllocCount objects when it adds a new block.
+    {
+        constexpr size_t kMinAllocCount = kSmallestMinAllocCount * 11 / 3;
+        DataObjectPool pool(0, kMinAllocCount);
+        AutoPoolReleaser r(pool);
+
+        // Fill prealloc space
+        allocateObjects(pool, r);
+
+        size_t firstBlockCount = 1 + allocateObjects(pool, r);
+        REPORTER_ASSERT(reporter, firstBlockCount == kMinAllocCount);
+
+        size_t secondBlockCount = 1 + allocateObjects(pool, r);
+        REPORTER_ASSERT(reporter, secondBlockCount == kMinAllocCount);
+    }
+}
+
 #endif