Pull tessellation forward into GrTextureOp::onPrePrepareDraws

This trades memory and DDL-record time for GPU-flush time. That is, by tessellating in onPrePrepareDraws we will consume more memory (for the verts) and time at DDL-record time but, hopefully, less time when drawing the DDL.

Change-Id: I97dd1a00486c7a1da53add2ea203bf3f90c20162
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/250178
Commit-Queue: Robert Phillips <robertphillips@google.com>
Reviewed-by: Michael Ludwig <michaelludwig@google.com>
diff --git a/src/gpu/ops/GrTextureOp.cpp b/src/gpu/ops/GrTextureOp.cpp
index 8f1d4e3..66ed0fd 100644
--- a/src/gpu/ops/GrTextureOp.cpp
+++ b/src/gpu/ops/GrTextureOp.cpp
@@ -267,17 +267,78 @@
         int fQuadCnt;
     };
 
-    // This descriptor is created in onPrePrepare. It is allocated in the creation-time opData
-    // arena. The actual data for the fDynamicStateArrays and fFixedDynamicState members will be
-    // allocated in the arena passed to 'allocate'.
+    // This descriptor is used in both onPrePrepareDraws and onPrepareDraws.
+    //
+    // In the onPrePrepareDraws case it is allocated in the creation-time opData
+    // arena. Both allocateCommon and allocatePrePrepareOnly are called and they also allocate
+    // their memory in the creation-time opData arena.
+    //
+    // In the onPrepareDraws case this descriptor is created on the stack and only
+    // allocateCommon is called. In this case the common memory fields are allocated
+    // in the flush-time arena (i.e., as part of the flushState).
     struct PrePreparedDesc {
-        GrPipeline::DynamicStateArrays* fDynamicStateArrays = nullptr;
-        GrPipeline::FixedDynamicState*  fFixedDynamicState = nullptr;
         VertexSpec                      fVertexSpec;
         int                             fNumProxies = 0;
         int                             fNumTotalQuads = 0;
+        GrPipeline::DynamicStateArrays* fDynamicStateArrays = nullptr;
+        GrPipeline::FixedDynamicState*  fFixedDynamicState = nullptr;
 
-        void allocate(SkArenaAlloc* arena, const GrAppliedClip* clip, GrTextureProxy* firstProxy) {
+        // These two member variables are only used by 'onPrePrepareDraws'. The prior five are also
+        // used by 'onPrepareDraws'
+        // TODO: we could just recompute 'fVertexOffsets' in onPrepareDraws
+        int*                            fVertexOffsets = nullptr;
+        char*                           fVertices = nullptr;
+
+        // How big should 'fVertices' be to hold all the vertex data?
+        size_t totalSizeInBytes() const {
+            return fNumTotalQuads * fVertexSpec.verticesPerQuad() * fVertexSpec.vertexSize();
+        }
+
+#ifdef SK_DEBUG
+        int totalNumVertices() const {
+            return fNumTotalQuads * fVertexSpec.verticesPerQuad();
+        }
+#endif
+
+        // Helper to fill in the fFixedDynamicState and fDynamicStateArrays. If there is more
+        // than one mesh/proxy they are stored in fDynamicStateArrays but if there is only one
+        // it is stored in fFixedDynamicState.
+        void setMeshProxy(int index, GrTextureProxy* proxy) {
+            SkASSERT(index < fNumProxies);
+
+            if (fDynamicStateArrays) {
+                SkASSERT(fDynamicStateArrays->fPrimitiveProcessorTextures);
+                SkASSERT(fNumProxies > 1);
+
+                fDynamicStateArrays->fPrimitiveProcessorTextures[index] = proxy;
+            } else {
+                SkASSERT(fFixedDynamicState);
+                SkASSERT(fNumProxies == 1);
+
+                fFixedDynamicState->fPrimitiveProcessorTextures[index] = proxy;
+            }
+        }
+
+#ifdef SK_DEBUG
+        GrTextureProxy* getMeshProxy(int index) {
+            SkASSERT(index < fNumProxies);
+
+            if (fDynamicStateArrays) {
+                SkASSERT(fDynamicStateArrays->fPrimitiveProcessorTextures);
+                SkASSERT(fNumProxies > 1);
+
+                return fDynamicStateArrays->fPrimitiveProcessorTextures[index];
+            } else {
+                SkASSERT(fFixedDynamicState);
+                SkASSERT(fNumProxies == 1);
+
+                return fFixedDynamicState->fPrimitiveProcessorTextures[index];
+            }
+        }
+#endif
+
+        // Allocate the fields required in both onPrePrepareDraws and onPrepareDraws
+        void allocateCommon(SkArenaAlloc* arena, const GrAppliedClip* clip) {
             // We'll use a dynamic state array for the GP textures when there are multiple ops.
             // Otherwise, we use fixed dynamic state to specify the single op's proxy.
             if (fNumProxies > 1) {
@@ -285,9 +346,15 @@
                 fFixedDynamicState = Target::MakeFixedDynamicState(arena, clip, 0);
             } else {
                 fFixedDynamicState = Target::MakeFixedDynamicState(arena, clip, 1);
-                fFixedDynamicState->fPrimitiveProcessorTextures[0] = firstProxy;
             }
         }
+
+        // Allocate the fields only needed by onPrePrepareDraws
+        void allocatePrePrepareOnly(SkArenaAlloc* arena) {
+            fVertexOffsets = arena->makeArrayDefault<int>(fNumProxies);
+            fVertices = arena->makeArrayDefault<char>(this->totalSizeInBytes());
+        }
+
     };
 
     // dstQuad should be the geometry transformed by the view matrix. If domainRect
@@ -419,18 +486,20 @@
         fDomain = static_cast<unsigned>(netDomain);
     }
 
-    void tess(void* v, const VertexSpec& spec, const GrTextureProxy* proxy,
-              GrQuadBuffer<ColorDomainAndAA>::Iter* iter, int cnt) const {
+    static void Tess(void* v, const VertexSpec& spec, const GrTextureProxy* proxy,
+                     GrQuadBuffer<ColorDomainAndAA>::Iter* iter, int cnt,
+                     GrSamplerState::Filter filter) {
         TRACE_EVENT0("skia.gpu", TRACE_FUNC);
         auto origin = proxy->origin();
-        const auto* texture = proxy->peekTexture();
+        SkISize dimensions = proxy->backingStoreDimensions();
+
         float iw, ih, h;
         if (proxy->textureType() == GrTextureType::kRectangle) {
             iw = ih = 1.f;
-            h = texture->height();
+            h = dimensions.height();
         } else {
-            iw = 1.f / texture->width();
-            ih = 1.f / texture->height();
+            iw = 1.f / dimensions.width();
+            ih = 1.f / dimensions.height();
             h = 1.f;
         }
 
@@ -444,7 +513,7 @@
             // Must correct the texture coordinates and domain now that the real texture size
             // is known
             compute_src_quad(origin, iter->localQuad(), iw, ih, h, &srcQuad);
-            compute_domain(info.domain(), this->filter(), origin, info.fDomainRect, iw, ih, h,
+            compute_domain(info.domain(), filter, origin, info.fDomainRect, iw, ih, h,
                            &domain);
             v = GrQuadPerEdgeAA::Tessellate(v, spec, iter->deviceQuad(), info.fColor, srcQuad,
                                             domain, info.aaFlags());
@@ -464,9 +533,51 @@
 
         fPrePreparedDesc->fVertexSpec = this->characterize(&fPrePreparedDesc->fNumProxies,
                                                            &fPrePreparedDesc->fNumTotalQuads);
-        fPrePreparedDesc->allocate(arena, clip, fProxyCountPairs[0].fProxy);
+        fPrePreparedDesc->allocateCommon(arena, clip);
 
-        // Pull forward the tessellation of the quads to here
+        fPrePreparedDesc->allocatePrePrepareOnly(arena);
+
+        {
+            SkDEBUGCODE(int totQuadsSeen = 0;)
+            SkDEBUGCODE(int totVerticesSeen = 0;)
+            int vertexOffsetInBuffer = 0;
+            char* dst = fPrePreparedDesc->fVertices;
+            const size_t vertexSize = fPrePreparedDesc->fVertexSpec.vertexSize();
+
+            int meshIndex = 0;
+            for (const auto& op : ChainRange<TextureOp>(this)) {
+                auto iter = op.fQuads.iterator();
+                for (unsigned p = 0; p < op.fProxyCnt; ++p) {
+                    GrTextureProxy* proxy = op.fProxyCountPairs[p].fProxy;
+
+                    int quadCnt = op.fProxyCountPairs[p].fQuadCnt;
+                    SkDEBUGCODE(totQuadsSeen += quadCnt;)
+
+                    int meshVertexCnt = quadCnt * fPrePreparedDesc->fVertexSpec.verticesPerQuad();
+                    SkDEBUGCODE(totVerticesSeen += meshVertexCnt);
+
+                    Tess(dst, fPrePreparedDesc->fVertexSpec, proxy, &iter, quadCnt, op.filter());
+
+                    fPrePreparedDesc->fVertexOffsets[meshIndex] = vertexOffsetInBuffer;
+                    SkASSERT(vertexOffsetInBuffer * vertexSize ==
+                             (size_t)(dst - fPrePreparedDesc->fVertices));
+                    fPrePreparedDesc->setMeshProxy(meshIndex, proxy);
+                    ++meshIndex;
+
+                    vertexOffsetInBuffer += meshVertexCnt;
+                    dst += vertexSize * meshVertexCnt;
+                }
+                // If quad counts per proxy were calculated correctly, the entire iterator
+                // should have been consumed.
+                SkASSERT(!iter.next());
+            }
+
+            SkASSERT(fPrePreparedDesc->totalSizeInBytes() ==
+                                                     (size_t)(dst - fPrePreparedDesc->fVertices));
+            SkASSERT(meshIndex == fPrePreparedDesc->fNumProxies);
+            SkASSERT(totQuadsSeen == fPrePreparedDesc->fNumTotalQuads);
+            SkASSERT(totVerticesSeen == fPrePreparedDesc->totalNumVertices());
+        }
     }
 
 #ifdef SK_DEBUG
@@ -542,7 +653,9 @@
             SkArenaAlloc* arena = target->allocator();
 
             desc.fVertexSpec = this->characterize(&desc.fNumProxies, &desc.fNumTotalQuads);
-            desc.allocate(arena, target->appliedClip(), fProxyCountPairs[0].fProxy);
+            desc.allocateCommon(arena, target->appliedClip());
+
+            SkASSERT(!desc.fVertexOffsets && !desc.fVertices);
         }
 
         size_t vertexSize = desc.fVertexSpec.vertexSize();
@@ -573,7 +686,16 @@
                 }
                 SkASSERT(numAllocatedVertices >= meshVertexCnt);
 
-                op.tess(vdata, desc.fVertexSpec, proxy, &iter, quadCnt);
+                if (fPrePreparedDesc) {
+                    // TODO: when we've prePrepared the vertex data should we just allocate
+                    // all the vertices together and just do one memcpy?
+                    size_t offset = desc.fVertexOffsets[meshIndex] * vertexSize;
+                    memcpy(vdata, &desc.fVertices[offset], meshVertexCnt * vertexSize);
+                    SkASSERT(proxy == desc.getMeshProxy(meshIndex));
+                } else {
+                    Tess(vdata, desc.fVertexSpec, proxy, &iter, quadCnt, op.filter());
+                    desc.setMeshProxy(meshIndex, proxy);
+                }
 
                 SkASSERT(meshIndex < desc.fNumProxies);
 
@@ -583,18 +705,17 @@
                     return;
                 }
                 meshes[meshIndex].setVertexData(vbuffer, vertexOffsetInBuffer);
-                if (desc.fDynamicStateArrays) {
-                    desc.fDynamicStateArrays->fPrimitiveProcessorTextures[meshIndex] = proxy;
-                }
                 ++meshIndex;
+
                 numAllocatedVertices -= meshVertexCnt;
                 numQuadVerticesLeft -= meshVertexCnt;
                 vertexOffsetInBuffer += meshVertexCnt;
                 vdata = reinterpret_cast<char*>(vdata) + vertexSize * meshVertexCnt;
             }
-            // If quad counts per proxy were calculated correctly, the entire iterator should have
-            // been consumed.
-            SkASSERT(!iter.next());
+
+            // If quad counts per proxy were calculated correctly, the entire iterator should
+            // have been consumed.
+            SkASSERT(fPrePreparedDesc || !iter.next());
         }
         SkASSERT(!numQuadVerticesLeft);
         SkASSERT(!numAllocatedVertices);