Pull tessellation forward into GrTextureOp::onPrePrepareDraws
This trades memory and DDL-record time for GPU-flush time. That is, by tessellating in onPrePrepareDraws we will consume more memory (for the verts) and time at DDL-record time but, hopefully, less time when drawing the DDL.
Change-Id: I97dd1a00486c7a1da53add2ea203bf3f90c20162
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/250178
Commit-Queue: Robert Phillips <robertphillips@google.com>
Reviewed-by: Michael Ludwig <michaelludwig@google.com>
diff --git a/src/gpu/ops/GrTextureOp.cpp b/src/gpu/ops/GrTextureOp.cpp
index 8f1d4e3..66ed0fd 100644
--- a/src/gpu/ops/GrTextureOp.cpp
+++ b/src/gpu/ops/GrTextureOp.cpp
@@ -267,17 +267,78 @@
int fQuadCnt;
};
- // This descriptor is created in onPrePrepare. It is allocated in the creation-time opData
- // arena. The actual data for the fDynamicStateArrays and fFixedDynamicState members will be
- // allocated in the arena passed to 'allocate'.
+ // This descriptor is used in both onPrePrepareDraws and onPrepareDraws.
+ //
+ // In the onPrePrepareDraws case it is allocated in the creation-time opData
+ // arena. Both allocateCommon and allocatePrePrepareOnly are called and they also allocate
+ // their memory in the creation-time opData arena.
+ //
+ // In the onPrepareDraws case this descriptor is created on the stack and only
+ // allocateCommon is called. In this case the common memory fields are allocated
+ // in the flush-time arena (i.e., as part of the flushState).
struct PrePreparedDesc {
- GrPipeline::DynamicStateArrays* fDynamicStateArrays = nullptr;
- GrPipeline::FixedDynamicState* fFixedDynamicState = nullptr;
VertexSpec fVertexSpec;
int fNumProxies = 0;
int fNumTotalQuads = 0;
+ GrPipeline::DynamicStateArrays* fDynamicStateArrays = nullptr;
+ GrPipeline::FixedDynamicState* fFixedDynamicState = nullptr;
- void allocate(SkArenaAlloc* arena, const GrAppliedClip* clip, GrTextureProxy* firstProxy) {
+ // These two member variables are only used by 'onPrePrepareDraws'. The prior five are also
+ // used by 'onPrepareDraws'
+ // TODO: we could just recompute 'fVertexOffsets' in onPrepareDraws
+ int* fVertexOffsets = nullptr;
+ char* fVertices = nullptr;
+
+ // How big should 'fVertices' be to hold all the vertex data?
+ size_t totalSizeInBytes() const {
+ return fNumTotalQuads * fVertexSpec.verticesPerQuad() * fVertexSpec.vertexSize();
+ }
+
+#ifdef SK_DEBUG
+ int totalNumVertices() const {
+ return fNumTotalQuads * fVertexSpec.verticesPerQuad();
+ }
+#endif
+
+ // Helper to fill in the fFixedDynamicState and fDynamicStateArrays. If there is more
+ // than one mesh/proxy they are stored in fDynamicStateArrays but if there is only one
+ // it is stored in fFixedDynamicState.
+ void setMeshProxy(int index, GrTextureProxy* proxy) {
+ SkASSERT(index < fNumProxies);
+
+ if (fDynamicStateArrays) {
+ SkASSERT(fDynamicStateArrays->fPrimitiveProcessorTextures);
+ SkASSERT(fNumProxies > 1);
+
+ fDynamicStateArrays->fPrimitiveProcessorTextures[index] = proxy;
+ } else {
+ SkASSERT(fFixedDynamicState);
+ SkASSERT(fNumProxies == 1);
+
+ fFixedDynamicState->fPrimitiveProcessorTextures[index] = proxy;
+ }
+ }
+
+#ifdef SK_DEBUG
+ GrTextureProxy* getMeshProxy(int index) {
+ SkASSERT(index < fNumProxies);
+
+ if (fDynamicStateArrays) {
+ SkASSERT(fDynamicStateArrays->fPrimitiveProcessorTextures);
+ SkASSERT(fNumProxies > 1);
+
+ return fDynamicStateArrays->fPrimitiveProcessorTextures[index];
+ } else {
+ SkASSERT(fFixedDynamicState);
+ SkASSERT(fNumProxies == 1);
+
+ return fFixedDynamicState->fPrimitiveProcessorTextures[index];
+ }
+ }
+#endif
+
+ // Allocate the fields required in both onPrePrepareDraws and onPrepareDraws
+ void allocateCommon(SkArenaAlloc* arena, const GrAppliedClip* clip) {
// We'll use a dynamic state array for the GP textures when there are multiple ops.
// Otherwise, we use fixed dynamic state to specify the single op's proxy.
if (fNumProxies > 1) {
@@ -285,9 +346,15 @@
fFixedDynamicState = Target::MakeFixedDynamicState(arena, clip, 0);
} else {
fFixedDynamicState = Target::MakeFixedDynamicState(arena, clip, 1);
- fFixedDynamicState->fPrimitiveProcessorTextures[0] = firstProxy;
}
}
+
+ // Allocate the fields only needed by onPrePrepareDraws
+ void allocatePrePrepareOnly(SkArenaAlloc* arena) {
+ fVertexOffsets = arena->makeArrayDefault<int>(fNumProxies);
+ fVertices = arena->makeArrayDefault<char>(this->totalSizeInBytes());
+ }
+
};
// dstQuad should be the geometry transformed by the view matrix. If domainRect
@@ -419,18 +486,20 @@
fDomain = static_cast<unsigned>(netDomain);
}
- void tess(void* v, const VertexSpec& spec, const GrTextureProxy* proxy,
- GrQuadBuffer<ColorDomainAndAA>::Iter* iter, int cnt) const {
+ static void Tess(void* v, const VertexSpec& spec, const GrTextureProxy* proxy,
+ GrQuadBuffer<ColorDomainAndAA>::Iter* iter, int cnt,
+ GrSamplerState::Filter filter) {
TRACE_EVENT0("skia.gpu", TRACE_FUNC);
auto origin = proxy->origin();
- const auto* texture = proxy->peekTexture();
+ SkISize dimensions = proxy->backingStoreDimensions();
+
float iw, ih, h;
if (proxy->textureType() == GrTextureType::kRectangle) {
iw = ih = 1.f;
- h = texture->height();
+ h = dimensions.height();
} else {
- iw = 1.f / texture->width();
- ih = 1.f / texture->height();
+ iw = 1.f / dimensions.width();
+ ih = 1.f / dimensions.height();
h = 1.f;
}
@@ -444,7 +513,7 @@
// Must correct the texture coordinates and domain now that the real texture size
// is known
compute_src_quad(origin, iter->localQuad(), iw, ih, h, &srcQuad);
- compute_domain(info.domain(), this->filter(), origin, info.fDomainRect, iw, ih, h,
+ compute_domain(info.domain(), filter, origin, info.fDomainRect, iw, ih, h,
&domain);
v = GrQuadPerEdgeAA::Tessellate(v, spec, iter->deviceQuad(), info.fColor, srcQuad,
domain, info.aaFlags());
@@ -464,9 +533,51 @@
fPrePreparedDesc->fVertexSpec = this->characterize(&fPrePreparedDesc->fNumProxies,
&fPrePreparedDesc->fNumTotalQuads);
- fPrePreparedDesc->allocate(arena, clip, fProxyCountPairs[0].fProxy);
+ fPrePreparedDesc->allocateCommon(arena, clip);
- // Pull forward the tessellation of the quads to here
+ fPrePreparedDesc->allocatePrePrepareOnly(arena);
+
+ {
+ SkDEBUGCODE(int totQuadsSeen = 0;)
+ SkDEBUGCODE(int totVerticesSeen = 0;)
+ int vertexOffsetInBuffer = 0;
+ char* dst = fPrePreparedDesc->fVertices;
+ const size_t vertexSize = fPrePreparedDesc->fVertexSpec.vertexSize();
+
+ int meshIndex = 0;
+ for (const auto& op : ChainRange<TextureOp>(this)) {
+ auto iter = op.fQuads.iterator();
+ for (unsigned p = 0; p < op.fProxyCnt; ++p) {
+ GrTextureProxy* proxy = op.fProxyCountPairs[p].fProxy;
+
+ int quadCnt = op.fProxyCountPairs[p].fQuadCnt;
+ SkDEBUGCODE(totQuadsSeen += quadCnt;)
+
+ int meshVertexCnt = quadCnt * fPrePreparedDesc->fVertexSpec.verticesPerQuad();
+ SkDEBUGCODE(totVerticesSeen += meshVertexCnt);
+
+ Tess(dst, fPrePreparedDesc->fVertexSpec, proxy, &iter, quadCnt, op.filter());
+
+ fPrePreparedDesc->fVertexOffsets[meshIndex] = vertexOffsetInBuffer;
+ SkASSERT(vertexOffsetInBuffer * vertexSize ==
+ (size_t)(dst - fPrePreparedDesc->fVertices));
+ fPrePreparedDesc->setMeshProxy(meshIndex, proxy);
+ ++meshIndex;
+
+ vertexOffsetInBuffer += meshVertexCnt;
+ dst += vertexSize * meshVertexCnt;
+ }
+ // If quad counts per proxy were calculated correctly, the entire iterator
+ // should have been consumed.
+ SkASSERT(!iter.next());
+ }
+
+ SkASSERT(fPrePreparedDesc->totalSizeInBytes() ==
+ (size_t)(dst - fPrePreparedDesc->fVertices));
+ SkASSERT(meshIndex == fPrePreparedDesc->fNumProxies);
+ SkASSERT(totQuadsSeen == fPrePreparedDesc->fNumTotalQuads);
+ SkASSERT(totVerticesSeen == fPrePreparedDesc->totalNumVertices());
+ }
}
#ifdef SK_DEBUG
@@ -542,7 +653,9 @@
SkArenaAlloc* arena = target->allocator();
desc.fVertexSpec = this->characterize(&desc.fNumProxies, &desc.fNumTotalQuads);
- desc.allocate(arena, target->appliedClip(), fProxyCountPairs[0].fProxy);
+ desc.allocateCommon(arena, target->appliedClip());
+
+ SkASSERT(!desc.fVertexOffsets && !desc.fVertices);
}
size_t vertexSize = desc.fVertexSpec.vertexSize();
@@ -573,7 +686,16 @@
}
SkASSERT(numAllocatedVertices >= meshVertexCnt);
- op.tess(vdata, desc.fVertexSpec, proxy, &iter, quadCnt);
+ if (fPrePreparedDesc) {
+ // TODO: when we've prePrepared the vertex data should we just allocate
+ // all the vertices together and just do one memcpy?
+ size_t offset = desc.fVertexOffsets[meshIndex] * vertexSize;
+ memcpy(vdata, &desc.fVertices[offset], meshVertexCnt * vertexSize);
+ SkASSERT(proxy == desc.getMeshProxy(meshIndex));
+ } else {
+ Tess(vdata, desc.fVertexSpec, proxy, &iter, quadCnt, op.filter());
+ desc.setMeshProxy(meshIndex, proxy);
+ }
SkASSERT(meshIndex < desc.fNumProxies);
@@ -583,18 +705,17 @@
return;
}
meshes[meshIndex].setVertexData(vbuffer, vertexOffsetInBuffer);
- if (desc.fDynamicStateArrays) {
- desc.fDynamicStateArrays->fPrimitiveProcessorTextures[meshIndex] = proxy;
- }
++meshIndex;
+
numAllocatedVertices -= meshVertexCnt;
numQuadVerticesLeft -= meshVertexCnt;
vertexOffsetInBuffer += meshVertexCnt;
vdata = reinterpret_cast<char*>(vdata) + vertexSize * meshVertexCnt;
}
- // If quad counts per proxy were calculated correctly, the entire iterator should have
- // been consumed.
- SkASSERT(!iter.next());
+
+ // If quad counts per proxy were calculated correctly, the entire iterator should
+ // have been consumed.
+ SkASSERT(fPrePreparedDesc || !iter.next());
}
SkASSERT(!numQuadVerticesLeft);
SkASSERT(!numAllocatedVertices);