Reland "Use specialized quad lists in rectangle ops"
This is a reland of 5820b0c3f3ceba23b9d80415e77a9db124b409b8
It is updated in patchset 2 to clean up pointers passed into memcpy, and to
optimize the bounds calculation in GrPerspQuad. This should fix a performance
regression caused by the move away from caching 1/w. The Sk4f::invert() does not
always preserve 1/1 == 1, which led to bounds slightly outside of clips and
thus forced Skia to keep the scissor test enabled. The fix also restores the
optimization of skipping the 1/w division when the quad is known to be 2D.
Original change's description:
> Use specialized quad lists in rectangle ops
>
> Hopefully reduces memory footprint of GrFillRectOp and GrTextureOp
>
> The original rect code (GrAAFillRectOp) stored 2 SkMatrices (18 floats), 2
> SkRects (8 floats) an SkPMColor4f (4 floats) and a flag (1 int) for a total
> of 124 bytes per quad that was stored in the op.
>
> The first pass at the rectangle consolidation switched to storing device and
> local quads as GrPerspQuads (32 floats), an SkPMColor4f (4 floats) and a flag
> (1 int) for a total of 148 bytes per quad. After landing, several memory
> regressions appeared in Chrome and our perf monitor.
>
> Several intertwined approaches are taken here. First, GrPerspQuad no longer
> caches 1/w, which makes a quad 12 floats instead of 16. Second, a specialized
> list type is defined that allows storing the x, y, and extra metadata together
> for quads, but keeps the w components separate. When the quad type isn't
> perspective, w is not stored at all since it is implicitly 1 and can be
> reconstituted at tessellation time. This brings the total per quad to either
> 84 or 116 bytes, depending on if the op list needs perspective information.
>
> Bug: chromium:915025
> Bug: chromium:917242
> Change-Id: If37ee122847b0c32604bb45dc2a1326b544f9cf6
> Reviewed-on: https://skia-review.googlesource.com/c/180644
> Commit-Queue: Michael Ludwig <michaelludwig@google.com>
> Reviewed-by: Robert Phillips <robertphillips@google.com>
Bug: chromium:915025, chromium:917242
Change-Id: I98a1bf83fd7d393604823d567c57d7e06fad5e55
Reviewed-on: https://skia-review.googlesource.com/c/182203
Commit-Queue: Michael Ludwig <michaelludwig@google.com>
Reviewed-by: Chris Dalton <csmartdalton@google.com>
diff --git a/src/gpu/ops/GrFillRectOp.cpp b/src/gpu/ops/GrFillRectOp.cpp
index 8efbcbd..ccd55c1 100644
--- a/src/gpu/ops/GrFillRectOp.cpp
+++ b/src/gpu/ops/GrFillRectOp.cpp
@@ -24,54 +24,32 @@
using VertexSpec = GrQuadPerEdgeAA::VertexSpec;
using ColorType = GrQuadPerEdgeAA::ColorType;
-// NOTE: This info structure is intentionally modeled after GrTextureOps' Quad so that they can
-// more easily be integrated together in the future.
-class TransformedQuad {
-public:
- TransformedQuad(const GrPerspQuad& deviceQuad, const GrPerspQuad& localQuad,
- const SkPMColor4f& color, GrQuadAAFlags aaFlags)
- : fDeviceQuad(deviceQuad)
- , fLocalQuad(localQuad)
- , fColor(color)
- , fAAFlags(aaFlags) {}
-
- const GrPerspQuad& deviceQuad() const { return fDeviceQuad; }
- const GrPerspQuad& localQuad() const { return fLocalQuad; }
- const SkPMColor4f& color() const { return fColor; }
- GrQuadAAFlags aaFlags() const { return fAAFlags; }
-
- void setColor(const SkPMColor4f& color) { fColor = color; }
-
- SkString dumpInfo(int index) const {
- SkString str;
- str.appendf("%d: Color: [%.2f, %.2f, %.2f, %.2f], Edge AA: l%u_t%u_r%u_b%u, \n"
- " device quad: [(%.2f, %2.f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), "
- "(%.2f, %.2f, %.2f)],\n"
- " local quad: [(%.2f, %2.f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), "
- "(%.2f, %.2f, %.2f)]\n",
- index, fColor.fR, fColor.fG, fColor.fB, fColor.fA,
- (uint32_t) (fAAFlags & GrQuadAAFlags::kLeft),
- (uint32_t) (fAAFlags & GrQuadAAFlags::kTop),
- (uint32_t) (fAAFlags & GrQuadAAFlags::kRight),
- (uint32_t) (fAAFlags & GrQuadAAFlags::kBottom),
- fDeviceQuad.x(0), fDeviceQuad.y(0), fDeviceQuad.w(0),
- fDeviceQuad.x(1), fDeviceQuad.y(1), fDeviceQuad.w(1),
- fDeviceQuad.x(2), fDeviceQuad.y(2), fDeviceQuad.w(2),
- fDeviceQuad.x(3), fDeviceQuad.y(3), fDeviceQuad.w(3),
- fLocalQuad.x(0), fLocalQuad.y(0), fLocalQuad.w(0),
- fLocalQuad.x(1), fLocalQuad.y(1), fLocalQuad.w(1),
- fLocalQuad.x(2), fLocalQuad.y(2), fLocalQuad.w(2),
- fLocalQuad.x(3), fLocalQuad.y(3), fLocalQuad.w(3));
- return str;
- }
-private:
- // NOTE: The TransformedQuad does not store the types for device and local. The owning op tracks
- // the most general type for device and local across all of its merged quads.
- GrPerspQuad fDeviceQuad; // In device space, allowing rects to be combined across view matrices
- GrPerspQuad fLocalQuad; // Original rect transformed by its local matrix
- SkPMColor4f fColor;
- GrQuadAAFlags fAAFlags;
-};
+#ifdef SK_DEBUG
+static SkString dump_quad_info(int index, const GrPerspQuad& deviceQuad,
+ const GrPerspQuad& localQuad, const SkPMColor4f& color,
+ GrQuadAAFlags aaFlags) {
+ SkString str;
+ str.appendf("%d: Color: [%.2f, %.2f, %.2f, %.2f], Edge AA: l%u_t%u_r%u_b%u, \n"
+ " device quad: [(%.2f, %2.f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), "
+ "(%.2f, %.2f, %.2f)],\n"
+ " local quad: [(%.2f, %2.f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), "
+ "(%.2f, %.2f, %.2f)]\n",
+ index, color.fR, color.fG, color.fB, color.fA,
+ (uint32_t) (aaFlags & GrQuadAAFlags::kLeft),
+ (uint32_t) (aaFlags & GrQuadAAFlags::kTop),
+ (uint32_t) (aaFlags & GrQuadAAFlags::kRight),
+ (uint32_t) (aaFlags & GrQuadAAFlags::kBottom),
+ deviceQuad.x(0), deviceQuad.y(0), deviceQuad.w(0),
+ deviceQuad.x(1), deviceQuad.y(1), deviceQuad.w(1),
+ deviceQuad.x(2), deviceQuad.y(2), deviceQuad.w(2),
+ deviceQuad.x(3), deviceQuad.y(3), deviceQuad.w(3),
+ localQuad.x(0), localQuad.y(0), localQuad.w(0),
+ localQuad.x(1), localQuad.y(1), localQuad.w(1),
+ localQuad.x(2), localQuad.y(2), localQuad.w(2),
+ localQuad.x(3), localQuad.y(3), localQuad.w(3));
+ return str;
+}
+#endif
class FillRectOp final : public GrMeshDrawOp {
private:
@@ -113,9 +91,7 @@
const GrPerspQuad& deviceQuad, GrQuadType deviceQuadType,
const GrPerspQuad& localQuad, GrQuadType localQuadType)
: INHERITED(ClassID())
- , fHelper(args, aaType, stencil)
- , fDeviceQuadType(static_cast<unsigned>(deviceQuadType))
- , fLocalQuadType(static_cast<unsigned>(localQuadType)) {
+ , fHelper(args, aaType, stencil) {
if (constBlendColor) {
// The GrPaint is compatible with clearing, and the constant blend color overrides the
// paint color (although in most cases they are probably the same)
@@ -133,9 +109,10 @@
// The color stored with the quad is the clear color if a scissor-clear is decided upon
// when executing the op.
- fQuads.emplace_back(deviceQuad, localQuad, paintColor, edgeFlags);
- this->setBounds(deviceQuad.bounds(), HasAABloat(aaType == GrAAType::kCoverage),
- IsZeroArea::kNo);
+ fDeviceQuads.push_back(deviceQuad, deviceQuadType, { paintColor, edgeFlags });
+ fLocalQuads.push_back(localQuad, localQuadType);
+ this->setBounds(deviceQuad.bounds(deviceQuadType),
+ HasAABloat(aaType == GrAAType::kCoverage), IsZeroArea::kNo);
}
const char* name() const override { return "FillRectOp"; }
@@ -147,14 +124,16 @@
#ifdef SK_DEBUG
SkString dumpInfo() const override {
SkString str;
- str.appendf("# draws: %d\n", fQuads.count());
- str.appendf("Clear compatible: %u\n", static_cast<bool>(fClearCompatible));
+ str.appendf("# draws: %u\n", this->quadCount());
str.appendf("Device quad type: %u, local quad type: %u\n",
- fDeviceQuadType, fLocalQuadType);
+ (uint32_t) fDeviceQuads.quadType(), (uint32_t) fLocalQuads.quadType());
str += fHelper.dumpInfo();
- for (int i = 0; i < fQuads.count(); i++) {
- str += fQuads[i].dumpInfo(i);
-
+ GrPerspQuad device, local;
+ for (int i = 0; i < this->quadCount(); i++) {
+ device = fDeviceQuads[i];
+ const ColorAndAA& info = fDeviceQuads.metadata(i);
+ local = fLocalQuads[i];
+ str += dump_quad_info(i, device, local, info.fColor, info.fAAFlags);
}
str += INHERITED::dumpInfo();
return str;
@@ -163,11 +142,12 @@
RequiresDstTexture finalize(const GrCaps& caps, const GrAppliedClip* clip) override {
// Initialize aggregate color analysis with the first quad's color (which always exists)
- SkASSERT(fQuads.count() > 0);
- GrProcessorAnalysisColor quadColors(fQuads[0].color());
+ SkASSERT(this->quadCount() > 0);
+ GrProcessorAnalysisColor quadColors(fDeviceQuads.metadata(0).fColor);
// Then combine the colors of any additional quads (e.g. from MakeSet)
- for (int i = 1; i < fQuads.count(); ++i) {
- quadColors = GrProcessorAnalysisColor::Combine(quadColors, fQuads[i].color());
+ for (int i = 1; i < this->quadCount(); ++i) {
+ quadColors = GrProcessorAnalysisColor::Combine(quadColors,
+ fDeviceQuads.metadata(i).fColor);
if (quadColors.isUnknown()) {
// No point in accumulating additional starting colors, combining cannot make it
// less unknown.
@@ -185,8 +165,8 @@
// to the same color (even if they started out with different colors).
SkPMColor4f colorOverride;
if (quadColors.isConstant(&colorOverride)) {
- for (int i = 0; i < fQuads.count(); ++i) {
- fQuads[i].setColor(colorOverride);
+ for (int i = 0; i < this->quadCount(); ++i) {
+ fDeviceQuads.metadata(i).fColor = colorOverride;
}
}
@@ -203,21 +183,20 @@
private:
// For GrFillRectOp::MakeSet's use of addQuad
- // FIXME(reviewer): better to just make addQuad public?
friend std::unique_ptr<GrDrawOp> GrFillRectOp::MakeSet(GrContext* context, GrPaint&& paint,
GrAAType aaType, const SkMatrix& viewMatrix,
const GrRenderTargetContext::QuadSetEntry quads[], int quadCount,
const GrUserStencilSettings* stencilSettings);
- void onPrepareDraws(Target* target) override {
+ void onPrepareDraws(Target* target) override {
TRACE_EVENT0("skia", TRACE_FUNC);
using Domain = GrQuadPerEdgeAA::Domain;
static constexpr SkRect kEmptyDomain = SkRect::MakeEmpty();
- VertexSpec vertexSpec(this->deviceQuadType(),
+ VertexSpec vertexSpec(fDeviceQuads.quadType(),
fWideColor ? ColorType::kHalf : ColorType::kByte,
- this->localQuadType(), fHelper.usesLocalCoords(), Domain::kNo,
+ fLocalQuads.quadType(), fHelper.usesLocalCoords(), Domain::kNo,
fHelper.aaType(), fHelper.compatibleWithAlphaAsCoverage());
sk_sp<GrGeometryProcessor> gp = GrQuadPerEdgeAA::MakeProcessor(vertexSpec);
@@ -228,7 +207,7 @@
// Fill the allocated vertex data
void* vdata = target->makeVertexSpace(
- vertexSize, fQuads.count() * vertexSpec.verticesPerQuad(),
+ vertexSize, this->quadCount() * vertexSpec.verticesPerQuad(),
&vbuffer, &vertexOffsetInBuffer);
if (!vdata) {
SkDebugf("Could not allocate vertices\n");
@@ -237,15 +216,18 @@
// vertices pointer advances through vdata based on Tessellate's return value
void* vertices = vdata;
- for (int i = 0; i < fQuads.count(); ++i) {
- const auto& q = fQuads[i];
- vertices = GrQuadPerEdgeAA::Tessellate(vertices, vertexSpec, q.deviceQuad(), q.color(),
- q.localQuad(), kEmptyDomain, q.aaFlags());
+ for (int i = 0; i < this->quadCount(); ++i) {
+ const GrPerspQuad& device = fDeviceQuads[i];
+ const ColorAndAA& info = fDeviceQuads.metadata(i);
+ const GrPerspQuad& local = fLocalQuads[i];
+
+ vertices = GrQuadPerEdgeAA::Tessellate(vertices, vertexSpec, device, info.fColor, local,
+ kEmptyDomain, info.fAAFlags);
}
// Configure the mesh for the vertex data
GrMesh* mesh = target->allocMeshes(1);
- if (!GrQuadPerEdgeAA::ConfigureMeshIndices(target, mesh, vertexSpec, fQuads.count())) {
+ if (!GrQuadPerEdgeAA::ConfigureMeshIndices(target, mesh, vertexSpec, this->quadCount())) {
SkDebugf("Could not allocate indices\n");
return;
}
@@ -261,13 +243,13 @@
if ((fHelper.aaType() == GrAAType::kCoverage ||
that->fHelper.aaType() == GrAAType::kCoverage) &&
- fQuads.count() + that->fQuads.count() > GrQuadPerEdgeAA::kNumAAQuadsInIndexBuffer) {
+ this->quadCount() + that->quadCount() > GrQuadPerEdgeAA::kNumAAQuadsInIndexBuffer) {
// This limit on batch size seems to help on Adreno devices
return CombineResult::kCannotCombine;
}
- // Unlike most users of the draw op helper, this op can merge none-aa and coverage-aa
- // draw ops together, so pass true as the last argument.
+ // Unlike most users of the draw op helper, this op can merge none-aa and coverage-aa draw
+ // ops together, so pass true as the last argument.
if (!fHelper.isCompatible(that->fHelper, caps, this->bounds(), that->bounds(), true)) {
return CombineResult::kCannotCombine;
}
@@ -275,13 +257,6 @@
// If the processor sets are compatible, the two ops are always compatible; it just needs
// to adjust the state of the op to be the more general quad and aa types of the two ops.
- // The GrQuadType enum is ordered such that higher values are more general quad types
- if (that->fDeviceQuadType > fDeviceQuadType) {
- fDeviceQuadType = that->fDeviceQuadType;
- }
- if (that->fLocalQuadType > fLocalQuadType) {
- fLocalQuadType = that->fLocalQuadType;
- }
fClearCompatible &= that->fClearCompatible;
fWideColor |= that->fWideColor;
@@ -292,7 +267,8 @@
fHelper.setAAType(GrAAType::kCoverage);
}
- fQuads.push_back_n(that->fQuads.count(), that->fQuads.begin());
+ fDeviceQuads.concat(that->fDeviceQuads);
+ fLocalQuads.concat(that->fLocalQuads);
return CombineResult::kMerged;
}
@@ -300,8 +276,10 @@
// But since it's avoiding the op list management, it must update the op's bounds. This is only
// used with quad sets, which uses the same view matrix for each quad so this assumes that the
// device quad type of the new quad is the same as the op's.
- void addQuad(TransformedQuad&& quad, GrQuadType localQuadType, GrAAType aaType) {
- SkASSERT(quad.deviceQuad().quadType() <= this->deviceQuadType());
+ void addQuad(const GrPerspQuad& deviceQuad, const GrPerspQuad& localQuad,
+ GrQuadType localQuadType, const SkPMColor4f& color, GrQuadAAFlags edgeAA,
+ GrAAType aaType) {
+ SkASSERT(deviceQuad.quadType() <= fDeviceQuads.quadType());
// The new quad's aa type should be the same as the first quad's or none, except when the
// first quad's aa type was already downgraded to none, in which case the stored type must
@@ -316,33 +294,35 @@
// reset the op's accumulated aa type.
}
- // The new quad's local coordinates could differ
- if (localQuadType > this->localQuadType()) {
- fLocalQuadType = static_cast<unsigned>(localQuadType);
- }
-
// clear compatible won't need to be updated, since device quad type and paint is the same,
// but this quad has a new color, so maybe update wide color
- fWideColor |= !SkPMColor4fFitsInBytes(quad.color());
+ fWideColor |= !SkPMColor4fFitsInBytes(color);
// Update the bounds and add the quad to this op's storage
SkRect newBounds = this->bounds();
- newBounds.joinPossiblyEmptyRect(quad.deviceQuad().bounds());
+ newBounds.joinPossiblyEmptyRect(deviceQuad.bounds(fDeviceQuads.quadType()));
this->setBounds(newBounds, HasAABloat(fHelper.aaType() == GrAAType::kCoverage),
IsZeroArea::kNo);
- fQuads.push_back(std::move(quad));
+ fDeviceQuads.push_back(deviceQuad, fDeviceQuads.quadType(), { color, edgeAA });
+ fLocalQuads.push_back(localQuad, localQuadType);
}
- GrQuadType deviceQuadType() const { return static_cast<GrQuadType>(fDeviceQuadType); }
- GrQuadType localQuadType() const { return static_cast<GrQuadType>(fLocalQuadType); }
+ int quadCount() const {
+ // Sanity check that the parallel arrays for quad properties all have the same size
+ SkASSERT(fDeviceQuads.count() == fLocalQuads.count());
+ return fDeviceQuads.count();
+ }
+
+ struct ColorAndAA {
+ SkPMColor4f fColor;
+ GrQuadAAFlags fAAFlags;
+ };
Helper fHelper;
- SkSTArray<1, TransformedQuad, true> fQuads;
+ GrTQuadList<ColorAndAA> fDeviceQuads;
+ // No metadata attached to the local quads
+ GrQuadList fLocalQuads;
- // While we always store full GrPerspQuads in memory, if the type is known to be simpler we can
- // optimize our geometry generation.
- unsigned fDeviceQuadType: 2;
- unsigned fLocalQuadType: 2;
unsigned fWideColor: 1;
// True if fQuad produced by a rectangle-preserving view matrix, is pixel aligned or non-AA,
@@ -422,9 +402,9 @@
GrResolveAATypeForQuad(aaType, quads[i].fAAFlags, deviceQuad, deviceQuadType,
&resolvedAA, &resolvedEdgeFlags);
- fillRects->addQuad({ deviceQuad, GrPerspQuad(quads[i].fRect, quads[i].fLocalMatrix),
- quads[i].fColor, resolvedEdgeFlags },
- GrQuadTypeForTransformedRect(quads[i].fLocalMatrix), resolvedAA);
+ fillRects->addQuad(deviceQuad, GrPerspQuad(quads[i].fRect, quads[i].fLocalMatrix),
+ GrQuadTypeForTransformedRect(quads[i].fLocalMatrix), quads[i].fColor,
+ resolvedEdgeFlags,resolvedAA);
}
return op;