Merge consecutive entries that share proxy in bulk texture op

Previously, a batch draw that reused the same proxy consecutively
would create a ViewCountPair for each set entry, with its count == 1.
This turned into 1 draw per entry, so although there'd still be a single
pipeline, it didn't take advantage of merging those consecutive entries
into a larger draw to reduce draw count as well.

Initially, the thinking for the batch API was that it was for tilers
that used unique images for each tile or render pass. However, Chrome's
compositor is also responsible for rendering 9 patches as part of the UI.
These appear as 9 consecutive entries in the image set that all refer to
the same texture. With this CL the texture op will automatically merge
such occurrences into one ViewCountPair with a count of 9.

The bulkrect_1000_[grid|random]_sharedimage_batch leverages this case.
Before this CL its op would hold 1000 view count pairs that each drew
one quad. Now its op will hold 1 view count pair with a count of 1000.
On my linux workstation, the bulkrect_1000_grid_sharedimage_batch time
went from 377us to 206us. For reference, the _ref variant (which already
was a 1 view count pair with ct == 1000 due to merging of each op) has
a time of 497us. The difference between 497us and 206us represents the
overhead of calling through SkCanvas, op creation, quad optimization
analysis 1000x.

Interestingly the bulkrect_1000_random_sharedimage_batch benchmark did not
change on my workstation. My conjecture is that it is bottlenecked by
overdraw of the many overlapping rectangles.

Change-Id: Icc4195de0bcb2219f424fdaa79728281c0418558
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/258418
Commit-Queue: Michael Ludwig <michaelludwig@google.com>
Reviewed-by: Brian Salomon <bsalomon@google.com>
diff --git a/src/gpu/GrRenderTargetContext.cpp b/src/gpu/GrRenderTargetContext.cpp
index 5314508..dc5c60c 100644
--- a/src/gpu/GrRenderTargetContext.cpp
+++ b/src/gpu/GrRenderTargetContext.cpp
@@ -863,7 +863,8 @@
     fRenderTargetContext->addOp(std::move(op));
 }
 
-void GrRenderTargetContext::drawTextureSet(const GrClip& clip, TextureSetEntry set[], int cnt,
+void GrRenderTargetContext::drawTextureSet(const GrClip& clip, TextureSetEntry set[],
+                                           int cnt, int proxyRunCnt,
                                            GrSamplerState::Filter filter, SkBlendMode mode,
                                            GrAA aa, SkCanvas::SrcRectConstraint constraint,
                                            const SkMatrix& viewMatrix,
@@ -880,8 +881,8 @@
     auto clampType = GrColorTypeClampType(this->colorInfo().colorType());
     auto saturate = clampType == GrClampType::kManual ? GrTextureOp::Saturate::kYes
                                                       : GrTextureOp::Saturate::kNo;
-    GrTextureOp::AddTextureSetOps(this, clip, fContext, set, cnt, filter, saturate, mode, aaType,
-                                  constraint, viewMatrix, std::move(texXform));
+    GrTextureOp::AddTextureSetOps(this, clip, fContext, set, cnt, proxyRunCnt, filter, saturate,
+                                  mode, aaType, constraint, viewMatrix, std::move(texXform));
 }
 
 void GrRenderTargetContext::drawVertices(const GrClip& clip,
diff --git a/src/gpu/GrRenderTargetContext.h b/src/gpu/GrRenderTargetContext.h
index 1dd854b..d1d6ed0 100644
--- a/src/gpu/GrRenderTargetContext.h
+++ b/src/gpu/GrRenderTargetContext.h
@@ -248,10 +248,15 @@
      *
      * If any entries provide a non-null fDstClip array, it will be read from immediately based on
      * fDstClipCount, so the pointer can become invalid after this returns.
+     *
+     * 'proxyCnt' is the number of proxy changes encountered in the entry array. Technically this
+     * can be inferred from the array within this function, but the information is already known
+     * by SkGpuDevice, so no need to incur another iteration over the array.
      */
-    void drawTextureSet(const GrClip&, TextureSetEntry[], int cnt, GrSamplerState::Filter,
-                        SkBlendMode mode, GrAA aa, SkCanvas::SrcRectConstraint,
-                        const SkMatrix& viewMatrix, sk_sp<GrColorSpaceXform> texXform);
+    void drawTextureSet(const GrClip&, TextureSetEntry[], int cnt, int proxyCnt,
+                        GrSamplerState::Filter, SkBlendMode mode, GrAA aa,
+                        SkCanvas::SrcRectConstraint, const SkMatrix& viewMatrix,
+                        sk_sp<GrColorSpaceXform> texXform);
 
     /**
      * Draw a roundrect using a paint.
diff --git a/src/gpu/SkGpuDevice_drawTexture.cpp b/src/gpu/SkGpuDevice_drawTexture.cpp
index 72fbbfe..faf23b5 100644
--- a/src/gpu/SkGpuDevice_drawTexture.cpp
+++ b/src/gpu/SkGpuDevice_drawTexture.cpp
@@ -489,17 +489,21 @@
 
     SkAutoTArray<GrRenderTargetContext::TextureSetEntry> textures(count);
     // We accumulate compatible proxies until we find an an incompatible one or reach the end and
-    // issue the accumulated 'n' draws starting at 'base'.
-    int base = 0, n = 0;
-    auto draw = [&] {
+    // issue the accumulated 'n' draws starting at 'base'. 'p' represents the number of proxy
+    // switches that occur within the 'n' entries.
+    int base = 0, n = 0, p = 0;
+    auto draw = [&](int nextBase) {
         if (n > 0) {
             auto textureXform = GrColorSpaceXform::Make(
                     set[base].fImage->colorSpace(), set[base].fImage->alphaType(),
                     fRenderTargetContext->colorInfo().colorSpace(), kPremul_SkAlphaType);
-            fRenderTargetContext->drawTextureSet(this->clip(), textures.get() + base, n,
+            fRenderTargetContext->drawTextureSet(this->clip(), textures.get() + base, n, p,
                                                  filter, mode, GrAA::kYes, constraint,
                                                  this->localToDevice(), std::move(textureXform));
         }
+        base = nextBase;
+        n = 0;
+        p = 0;
     };
     int dstClipIndex = 0;
     for (int i = 0; i < count; ++i) {
@@ -514,9 +518,7 @@
         // The default SkBaseDevice implementation is based on drawImageRect which does not allow
         // non-sorted src rects. TODO: Decide this is OK or make sure we handle it.
         if (!set[i].fSrcRect.isSorted()) {
-            draw();
-            base = i + 1;
-            n = 0;
+            draw(i + 1);
             continue;
         }
 
@@ -536,9 +538,7 @@
         if (!proxy) {
             // This image can't go through the texture op, send through general image pipeline
             // after flushing current batch.
-            draw();
-            base = i + 1;
-            n = 0;
+            draw(i + 1);
             SkTCopyOnFirstWrite<SkPaint> entryPaint(paint);
             if (set[i].fAlpha != 1.f) {
                 auto paintAlpha = paint.getAlphaf();
@@ -571,14 +571,18 @@
                     textures[base].fProxyView.proxy()) ||
              set[i].fImage->alphaType() != set[base].fImage->alphaType() ||
              !SkColorSpace::Equals(set[i].fImage->colorSpace(), set[base].fImage->colorSpace()))) {
-            draw();
-            base = i;
-            n = 1;
-        } else {
-            ++n;
+            draw(i);
+        }
+        // Whether or not we submitted a draw in the above if(), this ith entry is in the current
+        // set being accumulated so increment n, and increment p if proxies are different.
+        ++n;
+        if (n == 1 || textures[i - 1].fProxyView.proxy() != textures[i].fProxyView.proxy()) {
+            // First proxy or a different proxy (that is compatible, otherwise we'd have drawn up
+            // to i - 1).
+            ++p;
         }
     }
-    draw();
+    draw(count);
 }
 
 // TODO (michaelludwig) - to be removed when drawBitmapRect doesn't need it anymore
diff --git a/src/gpu/ops/GrTextureOp.cpp b/src/gpu/ops/GrTextureOp.cpp
index b6e39cb..1243a6b 100644
--- a/src/gpu/ops/GrTextureOp.cpp
+++ b/src/gpu/ops/GrTextureOp.cpp
@@ -174,6 +174,21 @@
     ys.store(srcQuad->ys());
 }
 
+// Count the number of proxy runs in the entry set. This usually is already computed by
+// SkGpuDevice, but when the BatchLengthLimiter chops the set up it must determine a new proxy count
+// for each split.
+static int proxy_run_count(const GrRenderTargetContext::TextureSetEntry set[], int count) {
+    int actualProxyRunCount = 0;
+    const GrSurfaceProxy* lastProxy = nullptr;
+    for (int i = 0; i < count; ++i) {
+        if (set[i].fProxyView.proxy() != lastProxy) {
+            actualProxyRunCount++;
+            lastProxy = set[i].fProxyView.proxy();
+        }
+    }
+    return actualProxyRunCount;
+}
+
 /**
  * Op that implements GrTextureOp::Make. It draws textured quads. Each quad can modulate against a
  * the texture by color. The blend with the destination is always src-over. The edges are non-AA.
@@ -200,18 +215,22 @@
     static std::unique_ptr<GrDrawOp> Make(GrRecordingContext* context,
                                           GrRenderTargetContext::TextureSetEntry set[],
                                           int cnt,
+                                          int proxyRunCnt,
                                           GrSamplerState::Filter filter,
                                           GrTextureOp::Saturate saturate,
                                           GrAAType aaType,
                                           SkCanvas::SrcRectConstraint constraint,
                                           const SkMatrix& viewMatrix,
                                           sk_sp<GrColorSpaceXform> textureColorSpaceXform) {
-        size_t size = sizeof(TextureOp) + sizeof(ViewCountPair) * (cnt - 1);
+        // Allocate size based on proxyRunCnt, since that determines number of ViewCountPairs.
+        SkASSERT(proxyRunCnt <= cnt);
+
+        size_t size = sizeof(TextureOp) + sizeof(ViewCountPair) * (proxyRunCnt - 1);
         GrOpMemoryPool* pool = context->priv().opMemoryPool();
         void* mem = pool->allocate(size);
-        return std::unique_ptr<GrDrawOp>(new (mem) TextureOp(set, cnt, filter, saturate, aaType,
-                                                             constraint, viewMatrix,
-                                                             std::move(textureColorSpaceXform)));
+        return std::unique_ptr<GrDrawOp>(
+                new (mem) TextureOp(set, cnt, proxyRunCnt, filter, saturate, aaType, constraint,
+                                    viewMatrix, std::move(textureColorSpaceXform)));
     }
 
     ~TextureOp() override {
@@ -473,6 +492,7 @@
 
     TextureOp(GrRenderTargetContext::TextureSetEntry set[],
               int cnt,
+              int proxyRunCnt,
               GrSamplerState::Filter filter,
               GrTextureOp::Saturate saturate,
               GrAAType aaType,
@@ -486,7 +506,7 @@
             , fMetadata(set[0].fProxyView.swizzle(), GrSamplerState::Filter::kNearest,
                         Domain::kNo, saturate) {
         // Update counts to reflect the batch op
-        fMetadata.fProxyCount = SkToUInt(cnt);
+        fMetadata.fProxyCount = SkToUInt(proxyRunCnt);
         fMetadata.fTotalQuadCount = SkToUInt(cnt);
 
         SkRect bounds = SkRectPriv::MakeLargestInverted();
@@ -500,42 +520,46 @@
         // GrQuadBuffer must be updated to reflect the 1/2px inset required. All quads appended
         // afterwards will properly take that into account.
         int correctDomainUpToIndex = 0;
-        const GrSurfaceProxy* curProxy;
-        for (unsigned p = 0; p < fMetadata.fProxyCount; ++p) {
-            if (p == 0) {
+        const GrSurfaceProxy* curProxy = nullptr;
+        // 'q' is the index in 'set' and fQuadBuffer; 'p' is the index in fViewCountPairs and only
+        // increases when set[q]'s proxy changes.
+        unsigned p = 0;
+        for (unsigned q = 0; q < fMetadata.fTotalQuadCount; ++q) {
+            if (q == 0) {
                 // We do not placement new the first ViewCountPair since that one is allocated and
                 // initialized as part of the GrTextureOp creation.
-                fViewCountPairs[p].fProxy = set[p].fProxyView.detachProxy();
-                fViewCountPairs[p].fQuadCnt = 1;
-            } else {
+                fViewCountPairs[0].fProxy = set[0].fProxyView.detachProxy();
+                fViewCountPairs[0].fQuadCnt = 0;
+                curProxy = fViewCountPairs[0].fProxy.get();
+            } else if (set[q].fProxyView.proxy() != curProxy) {
                 // We must placement new the ViewCountPairs here so that the sk_sps in the
                 // GrSurfaceProxyView get initialized properly.
-                new(&fViewCountPairs[p])ViewCountPair({set[p].fProxyView.detachProxy(), 1});
-            }
+                new(&fViewCountPairs[++p])ViewCountPair({set[q].fProxyView.detachProxy(), 0});
 
-            curProxy = fViewCountPairs[p].fProxy.get();
-            SkASSERT(curProxy->backendFormat().textureType() ==
-                     fViewCountPairs[0].fProxy->backendFormat().textureType());
-            SkASSERT(fMetadata.fSwizzle == set[p].fProxyView.swizzle());
-            SkASSERT(curProxy->config() == fViewCountPairs[0].fProxy->config());
+                curProxy = fViewCountPairs[p].fProxy.get();
+                SkASSERT(curProxy->backendFormat().textureType() ==
+                         fViewCountPairs[0].fProxy->backendFormat().textureType());
+                SkASSERT(fMetadata.fSwizzle == set[q].fProxyView.swizzle());
+                SkASSERT(curProxy->config() == fViewCountPairs[0].fProxy->config());
+            } // else another quad referencing the same proxy
 
             SkMatrix ctm = viewMatrix;
-            if (set[p].fPreViewMatrix) {
-                ctm.preConcat(*set[p].fPreViewMatrix);
+            if (set[q].fPreViewMatrix) {
+                ctm.preConcat(*set[q].fPreViewMatrix);
             }
 
             // Use dstRect/srcRect unless dstClip is provided, in which case derive new source
             // coordinates by mapping dstClipQuad by the dstRect to srcRect transform.
             GrQuad quad, srcQuad;
-            if (set[p].fDstClipQuad) {
-                quad = GrQuad::MakeFromSkQuad(set[p].fDstClipQuad, ctm);
+            if (set[q].fDstClipQuad) {
+                quad = GrQuad::MakeFromSkQuad(set[q].fDstClipQuad, ctm);
 
                 SkPoint srcPts[4];
-                GrMapRectPoints(set[p].fDstRect, set[p].fSrcRect, set[p].fDstClipQuad, srcPts, 4);
+                GrMapRectPoints(set[q].fDstRect, set[q].fSrcRect, set[q].fDstClipQuad, srcPts, 4);
                 srcQuad = GrQuad::MakeFromSkQuad(srcPts, SkMatrix::I());
             } else {
-                quad = GrQuad::MakeFromRect(set[p].fDstRect, ctm);
-                srcQuad = GrQuad(set[p].fSrcRect);
+                quad = GrQuad::MakeFromRect(set[q].fDstRect, ctm);
+                srcQuad = GrQuad(set[q].fSrcRect);
             }
 
             // Before normalizing the source coordinates, determine if bilerp is actually needed
@@ -545,14 +569,14 @@
                 SkASSERT(netFilter == GrSamplerState::Filter::kNearest &&
                          filter == GrSamplerState::Filter::kBilerp);
                 netFilter = GrSamplerState::Filter::kBilerp;
-                // All quads index < p with domains were calculated as if there was no filtering,
+                // All quads index < q with domains were calculated as if there was no filtering,
                 // which is no longer true.
-                correctDomainUpToIndex = p;
+                correctDomainUpToIndex = q;
             }
 
             // Normalize the src quads and apply origin
             NormalizationParams proxyParams = proxy_normalization_params(
-                    curProxy, set[p].fProxyView.origin());
+                    curProxy, set[q].fProxyView.origin());
             normalize_src_quad(proxyParams, &srcQuad);
 
             // Update overall bounds of the op as the union of all quads
@@ -561,7 +585,7 @@
             // Determine the AA type for the quad, then merge with net AA type
             GrQuadAAFlags aaFlags;
             GrAAType aaForQuad;
-            GrQuadUtils::ResolveAAType(aaType, set[p].fAAFlags, quad, &aaForQuad, &aaFlags);
+            GrQuadUtils::ResolveAAType(aaType, set[q].fAAFlags, quad, &aaForQuad, &aaFlags);
             // Resolve sets aaForQuad to aaType or None, there is never a change between aa methods
             SkASSERT(aaForQuad == GrAAType::kNone || aaForQuad == aaType);
             if (netAAType == GrAAType::kNone && aaForQuad != GrAAType::kNone) {
@@ -572,31 +596,45 @@
             const SkRect* domainForQuad = nullptr;
             if (constraint == SkCanvas::kStrict_SrcRectConstraint) {
                 // Check (briefly) if the strict constraint is needed for this set entry
-                if (!set[p].fSrcRect.contains(curProxy->backingStoreBoundsRect()) &&
+                if (!set[q].fSrcRect.contains(curProxy->backingStoreBoundsRect()) &&
                     (netFilter == GrSamplerState::Filter::kBilerp ||
                      aaForQuad == GrAAType::kCoverage)) {
                     // Can't rely on hardware clamping and the draw will access outer texels
                     // for AA and/or bilerp. Unlike filter quality, this op still has per-quad
                     // control over AA so that can check aaForQuad, not netAAType.
                     netDomain = Domain::kYes;
-                    domainForQuad = &set[p].fSrcRect;
+                    domainForQuad = &set[q].fSrcRect;
                 }
             }
 
+            // Always append a quad, it just may refer back to a prior ViewCountPair
+            // (this frequently happens when Chrome draws 9-patches).
             SkRect domain = normalize_domain(filter, proxyParams, domainForQuad);
-            float alpha = SkTPin(set[p].fAlpha, 0.f, 1.f);
+            float alpha = SkTPin(set[q].fAlpha, 0.f, 1.f);
             fQuads.append(quad, {{alpha, alpha, alpha, alpha}, domain, aaFlags}, &srcQuad);
+            fViewCountPairs[p].fQuadCnt++;
         }
+        // The # of proxy switches should match what was provided (-1 because we incremented p
+        // when a new proxy was encountered).
+        SkASSERT(p == fMetadata.fProxyCount - 1);
+        SkASSERT(fQuads.count() == fMetadata.fTotalQuadCount);
 
         // All the quads have been recorded, but some domains need to be fixed
         if (netDomain == Domain::kYes && correctDomainUpToIndex > 0) {
-            int p = 0;
+            int p = 0; // for fViewCountPairs
+            int q = 0; // for set/fQuads
+            int netVCt = 0;
             auto iter = fQuads.metadata();
-            while(p < correctDomainUpToIndex && iter.next()) {
+            while(q < correctDomainUpToIndex && iter.next()) {
                 NormalizationParams proxyParams = proxy_normalization_params(
-                        fViewCountPairs[p].fProxy.get(), set[p].fProxyView.origin());
+                        fViewCountPairs[p].fProxy.get(), set[q].fProxyView.origin());
                 correct_domain_for_bilerp(proxyParams, &(iter->fDomainRect));
-                p++;
+                q++;
+                if (q - netVCt >= fViewCountPairs[p].fQuadCnt) {
+                    // Advance to the next view count pair
+                    netVCt += fViewCountPairs[p].fQuadCnt;
+                    p++;
+                }
             }
         }
 
@@ -1067,8 +1105,9 @@
     void createOp(GrRenderTargetContext::TextureSetEntry set[],
                   int clumpSize,
                   GrAAType aaType) {
+        int clumpProxyCount = proxy_run_count(&set[fNumClumped], clumpSize);
         std::unique_ptr<GrDrawOp> op = TextureOp::Make(fContext, &set[fNumClumped], clumpSize,
-                                                       fFilter, fSaturate, aaType,
+                                                       clumpProxyCount, fFilter, fSaturate, aaType,
                                                        fConstraint, fViewMatrix,
                                                        fTextureColorSpaceXform);
         fRTC->addDrawOp(fClip, std::move(op));
@@ -1100,6 +1139,7 @@
                                    GrRecordingContext* context,
                                    GrRenderTargetContext::TextureSetEntry set[],
                                    int cnt,
+                                   int proxyRunCnt,
                                    GrSamplerState::Filter filter,
                                    Saturate saturate,
                                    SkBlendMode blendMode,
@@ -1107,6 +1147,12 @@
                                    SkCanvas::SrcRectConstraint constraint,
                                    const SkMatrix& viewMatrix,
                                    sk_sp<GrColorSpaceXform> textureColorSpaceXform) {
+    // Ensure that the index buffer limits are lower than the proxy and quad count limits of
+    // the op's metadata so we don't need to worry about overflow.
+    SkASSERT(GrResourceProvider::MaxNumNonAAQuads() <= UINT16_MAX &&
+             GrResourceProvider::MaxNumAAQuads() <= UINT16_MAX);
+    SkASSERT(proxy_run_count(set, cnt) == proxyRunCnt);
+
     // First check if we can support batches as a single op
     if (blendMode != SkBlendMode::kSrcOver ||
         !context->priv().caps()->dynamicStateArrayGeometryProcessorTextureSupport()) {
@@ -1144,16 +1190,11 @@
         return;
     }
 
-    // Ensure that the index buffer limits are lower than the proxy and quad count limits of
-    // the op's metadata so we don't need to worry about overflow.
-    SkASSERT(GrResourceProvider::MaxNumNonAAQuads() <= UINT16_MAX &&
-             GrResourceProvider::MaxNumAAQuads() <= UINT16_MAX);
-
     // Second check if we can always just make a single op and avoid the extra iteration
     // needed to clump things together.
     if (cnt <= SkTMin(GrResourceProvider::MaxNumNonAAQuads(),
                       GrResourceProvider::MaxNumAAQuads())) {
-        auto op = TextureOp::Make(context, set, cnt, filter, saturate, aaType,
+        auto op = TextureOp::Make(context, set, cnt, proxyRunCnt, filter, saturate, aaType,
                                   constraint, viewMatrix, std::move(textureColorSpaceXform));
         rtc->addDrawOp(clip, std::move(op));
         return;
diff --git a/src/gpu/ops/GrTextureOp.h b/src/gpu/ops/GrTextureOp.h
index 0a8cd36..079159b 100644
--- a/src/gpu/ops/GrTextureOp.h
+++ b/src/gpu/ops/GrTextureOp.h
@@ -55,12 +55,14 @@
                                           const SkRect* domain = nullptr);
 
     // Automatically falls back to using one GrFillRectOp per entry if dynamic states are not
-    // supported, or if the blend mode is not src-over.
+    // supported, or if the blend mode is not src-over. 'cnt' is the size of the entry array.
+    // 'proxyCnt' <= 'cnt' and represents the number of proxy switches within the array.
     static void AddTextureSetOps(GrRenderTargetContext*,
                                  const GrClip& clip,
                                  GrRecordingContext*,
                                  GrRenderTargetContext::TextureSetEntry[],
                                  int cnt,
+                                 int proxyRunCnt,
                                  GrSamplerState::Filter,
                                  Saturate,
                                  SkBlendMode,
diff --git a/tests/BulkRectTest.cpp b/tests/BulkRectTest.cpp
index f81c97b..9dbf812 100644
--- a/tests/BulkRectTest.cpp
+++ b/tests/BulkRectTest.cpp
@@ -109,12 +109,13 @@
     }
 
     GrTextureOp::AddTextureSetOps(rtc.get(), GrNoClip(), context, set, requestedTotNumQuads,
-                                     GrSamplerState::Filter::kNearest,
-                                     GrTextureOp::Saturate::kYes,
-                                     blendMode,
-                                     overallAA,
-                                     SkCanvas::kStrict_SrcRectConstraint,
-                                     SkMatrix::I(), nullptr);
+                                  requestedTotNumQuads, // We alternate so proxyCnt == cnt
+                                  GrSamplerState::Filter::kNearest,
+                                  GrTextureOp::Saturate::kYes,
+                                  blendMode,
+                                  overallAA,
+                                  SkCanvas::kStrict_SrcRectConstraint,
+                                  SkMatrix::I(), nullptr);
 
     GrOpsTask* opsTask = rtc->testingOnly_PeekLastOpsTask();
     int actualNumOps = opsTask->numOpChains();