Add code path that avoids large indexBuffer draws

The bulkrect_1000_random_uniqueimages_batch on a Nexus 6P/Adreno 430

w/o this CL
curr/maxrss	loops	min	median	mean	max	stddev	samples   	config
 304/304 MB	1	151ms	159ms	158ms	163ms	3%	▆█▇▄▆▆▁▂█▅	gles

w/ this CL
curr/maxrss	loops	min	median	mean	max	stddev	samples   	config
 286/286 MB	1	18.1ms	18.1ms	18.1ms	18.1ms	0%	▂▄▅▃▅▅▃▄▁█	gles

Change-Id: I0f6d690b953444ec7a3176cb27c8a253caa55f5d
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/255986
Commit-Queue: Robert Phillips <robertphillips@google.com>
Reviewed-by: Michael Ludwig <michaelludwig@google.com>
diff --git a/src/gpu/GrCaps.cpp b/src/gpu/GrCaps.cpp
index ee2ba50..48dc58c 100644
--- a/src/gpu/GrCaps.cpp
+++ b/src/gpu/GrCaps.cpp
@@ -40,6 +40,7 @@
     fDynamicStateArrayGeometryProcessorTextureSupport = false;
     fPerformPartialClearsAsDraws = false;
     fPerformColorClearsAsDraws = false;
+    fAvoidLargeIndexBufferDraws = false;
     fPerformStencilClearsAsDraws = false;
     fAllowCoverageCounting = false;
     fTransferBufferSupport = false;
@@ -203,6 +204,7 @@
                        fDynamicStateArrayGeometryProcessorTextureSupport);
     writer->appendBool("Use draws for partial clears", fPerformPartialClearsAsDraws);
     writer->appendBool("Use draws for color clears", fPerformColorClearsAsDraws);
+    writer->appendBool("Avoid Large IndexBuffer Draws", fAvoidLargeIndexBufferDraws);
     writer->appendBool("Use draws for stencil clip clears", fPerformStencilClearsAsDraws);
     writer->appendBool("Allow coverage counting shortcuts", fAllowCoverageCounting);
     writer->appendBool("Supports transfer buffers", fTransferBufferSupport);
diff --git a/src/gpu/GrCaps.h b/src/gpu/GrCaps.h
index d8be432..d501edc 100644
--- a/src/gpu/GrCaps.h
+++ b/src/gpu/GrCaps.h
@@ -332,6 +332,8 @@
     // Many drivers have issues with color clears.
     bool performColorClearsAsDraws() const { return fPerformColorClearsAsDraws; }
 
+    bool avoidLargeIndexBufferDraws() const { return fAvoidLargeIndexBufferDraws; }
+
     /// Adreno 4xx devices experience an issue when there are a large number of stencil clip bit
     /// clears. The minimal repro steps are not precisely known but drawing a rect with a stencil
     /// op instead of using glClear seems to resolve the issue.
@@ -487,6 +489,7 @@
     bool fClampToBorderSupport                       : 1;
     bool fPerformPartialClearsAsDraws                : 1;
     bool fPerformColorClearsAsDraws                  : 1;
+    bool fAvoidLargeIndexBufferDraws                 : 1;
     bool fPerformStencilClearsAsDraws                : 1;
     bool fAllowCoverageCounting                      : 1;
     bool fTransferBufferSupport                      : 1;
diff --git a/src/gpu/gl/GrGLCaps.cpp b/src/gpu/gl/GrGLCaps.cpp
index c25595c..5678ef7 100644
--- a/src/gpu/gl/GrGLCaps.cpp
+++ b/src/gpu/gl/GrGLCaps.cpp
@@ -3365,6 +3365,12 @@
         fPerformStencilClearsAsDraws = true;
     }
 
+    if (ctxInfo.vendor() == kQualcomm_GrGLVendor) {
+        // It appears that all the Adreno GPUs have less than optimal performance when
+        // drawing w/ large index buffers.
+        fAvoidLargeIndexBufferDraws = true;
+    }
+
     // This was reproduced on the following configurations:
     // - A Galaxy J5 (Adreno 306) running Android 6 with driver 140.0
     // - A Nexus 7 2013 (Adreno 320) running Android 5 with driver 104.0
diff --git a/src/gpu/ops/GrFillRectOp.cpp b/src/gpu/ops/GrFillRectOp.cpp
index e213283..0b35528 100644
--- a/src/gpu/ops/GrFillRectOp.cpp
+++ b/src/gpu/ops/GrFillRectOp.cpp
@@ -238,9 +238,9 @@
 
         // Configure the mesh for the vertex data
         GrMesh* mesh = target->allocMeshes(1);
-        GrQuadPerEdgeAA::ConfigureMesh(mesh, vertexSpec, 0, fQuads.count(), totalNumVertices,
-                                       std::move(vertexBuffer), std::move(indexBuffer),
-                                       vertexOffsetInBuffer);
+        GrQuadPerEdgeAA::ConfigureMesh(target->caps(), mesh, vertexSpec, 0, fQuads.count(),
+                                       totalNumVertices, std::move(vertexBuffer),
+                                       std::move(indexBuffer), vertexOffsetInBuffer);
         target->recordDraw(gp, mesh, 1, vertexSpec.primitiveType());
     }
 
diff --git a/src/gpu/ops/GrQuadPerEdgeAA.cpp b/src/gpu/ops/GrQuadPerEdgeAA.cpp
index 70a8607..ca0c7c0 100644
--- a/src/gpu/ops/GrQuadPerEdgeAA.cpp
+++ b/src/gpu/ops/GrQuadPerEdgeAA.cpp
@@ -381,7 +381,7 @@
     SkUNREACHABLE;
 }
 
-void ConfigureMesh(GrMesh* mesh, const VertexSpec& spec,
+void ConfigureMesh(const GrCaps& caps, GrMesh* mesh, const VertexSpec& spec,
                    int runningQuadCount, int quadsInDraw, int maxVerts,
                    sk_sp<const GrBuffer> vertexBuffer,
                    sk_sp<const GrBuffer> indexBuffer, int absVertBufferOffset) {
@@ -403,28 +403,42 @@
              spec.indexBufferOption() == IndexBufferOption::kIndexedRects);
     SkASSERT(indexBuffer);
 
-    int baseIndex, numIndicesToDraw;
-    int minVertex, maxVertex;
+    int maxNumQuads, numIndicesPerQuad, numVertsPerQuad;
 
     if (spec.indexBufferOption() == IndexBufferOption::kPictureFramed) {
-        SkASSERT(runningQuadCount + quadsInDraw <= GrResourceProvider::MaxNumAAQuads());
         // AA uses 8 vertices and 30 indices per quad, basically nested rectangles
-        baseIndex = runningQuadCount * GrResourceProvider::NumIndicesPerAAQuad();
-        numIndicesToDraw = quadsInDraw * GrResourceProvider::NumIndicesPerAAQuad();
-        minVertex = runningQuadCount * GrResourceProvider::NumVertsPerAAQuad();
-        maxVertex = (runningQuadCount + quadsInDraw) * GrResourceProvider::NumVertsPerAAQuad();
+        maxNumQuads = GrResourceProvider::MaxNumAAQuads();
+        numIndicesPerQuad = GrResourceProvider::NumIndicesPerAAQuad();
+        numVertsPerQuad = GrResourceProvider::NumVertsPerAAQuad();
     } else {
-        SkASSERT(runningQuadCount + quadsInDraw <= GrResourceProvider::MaxNumNonAAQuads());
         // Non-AA uses 4 vertices and 6 indices per quad
-        baseIndex = runningQuadCount * GrResourceProvider::NumIndicesPerNonAAQuad();
-        numIndicesToDraw = quadsInDraw * GrResourceProvider::NumIndicesPerNonAAQuad();
-        minVertex = runningQuadCount * GrResourceProvider::NumVertsPerNonAAQuad();
-        maxVertex = (runningQuadCount + quadsInDraw) * GrResourceProvider::NumVertsPerNonAAQuad();
+        maxNumQuads = GrResourceProvider::MaxNumNonAAQuads();
+        numIndicesPerQuad = GrResourceProvider::NumIndicesPerNonAAQuad();
+        numVertsPerQuad = GrResourceProvider::NumVertsPerNonAAQuad();
     }
 
-    mesh->setIndexed(std::move(indexBuffer), numIndicesToDraw, baseIndex, minVertex, maxVertex,
-                     GrPrimitiveRestart::kNo);
-    mesh->setVertexData(std::move(vertexBuffer), absVertBufferOffset);
+    SkASSERT(runningQuadCount + quadsInDraw <= maxNumQuads);
+
+    if (caps.avoidLargeIndexBufferDraws()) {
+        // When we need to avoid large index buffer draws we modify the base vertex of the draw
+        // which, in GL, requires rebinding all vertex attrib arrays, so a base index is generally
+        // preferred.
+        int offset = absVertBufferOffset + runningQuadCount * numVertsPerQuad;
+
+        mesh->setIndexedPatterned(std::move(indexBuffer), numIndicesPerQuad,
+                                  numVertsPerQuad, quadsInDraw, maxNumQuads);
+        mesh->setVertexData(std::move(vertexBuffer), offset);
+    } else {
+        int baseIndex = runningQuadCount * numIndicesPerQuad;
+        int numIndicesToDraw = quadsInDraw * numIndicesPerQuad;
+
+        int minVertex = runningQuadCount * numVertsPerQuad;
+        int maxVertex = (runningQuadCount + quadsInDraw) * numVertsPerQuad;
+
+        mesh->setIndexed(std::move(indexBuffer), numIndicesToDraw,
+                         baseIndex, minVertex, maxVertex, GrPrimitiveRestart::kNo);
+        mesh->setVertexData(std::move(vertexBuffer), absVertBufferOffset);
+    }
 }
 
 ////////////////// VertexSpec Implementation
diff --git a/src/gpu/ops/GrQuadPerEdgeAA.h b/src/gpu/ops/GrQuadPerEdgeAA.h
index 2b99c68..8a77fa7 100644
--- a/src/gpu/ops/GrQuadPerEdgeAA.h
+++ b/src/gpu/ops/GrQuadPerEdgeAA.h
@@ -186,8 +186,8 @@
     // @param quadCount         the number of quads that will be drawn by the provided 'mesh'.
     //                          A subsequent ConfigureMesh call would the use
     //                          'runningQuadCount' + 'quadCount' for its new 'runningQuadCount'.
-    void ConfigureMesh(GrMesh* mesh, const VertexSpec&, int runningQuadCount, int quadCount,
-                       int maxVerts, sk_sp<const GrBuffer> vertexBuffer,
+    void ConfigureMesh(const GrCaps&, GrMesh*, const VertexSpec&, int runningQuadCount,
+                       int quadCount, int maxVerts, sk_sp<const GrBuffer> vertexBuffer,
                        sk_sp<const GrBuffer> indexBuffer, int absVertBufferOffset);
 
 } // namespace GrQuadPerEdgeAA
diff --git a/src/gpu/ops/GrTextureOp.cpp b/src/gpu/ops/GrTextureOp.cpp
index 47cdbe8..c91db17 100644
--- a/src/gpu/ops/GrTextureOp.cpp
+++ b/src/gpu/ops/GrTextureOp.cpp
@@ -574,11 +574,11 @@
 
         // At this juncture we only fill in the vertex data and state arrays. Filling in of
         // the meshes is left until onPrepareDraws.
-        SkAssertResult(FillInData(this, fPrePreparedDesc, fPrePreparedDesc->fVertices,
-                                  nullptr, 0, nullptr, nullptr));
+        SkAssertResult(FillInData(*context->priv().caps(), this, fPrePreparedDesc,
+                                  fPrePreparedDesc->fVertices, nullptr, 0, nullptr, nullptr));
     }
 
-    static bool FillInData(TextureOp* texOp, PrePreparedDesc* desc,
+    static bool FillInData(const GrCaps& caps, TextureOp* texOp, PrePreparedDesc* desc,
                            char* pVertexData, GrMesh* meshes, int absBufferOffset,
                            sk_sp<const GrBuffer> vertexBuffer,
                            sk_sp<const GrBuffer> indexBuffer) {
@@ -611,7 +611,7 @@
                 }
 
                 if (meshes) {
-                    GrQuadPerEdgeAA::ConfigureMesh(&(meshes[meshIndex]), desc->fVertexSpec,
+                    GrQuadPerEdgeAA::ConfigureMesh(caps, &(meshes[meshIndex]), desc->fVertexSpec,
                                                    totQuadsSeen, quadCnt, desc->totalNumVertices(),
                                                    vertexBuffer, indexBuffer, absBufferOffset);
                 }
@@ -793,12 +793,12 @@
             memcpy(vdata, desc.fVertices, desc.totalSizeInBytes());
             // The above memcpy filled in the vertex data - just call FillInData to fill in the
             // mesh data
-            result = FillInData(this, &desc, nullptr, meshes, vertexOffsetInBuffer,
+            result = FillInData(target->caps(), this, &desc, nullptr, meshes, vertexOffsetInBuffer,
                                 std::move(vbuffer), std::move(indexBuffer));
         } else {
             // Fills in both vertex data and mesh data
-            result = FillInData(this, &desc, (char*) vdata, meshes, vertexOffsetInBuffer,
-                                std::move(vbuffer), std::move(indexBuffer));
+            result = FillInData(target->caps(), this, &desc, (char*) vdata, meshes,
+                                vertexOffsetInBuffer, std::move(vbuffer), std::move(indexBuffer));
         }
 
         if (!result) {