Speed up the fixed count parametric/radial sort

The fixed count stroke shader was executing a sort loop long enough to
handle 1024 parametric segments, when in reality it will never see
more than 48. This CL cuts the per-vertex sorting work nearly in half
by reducing the number of iterations from 10 to 6. (The indirect
tessellator continues to use 10 iterations.)

Bug: skia:10496
Change-Id: Idc21d6015d449f63915780972a8d487c9c6de6fe
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/412496
Reviewed-by: Jim Van Verth <jvanverth@google.com>
Commit-Queue: Chris Dalton <csmartdalton@google.com>
diff --git a/bench/TessellateBench.cpp b/bench/TessellateBench.cpp
index 5cc70d8..fc8ad69 100644
--- a/bench/TessellateBench.cpp
+++ b/bench/TessellateBench.cpp
@@ -211,21 +211,24 @@
 }
 
 using PathStrokeList = GrStrokeTessellator::PathStrokeList;
-using MakeTessellatorFn = std::unique_ptr<GrStrokeTessellator>(*)(ShaderFlags, const SkMatrix&,
-                                                                  PathStrokeList*,
+using MakeTessellatorFn = std::unique_ptr<GrStrokeTessellator>(*)(ShaderFlags, const GrShaderCaps&,
+                                                                  const SkMatrix&, PathStrokeList*,
                                                                   std::array<float, 2>, const
                                                                   SkRect&);
 
 static std::unique_ptr<GrStrokeTessellator> make_hw_tessellator(
-        ShaderFlags shaderFlags, const SkMatrix& viewMatrix, PathStrokeList* pathStrokeList,
-        std::array<float, 2> matrixMinMaxScales, const SkRect& strokeCullBounds) {
-    return std::make_unique<GrStrokeHardwareTessellator>(shaderFlags, viewMatrix, pathStrokeList,
-                                                         matrixMinMaxScales, strokeCullBounds);
+        ShaderFlags shaderFlags, const GrShaderCaps& shaderCaps, const SkMatrix& viewMatrix,
+        PathStrokeList* pathStrokeList, std::array<float, 2> matrixMinMaxScales,
+        const SkRect& strokeCullBounds) {
+    return std::make_unique<GrStrokeHardwareTessellator>(shaderFlags, shaderCaps, viewMatrix,
+                                                         pathStrokeList, matrixMinMaxScales,
+                                                         strokeCullBounds);
 }
 
 static std::unique_ptr<GrStrokeTessellator> make_fixed_count_tessellator(
-        ShaderFlags shaderFlags, const SkMatrix& viewMatrix, PathStrokeList* pathStrokeList,
-        std::array<float, 2> matrixMinMaxScales, const SkRect& strokeCullBounds) {
+        ShaderFlags shaderFlags, const GrShaderCaps&, const SkMatrix& viewMatrix,
+        PathStrokeList* pathStrokeList, std::array<float, 2> matrixMinMaxScales,
+        const SkRect& strokeCullBounds) {
     return std::make_unique<GrStrokeFixedCountTessellator>(shaderFlags, viewMatrix, pathStrokeList,
                                                            matrixMinMaxScales, strokeCullBounds);
 }
@@ -327,7 +330,8 @@
             fTotalVerbCount += fPathStrokes[i].fPath.countVerbs();
         }
 
-        fTessellator = fMakeTessellatorFn(fShaderFlags, SkMatrix::Scale(fMatrixScale, fMatrixScale),
+        fTessellator = fMakeTessellatorFn(fShaderFlags, *fTarget->caps().shaderCaps(),
+                                          SkMatrix::Scale(fMatrixScale, fMatrixScale),
                                           fPathStrokes.data(), {fMatrixScale, fMatrixScale},
                                           SkRectPriv::MakeLargest());
     }
diff --git a/src/gpu/tessellate/GrStrokeFixedCountTessellator.cpp b/src/gpu/tessellate/GrStrokeFixedCountTessellator.cpp
index a17bdb4..e8865a3 100644
--- a/src/gpu/tessellate/GrStrokeFixedCountTessellator.cpp
+++ b/src/gpu/tessellate/GrStrokeFixedCountTessellator.cpp
@@ -16,6 +16,7 @@
 namespace {
 
 constexpr static float kMaxParametricSegments_pow4 = 48*48*48*48;  // 48^4
+constexpr static int8_t kMaxParametricSegments_log2 = 6;  // ceil(log2(48))
 
 // Writes out strokes to the given instance chunk array, chopping if necessary so that all instances
 // require 48 parametric segments or less. (We don't consider radial segments here. The tessellator
@@ -233,6 +234,16 @@
 
 }  // namespace
 
+GrStrokeFixedCountTessellator::GrStrokeFixedCountTessellator(ShaderFlags shaderFlags,
+                                                             const SkMatrix& viewMatrix,
+                                                             PathStrokeList* pathStrokeList,
+                                                             std::array<float,2> matrixMinMaxScales,
+                                                             const SkRect& strokeCullBounds)
+        : GrStrokeTessellator(GrStrokeShader::Mode::kFixedCount, shaderFlags,
+                              kMaxParametricSegments_log2, viewMatrix, pathStrokeList,
+                              matrixMinMaxScales, strokeCullBounds) {
+}
+
 void GrStrokeFixedCountTessellator::prepare(GrMeshDrawOp::Target* target,
                                             int totalCombinedVerbCnt) {
     int maxEdgesInJoin = 0;
diff --git a/src/gpu/tessellate/GrStrokeFixedCountTessellator.h b/src/gpu/tessellate/GrStrokeFixedCountTessellator.h
index aeabbd3..ce708f8 100644
--- a/src/gpu/tessellate/GrStrokeFixedCountTessellator.h
+++ b/src/gpu/tessellate/GrStrokeFixedCountTessellator.h
@@ -15,14 +15,9 @@
 // instance are emitted as degenerate triangles.
 class GrStrokeFixedCountTessellator : public GrStrokeTessellator {
 public:
-    GrStrokeFixedCountTessellator(ShaderFlags shaderFlags, const SkMatrix& viewMatrix,
-                                  PathStrokeList* pathStrokeList,
-                                  std::array<float, 2> matrixMinMaxScales,
-                                  const SkRect& strokeCullBounds)
-            : GrStrokeTessellator(GrStrokeShader::Mode::kFixedCount, shaderFlags,
-                                  viewMatrix, pathStrokeList, matrixMinMaxScales,
-                                  strokeCullBounds) {
-    }
+    GrStrokeFixedCountTessellator(ShaderFlags, const SkMatrix&, PathStrokeList*,
+                                  std::array<float,2> matrixMinMaxScales,
+                                  const SkRect& strokeCullBounds);
 
     void prepare(GrMeshDrawOp::Target*, int totalCombinedVerbCnt) override;
     void draw(GrOpFlushState*) const override;
diff --git a/src/gpu/tessellate/GrStrokeHardwareTessellator.h b/src/gpu/tessellate/GrStrokeHardwareTessellator.h
index 2c61fef..3c4b19d 100644
--- a/src/gpu/tessellate/GrStrokeHardwareTessellator.h
+++ b/src/gpu/tessellate/GrStrokeHardwareTessellator.h
@@ -16,13 +16,13 @@
 // MSAA if antialiasing is desired.
 class GrStrokeHardwareTessellator : public GrStrokeTessellator {
 public:
-    GrStrokeHardwareTessellator(ShaderFlags shaderFlags, const SkMatrix& viewMatrix,
-                                PathStrokeList* pathStrokeList,
-                                std::array<float, 2> matrixMinMaxScales,
-                                const SkRect& strokeCullBounds)
-            : GrStrokeTessellator(GrStrokeShader::Mode::kHardwareTessellation,
-                                  shaderFlags, viewMatrix, pathStrokeList, matrixMinMaxScales,
-                                  strokeCullBounds) {
+    GrStrokeHardwareTessellator(ShaderFlags shaderFlags, const GrShaderCaps& shaderCaps,
+                                const SkMatrix& viewMatrix, PathStrokeList* pathStrokeList,
+                                std::array<float,2> matrixMinMaxScales, const SkRect&
+                                strokeCullBounds)
+            : GrStrokeTessellator(GrStrokeShader::Mode::kHardwareTessellation, shaderFlags,
+                                  SkNextLog2(shaderCaps.maxTessellationSegments()), viewMatrix,
+                                  pathStrokeList, matrixMinMaxScales, strokeCullBounds) {
     }
 
     void prepare(GrMeshDrawOp::Target*, int totalCombinedVerbCnt) override;
diff --git a/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp b/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
index ec5e60a..d49861e 100644
--- a/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
+++ b/src/gpu/tessellate/GrStrokeIndirectTessellator.cpp
@@ -66,7 +66,7 @@
 public:
     constexpr static int8_t kMaxResolveLevel = GrStrokeIndirectTessellator::kMaxResolveLevel;
 
-    ResolveLevelCounter(int* resolveLevelCounts, std::array<float, 2> matrixMinMaxScales)
+    ResolveLevelCounter(int* resolveLevelCounts, std::array<float,2> matrixMinMaxScales)
             : fResolveLevelCounts(resolveLevelCounts), fMatrixMinMaxScales(matrixMinMaxScales) {
     }
 
@@ -426,7 +426,7 @@
 
 #endif
     int* const fResolveLevelCounts;
-    std::array<float, 2> fMatrixMinMaxScales;
+    std::array<float,2> fMatrixMinMaxScales;
     GrStrokeTolerances fTolerances;
     int fResolveLevelForCircles;
     bool fIsRoundJoin;
@@ -434,15 +434,20 @@
 
 }  // namespace
 
+// GrTessellationPathRenderer crops paths that require more than 2^kMaxResolveLevel parametric
+// segments.
+constexpr static int8_t kMaxParametricSegments_log2 = GrTessellationPathRenderer::kMaxResolveLevel;
+
 GrStrokeIndirectTessellator::GrStrokeIndirectTessellator(ShaderFlags shaderFlags,
                                                          const SkMatrix& viewMatrix,
                                                          PathStrokeList* pathStrokeList,
-                                                         std::array<float, 2> matrixMinMaxScales,
+                                                         std::array<float,2> matrixMinMaxScales,
                                                          const SkRect& strokeCullBounds,
                                                          int totalCombinedVerbCnt,
                                                          SkArenaAlloc* alloc)
-        : GrStrokeTessellator(GrStrokeShader::Mode::kLog2Indirect, shaderFlags, viewMatrix,
-                              pathStrokeList, matrixMinMaxScales, strokeCullBounds) {
+        : GrStrokeTessellator(GrStrokeShader::Mode::kLog2Indirect, shaderFlags,
+                              kMaxParametricSegments_log2, viewMatrix, pathStrokeList,
+                              matrixMinMaxScales, strokeCullBounds) {
     // The maximum potential number of values we will need in fResolveLevels is:
     //
     //   * 3 segments per verb (from two chops)
diff --git a/src/gpu/tessellate/GrStrokeIndirectTessellator.h b/src/gpu/tessellate/GrStrokeIndirectTessellator.h
index b3cdb27..ce481dc 100644
--- a/src/gpu/tessellate/GrStrokeIndirectTessellator.h
+++ b/src/gpu/tessellate/GrStrokeIndirectTessellator.h
@@ -23,7 +23,7 @@
     constexpr static int8_t kMaxResolveLevel = 15;
 
     GrStrokeIndirectTessellator(ShaderFlags, const SkMatrix& viewMatrix, PathStrokeList*,
-                                std::array<float, 2> matrixMinMaxScales,
+                                std::array<float,2> matrixMinMaxScales,
                                 const SkRect& strokeCullBounds, int totalCombinedVerbCnt,
                                 SkArenaAlloc*);
 
diff --git a/src/gpu/tessellate/GrStrokeInstancedShaderImpl.cpp b/src/gpu/tessellate/GrStrokeInstancedShaderImpl.cpp
index 6a06059..10f74ff 100644
--- a/src/gpu/tessellate/GrStrokeInstancedShaderImpl.cpp
+++ b/src/gpu/tessellate/GrStrokeInstancedShaderImpl.cpp
@@ -18,9 +18,6 @@
     SkPaint::Join joinType = shader.stroke().getJoin();
     args.fVaryingHandler->emitAttributes(shader);
 
-    // Constants.
-    args.fVertBuilder->defineConstant("MAX_PARAMETRIC_SEGMENTS_LOG2",
-                                      GrTessellationPathRenderer::kMaxResolveLevel);
     args.fVertBuilder->defineConstant("float", "PI", "3.141592653589793238");
 
     // Helper functions.
@@ -108,9 +105,7 @@
 
     args.fVertBuilder->codeAppend(R"(
     // Find how many parametric segments this stroke requires.
-    float numParametricSegments = min(wangs_formula(PARAMETRIC_PRECISION,
-                                                    P[0], P[1], P[2], P[3], w),
-                                      float(1 << MAX_PARAMETRIC_SEGMENTS_LOG2));
+    float numParametricSegments = wangs_formula(PARAMETRIC_PRECISION, P[0], P[1], P[2], P[3], w);
     if (P[0] == P[1] && P[2] == P[3]) {
         // This is how we describe lines, but Wang's formula does not return 1 in this case.
         numParametricSegments = 1;
diff --git a/src/gpu/tessellate/GrStrokeShader.cpp b/src/gpu/tessellate/GrStrokeShader.cpp
index e0e6428..78d66f4 100644
--- a/src/gpu/tessellate/GrStrokeShader.cpp
+++ b/src/gpu/tessellate/GrStrokeShader.cpp
@@ -89,7 +89,7 @@
     //     float angle0;
     //     float strokeOutset;
     //
-    code->append(R"(
+    code->appendf(R"(
     float2 tangent, strokeCoord;
     if (combinedEdgeID != 0 && !isFinalEdge) {
         // Compute the location and tangent direction of the stroke edge with the integral id
@@ -138,7 +138,7 @@
         float2 tan0norm = normalize(tan0);
         float negAbsRadsPerSegment = -abs(radsPerSegment);
         float maxRotation0 = (1.0 + combinedEdgeID) * abs(radsPerSegment);
-        for (int exp = MAX_PARAMETRIC_SEGMENTS_LOG2 - 1; exp >= 0; --exp) {
+        for (int exp = %i - 1; exp >= 0; --exp) {
             // Test the parametric edge at lastParametricEdgeID + 2^exp.
             float testParametricID = lastParametricEdgeID + float(1 << exp);
             if (testParametricID <= maxParametricEdgeID) {
@@ -229,7 +229,7 @@
         // ensures crack-free seaming between instances.
         tangent = (combinedEdgeID == 0) ? tan0 : tan1;
         strokeCoord = (combinedEdgeID == 0) ? P[0] : P[3];
-    })");
+    })", shader.maxParametricSegments_log2() /* Parametric/radial sort loop count. */);
 
     code->append(R"(
     float2 ortho = normalize(float2(tangent.y, -tangent.x));
@@ -322,6 +322,7 @@
     key = (key << 2) | (uint32_t)fMode;
     key = (key << 2) | ((keyNeedsJoin) ? fStroke.getJoin() : 0);
     key = (key << 1) | (uint32_t)fStroke.isHairlineStyle();
+    key = (key << 8) | fMaxParametricSegments_log2;
     b->add32(key);
 }
 
diff --git a/src/gpu/tessellate/GrStrokeShader.h b/src/gpu/tessellate/GrStrokeShader.h
index 24adb89..00b732f 100644
--- a/src/gpu/tessellate/GrStrokeShader.h
+++ b/src/gpu/tessellate/GrStrokeShader.h
@@ -26,13 +26,13 @@
 class GrStrokeShader : public GrPathShader {
 public:
     // Are we using hardware tessellation or indirect draws?
-    enum class Mode {
+    enum class Mode : int8_t {
         kHardwareTessellation,
         kLog2Indirect,
         kFixedCount
     };
 
-    enum class ShaderFlags {
+    enum class ShaderFlags : uint8_t {
         kNone          = 0,
         kWideColor     = 1 << 0,
         kDynamicStroke = 1 << 1,  // Each patch or instance has its own stroke width and join type.
@@ -98,14 +98,15 @@
     };
 
     // 'viewMatrix' is applied to the geometry post tessellation. It cannot have perspective.
-    GrStrokeShader(Mode mode, ShaderFlags shaderFlags, const SkMatrix& viewMatrix,
-                   const SkStrokeRec& stroke, SkPMColor4f color)
+    GrStrokeShader(Mode mode, ShaderFlags shaderFlags, int8_t maxParametricSegments_log2,
+                   const SkMatrix& viewMatrix, const SkStrokeRec& stroke, SkPMColor4f color)
             : GrPathShader(kTessellate_GrStrokeShader_ClassID, viewMatrix,
                            (mode == Mode::kHardwareTessellation) ?
                                    GrPrimitiveType::kPatches : GrPrimitiveType::kTriangleStrip,
                            (mode == Mode::kHardwareTessellation) ? 1 : 0)
             , fMode(mode)
             , fShaderFlags(shaderFlags)
+            , fMaxParametricSegments_log2(maxParametricSegments_log2)
             , fStroke(stroke)
             , fColor(color) {
         if (fMode == Mode::kHardwareTessellation) {
@@ -167,6 +168,7 @@
 
     Mode mode() const { return fMode; }
     ShaderFlags flags() const { return fShaderFlags; }
+    int8_t maxParametricSegments_log2() const { return fMaxParametricSegments_log2; }
     bool hasDynamicStroke() const { return fShaderFlags & ShaderFlags::kDynamicStroke; }
     bool hasDynamicColor() const { return fShaderFlags & ShaderFlags::kDynamicColor; }
     const SkStrokeRec& stroke() const { return fStroke;}
@@ -187,6 +189,7 @@
 
     const Mode fMode;
     const ShaderFlags fShaderFlags;
+    const int8_t fMaxParametricSegments_log2;
     const SkStrokeRec fStroke;
     const SkPMColor4f fColor;
 
diff --git a/src/gpu/tessellate/GrStrokeTessellateOp.cpp b/src/gpu/tessellate/GrStrokeTessellateOp.cpp
index 8e2f881..c97a374 100644
--- a/src/gpu/tessellate/GrStrokeTessellateOp.cpp
+++ b/src/gpu/tessellate/GrStrokeTessellateOp.cpp
@@ -196,8 +196,8 @@
     if (this->canUseHardwareTessellation(fTotalCombinedVerbCnt, caps)) {
         // Only use hardware tessellation if we're drawing a somewhat large number of verbs.
         // Otherwise we seem to be better off using instanced draws.
-        fTessellator = arena->make<GrStrokeHardwareTessellator>(fShaderFlags, fViewMatrix,
-                                                                &fPathStrokeList,
+        fTessellator = arena->make<GrStrokeHardwareTessellator>(fShaderFlags, *caps.shaderCaps(),
+                                                                fViewMatrix, &fPathStrokeList,
                                                                 matrixMinMaxScales,
                                                                 strokeCullBounds);
     } else if (fTotalCombinedVerbCnt > 50 && !(fShaderFlags & ShaderFlags::kDynamicColor)) {
diff --git a/src/gpu/tessellate/GrStrokeTessellationShaderImpl.cpp b/src/gpu/tessellate/GrStrokeTessellationShaderImpl.cpp
index 9817aee..969932f 100644
--- a/src/gpu/tessellate/GrStrokeTessellationShaderImpl.cpp
+++ b/src/gpu/tessellate/GrStrokeTessellationShaderImpl.cpp
@@ -540,8 +540,6 @@
     code.appendf("#define float3x2 mat3x2\n");
     code.appendf("#define float4x2 mat4x2\n");
     code.appendf("#define PI 3.141592653589793238\n");
-    code.appendf("#define MAX_PARAMETRIC_SEGMENTS_LOG2 %i\n",
-                 SkNextLog2(shaderCaps.maxTessellationSegments()));
 
     if (!shader.hasDynamicStroke()) {
         const char* tessArgsName = uniformHandler.getUniformCStr(fTessControlArgsUniform);
diff --git a/src/gpu/tessellate/GrStrokeTessellator.h b/src/gpu/tessellate/GrStrokeTessellator.h
index f1e493c..4be1e36 100644
--- a/src/gpu/tessellate/GrStrokeTessellator.h
+++ b/src/gpu/tessellate/GrStrokeTessellator.h
@@ -26,10 +26,11 @@
     };
 
     GrStrokeTessellator(GrStrokeShader::Mode shaderMode, ShaderFlags shaderFlags,
-                        const SkMatrix& viewMatrix, PathStrokeList* pathStrokeList,
-                        std::array<float, 2> matrixMinMaxScales, const SkRect& strokeCullBounds)
-            : fShader(shaderMode, shaderFlags, viewMatrix, pathStrokeList->fStroke,
-                      pathStrokeList->fColor)
+                        int8_t maxParametricSegments_log2, const SkMatrix& viewMatrix,
+                        PathStrokeList* pathStrokeList, std::array<float,2> matrixMinMaxScales,
+                        const SkRect& strokeCullBounds)
+            : fShader(shaderMode, shaderFlags, maxParametricSegments_log2, viewMatrix,
+                      pathStrokeList->fStroke, pathStrokeList->fColor)
             , fPathStrokeList(pathStrokeList)
             , fMatrixMinMaxScales(matrixMinMaxScales)
             , fStrokeCullBounds(strokeCullBounds) {
@@ -49,7 +50,7 @@
 protected:
     GrStrokeShader fShader;
     PathStrokeList* fPathStrokeList;
-    const std::array<float, 2> fMatrixMinMaxScales;
+    const std::array<float,2> fMatrixMinMaxScales;
     const SkRect fStrokeCullBounds;  // See SkStrokeRec::inflationRadius.
 };