CCPR: Process quadratic flat edges without soft msaa

The artifacts previously thought to require msaa can be handled by
(1) converting near-linear quadratics into lines, and (2) ensuring all
quadratic segments are monotonic with respect to the vector of their
closing edge [P2 -> P0].

No. 1 was already in effect.

No. 2 is implemented by this change.

Now we only fall back on soft msaa for the two corner pixels.

This change also does some generic housekeeping in the quadratic
processor.

Bug: skia:
Change-Id: Ib3309c2ed86d3d8bec5f451125a69326e82eeb1c
Reviewed-on: https://skia-review.googlesource.com/29721
Commit-Queue: Chris Dalton <csmartdalton@google.com>
Reviewed-by: Greg Daniel <egdaniel@google.com>
diff --git a/src/gpu/GrPathUtils.cpp b/src/gpu/GrPathUtils.cpp
index b6711a0..9a79f20 100644
--- a/src/gpu/GrPathUtils.cpp
+++ b/src/gpu/GrPathUtils.cpp
@@ -567,6 +567,66 @@
     }
 }
 
+static inline Sk2f normalize(const Sk2f& n) {
+    Sk2f nn = n*n;
+    return n * (nn + SkNx_shuffle<1,0>(nn)).rsqrt();
+}
+
+bool GrPathUtils::chopMonotonicQuads(const SkPoint p[3], SkPoint dst[5]) {
+    GR_STATIC_ASSERT(SK_SCALAR_IS_FLOAT);
+    GR_STATIC_ASSERT(2 * sizeof(float) == sizeof(SkPoint));
+    GR_STATIC_ASSERT(0 == offsetof(SkPoint, fX));
+
+    Sk2f p0 = Sk2f::Load(&p[0]);
+    Sk2f p1 = Sk2f::Load(&p[1]);
+    Sk2f p2 = Sk2f::Load(&p[2]);
+
+    Sk2f tan0 = p1 - p0;
+    Sk2f tan1 = p2 - p1;
+    Sk2f v = p2 - p0;
+
+    // Check if the curve is already monotonic (i.e. (tan0 dot v) >= 0 and (tan1 dot v) >= 0).
+    // This should almost always be this case for well-behaved curves in the real world.
+    float dot0[2], dot1[2];
+    (tan0 * v).store(dot0);
+    (tan1 * v).store(dot1);
+    if (dot0[0] + dot0[1] >= 0 && dot1[0] + dot1[1] >= 0) {
+        return false;
+    }
+
+    // Chop the curve into two segments with equal curvature. To do this we find the T value whose
+    // tangent is perpendicular to the vector that bisects tan0 and -tan1.
+    Sk2f n = normalize(tan0) - normalize(tan1);
+
+    // This tangent can be found where (dQ(t) dot n) = 0:
+    //
+    //   0 = (dQ(t) dot n) = | 2*t  1 | * | p0 - 2*p1 + p2 | * | n |
+    //                                    | -2*p0 + 2*p1   |   | . |
+    //
+    //                     = | 2*t  1 | * | tan1 - tan0 | * | n |
+    //                                    | 2*tan0      |   | . |
+    //
+    //                     = 2*t * ((tan1 - tan0) dot n) + (2*tan0 dot n)
+    //
+    //   t = (tan0 dot n) / ((tan0 - tan1) dot n)
+    Sk2f dQ1n = (tan0 - tan1) * n;
+    Sk2f dQ0n = tan0 * n;
+    Sk2f t = (dQ0n + SkNx_shuffle<1,0>(dQ0n)) / (dQ1n + SkNx_shuffle<1,0>(dQ1n));
+    t = Sk2f::Min(Sk2f::Max(t, 0), 1); // Clamp for FP error.
+
+    Sk2f p01 = SkNx_fma(t, tan0, p0);
+    Sk2f p12 = SkNx_fma(t, tan1, p1);
+    Sk2f p012 = SkNx_fma(t, p12 - p01, p01);
+
+    p0.store(&dst[0]);
+    p01.store(&dst[1]);
+    p012.store(&dst[2]);
+    p12.store(&dst[3]);
+    p2.store(&dst[4]);
+
+    return true;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 /**
diff --git a/src/gpu/GrPathUtils.h b/src/gpu/GrPathUtils.h
index e9dee73..4643bff 100644
--- a/src/gpu/GrPathUtils.h
+++ b/src/gpu/GrPathUtils.h
@@ -124,6 +124,14 @@
                                                 SkPathPriv::FirstDirection dir,
                                                 SkTArray<SkPoint, true>* quads);
 
+    // Ensures that a quadratic bezier is monotonic with respect to its vector [P2 - P0] (the vector
+    // between its endpoints). In the event that the curve is not monotonic, it is chopped into two
+    // segments that are monotonic. This should be rare for well-behaved curves in the real world.
+    //
+    // Returns false if the curve was already monotonic.
+    //         true if it was chopped into two monotonic segments, now contained in dst.
+    bool chopMonotonicQuads(const SkPoint p[3], SkPoint dst[5]);
+
     // Computes the KLM linear functionals for the cubic implicit form. The "right" side of the
     // curve (when facing in the direction of increasing parameter values) will be the area that
     // satisfies:
diff --git a/src/gpu/ccpr/GrCCPRCoverageOpsBuilder.cpp b/src/gpu/ccpr/GrCCPRCoverageOpsBuilder.cpp
index 93e81ae..4c61570 100644
--- a/src/gpu/ccpr/GrCCPRCoverageOpsBuilder.cpp
+++ b/src/gpu/ccpr/GrCCPRCoverageOpsBuilder.cpp
@@ -11,6 +11,7 @@
 #include "GrGpuCommandBuffer.h"
 #include "GrOnFlushResourceProvider.h"
 #include "GrOpFlushState.h"
+#include "GrPathUtils.h"
 #include "SkGeometry.h"
 #include "SkMakeUnique.h"
 #include "SkMathPriv.h"
@@ -162,6 +163,9 @@
 
 void MaxBufferItems::countPathItems(GrCCPRCoverageOpsBuilder::ScissorMode scissorMode,
                                     const SkPath& path) {
+    static constexpr int kMaxQuadraticSegments = 2;
+    static constexpr int kMaxCubicSegments = 3;
+
     MaxPrimitives& maxPrimitives = fMaxPrimitives[(int)scissorMode];
     int currFanPts = 0;
 
@@ -179,23 +183,23 @@
                 continue;
             case SkPath::kQuad_Verb:
                 SkASSERT(currFanPts > 0);
-                ++currFanPts;
-                ++fMaxControlPoints;
-                ++maxPrimitives.fMaxQuadratics;
+                currFanPts += kMaxQuadraticSegments;
+                fMaxControlPoints += kMaxQuadraticSegments;
+                maxPrimitives.fMaxQuadratics += kMaxQuadraticSegments;
                 continue;
             case SkPath::kCubic_Verb:
+                GR_STATIC_ASSERT(kMaxCubicSegments >= kMaxQuadraticSegments);
                 SkASSERT(currFanPts > 0);
                 // Over-allocate for the worst case when the cubic is chopped into 3 segments.
-                enum { kMaxSegments = 3 };
-                currFanPts += kMaxSegments;
+                currFanPts += kMaxCubicSegments;
                 // Each cubic segment has two control points.
-                fMaxControlPoints += kMaxSegments * 2;
+                fMaxControlPoints += kMaxCubicSegments * 2;
                 // Each cubic segment also emits two root t,s values as "control points".
-                fMaxControlPoints += kMaxSegments * 2;
-                maxPrimitives.fMaxCubics += kMaxSegments;
+                fMaxControlPoints += kMaxCubicSegments * 2;
+                maxPrimitives.fMaxCubics += kMaxCubicSegments;
                 // The cubic may also turn out to be a quadratic. While we over-allocate by a fair
-                // amount, this is still a relatively small amount of space.
-                ++maxPrimitives.fMaxQuadratics;
+                // amount, this is still a relatively small amount of space compared to the atlas.
+                maxPrimitives.fMaxQuadratics += kMaxQuadraticSegments;
                 continue;
             case SkPath::kConic_Verb:
                 SkASSERT(currFanPts > 0);
@@ -305,11 +309,24 @@
 }
 
 void GrCCPRCoverageOpsBuilder::quadraticTo(SkPoint controlPt, SkPoint endPt) {
-    SkASSERT(fCurrPathIndices.fQuadratics < fBaseInstances[(int)fCurrScissorMode].fSerpentines);
+    SkASSERT(fCurrPathIndices.fQuadratics+2 <= fBaseInstances[(int)fCurrScissorMode].fSerpentines);
+
+    SkPoint P[3] = {fCurrFanPoint, controlPt, endPt};
+    SkPoint chopped[5];
+    if (GrPathUtils::chopMonotonicQuads(P, chopped)) {
+        this->fanTo(chopped[2]);
+        fPointsData[fControlPtsIdx++] = chopped[1];
+        fInstanceData[fCurrPathIndices.fQuadratics++].fQuadraticData = {
+            fControlPtsIdx - 1,
+            fFanPtsIdx - 2
+        };
+
+        controlPt = chopped[3];
+        SkASSERT(endPt == chopped[4]);
+    }
 
     this->fanTo(endPt);
     fPointsData[fControlPtsIdx++] = controlPt;
-
     fInstanceData[fCurrPathIndices.fQuadratics++].fQuadraticData = {
         fControlPtsIdx - 1,
         fFanPtsIdx - 2
@@ -515,7 +532,7 @@
     auto constexpr kQuadraticsGrPrimitiveType = GrCCPRCoverageProcessor::kQuadraticsGrPrimitiveType;
     this->drawMaskPrimitives(flushState, pipeline, Mode::kQuadraticHulls,
                              kQuadraticsGrPrimitiveType, 3, &PrimitiveTallies::fQuadratics);
-    this->drawMaskPrimitives(flushState, pipeline, Mode::kQuadraticFlatEdges,
+    this->drawMaskPrimitives(flushState, pipeline, Mode::kQuadraticCorners,
                              kQuadraticsGrPrimitiveType, 3, &PrimitiveTallies::fQuadratics);
 
     // Cubics.
diff --git a/src/gpu/ccpr/GrCCPRCoverageProcessor.cpp b/src/gpu/ccpr/GrCCPRCoverageProcessor.cpp
index 332a64c..05a3242 100644
--- a/src/gpu/ccpr/GrCCPRCoverageProcessor.cpp
+++ b/src/gpu/ccpr/GrCCPRCoverageProcessor.cpp
@@ -28,8 +28,8 @@
             return "GrCCPRTriangleCornerProcessor";
         case Mode::kQuadraticHulls:
             return "GrCCPRQuadraticHullProcessor";
-        case Mode::kQuadraticFlatEdges:
-            return "GrCCPRQuadraticSharedEdgeProcessor";
+        case Mode::kQuadraticCorners:
+            return "GrCCPRQuadraticCornerProcessor";
         case Mode::kSerpentineInsets:
             return "GrCCPRCubicInsetProcessor (serpentine)";
         case Mode::kSerpentineBorders:
@@ -74,8 +74,8 @@
             return new GrCCPRTriangleCornerProcessor();
         case Mode::kQuadraticHulls:
             return new GrCCPRQuadraticHullProcessor();
-        case Mode::kQuadraticFlatEdges:
-            return new GrCCPRQuadraticSharedEdgeProcessor();
+        case Mode::kQuadraticCorners:
+            return new GrCCPRQuadraticCornerProcessor();
         case Mode::kSerpentineInsets:
             return new GrCCPRCubicInsetProcessor(GrCCPRCubicProcessor::Type::kSerpentine);
         case Mode::kSerpentineBorders:
@@ -300,6 +300,17 @@
     g->codeAppendf("%s = float3(-n, kk[1]) * scale;", outputDistanceEquation);
 }
 
+int PrimitiveProcessor::emitCornerGeometry(GrGLSLGeometryBuilder* g, const char* emitVertexFn,
+                                           const char* pt) const {
+    g->codeAppendf("%s(%s + float2(-bloat.x, -bloat.y), 1);", emitVertexFn, pt);
+    g->codeAppendf("%s(%s + float2(-bloat.x, +bloat.y), 1);", emitVertexFn, pt);
+    g->codeAppendf("%s(%s + float2(+bloat.x, -bloat.y), 1);", emitVertexFn, pt);
+    g->codeAppendf("%s(%s + float2(+bloat.x, +bloat.y), 1);", emitVertexFn, pt);
+    g->codeAppend ("EndPrimitive();");
+
+    return 4;
+}
+
 void PrimitiveProcessor::emitCoverage(const GrCCPRCoverageProcessor& proc, GrGLSLFragmentBuilder* f,
                                       const char* outputColor, const char* outputCoverage) const {
     switch (fCoverageType) {
diff --git a/src/gpu/ccpr/GrCCPRCoverageProcessor.h b/src/gpu/ccpr/GrCCPRCoverageProcessor.h
index befa1a9..c9198d3 100644
--- a/src/gpu/ccpr/GrCCPRCoverageProcessor.h
+++ b/src/gpu/ccpr/GrCCPRCoverageProcessor.h
@@ -75,7 +75,7 @@
 
         // Quadratics.
         kQuadraticHulls,
-        kQuadraticFlatEdges,
+        kQuadraticCorners,
 
         // Cubics.
         kSerpentineInsets,
@@ -220,6 +220,14 @@
     void emitEdgeDistanceEquation(GrGLSLGeometryBuilder*, const char* leftPt, const char* rightPt,
                                   const char* outputDistanceEquation) const;
 
+    // Emits the conservative raster of a single point (i.e. pixel-size box centered on the point).
+    // Coverage is +1 all around.
+    //
+    // Geometry shader must be configured to output triangle strips.
+    //
+    // Returns the number of vertices that were emitted.
+    int emitCornerGeometry(GrGLSLGeometryBuilder*, const char* emitVertexFn, const char* pt) const;
+
     // Defines a global float2 array that contains MSAA sample locations as offsets from pixel
     // center. Subclasses can use this for software multisampling.
     //
diff --git a/src/gpu/ccpr/GrCCPRQuadraticProcessor.cpp b/src/gpu/ccpr/GrCCPRQuadraticProcessor.cpp
index bc09fea..ed5f0f3 100644
--- a/src/gpu/ccpr/GrCCPRQuadraticProcessor.cpp
+++ b/src/gpu/ccpr/GrCCPRQuadraticProcessor.cpp
@@ -64,55 +64,41 @@
     g->codeAppendf("%s = float2x2(%s) * float2x2(%s.x, 0, 0, %s.z);",
                    fCanonicalDerivatives.c_str(), fCanonicalMatrix.c_str(), rtAdjust, rtAdjust);
 
-    this->emitQuadraticGeometry(g, emitVertexFn, wind, rtAdjust);
+    g->declareGlobal(fEdgeDistanceEquation);
+    g->codeAppendf("highp float2 edgept0 = bezierpts[%s > 0 ? 2 : 0];", wind);
+    g->codeAppendf("highp float2 edgept1 = bezierpts[%s > 0 ? 0 : 2];", wind);
+    this->emitEdgeDistanceEquation(g, "edgept0", "edgept1", fEdgeDistanceEquation.c_str());
+
+    this->emitQuadraticGeometry(g, emitVertexFn, rtAdjust);
 }
 
 void GrCCPRQuadraticProcessor::emitPerVertexGeometryCode(SkString* fnBody, const char* position,
                                                          const char* /*coverage*/,
                                                          const char* /*wind*/) const {
     fnBody->appendf("%s.xy = (%s * float3(%s, 1)).xy;",
-                    fCanonicalCoord.gsOut(), fCanonicalMatrix.c_str(), position);
-    fnBody->appendf("%s.zw = float2(2 * %s.x * %s[0].x - %s[0].y, "
-                                   "2 * %s.x * %s[1].x - %s[1].y);",
-                    fCanonicalCoord.gsOut(), fCanonicalCoord.gsOut(),
-                    fCanonicalDerivatives.c_str(), fCanonicalDerivatives.c_str(),
-                    fCanonicalCoord.gsOut(), fCanonicalDerivatives.c_str(),
-                    fCanonicalDerivatives.c_str());
-}
-
-void GrCCPRQuadraticProcessor::emitShaderCoverage(GrGLSLFragmentBuilder* f,
-                                                  const char* outputCoverage) const {
-    f->codeAppendf("highp float d = (%s.x * %s.x - %s.y) * inversesqrt(dot(%s.zw, %s.zw));",
-                   fCanonicalCoord.fsIn(), fCanonicalCoord.fsIn(), fCanonicalCoord.fsIn(),
-                   fCanonicalCoord.fsIn(), fCanonicalCoord.fsIn());
-    f->codeAppendf("%s = clamp(0.5 - d, 0, 1);", outputCoverage);
+                    fXYD.gsOut(), fCanonicalMatrix.c_str(), position);
+    fnBody->appendf("%s.z = dot(%s.xy, %s) + %s.z;",
+                    fXYD.gsOut(), fEdgeDistanceEquation.c_str(), position,
+                    fEdgeDistanceEquation.c_str());
+    this->onEmitPerVertexGeometryCode(fnBody);
 }
 
 void GrCCPRQuadraticHullProcessor::emitQuadraticGeometry(GrGLSLGeometryBuilder* g,
                                                          const char* emitVertexFn,
-                                                         const char* wind,
-                                                         const char* rtAdjust) const {
-    // Find the point on the curve whose tangent is halfway between the tangents at the endpionts.
-    // We defined bezierpts in onEmitGeometryShader.
-    g->codeAppend ("highp float2 n = (normalize(bezierpts[0] - bezierpts[1]) + "
-                                     "normalize(bezierpts[2] - bezierpts[1]));");
-    g->codeAppend ("highp float t = dot(bezierpts[0] - bezierpts[1], n) / "
-                                   "dot(bezierpts[2] - 2 * bezierpts[1] + bezierpts[0], n);");
-    g->codeAppend ("highp float2 pt = (1 - t) * (1 - t) * bezierpts[0] + "
-                                      "2 * t * (1 - t) * bezierpts[1] + "
-                                      "t * t * bezierpts[2];");
+                                                         const char* /*rtAdjust*/) const {
+    // Find the t value whose tangent is halfway between the tangents at the endpionts.
+    // (We defined bezierpts in onEmitGeometryShader.)
+    g->codeAppend ("highp float2 tan0 = bezierpts[1] - bezierpts[0];");
+    g->codeAppend ("highp float2 tan1 = bezierpts[2] - bezierpts[1];");
+    g->codeAppend ("highp float2 midnorm = normalize(tan0) - normalize(tan1);");
+    g->codeAppend ("highp float2 T = midnorm * float2x2(tan0 - tan1, tan0);");
+    g->codeAppend ("highp float t = clamp(T.t / T.s, 0, 1);"); // T.s=0 is weeded out by this point.
 
-    // Clip the triangle by the tangent line at this halfway point.
-    g->codeAppend ("highp float2x2 v = float2x2(bezierpts[0] - bezierpts[1], "
-                                               "bezierpts[2] - bezierpts[1]);");
-    g->codeAppend ("highp float2 nv = n * v;");
-    g->codeAppend ("highp float2 d = abs(nv[0]) > 0.1 * max(bloat.x, bloat.y) ? "
-                                    "(dot(n, pt - bezierpts[1])) / nv : float2(0);");
-
-    // Generate a 4-point hull of the curve from the clipped triangle.
+    // Clip the bezier triangle by the tangent at our new t value. This is a simple application for
+    // De Casteljau's algorithm.
     g->codeAppendf("highp float4x2 quadratic_hull = float4x2(bezierpts[0], "
-                                                            "bezierpts[1] + d[0] * v[0], "
-                                                            "bezierpts[1] + d[1] * v[1], "
+                                                            "bezierpts[0] + tan0 * t, "
+                                                            "bezierpts[1] + tan1 * t, "
                                                             "bezierpts[2]);");
 
     int maxVerts = this->emitHullGeometry(g, emitVertexFn, "quadratic_hull", 4, "sk_InvocationID");
@@ -122,59 +108,63 @@
                  maxVerts, 4);
 }
 
-void GrCCPRQuadraticSharedEdgeProcessor::emitQuadraticGeometry(GrGLSLGeometryBuilder* g,
-                                                               const char* emitVertexFn,
-                                                               const char* wind,
-                                                               const char* rtAdjust) const {
-    // We defined bezierpts in onEmitGeometryShader.
-    g->codeAppendf("int leftidx = %s > 0 ? 2 : 0;", wind);
-    g->codeAppendf("highp float2 left = bezierpts[leftidx];");
-    g->codeAppendf("highp float2 right = bezierpts[2 - leftidx];");
-    this->emitEdgeDistanceEquation(g, "left", "right", "highp float3 edge_distance_equation");
+void GrCCPRQuadraticHullProcessor::onEmitPerVertexGeometryCode(SkString* fnBody) const {
+    fnBody->appendf("%s = float2(2 * %s.x, -1) * %s;",
+                    fGradXY.gsOut(), fXYD.gsOut(), fCanonicalDerivatives.c_str());
+}
 
+void GrCCPRQuadraticHullProcessor::emitShaderCoverage(GrGLSLFragmentBuilder* f,
+                                                      const char* outputCoverage) const {
+    f->codeAppendf("highp float d = (%s.x * %s.x - %s.y) * inversesqrt(dot(%s, %s));",
+                   fXYD.fsIn(), fXYD.fsIn(), fXYD.fsIn(), fGradXY.fsIn(), fGradXY.fsIn());
+    f->codeAppendf("%s = clamp(0.5 - d, 0, 1);", outputCoverage);
+    f->codeAppendf("%s += min(%s.z, 0);", outputCoverage, fXYD.fsIn()); // Flat closing edge.
+}
+
+void GrCCPRQuadraticCornerProcessor::emitQuadraticGeometry(GrGLSLGeometryBuilder* g,
+                                                           const char* emitVertexFn,
+                                                           const char* rtAdjust) const {
     g->declareGlobal(fEdgeDistanceDerivatives);
-    g->codeAppendf("%s = edge_distance_equation.xy * %s.xz;",
-                   fEdgeDistanceDerivatives.c_str(), rtAdjust);
+    g->codeAppendf("%s = %s.xy * %s.xz;",
+                   fEdgeDistanceDerivatives.c_str(), fEdgeDistanceEquation.c_str(), rtAdjust);
 
-    int maxVertices = this->emitEdgeGeometry(g, emitVertexFn, "left", "right",
-                                             "edge_distance_equation");
+    g->codeAppendf("highp float2 corner = bezierpts[sk_InvocationID * 2];");
+    int numVertices = this->emitCornerGeometry(g, emitVertexFn, "corner");
 
     g->configure(GrGLSLGeometryBuilder::InputType::kTriangles,
-                 GrGLSLGeometryBuilder::OutputType::kTriangleStrip, maxVertices, 1);
+                 GrGLSLGeometryBuilder::OutputType::kTriangleStrip, numVertices, 2);
 }
 
-void GrCCPRQuadraticSharedEdgeProcessor::emitPerVertexGeometryCode(SkString* fnBody,
-                                                                   const char* position,
-                                                                   const char* coverage,
-                                                                   const char* wind) const {
-    this->INHERITED::emitPerVertexGeometryCode(fnBody, position, coverage, wind);
-    fnBody->appendf("%s = %s;", fFragCanonicalDerivatives.gsOut(), fCanonicalDerivatives.c_str());
-    fnBody->appendf("%s.x = %s + 0.5;", fEdgeDistance.gsOut(), coverage); // outer=-.5, inner=+.5.
-    fnBody->appendf("%s.yz = %s;", fEdgeDistance.gsOut(), fEdgeDistanceDerivatives.c_str());
+void GrCCPRQuadraticCornerProcessor::onEmitPerVertexGeometryCode(SkString* fnBody) const {
+    fnBody->appendf("%s = float3(%s[0].x, %s[0].y, %s.x);",
+                    fdXYDdx.gsOut(), fCanonicalDerivatives.c_str(), fCanonicalDerivatives.c_str(),
+                    fEdgeDistanceDerivatives.c_str());
+    fnBody->appendf("%s = float3(%s[1].x, %s[1].y, %s.y);",
+                    fdXYDdy.gsOut(), fCanonicalDerivatives.c_str(), fCanonicalDerivatives.c_str(),
+                    fEdgeDistanceDerivatives.c_str());
 }
 
-void GrCCPRQuadraticSharedEdgeProcessor::emitShaderCoverage(GrGLSLFragmentBuilder* f,
-                                                            const char* outputCoverage) const {
-    // Erase what the previous hull shader wrote and replace with edge coverage.
-    this->INHERITED::emitShaderCoverage(f, outputCoverage);
-    f->codeAppendf("%s = %s.x + 0.5 - %s;",
-                   outputCoverage, fEdgeDistance.fsIn(), outputCoverage);
+void GrCCPRQuadraticCornerProcessor::emitShaderCoverage(GrGLSLFragmentBuilder* f,
+                                                        const char* outputCoverage) const {
+    f->codeAppendf("highp float x = %s.x, y = %s.y, d = %s.z;",
+                   fXYD.fsIn(), fXYD.fsIn(), fXYD.fsIn());
+    f->codeAppendf("highp float2x3 grad_xyd = float2x3(%s, %s);", fdXYDdx.fsIn(), fdXYDdy.fsIn());
 
-    // Use software msaa to subtract out the remaining pixel coverage that is still inside the
-    // shared edge, but outside the curve.
+    // Erase what the previous hull shader wrote. We don't worry about the two corners falling on
+    // the same pixel because those cases should have been weeded out by this point.
+    f->codeAppend ("highp float f = x*x - y;");
+    f->codeAppend ("highp float2 grad_f = float2(2*x, -1) * float2x2(grad_xyd);");
+    f->codeAppendf("%s = -(0.5 - f * inversesqrt(dot(grad_f, grad_f)));", outputCoverage);
+    f->codeAppendf("%s -= d;", outputCoverage);
+
+    // Use software msaa to approximate coverage at the corner pixels.
     int sampleCount = this->defineSoftSampleLocations(f, "samples");
-
-    f->codeAppendf("highp float2x3 grad_xyd = float2x3(%s[0],%s.y, %s[1],%s.z);",
-                   fFragCanonicalDerivatives.fsIn(), fEdgeDistance.fsIn(),
-                   fFragCanonicalDerivatives.fsIn(), fEdgeDistance.fsIn());
-    f->codeAppendf("highp float3 center_xyd = float3(%s.xy, %s.x);",
-                   fCanonicalCoord.fsIn(), fEdgeDistance.fsIn());
-
+    f->codeAppendf("highp float3 xyd_center = float3(%s.xy, %s.z + 0.5);",
+                   fXYD.fsIn(), fXYD.fsIn());
     f->codeAppendf("for (int i = 0; i < %i; ++i) {", sampleCount);
-    f->codeAppend (    "highp float3 xyd = grad_xyd * samples[i] + center_xyd;");
-    f->codeAppend (    "lowp float f = xyd.x * xyd.x - xyd.y;"); // f > 0 -> outside curve.
-    f->codeAppend (    "bool2 outside_curve_inside_edge = greaterThan(float2(f, xyd.z), float2(0));");
-    f->codeAppendf(    "%s -= all(outside_curve_inside_edge) ? %f : 0;",
+    f->codeAppend (    "highp float3 xyd = grad_xyd * samples[i] + xyd_center;");
+    f->codeAppend (    "lowp float f = xyd.y - xyd.x * xyd.x;"); // f > 0 -> inside curve.
+    f->codeAppendf(    "%s += all(greaterThan(float2(f,xyd.z), float2(0))) ? %f : 0;",
                        outputCoverage, 1.0 / sampleCount);
     f->codeAppendf("}");
 }
diff --git a/src/gpu/ccpr/GrCCPRQuadraticProcessor.h b/src/gpu/ccpr/GrCCPRQuadraticProcessor.h
index c3e8d17..1eda255 100644
--- a/src/gpu/ccpr/GrCCPRQuadraticProcessor.h
+++ b/src/gpu/ccpr/GrCCPRQuadraticProcessor.h
@@ -17,12 +17,8 @@
  *
  * https://www.microsoft.com/en-us/research/wp-content/uploads/2005/01/p1000-loop.pdf
  *
- * The curves are rendered in two passes:
- *
- * Pass 1: Draw a conservative raster hull around the quadratic bezier points, and compute the
- *         curve's coverage using the gradient-based AA technique outlined in the Loop/Blinn paper.
- *
- * Pass 2: Touch up and antialias the flat edge from P2 back to P0.
+ * The provided curves must be monotonic with respect to the vector of their closing edge [P2 - P0].
+ * Use GrPathUtils::chopMonotonicQuads.
  */
 class GrCCPRQuadraticProcessor : public GrCCPRCoverageProcessor::PrimitiveProcessor {
 public:
@@ -32,10 +28,12 @@
                                kHigh_GrSLPrecision)
             , fCanonicalDerivatives("canonical_derivatives", kMat22f_GrSLType,
                                     GrShaderVar::kNonArray, kHigh_GrSLPrecision)
-            , fCanonicalCoord(kVec4f_GrSLType) {}
+            , fEdgeDistanceEquation("edge_distance_equation", kVec3f_GrSLType,
+                                    GrShaderVar::kNonArray, kHigh_GrSLPrecision)
+            , fXYD(kVec3f_GrSLType) {}
 
     void resetVaryings(GrGLSLVaryingHandler* varyingHandler) override {
-        varyingHandler->addVarying("canonical_coord", &fCanonicalCoord, kHigh_GrSLPrecision);
+        varyingHandler->addVarying("xyd", &fXYD, kHigh_GrSLPrecision);
     }
 
     void onEmitVertexShader(const GrCCPRCoverageProcessor&, GrGLSLVertexBuilder*,
@@ -45,65 +43,74 @@
     void onEmitGeometryShader(GrGLSLGeometryBuilder*, const char* emitVertexFn, const char* wind,
                               const char* rtAdjust) const final;
     void emitPerVertexGeometryCode(SkString* fnBody, const char* position, const char* coverage,
-                                   const char* wind) const override;
-    void emitShaderCoverage(GrGLSLFragmentBuilder* f, const char* outputCoverage) const override;
+                                   const char* wind) const final;
 
 protected:
     virtual void emitQuadraticGeometry(GrGLSLGeometryBuilder*, const char* emitVertexFn,
-                                       const char* wind, const char* rtAdjust) const = 0;
+                                       const char* rtAdjust) const = 0;
+    virtual void onEmitPerVertexGeometryCode(SkString* fnBody) const = 0;
 
     GrShaderVar       fCanonicalMatrix;
     GrShaderVar       fCanonicalDerivatives;
-    GrGLSLGeoToFrag   fCanonicalCoord;
+    GrShaderVar       fEdgeDistanceEquation;
+    GrGLSLGeoToFrag   fXYD;
 
     typedef GrCCPRCoverageProcessor::PrimitiveProcessor INHERITED;
 };
 
+/**
+ * This pass draws a conservative raster hull around the quadratic bezier curve, computes the
+ * curve's coverage using the gradient-based AA technique outlined in the Loop/Blinn paper, and
+ * uses simple distance-to-edge to subtract out coverage for the flat closing edge [P2 -> P0]. Since
+ * the provided curves are monotonic, this will get every pixel right except the two corners.
+ */
 class GrCCPRQuadraticHullProcessor : public GrCCPRQuadraticProcessor {
 public:
+    GrCCPRQuadraticHullProcessor()
+            : fGradXY(kVec2f_GrSLType) {}
+
+    void resetVaryings(GrGLSLVaryingHandler* varyingHandler) override {
+        this->INHERITED::resetVaryings(varyingHandler);
+        varyingHandler->addVarying("grad_xy", &fGradXY, kHigh_GrSLPrecision);
+    }
+
     void emitQuadraticGeometry(GrGLSLGeometryBuilder*, const char* emitVertexFn,
-                               const char* wind, const char* rtAdjust) const override;
+                               const char* rtAdjust) const override;
+    void onEmitPerVertexGeometryCode(SkString* fnBody) const override;
+    void emitShaderCoverage(GrGLSLFragmentBuilder* f, const char* outputCoverage) const override;
 
 private:
+    GrGLSLGeoToFrag   fGradXY;
+
     typedef GrCCPRQuadraticProcessor INHERITED;
 };
 
 /**
- * This pass touches up the flat edge (P2 -> P0) of a closed quadratic segment as follows:
- *
- *   1) Erase what the previous hull shader estimated for coverage.
- *   2) Replace coverage with distance to the curve's flat edge (this is necessary when the edge
- *      is shared and must create a "water-tight" seam).
- *   3) Use pseudo MSAA to subtract out the remaining pixel coverage that is still inside the flat
- *      edge, but outside the curve.
+ * This pass fixes the corners of a closed quadratic segment with soft MSAA.
  */
-class GrCCPRQuadraticSharedEdgeProcessor : public GrCCPRQuadraticProcessor {
+class GrCCPRQuadraticCornerProcessor : public GrCCPRQuadraticProcessor {
 public:
-    GrCCPRQuadraticSharedEdgeProcessor()
-            : fXYD("xyd", kMat33f_GrSLType, GrShaderVar::kNonArray, kHigh_GrSLPrecision)
-            , fEdgeDistanceDerivatives("edge_distance_derivatives", kVec2f_GrSLType,
+    GrCCPRQuadraticCornerProcessor()
+            : fEdgeDistanceDerivatives("edge_distance_derivatives", kVec2f_GrSLType,
                                        GrShaderVar::kNonArray, kHigh_GrSLPrecision)
-            , fFragCanonicalDerivatives(kMat22f_GrSLType)
-            , fEdgeDistance(kVec3f_GrSLType) {}
+            , fdXYDdx(kVec3f_GrSLType)
+            , fdXYDdy(kVec3f_GrSLType) {}
 
     void resetVaryings(GrGLSLVaryingHandler* varyingHandler) override {
         this->INHERITED::resetVaryings(varyingHandler);
-        varyingHandler->addFlatVarying("canonical_derivatives", &fFragCanonicalDerivatives,
-                                       kHigh_GrSLPrecision);
-        varyingHandler->addVarying("edge_distance", &fEdgeDistance, kHigh_GrSLPrecision);
+        varyingHandler->addFlatVarying("dXYDdx", &fdXYDdx, kHigh_GrSLPrecision);
+        varyingHandler->addFlatVarying("dXYDdy", &fdXYDdy, kHigh_GrSLPrecision);
     }
 
     void emitQuadraticGeometry(GrGLSLGeometryBuilder*, const char* emitVertexFn,
-                               const char* wind, const char* rtAdjust) const override;
-    void emitPerVertexGeometryCode(SkString* fnBody, const char* position, const char* coverage,
-                                   const char* wind) const override;
+                               const char* rtAdjust) const override;
+    void onEmitPerVertexGeometryCode(SkString* fnBody) const override;
     void emitShaderCoverage(GrGLSLFragmentBuilder*, const char* outputCoverage) const override;
 
 private:
-    GrShaderVar       fXYD;
     GrShaderVar       fEdgeDistanceDerivatives;
-    GrGLSLGeoToFrag   fFragCanonicalDerivatives;
-    GrGLSLGeoToFrag   fEdgeDistance;
+    GrGLSLGeoToFrag   fdXYDdx;
+    GrGLSLGeoToFrag   fdXYDdy;
 
     typedef GrCCPRQuadraticProcessor INHERITED;
 };
diff --git a/src/gpu/ccpr/GrCCPRTriangleProcessor.cpp b/src/gpu/ccpr/GrCCPRTriangleProcessor.cpp
index 77da2cf..bb2ad1b 100644
--- a/src/gpu/ccpr/GrCCPRTriangleProcessor.cpp
+++ b/src/gpu/ccpr/GrCCPRTriangleProcessor.cpp
@@ -99,15 +99,11 @@
     this->defineInputVertices(g);
 
     g->codeAppend ("highp float2 self = in_vertices[sk_InvocationID];");
-    g->codeAppendf("%s(self + float2(-bloat.x, -bloat.y), 1);", emitVertexFn);
-    g->codeAppendf("%s(self + float2(-bloat.x, +bloat.y), 1);", emitVertexFn);
-    g->codeAppendf("%s(self + float2(+bloat.x, -bloat.y), 1);", emitVertexFn);
-    g->codeAppendf("%s(self + float2(+bloat.x, +bloat.y), 1);", emitVertexFn);
-    g->codeAppend ("EndPrimitive();");
+    int numVertices = this->emitCornerGeometry(g, emitVertexFn, "self");
 
     g->configure(GrGLSLGeometryBuilder::InputType::kTriangles,
                  GrGLSLGeometryBuilder::OutputType::kTriangleStrip,
-                 4, 3);
+                 numVertices, 3);
 }
 
 void GrCCPRTriangleCornerProcessor::emitPerVertexGeometryCode(SkString* fnBody,