Handle arbitrary and degenerate quadrilaterals in GrQuadPerEdgeAA

Updates the handling of perspective quads to calculate the correct,
degenerate-safe projected quad and then derive from that the proper
perspective quad.

In 2D, updated to determine if the optimized outset/inset procedure
is valid, and if not goes through a more robust procedure that is
based on line equation intersections. In particular, the degenerate
inset/outset approach is used when the quad has a zero-length edge,
if insetting/outsetting normally would create a self-intersecting shape,
or if near parallel edge angles would create numerical instabilities.

Performance testing by forcing all rectangle draws through the degenerate
code path suggests that it is about 10% slower than the optimal approach,
at least on my workstation...

Bug: skia:
Change-Id: I2973da8d97949eacebb09a1b27c334d62c1c948e
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/194008
Reviewed-by: Chris Dalton <csmartdalton@google.com>
Reviewed-by: Brian Salomon <bsalomon@google.com>
Commit-Queue: Michael Ludwig <michaelludwig@google.com>
diff --git a/src/gpu/ops/GrQuadPerEdgeAA.cpp b/src/gpu/ops/GrQuadPerEdgeAA.cpp
index de2641f..f70f953 100644
--- a/src/gpu/ops/GrQuadPerEdgeAA.cpp
+++ b/src/gpu/ops/GrQuadPerEdgeAA.cpp
@@ -20,6 +20,35 @@
 
 namespace {
 
+// Helper data types since there is a lot of information that needs to be passed around to
+// avoid recalculation in the different procedures for tessellating an AA quad.
+
+struct Vertices {
+    // X, Y, and W coordinates in device space. If not perspective, w should be set to 1.f
+    Sk4f fX, fY, fW;
+    // U, V, and R coordinates representing local quad. Ignored depending on uvrCount (0, 1, 2).
+    Sk4f fU, fV, fR;
+    int fUVRCount;
+};
+
+struct QuadMetadata {
+    // Normalized edge vectors of the device space quad, ordered L, B, T, R (i.e. nextCCW(x) - x).
+    Sk4f fDX, fDY;
+    // 1 / edge length of the device space quad
+    Sk4f fInvLengths;
+    // Edge mask (set to all 1s if aa flags is kAll), otherwise 1.f if edge was AA, 0.f if non-AA.
+    Sk4f fMask;
+};
+
+struct Edges {
+    // a * x + b * y + c = 0; positive distance is inside the quad; ordered LBTR.
+    Sk4f fA, fB, fC;
+    // Whether or not the edge normals had to be flipped to preserve positive distance on the inside
+    bool fFlipped;
+};
+
+static constexpr float kTolerance = 1e-2f;
+
 static AI Sk4f fma(const Sk4f& f, const Sk4f& m, const Sk4f& a) {
     return SkNx_fma<4, float>(f, m, a);
 }
@@ -34,281 +63,544 @@
     return SkNx_shuffle<1, 3, 0, 2>(v);
 }
 
-// Fills Sk4f with 1f if edge bit is set, 0f otherwise. Edges are ordered LBTR to match CCW ordering
-// of vertices in the quad.
-static AI Sk4f compute_edge_mask(GrQuadAAFlags aaFlags) {
-    return Sk4f((GrQuadAAFlags::kLeft & aaFlags) ? 1.f : 0.f,
-                (GrQuadAAFlags::kBottom & aaFlags) ? 1.f : 0.f,
-                (GrQuadAAFlags::kTop & aaFlags) ? 1.f : 0.f,
-                (GrQuadAAFlags::kRight & aaFlags) ? 1.f : 0.f);
+// Replaces zero-length 'bad' edge vectors with the reversed opposite edge vector.
+// e3 may be null if only 2D edges need to be corrected for.
+static AI void correct_bad_edges(const Sk4f& bad, Sk4f* e1, Sk4f* e2, Sk4f* e3) {
+    if (bad.anyTrue()) {
+        // Want opposite edges, L B T R -> R T B L but with flipped sign to preserve winding
+        *e1 = bad.thenElse(-SkNx_shuffle<3, 2, 1, 0>(*e1), *e1);
+        *e2 = bad.thenElse(-SkNx_shuffle<3, 2, 1, 0>(*e2), *e2);
+        if (e3) {
+            *e3 = bad.thenElse(-SkNx_shuffle<3, 2, 1, 0>(*e3), *e3);
+        }
+    }
 }
 
-// Outputs normalized edge vectors in xdiff and ydiff, as well as the reciprocal of the original
-// edge lengths in invLengths
-static AI void compute_edge_vectors(const Sk4f& x, const Sk4f& y, const Sk4f& xnext,
-                                    const Sk4f& ynext, Sk4f* xdiff, Sk4f* ydiff, Sk4f* invLengths) {
-    *xdiff = xnext - x;
-    *ydiff = ynext - y;
-    *invLengths = fma(*xdiff, *xdiff, *ydiff * *ydiff).rsqrt();
-    *xdiff *= *invLengths;
-    *ydiff *= *invLengths;
+// Replace 'bad' coordinates by rotating CCW to get the next point. c3 may be null for 2D points.
+static AI void correct_bad_coords(const Sk4f& bad, Sk4f* c1, Sk4f* c2, Sk4f* c3) {
+    if (bad.anyTrue()) {
+        *c1 = bad.thenElse(nextCCW(*c1), *c1);
+        *c2 = bad.thenElse(nextCCW(*c2), *c2);
+        if (c3) {
+            *c3 = bad.thenElse(nextCCW(*c3), *c3);
+        }
+    }
 }
 
-// outset and outsetCW are provided separately to allow for different magnitude outsets for
-// with-edge and "perpendicular" edge shifts. This is needed when one axis cannot be inset the full
-// half pixel without crossing over the other side.
-static AI void outset_masked_vertices(const Sk4f& outset, const Sk4f& outsetCW, const Sk4f& xdiff,
-                                      const Sk4f& ydiff, const Sk4f& invLengths, const Sk4f& mask,
-                                      Sk4f* x, Sk4f* y, Sk4f* u, Sk4f* v, Sk4f* r, int uvrCount) {
+static AI QuadMetadata get_metadata(const Vertices& vertices, GrQuadAAFlags aaFlags) {
+    Sk4f dx = nextCCW(vertices.fX) - vertices.fX;
+    Sk4f dy = nextCCW(vertices.fY) - vertices.fY;
+    Sk4f invLengths = fma(dx, dx, dy * dy).rsqrt();
+
+    Sk4f mask = aaFlags == GrQuadAAFlags::kAll ? Sk4f(1.f) :
+            Sk4f((GrQuadAAFlags::kLeft & aaFlags) ? 1.f : 0.f,
+                 (GrQuadAAFlags::kBottom & aaFlags) ? 1.f : 0.f,
+                 (GrQuadAAFlags::kTop & aaFlags) ? 1.f : 0.f,
+                 (GrQuadAAFlags::kRight & aaFlags) ? 1.f : 0.f);
+    return { dx * invLengths, dy * invLengths, invLengths, mask };
+}
+
+static AI Edges get_edge_equations(const QuadMetadata& metadata, const Vertices& vertices) {
+    Sk4f dx = metadata.fDX;
+    Sk4f dy = metadata.fDY;
+    // Correct for bad edges by copying adjacent edge information into the bad component
+    correct_bad_edges(metadata.fInvLengths >= 1.f / kTolerance, &dx, &dy, nullptr);
+
+    Sk4f c = fma(dx, vertices.fY, -dy * vertices.fX);
+    // Make sure normals point into the shape
+    Sk4f test = fma(dy, nextCW(vertices.fX), fma(-dx, nextCW(vertices.fY), c));
+    if ((test < -kTolerance).anyTrue()) {
+        return {-dy, dx, -c, true};
+    } else {
+        return {dy, -dx, c, false};
+    }
+}
+
+// Sets 'outset' to the magnitude of outset/inset to adjust each corner of a quad given the
+// edge angles and lengths. If the quad is too small, has empty edges, or too sharp of angles,
+// false is returned and the degenerate slow-path should be used.
+static bool get_optimized_outset(const QuadMetadata& metadata, bool rectilinear, Sk4f* outset) {
+    if (rectilinear) {
+        *outset = 0.5f;
+        // Stay in the fast path as long as all edges are at least a pixel long (so 1/len <= 1)
+        return (metadata.fInvLengths <= 1.f).allTrue();
+    }
+
+    if ((metadata.fInvLengths >= 1.f / kTolerance).anyTrue()) {
+        // Have an empty edge from a degenerate quad, so there's no hope
+        return false;
+    }
+
+    // The distance the point needs to move is 1/2sin(theta), where theta is the angle between the
+    // two edges at that point. cos(theta) is equal to dot(dxy, nextCW(dxy))
+    Sk4f cosTheta = fma(metadata.fDX, nextCW(metadata.fDX), metadata.fDY * nextCW(metadata.fDY));
+    // If the angle is too shallow between edges, go through the degenerate path, otherwise adding
+    // and subtracting very large vectors in almost opposite directions leads to float errors
+    if ((cosTheta.abs() >= 0.9f).anyTrue()) {
+        return false;
+    }
+    *outset = 0.5f * (1.f - cosTheta * cosTheta).rsqrt(); // 1/2sin(theta)
+
+    // When outsetting or insetting, the current edge's AA adds to the length:
+    //   cos(pi - theta)/2sin(theta) + cos(pi-ccw(theta))/2sin(ccw(theta))
+    // Moving an adjacent edge updates the length by 1/2sin(theta|ccw(theta))
+    Sk4f halfTanTheta = -cosTheta * (*outset); // cos(pi - theta) = -cos(theta)
+    Sk4f edgeAdjust = metadata.fMask * (halfTanTheta + nextCCW(halfTanTheta)) +
+                      nextCCW(metadata.fMask) * nextCCW(*outset) +
+                      nextCW(metadata.fMask) * (*outset);
+    // If either outsetting (plus edgeAdjust) or insetting (minus edgeAdjust) make edgeLen negative
+    // then use the slow path
+    Sk4f threshold = 0.1f - metadata.fInvLengths.invert();
+    return (edgeAdjust > threshold).allTrue() && (edgeAdjust < -threshold).allTrue();
+}
+
+// Ignores the quad's fW, use outset_projected_vertices if it's known to need 3D.
+static AI void outset_vertices(const Sk4f& outset, const QuadMetadata& metadata, Vertices* quad) {
     // The mask is rotated compared to the outsets and edge vectors, since if the edge is "on"
     // both its points need to be moved along their other edge vectors.
-    auto maskedOutset = -outset * nextCW(mask);
-    auto maskedOutsetCW = outsetCW * mask;
-    // x = x + outsetCW * mask * nextCW(xdiff) - outset * nextCW(mask) * xdiff
-    *x += fma(maskedOutsetCW, nextCW(xdiff), maskedOutset * xdiff);
-    *y += fma(maskedOutsetCW, nextCW(ydiff), maskedOutset * ydiff);
-    if (uvrCount > 0) {
+    auto maskedOutset = -outset * nextCW(metadata.fMask);
+    auto maskedOutsetCW = outset * metadata.fMask;
+    // x = x + outset * mask * nextCW(xdiff) - outset * nextCW(mask) * xdiff
+    quad->fX += fma(maskedOutsetCW, nextCW(metadata.fDX), maskedOutset * metadata.fDX);
+    quad->fY += fma(maskedOutsetCW, nextCW(metadata.fDY), maskedOutset * metadata.fDY);
+    if (quad->fUVRCount > 0) {
         // We want to extend the texture coords by the same proportion as the positions.
-        maskedOutset *= invLengths;
-        maskedOutsetCW *= nextCW(invLengths);
-        Sk4f udiff = nextCCW(*u) - *u;
-        Sk4f vdiff = nextCCW(*v) - *v;
-        *u += fma(maskedOutsetCW, nextCW(udiff), maskedOutset * udiff);
-        *v += fma(maskedOutsetCW, nextCW(vdiff), maskedOutset * vdiff);
-        if (uvrCount == 3) {
-            Sk4f rdiff = nextCCW(*r) - *r;
-            *r += fma(maskedOutsetCW, nextCW(rdiff), maskedOutset * rdiff);
+        maskedOutset *= metadata.fInvLengths;
+        maskedOutsetCW *= nextCW(metadata.fInvLengths);
+        Sk4f du = nextCCW(quad->fU) - quad->fU;
+        Sk4f dv = nextCCW(quad->fV) - quad->fV;
+        quad->fU += fma(maskedOutsetCW, nextCW(du), maskedOutset * du);
+        quad->fV += fma(maskedOutsetCW, nextCW(dv), maskedOutset * dv);
+        if (quad->fUVRCount == 3) {
+            Sk4f dr = nextCCW(quad->fR) - quad->fR;
+            quad->fR += fma(maskedOutsetCW, nextCW(dr), maskedOutset * dr);
         }
     }
 }
 
-static AI void outset_vertices(const Sk4f& outset, const Sk4f& outsetCW, const Sk4f& xdiff,
-                               const Sk4f& ydiff, const Sk4f& invLengths,
-                               Sk4f* x, Sk4f* y, Sk4f* u, Sk4f* v, Sk4f* r, int uvrCount) {
-    // x = x + outsetCW * nextCW(xdiff) - outset * xdiff (as above, but where mask = (1,1,1,1))
-    *x += fma(outsetCW, nextCW(xdiff), -outset * xdiff);
-    *y += fma(outsetCW, nextCW(ydiff), -outset * ydiff);
-    if (uvrCount > 0) {
-        Sk4f t = -outset * invLengths; // Bake minus sign in here
-        Sk4f tCW = outsetCW * nextCW(invLengths);
-        Sk4f udiff = nextCCW(*u) - *u;
-        Sk4f vdiff = nextCCW(*v) - *v;
-        *u += fma(tCW, nextCW(udiff), t * udiff);
-        *v += fma(tCW, nextCW(vdiff), t * vdiff);
-        if (uvrCount == 3) {
-            Sk4f rdiff = nextCCW(*r) - *r;
-            *r += fma(tCW, nextCW(rdiff), t * rdiff);
+// Updates (x,y,w) to be at (x2d,y2d) once projected. Updates (u,v,r) to match if provided.
+// Gracefully handles 2D content if *w holds all 1s.
+static void outset_projected_vertices(const Sk4f& x2d, const Sk4f& y2d,
+                                      GrQuadAAFlags aaFlags, Vertices* quad) {
+    // Left to right, in device space, for each point
+    Sk4f e1x = SkNx_shuffle<2, 3, 2, 3>(quad->fX) - SkNx_shuffle<0, 1, 0, 1>(quad->fX);
+    Sk4f e1y = SkNx_shuffle<2, 3, 2, 3>(quad->fY) - SkNx_shuffle<0, 1, 0, 1>(quad->fY);
+    Sk4f e1w = SkNx_shuffle<2, 3, 2, 3>(quad->fW) - SkNx_shuffle<0, 1, 0, 1>(quad->fW);
+    correct_bad_edges(fma(e1x, e1x, e1y * e1y) < kTolerance * kTolerance, &e1x, &e1y, &e1w);
+
+    // // Top to bottom, in device space, for each point
+    Sk4f e2x = SkNx_shuffle<1, 1, 3, 3>(quad->fX) - SkNx_shuffle<0, 0, 2, 2>(quad->fX);
+    Sk4f e2y = SkNx_shuffle<1, 1, 3, 3>(quad->fY) - SkNx_shuffle<0, 0, 2, 2>(quad->fY);
+    Sk4f e2w = SkNx_shuffle<1, 1, 3, 3>(quad->fW) - SkNx_shuffle<0, 0, 2, 2>(quad->fW);
+    correct_bad_edges(fma(e2x, e2x, e2y * e2y) < kTolerance * kTolerance, &e2x, &e2y, &e2w);
+
+    // Can only move along e1 and e2 to reach the new 2D point, so we have
+    // x2d = (x + a*e1x + b*e2x) / (w + a*e1w + b*e2w) and
+    // y2d = (y + a*e1y + b*e2y) / (w + a*e1w + b*e2w) for some a, b
+    // This can be rewritten to a*c1x + b*c2x + c3x = 0; a * c1y + b*c2y + c3y = 0, where
+    // the cNx and cNy coefficients are:
+    Sk4f c1x = e1w * x2d - e1x;
+    Sk4f c1y = e1w * y2d - e1y;
+    Sk4f c2x = e2w * x2d - e2x;
+    Sk4f c2y = e2w * y2d - e2y;
+    Sk4f c3x = quad->fW * x2d - quad->fX;
+    Sk4f c3y = quad->fW * y2d - quad->fY;
+
+    // Solve for a and b
+    Sk4f a, b, denom;
+    if (aaFlags == GrQuadAAFlags::kAll) {
+        // When every edge is outset/inset, each corner can use both edge vectors
+        denom = c1x * c2y - c2x * c1y;
+        a = (c2x * c3y - c3x * c2y) / denom;
+        b = (c3x * c1y - c1x * c3y) / denom;
+    } else {
+        // Force a or b to be 0 if that edge cannot be used due to non-AA
+        // FIXME requires the extra > 0.f, since Sk4f's thenElse only works if true values have
+        // all their bits set to 1.
+        Sk4f aMask = Sk4f((aaFlags & GrQuadAAFlags::kLeft) ? 1.f : 0.f,
+                   (aaFlags & GrQuadAAFlags::kLeft) ? 1.f : 0.f,
+                   (aaFlags & GrQuadAAFlags::kRight) ? 1.f : 0.f,
+                   (aaFlags & GrQuadAAFlags::kRight) ? 1.f : 0.f) > 0.f;
+        Sk4f bMask = Sk4f((aaFlags & GrQuadAAFlags::kTop) ? 1.f : 0.f,
+                   (aaFlags & GrQuadAAFlags::kBottom) ? 1.f : 0.f,
+                   (aaFlags & GrQuadAAFlags::kTop) ? 1.f : 0.f,
+                   (aaFlags & GrQuadAAFlags::kBottom) ? 1.f : 0.f) > 0.f;
+
+        // When aMask[i]&bMask[i], then a[i], b[i], denom[i] match the kAll case.
+        // When aMask[i]&!bMask[i], then b[i] = 0, a[i] = -c3x/c1x or -c3y/c1y, using better denom
+        // When !aMask[i]&bMask[i], then a[i] = 0, b[i] = -c3x/c2x or -c3y/c2y, ""
+        // When !aMask[i]&!bMask[i], then both a[i] = 0 and b[i] = 0
+        Sk4f useC1x = c1x.abs() > c1y.abs();
+        Sk4f useC2x = c2x.abs() > c2y.abs();
+        //                                    -------- A & B ------      --------- A & !B ---------
+        denom = aMask.thenElse(bMask.thenElse(c1x * c2y - c2x * c1y,     useC1x.thenElse(c1x, c1y)),
+        //                                    ------- !A & B ----------  - !A & !B -
+                               bMask.thenElse(useC2x.thenElse(c2x, c2y), 1.0f));
+        //                                -------- A & B ------  ---------- A & !B ----------
+        a = aMask.thenElse(bMask.thenElse(c2x * c3y - c3x * c2y, useC1x.thenElse(-c3x, -c3y)),
+        //                 - !A -
+                           0.0f) / denom;
+        //                                -------- A & B ------  ---------- !A & B ----------
+        b = bMask.thenElse(aMask.thenElse(c3x * c1y - c1x * c3y, useC2x.thenElse(-c3x, -c3y)),
+        //                 - !B -
+                           0.0f) / denom;
+    }
+
+    quad->fX += a * e1x + b * e2x;
+    quad->fY += a * e1y + b * e2y;
+    quad->fW += a * e1w + b * e2w;
+    correct_bad_coords(denom.abs() < kTolerance, &quad->fX, &quad->fY, &quad->fW);
+
+    if (quad->fUVRCount > 0) {
+        // Calculate R here so it can be corrected with U and V in case it's needed later
+        Sk4f e1u = SkNx_shuffle<2, 3, 2, 3>(quad->fU) - SkNx_shuffle<0, 1, 0, 1>(quad->fU);
+        Sk4f e1v = SkNx_shuffle<2, 3, 2, 3>(quad->fV) - SkNx_shuffle<0, 1, 0, 1>(quad->fV);
+        Sk4f e1r = SkNx_shuffle<2, 3, 2, 3>(quad->fR) - SkNx_shuffle<0, 1, 0, 1>(quad->fR);
+        correct_bad_edges(fma(e1u, e1u, e1v * e1v) < kTolerance * kTolerance, &e1u, &e1v, &e1r);
+
+        Sk4f e2u = SkNx_shuffle<1, 1, 3, 3>(quad->fU) - SkNx_shuffle<0, 0, 2, 2>(quad->fU);
+        Sk4f e2v = SkNx_shuffle<1, 1, 3, 3>(quad->fV) - SkNx_shuffle<0, 0, 2, 2>(quad->fV);
+        Sk4f e2r = SkNx_shuffle<1, 1, 3, 3>(quad->fR) - SkNx_shuffle<0, 0, 2, 2>(quad->fR);
+        correct_bad_edges(fma(e2u, e2u, e2v * e2v) < kTolerance * kTolerance, &e2u, &e2v, &e2r);
+
+        quad->fU += a * e1u + b * e2u;
+        quad->fV += a * e1v + b * e2v;
+        if (quad->fUVRCount == 3) {
+            quad->fR += a * e1r + b * e2r;
+            correct_bad_coords(denom.abs() < kTolerance, &quad->fU, &quad->fV, &quad->fR);
+        } else {
+            correct_bad_coords(denom.abs() < kTolerance, &quad->fU, &quad->fV, nullptr);
         }
     }
 }
 
-// Updates outset in place to account for non-90 degree angles of the quad edges stored in
-// xdiff, ydiff (which are assumed to be normalized).
-static void adjust_non_rectilinear_outset(const Sk4f& xdiff, const Sk4f& ydiff, Sk4f* outset) {
-    // The distance the point needs to move is outset/sqrt(1-cos^2(theta)), where theta is the angle
-    // between the two edges at that point. cos(theta) is equal to dot(xydiff, nextCW(xydiff)),
-    Sk4f cosTheta = fma(xdiff, nextCW(xdiff), ydiff * nextCW(ydiff));
-    *outset *= (1.f - cosTheta * cosTheta).rsqrt();
-    // But clamp to make sure we don't expand by a giant amount if the sheer is really high
-    *outset = Sk4f::Max(-3.f, Sk4f::Min(*outset, 3.f));
+// Calculate area of intersection between quad (xs, ys) and a pixel at 'pixelCenter'.
+// a, b, c are edge equations of the quad, flipped is true if the line equations had their normals
+// reversed to correct for matrix transforms.
+static float get_exact_coverage(const SkPoint& pixelCenter, const Vertices& quad,
+                                const Edges& edges) {
+     // Ordering of vertices given default tri-strip that produces CCW points
+    static const int kCCW[] = {0, 1, 3, 2};
+    // Ordering of vertices given inverted tri-strip that produces CCW
+    static const int kFlippedCCW[] = {0, 2, 3, 1};
+
+    // Edge boundaries of the pixel
+    float left = pixelCenter.fX - 0.5f;
+    float right = pixelCenter.fX + 0.5f;
+    float top = pixelCenter.fY - 0.5f;
+    float bot = pixelCenter.fY + 0.5f;
+
+    // Whether or not the 4 corners of the pixel are inside the quad geometry. Variable names are
+    // intentional to work easily with the helper macros.
+    bool topleftInside = ((edges.fA * left + edges.fB * top + edges.fC) >= 0.f).allTrue();
+    bool botleftInside = ((edges.fA * left + edges.fB * bot + edges.fC) >= 0.f).allTrue();
+    bool botrightInside = ((edges.fA * right + edges.fB * bot + edges.fC) >= 0.f).allTrue();
+    bool toprightInside = ((edges.fA * right + edges.fB * top + edges.fC) >= 0.f).allTrue();
+    if (topleftInside && botleftInside && botrightInside && toprightInside) {
+        // Quad fully contains the pixel, so we know the area will be 1.f
+        return 1.f;
+    }
+
+    // Track whether or not the quad vertices in (xs, ys) are on the proper sides of l, t, r, and b
+    Sk4i leftValid = SkNx_cast<int32_t>(quad.fX >= left);
+    Sk4i rightValid = SkNx_cast<int32_t>(quad.fX <= right);
+    Sk4i topValid = SkNx_cast<int32_t>(quad.fY >= top);
+    Sk4i botValid = SkNx_cast<int32_t>(quad.fY <= bot);
+
+    // Intercepts of quad lines with the 4 pixel edges
+    Sk4f leftCross = -(edges.fC + edges.fA * left) / edges.fB;
+    Sk4f rightCross = -(edges.fC + edges.fA * right) / edges.fB;
+    Sk4f topCross = -(edges.fC + edges.fB * top) / edges.fA;
+    Sk4f botCross = -(edges.fC + edges.fB * bot) / edges.fA;
+
+    // State for implicitly tracking the intersection boundary and area
+    SkPoint firstPoint = {0.f, 0.f};
+    SkPoint lastPoint = {0.f, 0.f};
+    bool intersected = false;
+    float area = 0.f;
+
+    // Adds a point to the intersection hull, remembering first point (for closing) and the
+    // current point, and updates the running area total.
+    // See http://mathworld.wolfram.com/PolygonArea.html
+    auto accumulate = [&](const SkPoint& p) {
+        if (intersected) {
+            float da = lastPoint.fX * p.fY - p.fX * lastPoint.fY;
+            area += da;
+        } else {
+            firstPoint = p;
+            intersected = true;
+        }
+        lastPoint = p;
+    };
+
+    // Used during iteration over the quad points to check if edge intersections are valid and
+    // should be accumulated.
+#define ADD_EDGE_CROSSING_X(SIDE) \
+    do { \
+        if (SIDE##Cross[ei] >= top && SIDE##Cross[ei] <= bot) { \
+            accumulate({SIDE, SIDE##Cross[ei]}); \
+            addedIntersection = true; \
+        } \
+    } while(false)
+#define ADD_EDGE_CROSSING_Y(SIDE) \
+    do { \
+        if (SIDE##Cross[ei] >= left && SIDE##Cross[ei] <= right) { \
+            accumulate({SIDE##Cross[ei], SIDE}); \
+            addedIntersection = true; \
+        } \
+    } while(false)
+#define TEST_EDGES(SIDE, AXIS, I, NI) \
+    do { \
+        if (!SIDE##Valid[I] && SIDE##Valid[NI]) { \
+            ADD_EDGE_CROSSING_##AXIS(SIDE); \
+            crossedEdges = true; \
+        } \
+    } while(false)
+    // Used during iteration over the quad points to check if a pixel corner should be included
+    // in the intersection boundary
+#define ADD_CORNER(CHECK, SIDE_LR, SIDE_TB) \
+    if (!CHECK##Valid[i] || !CHECK##Valid[ni]) { \
+        if (SIDE_TB##SIDE_LR##Inside) { \
+            accumulate({SIDE_LR, SIDE_TB}); \
+        } \
+    }
+#define TEST_CORNER_X(SIDE, I, NI) \
+    do { \
+        if (!SIDE##Valid[I] && SIDE##Valid[NI]) { \
+            ADD_CORNER(top, SIDE, top) else ADD_CORNER(bot, SIDE, bot) \
+        } \
+    } while(false)
+#define TEST_CORNER_Y(SIDE, I, NI) \
+    do { \
+        if (!SIDE##Valid[I] && SIDE##Valid[NI]) { \
+            ADD_CORNER(left, left, SIDE) else ADD_CORNER(right, right, SIDE) \
+        } \
+    } while(false)
+
+    // Iterate over the 4 points of the quad, adding valid intersections with the pixel edges
+    // or adding interior pixel corners as it goes. This automatically keeps all accumulated points
+    // in CCW ordering so the area can be calculated on the fly and there's no need to store the
+    // list of hull points. This is somewhat inspired by the Sutherland-Hodgman algorithm but since
+    // there are only 4 points in each source polygon, there is no point list maintenance.
+    for (int j = 0; j < 4; ++j) {
+        // Current vertex
+        int i = edges.fFlipped ? kFlippedCCW[j] : kCCW[j];
+        // Moving to this vertex
+        int ni = edges.fFlipped ? kFlippedCCW[(j + 1) % 4] : kCCW[(j + 1) % 4];
+        // Index in edge vectors corresponding to move from i to ni
+        int ei = edges.fFlipped ? ni : i;
+
+        bool crossedEdges = false;
+        bool addedIntersection = false;
+
+        // First check if there are any outside -> inside edge crossings. There can be 0, 1, or 2.
+        // 2 can occur if one crossing is still outside the pixel, or if they both go through
+        // the corner (in which case a duplicate point is added, but that doesn't change area).
+
+        // Outside to inside crossing
+        TEST_EDGES(left, X, i, ni);
+        TEST_EDGES(right, X, i, ni);
+        TEST_EDGES(top, Y, i, ni);
+        TEST_EDGES(bot, Y, i, ni);
+        // Inside to outside crossing (swapping ni and i in the boolean test)
+        TEST_EDGES(left, X, ni, i);
+        TEST_EDGES(right, X, ni, i);
+        TEST_EDGES(top, Y, ni, i);
+        TEST_EDGES(bot, Y, ni, i);
+
+        // If we crossed edges but didn't add any intersections, check the corners of the pixel.
+        // If the pixel corners are inside the quad, include them in the boundary.
+        if (crossedEdges && !addedIntersection) {
+            // This can lead to repeated points, but those just accumulate zero area
+            TEST_CORNER_X(left, i, ni);
+            TEST_CORNER_X(right, i, ni);
+            TEST_CORNER_Y(top, i, ni);
+            TEST_CORNER_Y(bot, i, ni);
+
+            TEST_CORNER_X(left, ni, i);
+            TEST_CORNER_X(right, ni, i);
+            TEST_CORNER_Y(top, ni, i);
+            TEST_CORNER_Y(bot, ni, i);
+        }
+
+        // Lastly, if the next point is completely inside the pixel it gets included in the boundary
+        if (leftValid[ni] && rightValid[ni] && topValid[ni] && botValid[ni]) {
+            accumulate({quad.fX[ni], quad.fY[ni]});
+        }
+    }
+
+#undef TEST_CORNER_Y
+#undef TEST_CORNER_X
+#undef ADD_CORNER
+
+#undef TEST_EDGES
+#undef ADD_EDGE_CROSSING_Y
+#undef ADD_EDGE_CROSSING_X
+
+    // After all points have been considered, close the boundary to get final area. If we never
+    // added any points, it means the quad didn't intersect the pixel rectangle.
+    if (intersected) {
+        // Final equation for area of convex polygon is to multiply by -1/2 (minus since the points
+        // were in CCW order).
+        accumulate(firstPoint);
+        return -0.5f * area;
+    } else {
+        return 0.f;
+    }
+}
+
+// Outsets or insets xs/ys in place. To be used when the interior is very small, edges are near
+// parallel, or edges are very short/zero-length. Returns coverage for each vertex.
+// Requires (dx, dy) to already be fixed for empty edges.
+static Sk4f compute_degenerate_quad(GrQuadAAFlags aaFlags, const Sk4f& mask, const Edges& edges,
+                                    bool outset, Vertices* quad) {
+    // Move the edge 1/2 pixel in or out depending on 'outset'.
+    Sk4f oc = edges.fC + mask * (outset ? 0.5f : -0.5f);
+
+    // There are 6 points that we care about to determine the final shape of the polygon, which
+    // are the intersections between (e0,e2), (e1,e0), (e2,e3), (e3,e1) (corresponding to the
+    // 4 corners), and (e1, e2), (e0, e3) (representing the intersections of opposite edges).
+    Sk4f denom = edges.fA * nextCW(edges.fB) - edges.fB * nextCW(edges.fA);
+    Sk4f px = (edges.fB * nextCW(oc) - oc * nextCW(edges.fB)) / denom;
+    Sk4f py = (oc * nextCW(edges.fA) - edges.fA * nextCW(oc)) / denom;
+    correct_bad_coords(denom.abs() < kTolerance, &px, &py, nullptr);
+
+    // Calculate the signed distances from these 4 corners to the other two edges that did not
+    // define the intersection. So p(0) is compared to e3,e1, p(1) to e3,e2 , p(2) to e0,e1, and
+    // p(3) to e0,e2
+    Sk4f dists1 = px * SkNx_shuffle<3, 3, 0, 0>(edges.fA) +
+                  py * SkNx_shuffle<3, 3, 0, 0>(edges.fB) +
+                  SkNx_shuffle<3, 3, 0, 0>(oc);
+    Sk4f dists2 = px * SkNx_shuffle<1, 2, 1, 2>(edges.fA) +
+                  py * SkNx_shuffle<1, 2, 1, 2>(edges.fB) +
+                  SkNx_shuffle<1, 2, 1, 2>(oc);
+
+    // If all the distances are >= 0, the 4 corners form a valid quadrilateral, so use them as
+    // the 4 points. If any point is on the wrong side of both edges, the interior has collapsed
+    // and we need to use a central point to represent it. If all four points are only on the
+    // wrong side of 1 edge, one edge has crossed over another and we use a line to represent it.
+    // Otherwise, use a triangle that replaces the bad points with the intersections of
+    // (e1, e2) or (e0, e3) as needed.
+    Sk4f d1v0 = dists1 < kTolerance;
+    Sk4f d2v0 = dists2 < kTolerance;
+    // FIXME(michaelludwig): Sk4f has anyTrue() and allTrue(), but not & or |. Sk4i has & or | but
+    // not anyTrue() and allTrue(). Moving to SkVx from SkNx will clean this up.
+    Sk4i d1And2 = SkNx_cast<int32_t>(d1v0) & SkNx_cast<int32_t>(d2v0);
+    Sk4i d1Or2 = SkNx_cast<int32_t>(d1v0) | SkNx_cast<int32_t>(d2v0);
+
+    Sk4f coverage;
+    if (!d1Or2[0] && !d1Or2[1] && !d1Or2[2] && !d1Or2[3]) {
+        // Every dists1 and dists2 >= kTolerance so it's not degenerate, use all 4 corners as-is
+        // and use full coverage
+        coverage = 1.f;
+    } else if (d1And2[0] || d1And2[1] || d1And2[2] || d1And2[2]) {
+        // A point failed against two edges, so reduce the shape to a single point, which we take as
+        // the center of the original quad to ensure it is contained in the intended geometry. Since
+        // it has collapsed, we know the shape cannot cover a pixel so update the coverage.
+        SkPoint center = {0.25f * (quad->fX[0] + quad->fX[1] + quad->fX[2] + quad->fX[3]),
+                          0.25f * (quad->fY[0] + quad->fY[1] + quad->fY[2] + quad->fY[3])};
+        coverage = get_exact_coverage(center, *quad, edges);
+        px = center.fX;
+        py = center.fY;
+    } else if (d1Or2[0] && d1Or2[1] && d1Or2[2] && d1Or2[3]) {
+        // Degenerates to a line. Compare p[2] and p[3] to edge 0. If they are on the wrong side,
+        // that means edge 0 and 3 crossed, and otherwise edge 1 and 2 crossed.
+        if (dists1[2] < kTolerance && dists1[3] < kTolerance) {
+            // Edges 0 and 3 have crossed over, so make the line from average of (p0,p2) and (p1,p3)
+            px = 0.5f * (SkNx_shuffle<0, 1, 0, 1>(px) + SkNx_shuffle<2, 3, 2, 3>(px));
+            py = 0.5f * (SkNx_shuffle<0, 1, 0, 1>(py) + SkNx_shuffle<2, 3, 2, 3>(py));
+            float mc02 = get_exact_coverage({px[0], py[0]}, *quad, edges);
+            float mc13 = get_exact_coverage({px[1], py[1]}, *quad, edges);
+            coverage = Sk4f(mc02, mc13, mc02, mc13);
+        } else {
+            // Edges 1 and 2 have crossed over, so make the line from average of (p0,p1) and (p2,p3)
+            px = 0.5f * (SkNx_shuffle<0, 0, 2, 2>(px) + SkNx_shuffle<1, 1, 3, 3>(px));
+            py = 0.5f * (SkNx_shuffle<0, 0, 2, 2>(py) + SkNx_shuffle<1, 1, 3, 3>(py));
+            float mc01 = get_exact_coverage({px[0], py[0]}, *quad, edges);
+            float mc23 = get_exact_coverage({px[2], py[2]}, *quad, edges);
+            coverage = Sk4f(mc01, mc01, mc23, mc23);
+        }
+    } else {
+        // This turns into a triangle. Replace corners as needed with the intersections between
+        // (e0,e3) and (e1,e2), which must now be calculated
+        Sk2f eDenom = SkNx_shuffle<0, 1>(edges.fA) * SkNx_shuffle<3, 2>(edges.fB) -
+                      SkNx_shuffle<0, 1>(edges.fB) * SkNx_shuffle<3, 2>(edges.fA);
+        Sk2f ex = (SkNx_shuffle<0, 1>(edges.fB) * SkNx_shuffle<3, 2>(oc) -
+                   SkNx_shuffle<0, 1>(oc) * SkNx_shuffle<3, 2>(edges.fB)) / eDenom;
+        Sk2f ey = (SkNx_shuffle<0, 1>(oc) * SkNx_shuffle<3, 2>(edges.fA) -
+                   SkNx_shuffle<0, 1>(edges.fA) * SkNx_shuffle<3, 2>(oc)) / eDenom;
+
+        if (SkScalarAbs(eDenom[0]) > kTolerance) {
+            px = d1v0.thenElse(ex[0], px);
+            py = d1v0.thenElse(ey[0], py);
+        }
+        if (SkScalarAbs(eDenom[1]) > kTolerance) {
+            px = d2v0.thenElse(ex[1], px);
+            py = d2v0.thenElse(ey[1], py);
+        }
+
+        coverage = 1.f;
+    }
+
+    outset_projected_vertices(px, py, aaFlags, quad);
+    return coverage;
 }
 
 // Computes the vertices for the two nested quads used to create AA edges. The original single quad
-// should be duplicated as input in x1 and x2, y1 and y2, and possibly u1|u2, v1|v2, [r1|r2]
-// (controlled by uvrChannelCount).  While the values should be duplicated, they should be separate
-// pointers. The outset quad is written in-place back to x1, y1, etc. and the inset inner quad is
-// written to x2, y2, etc.
-static float compute_nested_quad_vertices(GrQuadAAFlags aaFlags, Sk4f* x1, Sk4f* y1,
-        Sk4f* u1, Sk4f* v1, Sk4f* r1, Sk4f* x2, Sk4f* y2, Sk4f* u2, Sk4f* v2, Sk4f* r2,
-        int uvrCount, bool rectilinear) {
-    SkASSERT(uvrCount == 0 || uvrCount == 2 || uvrCount == 3);
+// should be duplicated as input in 'inner' and 'outer', and the resulting quad frame will be
+// stored in-place on return. Returns per-vertex coverage for the inner vertices.
+static Sk4f compute_nested_quad_vertices(GrQuadAAFlags aaFlags, bool rectilinear,
+                                         Vertices* inner, Vertices* outer) {
+    SkASSERT(inner->fUVRCount == 0 || inner->fUVRCount == 2 || inner->fUVRCount == 3);
+    SkASSERT(outer->fUVRCount == inner->fUVRCount);
 
-    // Compute edge vectors for the quad.
-    auto xnext = nextCCW(*x1);
-    auto ynext = nextCCW(*y1);
-    // xdiff and ydiff will comprise the normalized vectors pointing along each quad edge.
-    Sk4f xdiff, ydiff, invLengths;
-    compute_edge_vectors(*x1, *y1, xnext, ynext, &xdiff, &ydiff, &invLengths);
+    QuadMetadata metadata = get_metadata(*inner, aaFlags);
 
     // When outsetting, we want the new edge to be .5px away from the old line, which means the
-    // corners may need to be adjusted by more than .5px if the matrix had sheer.
+    // corners may need to be adjusted by more than .5px if the matrix had sheer. This adjustment
+    // is only computed if there are no empty edges, and it may signal going through the slow path.
     Sk4f outset = 0.5f;
-    if (!rectilinear) {
-        adjust_non_rectilinear_outset(xdiff, ydiff, &outset);
+    if (get_optimized_outset(metadata, rectilinear, &outset)) {
+       // Since it's not subpixel, outsetting and insetting are trivial vector additions.
+        outset_vertices(outset, metadata, outer);
+        outset_vertices(-outset, metadata, inner);
+        return 1.f;
     }
 
-    // When insetting, cap the inset amount to be half of the edge length, except that each edge
-    // has to remain parallel, so we separately limit LR and TB to half of the smallest of the
-    // opposing edges.
-    Sk4f lengths = invLengths.invert();
-    Sk2f sides(SkMinScalar(lengths[0], lengths[3]), SkMinScalar(lengths[1], lengths[2]));
-    Sk4f edgeLimits = 0.5f * SkNx_shuffle<0, 1, 1, 0>(sides);
+    // Only compute edge equations once since they are the same for inner and outer quads
+    Edges edges = get_edge_equations(metadata, *inner);
 
-    if ((edgeLimits < 0.5f).anyTrue()) {
-        // Dealing with a subpixel rectangle, so must calculate clamped insets and padded outsets.
-        // The outsets are padded to ensure that the quad spans 2 pixels for improved interpolation.
-        Sk4f inset = -Sk4f::Min(outset, edgeLimits);
-        Sk4f insetCW = -Sk4f::Min(outset, nextCW(edgeLimits));
-
-        // The parallel distance shift caused by outset is currently 0.5, but need to scale it up to
-        // 0.5*(2 - side) so that (side + 2*shift) = 2px. Thus scale outsets for thin edges by
-        // (2 - side) since it already has the 1/2.
-        Sk4f outsetScale = 2.f - 2.f * Sk4f::Min(edgeLimits, 0.5f); // == 1 for non-thin edges
-        Sk4f outsetCW = outset * nextCW(outsetScale);
-        outset *= outsetScale;
-
-        if (aaFlags != GrQuadAAFlags::kAll) {
-            Sk4f mask = compute_edge_mask(aaFlags);
-            outset_masked_vertices(outset, outsetCW, xdiff, ydiff, invLengths, mask, x1, y1,
-                                   u1, v1, r1, uvrCount);
-            outset_masked_vertices(inset, insetCW, xdiff, ydiff, invLengths, mask, x2, y2,
-                                   u2, v2, r2, uvrCount);
-        } else {
-            outset_vertices(outset, outsetCW, xdiff, ydiff, invLengths, x1, y1, u1, v1, r1, uvrCount);
-            outset_vertices(inset, insetCW, xdiff, ydiff, invLengths, x2, y2, u2, v2, r2, uvrCount);
-        }
-    } else {
-        // Since it's not subpixel, the inset is just the opposite of the outset and there's no
-        // difference between CCW and CW behavior.
-        Sk4f inset = -outset;
-        if (aaFlags != GrQuadAAFlags::kAll) {
-            Sk4f mask = compute_edge_mask(aaFlags);
-            outset_masked_vertices(outset, outset, xdiff, ydiff, invLengths, mask, x1, y1,
-                                   u1, v1, r1, uvrCount);
-            outset_masked_vertices(inset, inset, xdiff, ydiff, invLengths, mask, x2, y2,
-                                   u2, v2, r2, uvrCount);
-        } else {
-            outset_vertices(outset, outset, xdiff, ydiff, invLengths, x1, y1, u1, v1, r1, uvrCount);
-            outset_vertices(inset, inset, xdiff, ydiff, invLengths, x2, y2, u2, v2, r2, uvrCount);
-        }
-    }
-
-    // An approximation of the pixel area covered by the quad
-    sides = Sk2f::Min(1.f, sides);
-    return sides[0] * sides[1];
+    // Calculate both outset and inset, returning the coverage reported for the inset, since the
+    // outset will always have 0.0f.
+    compute_degenerate_quad(aaFlags, metadata.fMask, edges, true, outer);
+    return compute_degenerate_quad(aaFlags, metadata.fMask, edges, false, inner);
 }
 
-// For each device space corner, devP, label its left/right or top/bottom opposite device space
-// point opDevPt. The new device space point is opDevPt + s (devPt - opDevPt) where s is
-// (length(devPt - opDevPt) + outset) / length(devPt - opDevPt); This returns the interpolant s,
-// adjusted for any subpixel corrections. If subpixel, it also updates the max coverage.
-static Sk4f get_projected_interpolant(const Sk4f& len, const Sk4f& outsets, float* maxCoverage) {
-    if ((len < 1.f).anyTrue()) {
-        *maxCoverage *= len.min();
+// Generalizes compute_nested_quad_vertices to extrapolate local coords such that after perspective
+// division of the device coordinates, the original local coordinate value is at the original
+// un-outset device position.
+static Sk4f compute_nested_persp_quad_vertices(const GrQuadAAFlags aaFlags, Vertices* inner,
+                                               Vertices* outer) {
+    SkASSERT(inner->fUVRCount == 0 || inner->fUVRCount == 2 || inner->fUVRCount == 3);
+    SkASSERT(outer->fUVRCount == inner->fUVRCount);
 
-        // When insetting, the amount is clamped to be half the minimum edge length to prevent
-        // overlap. When outsetting, the amount is padded to cover 2 pixels.
-        if ((outsets < 0.f).anyTrue()) {
-            return (len - 0.5f * len.min()) / len;
-        } else {
-            return (len + outsets * (2.f - len.min())) / len;
-        }
-    } else {
-        return (len + outsets) / len;
-    }
-}
+    // Calculate the projected 2D quad and use it to form projeccted inner/outer quads
+    // Don't use Sk4f.invert() here because it does not preserve 1/1 == 1, which creates rendering
+    // mismatches for 2D content that was batched into a 3D op, vs. 2D on its own.
+    Sk4f iw = 1.0f / inner->fW;
+    Sk4f x2d = inner->fX * iw;
+    Sk4f y2d = inner->fY * iw;
 
-// Generalizes compute_nested_quad_vertices to extrapolate local coords such that
-// after perspective division of the device coordinate, the original local coordinate value is at
-// the original un-outset device position. r is the local coordinate's w component. However, since
-// the projected edges will be different for inner and outer quads, there isn't much reuse between
-// the calculations, so it's easier to just have this operate on one quad a time.
-static float compute_quad_persp_vertices(GrQuadAAFlags aaFlags, Sk4f* x, Sk4f* y,
-        Sk4f* w, Sk4f* u, Sk4f* v, Sk4f* r, int uvrCount, bool inset) {
-    SkASSERT(uvrCount == 0 || uvrCount == 2 || uvrCount == 3);
+    Vertices inner2D = { x2d, y2d, /*w*/ 1.f, 0.f, 0.f, 0.f, 0 }; // No uvr outsetting in 2D
+    Vertices outer2D = inner2D;
 
-    auto iw = (*w).invert();
-    auto x2d = (*x) * iw;
-    auto y2d = (*y) * iw;
+    Sk4f coverage = compute_nested_quad_vertices(aaFlags, /* rect */ false, &inner2D, &outer2D);
 
-    // Must compute non-rectilinear outset quantity using the projected 2d edge vectors
-    Sk4f xdiff, ydiff, invLengths;
-    compute_edge_vectors(x2d, y2d, nextCCW(x2d), nextCCW(y2d), &xdiff, &ydiff, &invLengths);
-    Sk4f outset = inset ? -0.5f : 0.5f;
-    adjust_non_rectilinear_outset(xdiff, ydiff, &outset);
+    // Now map from the 2D inset/outset back to 3D and update the local coordinates as well
+    outset_projected_vertices(inner2D.fX, inner2D.fY, aaFlags, inner);
+    outset_projected_vertices(outer2D.fX, outer2D.fY, aaFlags, outer);
 
-    float maxProjectedCoverage = 1.f;
-
-    if ((GrQuadAAFlags::kLeft | GrQuadAAFlags::kRight) & aaFlags) {
-        // For each entry in x the equivalent entry in opX is the left/right opposite and so on.
-        Sk4f opX = SkNx_shuffle<2, 3, 0, 1>(*x);
-        Sk4f opW = SkNx_shuffle<2, 3, 0, 1>(*w);
-        Sk4f opY = SkNx_shuffle<2, 3, 0, 1>(*y);
-        // vx/vy holds the device space left-to-right vectors along top and bottom of the quad.
-        Sk2f vx = SkNx_shuffle<2, 3>(x2d) - SkNx_shuffle<0, 1>(x2d);
-        Sk2f vy = SkNx_shuffle<2, 3>(y2d) - SkNx_shuffle<0, 1>(y2d);
-        Sk4f len = SkNx_shuffle<0, 1, 0, 1>(SkNx_fma(vx, vx, vy * vy).sqrt());
-
-        // Compute t in homogeneous space from s using similar triangles so that we can produce
-        // homogeneous outset vertices for perspective-correct interpolation.
-        Sk4f s = get_projected_interpolant(len, outset, &maxProjectedCoverage);
-        Sk4f sOpW = s * opW;
-        Sk4f t = sOpW / (sOpW + (1.f - s) * (*w));
-        // mask is used to make the t values be 1 when the left/right side is not antialiased.
-        Sk4f mask(GrQuadAAFlags::kLeft & aaFlags  ? 1.f : 0.f,
-                  GrQuadAAFlags::kLeft & aaFlags  ? 1.f : 0.f,
-                  GrQuadAAFlags::kRight & aaFlags ? 1.f : 0.f,
-                  GrQuadAAFlags::kRight & aaFlags ? 1.f : 0.f);
-        t = t * mask + (1.f - mask);
-        *x = opX + t * (*x - opX);
-        *y = opY + t * (*y - opY);
-        *w = opW + t * (*w - opW);
-
-        if (uvrCount > 0) {
-            Sk4f opU = SkNx_shuffle<2, 3, 0, 1>(*u);
-            Sk4f opV = SkNx_shuffle<2, 3, 0, 1>(*v);
-            *u = opU + t * (*u - opU);
-            *v = opV + t * (*v - opV);
-            if (uvrCount == 3) {
-                Sk4f opR = SkNx_shuffle<2, 3, 0, 1>(*r);
-                *r = opR + t * (*r - opR);
-            }
-        }
-
-        if ((GrQuadAAFlags::kTop | GrQuadAAFlags::kBottom) & aaFlags) {
-            // Update the 2D points for the top/bottom calculation.
-            iw = (*w).invert();
-            x2d = (*x) * iw;
-            y2d = (*y) * iw;
-        }
-    }
-
-    if ((GrQuadAAFlags::kTop | GrQuadAAFlags::kBottom) & aaFlags) {
-        // This operates the same as above but for top/bottom rather than left/right.
-        Sk4f opX = SkNx_shuffle<1, 0, 3, 2>(*x);
-        Sk4f opW = SkNx_shuffle<1, 0, 3, 2>(*w);
-        Sk4f opY = SkNx_shuffle<1, 0, 3, 2>(*y);
-
-        Sk2f vx = SkNx_shuffle<1, 3>(x2d) - SkNx_shuffle<0, 2>(x2d);
-        Sk2f vy = SkNx_shuffle<1, 3>(y2d) - SkNx_shuffle<0, 2>(y2d);
-        Sk4f len = SkNx_shuffle<0, 0, 1, 1>(SkNx_fma(vx, vx, vy * vy).sqrt());
-
-        Sk4f s = get_projected_interpolant(len, outset, &maxProjectedCoverage);
-        Sk4f sOpW = s * opW;
-        Sk4f t = sOpW / (sOpW + (1.f - s) * (*w));
-
-        Sk4f mask(GrQuadAAFlags::kTop    & aaFlags ? 1.f : 0.f,
-                  GrQuadAAFlags::kBottom & aaFlags ? 1.f : 0.f,
-                  GrQuadAAFlags::kTop    & aaFlags ? 1.f : 0.f,
-                  GrQuadAAFlags::kBottom & aaFlags ? 1.f : 0.f);
-        t = t * mask + (1.f - mask);
-        *x = opX + t * (*x - opX);
-        *y = opY + t * (*y - opY);
-        *w = opW + t * (*w - opW);
-
-        if (uvrCount > 0) {
-            Sk4f opU = SkNx_shuffle<1, 0, 3, 2>(*u);
-            Sk4f opV = SkNx_shuffle<1, 0, 3, 2>(*v);
-            *u = opU + t * (*u - opU);
-            *v = opV + t * (*v - opV);
-            if (uvrCount == 3) {
-                Sk4f opR = SkNx_shuffle<1, 0, 3, 2>(*r);
-                *r = opR + t * (*r - opR);
-            }
-        }
-    }
-
-    return maxProjectedCoverage;
+    return coverage;
 }
 
 enum class CoverageMode {
@@ -332,34 +624,29 @@
 // Writes four vertices in triangle strip order, including the additional data for local
 // coordinates, domain, color, and coverage as needed to satisfy the vertex spec.
 static void write_quad(GrVertexWriter* vb, const GrQuadPerEdgeAA::VertexSpec& spec,
-                       CoverageMode mode, float coverage,
+                       CoverageMode mode, Sk4f coverage,
                        SkPMColor4f color4f, bool wideColor,
                        const SkRect& domain,
-                       const Sk4f& x, const Sk4f& y, const Sk4f& w,
-                       const Sk4f& u, const Sk4f& v, const Sk4f& r) {
+                       const Vertices& quad) {
     static constexpr auto If = GrVertexWriter::If<float>;
 
-    if (mode == CoverageMode::kWithColor) {
-        // Multiply the color by the coverage up front
-        SkASSERT(spec.hasVertexColors());
-        color4f = color4f * coverage;
-    }
-    GrVertexColor color(color4f, wideColor);
-
     for (int i = 0; i < 4; ++i) {
         // save position, this is a float2 or float3 or float4 depending on the combination of
         // perspective and coverage mode.
-        vb->write(x[i], y[i], If(spec.deviceQuadType() == GrQuadType::kPerspective, w[i]),
-                  If(mode == CoverageMode::kWithPosition, coverage));
+        vb->write(quad.fX[i], quad.fY[i],
+                  If(spec.deviceQuadType() == GrQuadType::kPerspective, quad.fW[i]),
+                  If(mode == CoverageMode::kWithPosition, coverage[i]));
 
         // save color
         if (spec.hasVertexColors()) {
-            vb->write(color);
+            vb->write(GrVertexColor(
+                    color4f * (mode == CoverageMode::kWithColor ? coverage[i] : 1.f), wideColor));
         }
 
         // save local position
         if (spec.hasLocalCoords()) {
-            vb->write(u[i], v[i], If(spec.localQuadType() == GrQuadType::kPerspective, r[i]));
+            vb->write(quad.fU[i], quad.fV[i],
+                      If(spec.localQuadType() == GrQuadType::kPerspective, quad.fR[i]));
         }
 
         // save the domain
@@ -406,58 +693,51 @@
     CoverageMode mode = get_mode_for_spec(spec);
 
     // Load position data into Sk4fs (always x, y, and load w to avoid branching down the road)
-    Sk4f oX = deviceQuad.x4f();
-    Sk4f oY = deviceQuad.y4f();
-    Sk4f oW = deviceQuad.w4f(); // Guaranteed to be 1f if it's not perspective
+    Vertices outer;
+    outer.fX = deviceQuad.x4f();
+    outer.fY = deviceQuad.y4f();
+    outer.fW = deviceQuad.w4f(); // Guaranteed to be 1f if it's not perspective
 
     // Load local position data into Sk4fs (either none, just u,v or all three)
-    Sk4f oU, oV, oR;
+    outer.fUVRCount = spec.localDimensionality();
     if (spec.hasLocalCoords()) {
-        oU = localQuad.x4f();
-        oV = localQuad.y4f();
-        oR = localQuad.w4f(); // Will be ignored if the local quad type isn't perspective
+        outer.fU = localQuad.x4f();
+        outer.fV = localQuad.y4f();
+        outer.fR = localQuad.w4f(); // Will be ignored if the local quad type isn't perspective
     }
 
     GrVertexWriter vb{vertices};
     if (spec.usesCoverageAA()) {
         SkASSERT(mode == CoverageMode::kWithPosition || mode == CoverageMode::kWithColor);
-
         // Must calculate two new quads, an outset and inset by .5 in projected device space, so
-        // duplicate the original quad into new Sk4fs for the inset.
-        Sk4f iX = oX, iY = oY, iW = oW;
-        Sk4f iU = oU, iV = oV, iR = oR;
+        // duplicate the original quad for the inner space
+        Vertices inner = outer;
 
-        float maxCoverage = 1.f;
-        if (aaFlags != GrQuadAAFlags::kNone) {
-            if (spec.deviceQuadType() == GrQuadType::kPerspective) {
-                // Outset and inset the quads independently because perspective makes each shift
-                // unique. Since iX copied pre-outset oX, this will compute the proper inset too.
-                compute_quad_persp_vertices(aaFlags, &oX, &oY, &oW, &oU, &oV, &oW,
-                                            spec.localDimensionality(), /* inset */ false);
-                // Save coverage limit when computing inset quad
-                maxCoverage = compute_quad_persp_vertices(aaFlags, &iX, &iY, &iW, &iU, &iV, &iW,
-                                                          spec.localDimensionality(), true);
-            } else {
-                // In the 2D case, insetting and outsetting can reuse the edge vectors, so the
-                // nested quads are computed together
-                maxCoverage = compute_nested_quad_vertices(aaFlags, &oX, &oY, &oU, &oV, &oR,
-                        &iX, &iY, &iU, &iV, &iR, spec.localDimensionality(),
-                        spec.deviceQuadType() <= GrQuadType::kRectilinear);
-            }
-            // NOTE: could provide an even more optimized tessellation function for axis-aligned
-            // rects since the positions can be outset by constants without doing vector math,
-            // except it must handle identifying the winding of the quad vertices if the transform
-            // applied a mirror, etc. The current 2D case is already adequately fast.
-        } // else don't adjust any positions, let the outer quad form degenerate triangles
+        Sk4f maxCoverage = 1.f;
+        if (spec.deviceQuadType() == GrQuadType::kPerspective) {
+            // For perspective, send quads with all edges non-AA through the tessellation to ensure
+            // their corners are processed the same as adjacent quads. This approach relies on
+            // solving edge equations to reconstruct corners, which can create seams if an inner
+            // fully non-AA quad is not similarly processed.
+            maxCoverage = compute_nested_persp_quad_vertices(aaFlags, &inner, &outer);
+        } else if (aaFlags != GrQuadAAFlags::kNone) {
+            // In 2D, the simpler corner math does not cause issues with seaming against non-AA
+            // inner quads.
+            maxCoverage = compute_nested_quad_vertices(
+                    aaFlags, spec.deviceQuadType() <= GrQuadType::kRectilinear, &inner, &outer);
+        }
+        // NOTE: could provide an even more optimized tessellation function for axis-aligned
+        // rects since the positions can be outset by constants without doing vector math,
+        // except it must handle identifying the winding of the quad vertices if the transform
+        // applied a mirror, etc. The current 2D case is already adequately fast.
 
         // Write two quads for inner and outer, inner will use the
-        write_quad(&vb, spec, mode, maxCoverage, color4f, wideColor, domain,
-                   iX, iY, iW, iU, iV, iR);
-        write_quad(&vb, spec, mode, 0.f, color4f, wideColor, domain, oX, oY, oW, oU, oV, oR);
+        write_quad(&vb, spec, mode, maxCoverage, color4f, wideColor, domain, inner);
+        write_quad(&vb, spec, mode, 0.f, color4f, wideColor, domain, outer);
     } else {
         // No outsetting needed, just write a single quad with full coverage
         SkASSERT(mode == CoverageMode::kNone);
-        write_quad(&vb, spec, mode, 1.f, color4f, wideColor, domain, oX, oY, oW, oU, oV, oR);
+        write_quad(&vb, spec, mode, 1.f, color4f, wideColor, domain, outer);
     }
 
     return vb.fPtr;