Implement per-edge aa as extra vertices instead of interpolated edge distances

It appears that using vertex interpolation to handle coverage, instead of
evaluating per-pixel, helps significantly on Adreno GPUs. Will see after
perf comes in if it's worth having both strategies and switch depending
on the platform.

Bug: chromium:914833
Bug: b/120946388
Change-Id: Ie33417938a72aa14eba4e22711e0abf97fcfbc7d
Reviewed-on: https://skia-review.googlesource.com/c/179255
Commit-Queue: Michael Ludwig <michaelludwig@google.com>
Reviewed-by: Brian Salomon <bsalomon@google.com>
diff --git a/src/gpu/ops/GrQuadPerEdgeAA.cpp b/src/gpu/ops/GrQuadPerEdgeAA.cpp
index f893a91..7d00101 100644
--- a/src/gpu/ops/GrQuadPerEdgeAA.cpp
+++ b/src/gpu/ops/GrQuadPerEdgeAA.cpp
@@ -54,190 +54,181 @@
     *ydiff *= *invLengths;
 }
 
-static AI void outset_masked_vertices(const Sk4f& xdiff, const Sk4f& ydiff, const Sk4f& invLengths,
-                                      const Sk4f& mask, Sk4f* x, Sk4f* y, Sk4f* u, Sk4f* v, Sk4f* r,
-                                      int uvrCount) {
-    auto halfMask = 0.5f * mask;
-    auto maskCW = nextCW(halfMask);
-    *x += maskCW * -xdiff + halfMask * nextCW(xdiff);
-    *y += maskCW * -ydiff + halfMask * nextCW(ydiff);
+// outset and outsetCW are provided separately to allow for different magnitude outsets for
+// with-edge and "perpendicular" edge shifts. This is needed when one axis cannot be inset the full
+// half pixel without crossing over the other side.
+static AI void outset_masked_vertices(const Sk4f& outset, const Sk4f& outsetCW, const Sk4f& xdiff,
+                                      const Sk4f& ydiff, const Sk4f& invLengths, const Sk4f& mask,
+                                      Sk4f* x, Sk4f* y, Sk4f* u, Sk4f* v, Sk4f* r, int uvrCount) {
+    // The mask is rotated compared to the outsets and edge vectors, since if the edge is "on"
+    // both its points need to be moved along their other edge vectors.
+    auto maskedOutset = -outset * nextCW(mask);
+    auto maskedOutsetCW = outsetCW * mask;
+    // x = x + outsetCW * mask * nextCW(xdiff) - outset * nextCW(mask) * xdiff
+    *x += fma(maskedOutsetCW, nextCW(xdiff), maskedOutset * xdiff);
+    *y += fma(maskedOutsetCW, nextCW(ydiff), maskedOutset * ydiff);
     if (uvrCount > 0) {
         // We want to extend the texture coords by the same proportion as the positions.
-        maskCW *= invLengths;
-        halfMask *= nextCW(invLengths);
+        maskedOutset *= invLengths;
+        maskedOutsetCW *= nextCW(invLengths);
         Sk4f udiff = nextCCW(*u) - *u;
         Sk4f vdiff = nextCCW(*v) - *v;
-        *u += maskCW * -udiff + halfMask * nextCW(udiff);
-        *v += maskCW * -vdiff + halfMask * nextCW(vdiff);
+        *u += fma(maskedOutsetCW, nextCW(udiff), maskedOutset * udiff);
+        *v += fma(maskedOutsetCW, nextCW(vdiff), maskedOutset * vdiff);
         if (uvrCount == 3) {
             Sk4f rdiff = nextCCW(*r) - *r;
-            *r += maskCW * -rdiff + halfMask * nextCW(rdiff);
+            *r += fma(maskedOutsetCW, nextCW(rdiff), maskedOutset * rdiff);
         }
     }
 }
 
-static AI void outset_vertices(const Sk4f& xdiff, const Sk4f& ydiff, const Sk4f& invLengths,
+static AI void outset_vertices(const Sk4f& outset, const Sk4f& outsetCW, const Sk4f& xdiff,
+                               const Sk4f& ydiff, const Sk4f& invLengths,
                                Sk4f* x, Sk4f* y, Sk4f* u, Sk4f* v, Sk4f* r, int uvrCount) {
-    *x += 0.5f * (-xdiff + nextCW(xdiff));
-    *y += 0.5f * (-ydiff + nextCW(ydiff));
+    // x = x + outsetCW * nextCW(xdiff) - outset * xdiff (as above, but where mask = (1,1,1,1))
+    *x += fma(outsetCW, nextCW(xdiff), -outset * xdiff);
+    *y += fma(outsetCW, nextCW(ydiff), -outset * ydiff);
     if (uvrCount > 0) {
-        Sk4f t = 0.5f * invLengths;
+        Sk4f t = -outset * invLengths; // Bake minus sign in here
+        Sk4f tCW = outsetCW * nextCW(invLengths);
         Sk4f udiff = nextCCW(*u) - *u;
         Sk4f vdiff = nextCCW(*v) - *v;
-        *u += t * -udiff + nextCW(t) * nextCW(udiff);
-        *v += t * -vdiff + nextCW(t) * nextCW(vdiff);
+        *u += fma(tCW, nextCW(udiff), t * udiff);
+        *v += fma(tCW, nextCW(vdiff), t * vdiff);
         if (uvrCount == 3) {
             Sk4f rdiff = nextCCW(*r) - *r;
-            *r += t * -rdiff + nextCW(t) * nextCW(rdiff);
+            *r += fma(tCW, nextCW(rdiff), t * rdiff);
         }
     }
 }
 
-static AI void compute_edge_distances(const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& x,
-                                      const Sk4f& y, const Sk4f& w, Sk4f edgeDistances[]) {
-    for (int i = 0; i < 4; ++i) {
-        edgeDistances[i] = a * x[i] + b * y[i] + c * w[i];
-    }
+// Updates outset in place to account for non-90 degree angles of the quad edges stored in
+// xdiff, ydiff (which are assumed to be normalized).
+static void adjust_non_rectilinear_outset(const Sk4f& xdiff, const Sk4f& ydiff, Sk4f* outset) {
+    // The distance the point needs to move is outset/sqrt(1-cos^2(theta)), where theta is the angle
+    // between the two edges at that point. cos(theta) is equal to dot(xydiff, nextCW(xydiff)),
+    Sk4f cosTheta = fma(xdiff, nextCW(xdiff), ydiff * nextCW(ydiff));
+    *outset *= (1.f - cosTheta * cosTheta).rsqrt();
+    // But clamp to make sure we don't expand by a giant amount if the sheer is really high
+    *outset = Sk4f::Max(-3.f, Sk4f::Min(*outset, 3.f));
 }
 
-static AI float get_max_coverage(const Sk4f& lengths) {
-    float minWidth = SkMinScalar(lengths[0], lengths[3]);
-    float minHeight = SkMinScalar(lengths[1], lengths[2]);
-    // Calculate approximate area of the quad, pinning dimensions to 1 in case the quad is larger
-    // than a pixel. Sub-pixel quads that are rotated may in fact have a different true maximum
-    // coverage than this calculation, but this will be close and is stable.
-    return SkMinScalar(minWidth, 1.f) * SkMinScalar(minHeight, 1.f);
-}
-
-// This computes the four edge equations for a quad, then outsets them and optionally computes a new
-// quad as the intersection points of the outset edges. 'x' and 'y' contain the original points as
-// input and the outset points as output. In order to be used as a component of perspective edge
-// distance calculation, this exports edge equations in 'a', 'b', and 'c'. Use
-// compute_edge_distances to turn these equations into the distances needed by the shader. The
-// values in x, y, u, v, and r are possibly updated if outsetting is needed. r is the local
-// position's w component if it exists.
-//
-// Returns maximum coverage allowed for any given pixel.
-static float compute_quad_edges_and_outset_vertices(GrQuadAAFlags aaFlags, Sk4f* x, Sk4f* y,
-        Sk4f* a, Sk4f* b, Sk4f* c, Sk4f* u, Sk4f* v, Sk4f* r, int uvrChannelCount, bool outset) {
-    SkASSERT(uvrChannelCount == 0 || uvrChannelCount == 2 || uvrChannelCount == 3);
+// Computes the vertices for the two nested quads used to create AA edges. The original single quad
+// should be duplicated as input in x1 and x2, y1 and y2, and possibly u1|u2, v1|v2, [r1|r2]
+// (controlled by uvrChannelCount).  While the values should be duplicated, they should be separate
+// pointers. The outset quad is written in-place back to x1, y1, etc. and the inset inner quad is
+// written to x2, y2, etc.
+static float compute_nested_quad_vertices(GrQuadAAFlags aaFlags, Sk4f* x1, Sk4f* y1,
+        Sk4f* u1, Sk4f* v1, Sk4f* r1, Sk4f* x2, Sk4f* y2, Sk4f* u2, Sk4f* v2, Sk4f* r2,
+        int uvrCount, bool rectilinear) {
+    SkASSERT(uvrCount == 0 || uvrCount == 2 || uvrCount == 3);
 
     // Compute edge vectors for the quad.
-    auto xnext = nextCCW(*x);
-    auto ynext = nextCCW(*y);
+    auto xnext = nextCCW(*x1);
+    auto ynext = nextCCW(*y1);
     // xdiff and ydiff will comprise the normalized vectors pointing along each quad edge.
     Sk4f xdiff, ydiff, invLengths;
-    compute_edge_vectors(*x, *y, xnext, ynext, &xdiff, &ydiff, &invLengths);
+    compute_edge_vectors(*x1, *y1, xnext, ynext, &xdiff, &ydiff, &invLengths);
 
-    // Use above vectors to compute edge equations (importantly before we outset positions).
-    *c = fma(xnext, *y,  -ynext * *x) * invLengths;
-    // Make sure the edge equations have their normals facing into the quad in device space.
-    auto test = fma(ydiff, nextCW(*x), fma(-xdiff, nextCW(*y), *c));
-    if ((test < Sk4f(0)).anyTrue()) {
-        *a = -ydiff;
-        *b = xdiff;
-        *c = -*c;
-    } else {
-        *a = ydiff;
-        *b = -xdiff;
-    }
-    // Outset the edge equations so aa coverage evaluates to zero half a pixel away from the
-    // original quad edge.
-    *c += 0.5f;
-
-    if (aaFlags != GrQuadAAFlags::kAll) {
-        // This order is the same order the edges appear in xdiff/ydiff and therefore as the
-        // edges in a/b/c.
-        Sk4f mask = compute_edge_mask(aaFlags);
-
-        // Outset edge equations for masked out edges another pixel so that they always evaluate
-        // >= 1.
-        *c += (1.f - mask);
-        if (outset) {
-            outset_masked_vertices(xdiff, ydiff, invLengths, mask, x, y, u, v, r, uvrChannelCount);
-        }
-    } else if (outset) {
-        outset_vertices(xdiff, ydiff, invLengths, x, y, u, v, r, uvrChannelCount);
+    // When outsetting, we want the new edge to be .5px away from the old line, which means the
+    // corners may need to be adjusted by more than .5px if the matrix had sheer.
+    Sk4f outset = 0.5f;
+    if (!rectilinear) {
+        adjust_non_rectilinear_outset(xdiff, ydiff, &outset);
     }
 
-    return get_max_coverage(invLengths.invert());
-}
-
-// A specialization of the above function that can compute edge distances very quickly when it knows
-// that the edges intersect at right angles, i.e. any transform other than skew and perspective
-// (GrQuadType::kRectilinear). Unlike the above function, this always outsets the corners since it
-// cannot be reused in the perspective case.
-static float compute_rectilinear_dists_and_outset_vertices(GrQuadAAFlags aaFlags, Sk4f* x,
-        Sk4f* y,  Sk4f edgeDistances[4], Sk4f* u, Sk4f* v, Sk4f* r, int uvrChannelCount) {
-    SkASSERT(uvrChannelCount == 0 || uvrChannelCount == 2 || uvrChannelCount == 3);
-    // xdiff and ydiff will comprise the normalized vectors pointing along each quad edge.
-    Sk4f xdiff, ydiff, invLengths;
-    compute_edge_vectors(*x, *y, nextCCW(*x), nextCCW(*y), &xdiff, &ydiff, &invLengths);
+    // When insetting, cap the inset amount to be half of the edge length, except that each edge
+    // has to remain parallel, so we separately limit LR and TB to half of the smallest of the
+    // opposing edges.
     Sk4f lengths = invLengths.invert();
+    Sk2f sides(SkMinScalar(lengths[0], lengths[3]), SkMinScalar(lengths[1], lengths[2]));
+    Sk4f edgeLimits = 0.5f * SkNx_shuffle<0, 1, 1, 0>(sides);
 
-    // Since the quad is rectilinear, the edge distances are predictable and independent of the
-    // actual orientation of the quad. The lengths vector stores |p1-p0|, |p3-p1|, |p0-p2|, |p2-p3|,
-    // matching the CCW order. For instance, edge distances for p0 are 0 for e0 and e2 since they
-    // intersect at p0. Distance to e1 is the same as p0 to p1. Distance to e3 is p0 to p2 since
-    // e3 goes through p2 and since the quad is rectilinear, we know that's the shortest distance.
-    edgeDistances[0] = Sk4f(0.f, lengths[0], 0.f, lengths[2]);
-    edgeDistances[1] = Sk4f(0.f, 0.f, lengths[0], lengths[1]);
-    edgeDistances[2] = Sk4f(lengths[2], lengths[3], 0.f, 0.f);
-    edgeDistances[3] = Sk4f(lengths[1], 0.f, lengths[3], 0.f);
+    if ((edgeLimits < 0.5f).anyTrue()) {
+        // Dealing with a subpixel rectangle, so must calculate clamped insets and padded outsets.
+        // The outsets are padded to ensure that the quad spans 2 pixels for improved interpolation.
+        Sk4f inset = -Sk4f::Min(outset, edgeLimits);
+        Sk4f insetCW = -Sk4f::Min(outset, nextCW(edgeLimits));
 
-    if (aaFlags != GrQuadAAFlags::kAll) {
-        // This order is the same order the edges appear in xdiff/ydiff and therefore as the
-        // edges in a/b/c.
-        Sk4f mask = compute_edge_mask(aaFlags);
+        // The parallel distance shift caused by outset is currently 0.5, but need to scale it up to
+        // 0.5*(2 - side) so that (side + 2*shift) = 2px. Thus scale outsets for thin edges by
+        // (2 - side) since it already has the 1/2.
+        Sk4f outsetScale = 2.f - 2.f * Sk4f::Min(edgeLimits, 0.5f); // == 1 for non-thin edges
+        Sk4f outsetCW = outset * nextCW(outsetScale);
+        outset *= outsetScale;
 
-        // Update opposite corner distances by 1 (when enabled by the mask). The distance
-        // calculations used in compute_quad_edges_... calculates the edge equations from original
-        // positions and then shifts the coefficient by 0.5. If the opposite edges are also outset
-        // then must add an additional 0.5 to account for its shift away from that edge.
-        Sk4f maskWithOpposites = mask + SkNx_shuffle<3, 2, 1, 0>(mask);
-        edgeDistances[0] += Sk4f(0.f, 0.5f, 0.f, 0.5f) * maskWithOpposites;
-        edgeDistances[1] += Sk4f(0.f, 0.f, 0.5f, 0.5f) * maskWithOpposites;
-        edgeDistances[2] += Sk4f(0.5f, 0.5f, 0.f, 0.f) * maskWithOpposites;
-        edgeDistances[3] += Sk4f(0.5f, 0.f, 0.5f, 0.f) * maskWithOpposites;
-
-        // Outset edge equations for masked out edges another pixel so that they always evaluate
-        // So add 1-mask to each point's edge distances vector so that coverage >= 1 on non-aa
-        for (int i = 0; i < 4; ++i) {
-            edgeDistances[i] += (1.f - mask);
+        if (aaFlags != GrQuadAAFlags::kAll) {
+            Sk4f mask = compute_edge_mask(aaFlags);
+            outset_masked_vertices(outset, outsetCW, xdiff, ydiff, invLengths, mask, x1, y1,
+                                   u1, v1, r1, uvrCount);
+            outset_masked_vertices(inset, insetCW, xdiff, ydiff, invLengths, mask, x2, y2,
+                                   u2, v2, r2, uvrCount);
+        } else {
+            outset_vertices(outset, outsetCW, xdiff, ydiff, invLengths, x1, y1, u1, v1, r1, uvrCount);
+            outset_vertices(inset, insetCW, xdiff, ydiff, invLengths, x2, y2, u2, v2, r2, uvrCount);
         }
-        outset_masked_vertices(xdiff, ydiff, invLengths, mask, x, y, u, v, r, uvrChannelCount);
     } else {
-        // Update opposite corner distances by 0.5 pixel and 0.5 edge shift, skipping the need for
-        // mask since that's 1s
-        edgeDistances[0] += Sk4f(0.f, 1.f, 0.f, 1.f);
-        edgeDistances[1] += Sk4f(0.f, 0.f, 1.f, 1.f);
-        edgeDistances[2] += Sk4f(1.f, 1.f, 0.f, 0.f);
-        edgeDistances[3] += Sk4f(1.f, 0.f, 1.f, 0.f);
-
-        outset_vertices(xdiff, ydiff, invLengths, x, y, u, v, r, uvrChannelCount);
+        // Since it's not subpixel, the inset is just the opposite of the outset and there's no
+        // difference between CCW and CW behavior.
+        Sk4f inset = -outset;
+        if (aaFlags != GrQuadAAFlags::kAll) {
+            Sk4f mask = compute_edge_mask(aaFlags);
+            outset_masked_vertices(outset, outset, xdiff, ydiff, invLengths, mask, x1, y1,
+                                   u1, v1, r1, uvrCount);
+            outset_masked_vertices(inset, inset, xdiff, ydiff, invLengths, mask, x2, y2,
+                                   u2, v2, r2, uvrCount);
+        } else {
+            outset_vertices(outset, outset, xdiff, ydiff, invLengths, x1, y1, u1, v1, r1, uvrCount);
+            outset_vertices(inset, inset, xdiff, ydiff, invLengths, x2, y2, u2, v2, r2, uvrCount);
+        }
     }
 
-    return get_max_coverage(lengths);
+    // An approximation of the pixel area covered by the quad
+    sides = Sk2f::Min(1.f, sides);
+    return sides[0] * sides[1];
 }
 
-// Generalizes compute_quad_edge_distances_and_outset_vertices to extrapolate local coords such that
+// For each device space corner, devP, label its left/right or top/bottom opposite device space
+// point opDevPt. The new device space point is opDevPt + s (devPt - opDevPt) where s is
+// (length(devPt - opDevPt) + outset) / length(devPt - opDevPt); This returns the interpolant s,
+// adjusted for any subpixel corrections. If subpixel, it also updates the max coverage.
+static Sk4f get_projected_interpolant(const Sk4f& len, const Sk4f& outsets, float* maxCoverage) {
+    if ((len < 1.f).anyTrue()) {
+        *maxCoverage *= len.min();
+
+        // When insetting, the amount is clamped to be half the minimum edge length to prevent
+        // overlap. When outsetting, the amount is padded to cover 2 pixels.
+        if ((outsets < 0.f).anyTrue()) {
+            return (len - 0.5f * len.min()) / len;
+        } else {
+            return (len + outsets * (2.f - len.min())) / len;
+        }
+    } else {
+        return (len + outsets) / len;
+    }
+}
+
+// Generalizes compute_nested_quad_vertices to extrapolate local coords such that
 // after perspective division of the device coordinate, the original local coordinate value is at
-// the original un-outset device position. r is the local coordinate's w component.
-static float compute_quad_dists_and_outset_persp_vertices(GrQuadAAFlags aaFlags, Sk4f* x,
-        Sk4f* y, Sk4f* w, Sk4f edgeDistances[4], Sk4f* u, Sk4f* v, Sk4f* r, int uvrChannelCount) {
-    SkASSERT(uvrChannelCount == 0 || uvrChannelCount == 2 || uvrChannelCount == 3);
+// the original un-outset device position. r is the local coordinate's w component. However, since
+// the projected edges will be different for inner and outer quads, there isn't much reuse between
+// the calculations, so it's easier to just have this operate on one quad a time.
+static float compute_quad_persp_vertices(GrQuadAAFlags aaFlags, Sk4f* x, Sk4f* y,
+        Sk4f* w, Sk4f* u, Sk4f* v, Sk4f* r, int uvrCount, bool inset) {
+    SkASSERT(uvrCount == 0 || uvrCount == 2 || uvrCount == 3);
 
     auto iw = (*w).invert();
     auto x2d = (*x) * iw;
     auto y2d = (*y) * iw;
-    Sk4f a, b, c;
-    // Don't compute outset corners in the normalized space, which means u, v, and r don't need
-    // to be provided here (outset separately below). Since this is computing distances for a
-    // projected quad, there is a very good chance it's not rectilinear so use the general 2D path.
-    float maxProjectedCoverage = compute_quad_edges_and_outset_vertices(aaFlags, &x2d, &y2d,
-            &a, &b, &c, nullptr, nullptr, nullptr, /* uvr ct */ 0, /* outsetCorners */ false);
 
-    static const float kOutset = 0.5f;
+    // Must compute non-rectilinear outset quantity using the projected 2d edge vectors
+    Sk4f xdiff, ydiff, invLengths;
+    compute_edge_vectors(x2d, y2d, nextCCW(x2d), nextCCW(y2d), &xdiff, &ydiff, &invLengths);
+    Sk4f outset = inset ? -0.5f : 0.5f;
+    adjust_non_rectilinear_outset(xdiff, ydiff, &outset);
+
+    float maxProjectedCoverage = 1.f;
+
     if ((GrQuadAAFlags::kLeft | GrQuadAAFlags::kRight) & aaFlags) {
         // For each entry in x the equivalent entry in opX is the left/right opposite and so on.
         Sk4f opX = SkNx_shuffle<2, 3, 0, 1>(*x);
@@ -246,13 +237,11 @@
         // vx/vy holds the device space left-to-right vectors along top and bottom of the quad.
         Sk2f vx = SkNx_shuffle<2, 3>(x2d) - SkNx_shuffle<0, 1>(x2d);
         Sk2f vy = SkNx_shuffle<2, 3>(y2d) - SkNx_shuffle<0, 1>(y2d);
-        Sk2f len = SkNx_fma(vx, vx, vy * vy).sqrt();
-        // For each device space corner, devP, label its left/right opposite device space point
-        // opDevPt. The new device space point is opDevPt + s (devPt - opDevPt) where s is
-        // (length(devPt - opDevPt) + 0.5) / length(devPt - opDevPt);
-        Sk4f s = SkNx_shuffle<0, 1, 0, 1>((len + kOutset) / len);
+        Sk4f len = SkNx_shuffle<0, 1, 0, 1>(SkNx_fma(vx, vx, vy * vy).sqrt());
+
         // Compute t in homogeneous space from s using similar triangles so that we can produce
         // homogeneous outset vertices for perspective-correct interpolation.
+        Sk4f s = get_projected_interpolant(len, outset, &maxProjectedCoverage);;
         Sk4f sOpW = s * opW;
         Sk4f t = sOpW / (sOpW + (1.f - s) * (*w));
         // mask is used to make the t values be 1 when the left/right side is not antialiased.
@@ -265,12 +254,12 @@
         *y = opY + t * (*y - opY);
         *w = opW + t * (*w - opW);
 
-        if (uvrChannelCount > 0) {
+        if (uvrCount > 0) {
             Sk4f opU = SkNx_shuffle<2, 3, 0, 1>(*u);
             Sk4f opV = SkNx_shuffle<2, 3, 0, 1>(*v);
             *u = opU + t * (*u - opU);
             *v = opV + t * (*v - opV);
-            if (uvrChannelCount == 3) {
+            if (uvrCount == 3) {
                 Sk4f opR = SkNx_shuffle<2, 3, 0, 1>(*r);
                 *r = opR + t * (*r - opR);
             }
@@ -292,10 +281,9 @@
 
         Sk2f vx = SkNx_shuffle<1, 3>(x2d) - SkNx_shuffle<0, 2>(x2d);
         Sk2f vy = SkNx_shuffle<1, 3>(y2d) - SkNx_shuffle<0, 2>(y2d);
-        Sk2f len = SkNx_fma(vx, vx, vy * vy).sqrt();
+        Sk4f len = SkNx_shuffle<0, 0, 1, 1>(SkNx_fma(vx, vx, vy * vy).sqrt());
 
-        Sk4f s = SkNx_shuffle<0, 0, 1, 1>((len + kOutset) / len);
-
+        Sk4f s = get_projected_interpolant(len, outset, &maxProjectedCoverage);;
         Sk4f sOpW = s * opW;
         Sk4f t = sOpW / (sOpW + (1.f - s) * (*w));
 
@@ -308,58 +296,101 @@
         *y = opY + t * (*y - opY);
         *w = opW + t * (*w - opW);
 
-        if (uvrChannelCount > 0) {
+        if (uvrCount > 0) {
             Sk4f opU = SkNx_shuffle<1, 0, 3, 2>(*u);
             Sk4f opV = SkNx_shuffle<1, 0, 3, 2>(*v);
             *u = opU + t * (*u - opU);
             *v = opV + t * (*v - opV);
-            if (uvrChannelCount == 3) {
+            if (uvrCount == 3) {
                 Sk4f opR = SkNx_shuffle<1, 0, 3, 2>(*r);
                 *r = opR + t * (*r - opR);
             }
         }
     }
 
-    // Use the original edge equations with the outset homogeneous coordinates to get the edge
-    // distance (technically multiplied by w, so that the fragment shader can do perspective
-    // interpolation when it multiplies by 1/w later).
-    compute_edge_distances(a, b, c, *x, *y, *w, edgeDistances);
-
     return maxProjectedCoverage;
 }
 
-// Calculate safe edge distances for non-aa quads that have been batched with aa quads. Since the
-// fragment shader multiples by 1/w, so the edge distance cannot just be set to 1. It cannot just
-// be set to w either due to interpolation across the triangle. If iA, iB, and iC are the
-// barycentric weights of the triangle, and we set the edge distance to w, the fragment shader
-// actually sees d = (iA*wA + iB*wB + iC*wC) * (iA/wA + iB/wB + iC/wC). Without perspective this
-// simplifies to 1 as necessary, but we must choose something other than w when there is perspective
-// to ensure that d >= 1 and the edge shows as non-aa.
-static float compute_nonaa_edge_distances(const Sk4f& w, bool hasPersp, Sk4f edgeDistances[4]) {
-    // Let n = min(w1,w2,w3,w4) and m = max(w1,w2,w3,w4) and rewrite
-    //   d = (iA*wA + iB*wB + iC*wC) * (iA*wB*wC + iB*wA*wC + iC*wA*wB) / (wA*wB*wC)
-    //       |   e=attr from VS    |   |         fragCoord.w = 1/w                 |
-    // Since the weights are the interior of the primitive then we have:
-    //   n <= (iA*wA + iB*wB + iC*wC) <= m and
-    //   n^2 <= (iA*wB*wC + iB*wA*wC + iC*wA*wB) <= m^2 and
-    //   n^3 <= wA*wB*wC <= m^3 regardless of the choice of A, B, and C verts in the quad
-    // Thus if we set e = m^3/n^3, it guarantees d >= 1 for any perspective.
-    float e;
-    if (hasPersp) {
-        float m = w.max();
-        float n = w.min();
-        e = (m * m * m) / (n * n * n);
+enum class CoverageMode {
+    kNone,
+    kWithPosition,
+    kWithColor
+};
+
+static CoverageMode get_mode_for_spec(const GrQuadPerEdgeAA::VertexSpec& spec) {
+    if (spec.usesCoverageAA()) {
+        if (spec.compatibleWithAlphaAsCoverage() && spec.hasVertexColors()) {
+            return CoverageMode::kWithColor;
+        } else {
+            return CoverageMode::kWithPosition;
+        }
     } else {
-        e = 1.f;
+        return CoverageMode::kNone;
     }
+}
 
-    // All edge distances set to the same
+// Writes four vertices in triangle strip order, including the additional data for local
+// coordinates, domain, color, and coverage as needed to satisfy the vertex spec.
+static void write_quad(GrVertexWriter* vb, const GrQuadPerEdgeAA::VertexSpec& spec,
+                       CoverageMode mode, float coverage,
+                       SkPMColor4f color4f, bool wideColor,
+                       const SkRect& domain,
+                       const Sk4f& x, const Sk4f& y, const Sk4f& w,
+                       const Sk4f& u, const Sk4f& v, const Sk4f& r) {
+    static constexpr auto If = GrVertexWriter::If<float>;
+
+    if (mode == CoverageMode::kWithColor) {
+        // Multiply the color by the coverage up front
+        SkASSERT(spec.hasVertexColors());
+        color4f = color4f * coverage;
+    }
+    GrVertexColor color(color4f, wideColor);
+
     for (int i = 0; i < 4; ++i) {
-        edgeDistances[i] = e;
-    }
+        // save position, this is a float2 or float3 or float4 depending on the combination of
+        // perspective and coverage mode.
+        vb->write(x[i], y[i], If(spec.deviceQuadType() == GrQuadType::kPerspective, w[i]),
+                  If(mode == CoverageMode::kWithPosition, coverage));
 
-    // Non-aa, so always use full coverage
-    return 1.f;
+        // save color
+        if (spec.hasVertexColors()) {
+            vb->write(color);
+        }
+
+        // save local position
+        if (spec.hasLocalCoords()) {
+            vb->write(u[i], v[i], If(spec.localQuadType() == GrQuadType::kPerspective, r[i]));
+        }
+
+        // save the domain
+        if (spec.hasDomain()) {
+            vb->write(domain);
+        }
+    }
+}
+
+GR_DECLARE_STATIC_UNIQUE_KEY(gAAFillRectIndexBufferKey);
+
+static const int kVertsPerAAFillRect = 8;
+static const int kIndicesPerAAFillRect = 30;
+
+static sk_sp<const GrBuffer> get_index_buffer(GrResourceProvider* resourceProvider) {
+    GR_DEFINE_STATIC_UNIQUE_KEY(gAAFillRectIndexBufferKey);
+
+    // clang-format off
+    static const uint16_t gFillAARectIdx[] = {
+        0, 1, 2, 1, 3, 2,
+        0, 4, 1, 4, 5, 1,
+        0, 6, 4, 0, 2, 6,
+        2, 3, 6, 3, 7, 6,
+        1, 5, 3, 3, 5, 7,
+    };
+    // clang-format on
+
+    GR_STATIC_ASSERT(SK_ARRAY_COUNT(gFillAARectIdx) == kIndicesPerAAFillRect);
+    return resourceProvider->findOrCreatePatternedIndexBuffer(
+            gFillAARectIdx, kIndicesPerAAFillRect, GrQuadPerEdgeAA::kNumAAQuadsInIndexBuffer,
+            kVertsPerAAFillRect, gAAFillRectIndexBufferKey);
 }
 
 } // anonymous namespace
@@ -371,86 +402,99 @@
 void* Tessellate(void* vertices, const VertexSpec& spec, const GrPerspQuad& deviceQuad,
                  const SkPMColor4f& color4f, const GrPerspQuad& localQuad, const SkRect& domain,
                  GrQuadAAFlags aaFlags) {
-    bool deviceHasPerspective = spec.deviceQuadType() == GrQuadType::kPerspective;
-    bool localHasPerspective = spec.localQuadType() == GrQuadType::kPerspective;
-    GrVertexColor color(color4f, GrQuadPerEdgeAA::ColorType::kHalf == spec.colorType());
+    bool wideColor = GrQuadPerEdgeAA::ColorType::kHalf == spec.colorType();
+    CoverageMode mode = get_mode_for_spec(spec);
 
     // Load position data into Sk4fs (always x, y, and load w to avoid branching down the road)
-    Sk4f x = deviceQuad.x4f();
-    Sk4f y = deviceQuad.y4f();
-    Sk4f w = deviceQuad.w4f(); // Guaranteed to be 1f if it's not perspective
+    Sk4f oX = deviceQuad.x4f();
+    Sk4f oY = deviceQuad.y4f();
+    Sk4f oW = deviceQuad.w4f(); // Guaranteed to be 1f if it's not perspective
 
     // Load local position data into Sk4fs (either none, just u,v or all three)
-    Sk4f u, v, r;
+    Sk4f oU, oV, oR;
     if (spec.hasLocalCoords()) {
-        u = localQuad.x4f();
-        v = localQuad.y4f();
-
-        if (localHasPerspective) {
-            r = localQuad.w4f();
-        }
+        oU = localQuad.x4f();
+        oV = localQuad.y4f();
+        oR = localQuad.w4f(); // Will be ignored if the local quad type isn't perspective
     }
 
-    // Index into array refers to vertex. Index into particular Sk4f refers to edge.
-    Sk4f edgeDistances[4];
-    float maxCoverage = 1.f;
-    if (spec.usesCoverageAA()) {
-        // Must calculate edges and possibly outside the positions
-        if (aaFlags == GrQuadAAFlags::kNone) {
-            // A non-AA quad that got batched into an AA group, so it should have full coverage
-            maxCoverage = compute_nonaa_edge_distances(w, deviceHasPerspective, edgeDistances);
-        } else if (deviceHasPerspective) {
-            // For simplicity, pointers to u, v, and r are always provided, but the local dim param
-            // ensures that only loaded Sk4fs are modified in the compute functions.
-            maxCoverage = compute_quad_dists_and_outset_persp_vertices(aaFlags, &x, &y, &w,
-                    edgeDistances, &u, &v, &r, spec.localDimensionality());
-        } else if (spec.deviceQuadType() <= GrQuadType::kRectilinear) {
-            maxCoverage = compute_rectilinear_dists_and_outset_vertices(aaFlags, &x, &y,
-                    edgeDistances, &u, &v, &r, spec.localDimensionality());
-        } else {
-            Sk4f a, b, c;
-            maxCoverage = compute_quad_edges_and_outset_vertices(aaFlags, &x, &y, &a, &b, &c,
-                    &u, &v, &r, spec.localDimensionality(), /*outset*/ true);
-            compute_edge_distances(a, b, c, x, y, w, edgeDistances); // w holds 1.f as desired
-        }
-    }
-
-    // Now rearrange the Sk4fs into the interleaved vertex layout:
-    //  i.e. x1x2x3x4 y1y2y3y4 -> x1y1 x2y2 x3y3 x4y
     GrVertexWriter vb{vertices};
-    for (int i = 0; i < 4; ++i) {
-        // save position, always send a vec4 because we embed max coverage in the last component.
-        // For 2D quads, we know w holds the correct 1.f, so just write it out without branching
-        vb.write(x[i], y[i], w[i], maxCoverage);
+    if (spec.usesCoverageAA()) {
+        SkASSERT(mode == CoverageMode::kWithPosition || mode == CoverageMode::kWithColor);
 
-        // save color
-        if (spec.hasVertexColors()) {
-            vb.write(color);
-        }
+        // Must calculate two new quads, an outset and inset by .5 in projected device space, so
+        // duplicate the original quad into new Sk4fs for the inset.
+        Sk4f iX = oX, iY = oY, iW = oW;
+        Sk4f iU = oU, iV = oV, iR = oR;
 
-        // save local position
-        if (spec.hasLocalCoords()) {
-            if (localHasPerspective) {
-                vb.write<SkPoint3>({u[i], v[i], r[i]});
+        float maxCoverage = 1.f;
+        if (aaFlags != GrQuadAAFlags::kNone) {
+            if (spec.deviceQuadType() == GrQuadType::kPerspective) {
+                // Outset and inset the quads independently because perspective makes each shift
+                // unique. Since iX copied pre-outset oX, this will compute the proper inset too.
+                compute_quad_persp_vertices(aaFlags, &oX, &oY, &oW, &oU, &oV, &oW,
+                                            spec.localDimensionality(), /* inset */ false);
+                // Save coverage limit when computing inset quad
+                maxCoverage = compute_quad_persp_vertices(aaFlags, &iX, &iY, &iW, &iU, &iV, &iW,
+                                                          spec.localDimensionality(), true);
             } else {
-                vb.write<SkPoint>({u[i], v[i]});
+                // In the 2D case, insetting and outsetting can reuse the edge vectors, so the
+                // nested quads are computed together
+                maxCoverage = compute_nested_quad_vertices(aaFlags, &oX, &oY, &oU, &oV, &oR,
+                        &iX, &iY, &iU, &iV, &iR, spec.localDimensionality(),
+                        spec.deviceQuadType() <= GrQuadType::kRectilinear);
             }
-        }
+            // NOTE: could provide an even more optimized tessellation function for axis-aligned
+            // rects since the positions can be outset by constants without doing vector math,
+            // except it must handle identifying the winding of the quad vertices if the transform
+            // applied a mirror, etc. The current 2D case is already adequately fast.
+        } // else don't adjust any positions, let the outer quad form degenerate triangles
 
-        // save the domain
-        if (spec.hasDomain()) {
-            vb.write(domain);
-        }
-
-        // save the edges
-        if (spec.usesCoverageAA()) {
-            vb.write(edgeDistances[i]);
-        }
+        // Write two quads for inner and outer, inner will use the
+        write_quad(&vb, spec, mode, maxCoverage, color4f, wideColor, domain,
+                   iX, iY, iW, iU, iV, iR);
+        write_quad(&vb, spec, mode, 0.f, color4f, wideColor, domain, oX, oY, oW, oU, oV, oR);
+    } else {
+        // No outsetting needed, just write a single quad with full coverage
+        SkASSERT(mode == CoverageMode::kNone);
+        write_quad(&vb, spec, mode, 1.f, color4f, wideColor, domain, oX, oY, oW, oU, oV, oR);
     }
 
     return vb.fPtr;
 }
 
+bool ConfigureMeshIndices(GrMeshDrawOp::Target* target, GrMesh* mesh, const VertexSpec& spec,
+                          int quadCount) {
+    if (spec.usesCoverageAA()) {
+        // AA quads use 8 vertices, basically nested rectangles
+        sk_sp<const GrBuffer> ibuffer = get_index_buffer(target->resourceProvider());
+        if (!ibuffer) {
+            return false;
+        }
+
+        mesh->setPrimitiveType(GrPrimitiveType::kTriangles);
+        mesh->setIndexedPatterned(ibuffer.get(), kIndicesPerAAFillRect, kVertsPerAAFillRect,
+                quadCount, kNumAAQuadsInIndexBuffer);
+    } else {
+        // Non-AA quads use 4 vertices, and regular triangle strip layout
+        if (quadCount > 1) {
+            sk_sp<const GrBuffer> ibuffer = target->resourceProvider()->refQuadIndexBuffer();
+            if (!ibuffer) {
+                return false;
+            }
+
+            mesh->setPrimitiveType(GrPrimitiveType::kTriangles);
+            mesh->setIndexedPatterned(ibuffer.get(), 6, 4, quadCount,
+                                      GrResourceProvider::QuadCountOfQuadBuffer());
+        } else {
+            mesh->setPrimitiveType(GrPrimitiveType::kTriangleStrip);
+            mesh->setNonIndexedNonInstanced(4);
+        }
+    }
+
+    return true;
+}
+
 ////////////////// VertexSpec Implementation
 
 int VertexSpec::deviceDimensionality() const {
@@ -483,19 +527,21 @@
     const char* name() const override { return "QuadPerEdgeAAGeometryProcessor"; }
 
     void getGLSLProcessorKey(const GrShaderCaps&, GrProcessorKeyBuilder* b) const override {
-        // aa, domain, texturing are single bit flags
-        uint32_t x = fAAEdgeDistances.isInitialized() ? 0 : 1;
-        x |= fDomain.isInitialized() ? 0 : 2;
-        x |= fSampler.isInitialized() ? 0 : 4;
-        // regular position has two options as well
-        x |= fNeedsPerspective ? 0 : 8;
+        // domain, texturing, device-dimensions are single bit flags
+        uint32_t x = fDomain.isInitialized() ? 0 : 1;
+        x |= fSampler.isInitialized() ? 0 : 2;
+        x |= fNeedsPerspective ? 0 : 4;
         // local coords require 2 bits (3 choices), 00 for none, 01 for 2d, 10 for 3d
         if (fLocalCoord.isInitialized()) {
-            x |= kFloat3_GrVertexAttribType == fLocalCoord.cpuType() ? 16 : 32;
+            x |= kFloat3_GrVertexAttribType == fLocalCoord.cpuType() ? 8 : 16;
         }
         // similar for colors, 00 for none, 01 for bytes, 10 for half-floats
-        if (this->fColor.isInitialized()) {
-            x |= kUByte4_norm_GrVertexAttribType == fColor.cpuType() ? 64 : 128;
+        if (fColor.isInitialized()) {
+            x |= kUByte4_norm_GrVertexAttribType == fColor.cpuType() ? 32 : 64;
+        }
+        // and coverage mode, 00 for none, 01 for withposition, 10 for withcolor
+        if (fCoverageMode != CoverageMode::kNone) {
+            x |= CoverageMode::kWithPosition == fCoverageMode ? 128 : 256;
         }
 
         b->add32(GrColorSpaceXform::XformKey(fTextureColorSpaceXform.get()));
@@ -524,17 +570,24 @@
 
                 args.fVaryingHandler->emitAttributes(gp);
 
-                // Extract effective position out of vec4 as a local variable in the vertex shader
-                if (gp.fNeedsPerspective) {
-                    args.fVertBuilder->codeAppendf("float3 position = %s.xyz;",
-                                                   gp.fPositionWithCoverage.name());
+                if (gp.fCoverageMode == CoverageMode::kWithPosition) {
+                    // Strip last channel from the vertex attribute to remove coverage and get the
+                    // actual position
+                    if (gp.fNeedsPerspective) {
+                        args.fVertBuilder->codeAppendf("float3 position = %s.xyz;",
+                                                       gp.fPosition.name());
+                    } else {
+                        args.fVertBuilder->codeAppendf("float2 position = %s.xy;",
+                                                       gp.fPosition.name());
+                    }
+                    gpArgs->fPositionVar = {"position",
+                                            gp.fNeedsPerspective ? kFloat3_GrSLType
+                                                                 : kFloat2_GrSLType,
+                                            GrShaderVar::kNone_TypeModifier};
                 } else {
-                    args.fVertBuilder->codeAppendf("float2 position = %s.xy;",
-                                                   gp.fPositionWithCoverage.name());
+                    // No coverage to eliminate
+                    gpArgs->fPositionVar = gp.fPosition.asShaderVar();
                 }
-                gpArgs->fPositionVar = {"position",
-                                        gp.fNeedsPerspective ? kFloat3_GrSLType : kFloat2_GrSLType,
-                                        GrShaderVar::kNone_TypeModifier};
 
                 // Handle local coordinates if they exist
                 if (gp.fLocalCoord.isInitialized()) {
@@ -550,8 +603,13 @@
 
                 // Solid color before any texturing gets modulated in
                 if (gp.fColor.isInitialized()) {
+                    // The color cannot be flat if the varying coverage has been modulated into it
                     args.fVaryingHandler->addPassThroughAttribute(gp.fColor, args.fOutputColor,
-                                                                  Interpolation::kCanBeFlat);
+                            gp.fCoverageMode == CoverageMode::kWithColor ?
+                            Interpolation::kInterpolated : Interpolation::kCanBeFlat);
+                } else {
+                    // Output color must be initialized to something
+                    args.fFragBuilder->codeAppendf("%s = half4(1);", args.fOutputColor);
                 }
 
                 // If there is a texture, must also handle texture coordinates and reading from
@@ -590,32 +648,22 @@
                 }
 
                 // And lastly, output the coverage calculation code
-                if (gp.fAAEdgeDistances.isInitialized()) {
-                    GrGLSLVarying maxCoverage(kFloat_GrSLType);
-                    args.fVaryingHandler->addVarying("maxCoverage", &maxCoverage);
-                    args.fVertBuilder->codeAppendf("%s = %s.w;",
-                                                   maxCoverage.vsOut(), gp.fPositionWithCoverage.name());
-
-                    args.fFragBuilder->codeAppend("float4 edgeDists;");
-                    args.fVaryingHandler->addPassThroughAttribute(gp.fAAEdgeDistances, "edgeDists");
-
-                    args.fFragBuilder->codeAppend(
-                            "float minDist = min(min(edgeDists.x, edgeDists.y),"
-                            " min(edgeDists.z, edgeDists.w));");
+                if (gp.fCoverageMode == CoverageMode::kWithPosition) {
+                    GrGLSLVarying coverage(kFloat_GrSLType);
+                    args.fVaryingHandler->addVarying("coverage", &coverage);
                     if (gp.fNeedsPerspective) {
-                        // The distance from edge equation e to homogeneous point p=sk_Position is
-                        // e.x*p.x/p.w + e.y*p.y/p.w + e.z. However, we want screen space
-                        // interpolation of this distance. We can do this by multiplying the vertex
-                        // attribute by p.w and then multiplying by sk_FragCoord.w in the FS. So we
-                        // output e.x*p.x + e.y*p.y + e.z * p.w
-                        args.fFragBuilder->codeAppend("minDist *= sk_FragCoord.w;");
+                        args.fVertBuilder->codeAppendf("%s = %s.w;",
+                                                       coverage.vsOut(), gp.fPosition.name());
+                    } else {
+                        args.fVertBuilder->codeAppendf("%s = %s.z;",
+                                                       coverage.vsOut(), gp.fPosition.name());
                     }
-                    // Clamp to max coverage after the perspective divide since perspective quads
-                    // calculated the max coverage in projected space.
-                    args.fFragBuilder->codeAppendf("%s = float4(clamp(minDist, 0.0, %s));",
-                                                   args.fOutputCoverage, maxCoverage.fsIn());
+
+                    args.fFragBuilder->codeAppendf("%s = float4(%s);",
+                                                   args.fOutputCoverage, coverage.fsIn());
                 } else {
-                    // Set coverage to 1
+                    // Set coverage to 1, since it's either non-AA or the coverage was already
+                    // folded into the output color
                     args.fFragBuilder->codeAppendf("%s = float4(1);", args.fOutputCoverage);
                 }
             }
@@ -628,7 +676,7 @@
     QuadPerEdgeAAGeometryProcessor(const VertexSpec& spec)
             : INHERITED(kQuadPerEdgeAAGeometryProcessor_ClassID)
             , fTextureColorSpaceXform(nullptr) {
-        SkASSERT(spec.hasVertexColors() && !spec.hasDomain());
+        SkASSERT(!spec.hasDomain());
         this->initializeAttrs(spec);
         this->setTextureSamplerCnt(0);
     }
@@ -641,14 +689,28 @@
             : INHERITED(kQuadPerEdgeAAGeometryProcessor_ClassID)
             , fTextureColorSpaceXform(std::move(textureColorSpaceXform))
             , fSampler(textureType, textureConfig, samplerState, extraSamplerKey) {
-        SkASSERT(spec.hasVertexColors() && spec.hasLocalCoords());
+        SkASSERT(spec.hasLocalCoords());
         this->initializeAttrs(spec);
         this->setTextureSamplerCnt(1);
     }
 
     void initializeAttrs(const VertexSpec& spec) {
         fNeedsPerspective = spec.deviceDimensionality() == 3;
-        fPositionWithCoverage = {"posAndCoverage", kFloat4_GrVertexAttribType, kFloat4_GrSLType};
+        fCoverageMode = get_mode_for_spec(spec);
+
+        if (fCoverageMode == CoverageMode::kWithPosition) {
+            if (fNeedsPerspective) {
+                fPosition = {"positionWithCoverage", kFloat4_GrVertexAttribType, kFloat4_GrSLType};
+            } else {
+                fPosition = {"positionWithCoverage", kFloat3_GrVertexAttribType, kFloat3_GrSLType};
+            }
+        } else {
+            if (fNeedsPerspective) {
+                fPosition = {"position", kFloat3_GrVertexAttribType, kFloat3_GrSLType};
+            } else {
+                fPosition = {"position", kFloat2_GrVertexAttribType, kFloat2_GrSLType};
+            }
+        }
 
         int localDim = spec.localDimensionality();
         if (localDim == 3) {
@@ -667,22 +729,20 @@
             fDomain = {"domain", kFloat4_GrVertexAttribType, kFloat4_GrSLType};
         }
 
-        if (spec.usesCoverageAA()) {
-            fAAEdgeDistances = {"aaEdgeDist", kFloat4_GrVertexAttribType, kFloat4_GrSLType};
-        }
-        this->setVertexAttributes(&fPositionWithCoverage, 5);
+        this->setVertexAttributes(&fPosition, 4);
     }
 
     const TextureSampler& onTextureSampler(int) const override { return fSampler; }
 
-    Attribute fPositionWithCoverage;
-    Attribute fColor;
+    Attribute fPosition; // May contain coverage as last channel
+    Attribute fColor; // May have coverage modulated in if the FPs support it
     Attribute fLocalCoord;
     Attribute fDomain;
-    Attribute fAAEdgeDistances;
 
-    // The positions attribute is always a vec4 and can't be used to encode perspectiveness
+    // The positions attribute may have coverage built into it, so float3 is an ambiguous type
+    // and may mean 2d with coverage, or 3d with no coverage
     bool fNeedsPerspective;
+    CoverageMode fCoverageMode;
 
     // Color space will be null and fSampler.isInitialized() returns false when the GP is configured
     // to skip texturing.