Reland "Try to avoid vertex colors in Texture/FillRect ops when possible."

This is a reland of e0b989e5e3ab64f3585fe1bf0228e964dad0678c

Original change's description:
> Try to avoid vertex colors in Texture/FillRect ops when possible.
> 
> Avoids unnecessary fragment shader color multiplication.
> 
> Change-Id: I353d3ca91824ce20c9e9af1c5c84ab9953ddd8ab
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/201004
> Commit-Queue: Brian Salomon <bsalomon@google.com>
> Reviewed-by: Michael Ludwig <michaelludwig@google.com>

Change-Id: I22125cb7058f528cb368ff30c3c26e3d55056e66
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/201222
Reviewed-by: Brian Salomon <bsalomon@google.com>
Commit-Queue: Brian Salomon <bsalomon@google.com>
diff --git a/src/gpu/ops/GrFillRectOp.cpp b/src/gpu/ops/GrFillRectOp.cpp
index d9a11c4..94f90b3 100644
--- a/src/gpu/ops/GrFillRectOp.cpp
+++ b/src/gpu/ops/GrFillRectOp.cpp
@@ -79,7 +79,7 @@
                const GrPerspQuad& localQuad, GrQuadType localQuadType)
             : INHERITED(ClassID())
             , fHelper(args, aaType, stencil)
-            , fWideColor(!SkPMColor4fFitsInBytes(paintColor)) {
+            , fColorType(GrQuadPerEdgeAA::MinColorType(paintColor)) {
         // The color stored with the quad is the clear color if a scissor-clear is decided upon
         // when executing the op.
         fDeviceQuads.push_back(deviceQuad, deviceQuadType, { paintColor, edgeFlags });
@@ -178,10 +178,9 @@
         using Domain = GrQuadPerEdgeAA::Domain;
         static constexpr SkRect kEmptyDomain = SkRect::MakeEmpty();
 
-        VertexSpec vertexSpec(fDeviceQuads.quadType(),
-                              fWideColor ? ColorType::kHalf : ColorType::kByte,
-                              fLocalQuads.quadType(), fHelper.usesLocalCoords(), Domain::kNo,
-                              fHelper.aaType(), fHelper.compatibleWithAlphaAsCoverage());
+        VertexSpec vertexSpec(fDeviceQuads.quadType(), fColorType, fLocalQuads.quadType(),
+                              fHelper.usesLocalCoords(), Domain::kNo, fHelper.aaType(),
+                              fHelper.compatibleWithAlphaAsCoverage());
         // Make sure that if the op thought it was a solid color, the vertex spec does not use
         // local coords.
         SkASSERT(!fHelper.isTrivial() || !fHelper.usesLocalCoords());
@@ -258,7 +257,7 @@
         // If the processor sets are compatible, the two ops are always compatible; it just needs to
         // adjust the state of the op to be the more general quad and aa types of the two ops and
         // then concatenate the per-quad data.
-        fWideColor |= that->fWideColor;
+        fColorType = SkTMax(fColorType, that->fColorType);
 
         // The helper stores the aa type, but isCompatible(with true arg) allows the two ops' aa
         // types to be none and coverage, in which case this op's aa type must be lifted to coverage
@@ -297,8 +296,8 @@
         }
 
         // clear compatible won't need to be updated, since device quad type and paint is the same,
-        // but this quad has a new color, so maybe update wide color
-        fWideColor |= !SkPMColor4fFitsInBytes(color);
+        // but this quad has a new color, so maybe update color type
+        fColorType = SkTMax(fColorType, GrQuadPerEdgeAA::MinColorType(color));
 
         // Update the bounds and add the quad to this op's storage
         SkRect newBounds = this->bounds();
@@ -328,7 +327,7 @@
     // No metadata attached to the local quads; this list is empty when local coords are not needed.
     GrQuadList fLocalQuads;
 
-    unsigned fWideColor: 1;
+    ColorType fColorType;
 
     typedef GrMeshDrawOp INHERITED;
 };
diff --git a/src/gpu/ops/GrQuadPerEdgeAA.cpp b/src/gpu/ops/GrQuadPerEdgeAA.cpp
index 4845b55..cdc49b3 100644
--- a/src/gpu/ops/GrQuadPerEdgeAA.cpp
+++ b/src/gpu/ops/GrQuadPerEdgeAA.cpp
@@ -624,9 +624,7 @@
 // Writes four vertices in triangle strip order, including the additional data for local
 // coordinates, domain, color, and coverage as needed to satisfy the vertex spec.
 static void write_quad(GrVertexWriter* vb, const GrQuadPerEdgeAA::VertexSpec& spec,
-                       CoverageMode mode, Sk4f coverage,
-                       SkPMColor4f color4f, bool wideColor,
-                       const SkRect& domain,
+                       CoverageMode mode, Sk4f coverage, SkPMColor4f color4f, const SkRect& domain,
                        const Vertices& quad) {
     static constexpr auto If = GrVertexWriter::If<float>;
 
@@ -639,8 +637,9 @@
 
         // save color
         if (spec.hasVertexColors()) {
+            bool wide = spec.colorType() == GrQuadPerEdgeAA::ColorType::kHalf;
             vb->write(GrVertexColor(
-                    color4f * (mode == CoverageMode::kWithColor ? coverage[i] : 1.f), wideColor));
+                    color4f * (mode == CoverageMode::kWithColor ? coverage[i] : 1.f), wide));
         }
 
         // save local position
@@ -684,12 +683,21 @@
 
 namespace GrQuadPerEdgeAA {
 
+ColorType MinColorType(SkPMColor4f color) {
+    if (color == SK_PMColor4fWHITE) {
+        return ColorType::kNone;
+    } else if (color.fitsInBytes()) {
+        return ColorType::kByte;
+    } else {
+        return ColorType::kHalf;
+    }
+}
+
 ////////////////// Tessellate Implementation
 
 void* Tessellate(void* vertices, const VertexSpec& spec, const GrPerspQuad& deviceQuad,
                  const SkPMColor4f& color4f, const GrPerspQuad& localQuad, const SkRect& domain,
                  GrQuadAAFlags aaFlags) {
-    bool wideColor = GrQuadPerEdgeAA::ColorType::kHalf == spec.colorType();
     CoverageMode mode = get_mode_for_spec(spec);
 
     // Load position data into Sk4fs (always x, y, and load w to avoid branching down the road)
@@ -732,12 +740,12 @@
         // applied a mirror, etc. The current 2D case is already adequately fast.
 
         // Write two quads for inner and outer, inner will use the
-        write_quad(&vb, spec, mode, maxCoverage, color4f, wideColor, domain, inner);
-        write_quad(&vb, spec, mode, 0.f, color4f, wideColor, domain, outer);
+        write_quad(&vb, spec, mode, maxCoverage, color4f, domain, inner);
+        write_quad(&vb, spec, mode, 0.f, color4f, domain, outer);
     } else {
         // No outsetting needed, just write a single quad with full coverage
         SkASSERT(mode == CoverageMode::kNone);
-        write_quad(&vb, spec, mode, 1.f, color4f, wideColor, domain, outer);
+        write_quad(&vb, spec, mode, 1.f, color4f, domain, outer);
     }
 
     return vb.fPtr;
diff --git a/src/gpu/ops/GrQuadPerEdgeAA.h b/src/gpu/ops/GrQuadPerEdgeAA.h
index 232a10d..a491f26 100644
--- a/src/gpu/ops/GrQuadPerEdgeAA.h
+++ b/src/gpu/ops/GrQuadPerEdgeAA.h
@@ -26,6 +26,9 @@
     enum class ColorType { kNone, kByte, kHalf, kLast = kHalf };
     static const int kColorTypeCount = static_cast<int>(ColorType::kLast) + 1;
 
+    // Gets the minimum ColorType that can represent a color.
+    ColorType MinColorType(SkPMColor4f);
+
     // Specifies the vertex configuration for an op that renders per-edge AA quads. The vertex
     // order (when enabled) is device position, color, local position, domain, aa edge equations.
     // This order matches the constructor argument order of VertexSpec and is the order that
diff --git a/src/gpu/ops/GrTextureOp.cpp b/src/gpu/ops/GrTextureOp.cpp
index 9db287a..5b4232c 100644
--- a/src/gpu/ops/GrTextureOp.cpp
+++ b/src/gpu/ops/GrTextureOp.cpp
@@ -291,7 +291,7 @@
         auto bounds = dstQuad.bounds(dstQuadType);
         this->setBounds(bounds, HasAABloat(aaType == GrAAType::kCoverage), IsZeroArea::kNo);
         fDomain = static_cast<unsigned>(domain);
-        fWideColor = !SkPMColor4fFitsInBytes(color);
+        fColorType = static_cast<unsigned>(GrQuadPerEdgeAA::MinColorType(color));
         fCanSkipAllocatorGather =
                 static_cast<unsigned>(fProxies[0].fProxy->canSkipResourceAllocator());
     }
@@ -311,7 +311,7 @@
         // identical, unless an entry provides a dstClip or additional transform that changes it.
         // The quad list will automatically adapt to that.
         fQuads.reserve(cnt, GrQuadTypeForTransformedRect(viewMatrix));
-
+        bool allOpaque = true;
         for (unsigned p = 0; p < fProxyCnt; ++p) {
             fProxies[p].fProxy = SkRef(set[p].fProxy.get());
             fProxies[p].fQuadCnt = 1;
@@ -351,6 +351,7 @@
                                                              set[p].fDstRect);
             }
             float alpha = SkTPin(set[p].fAlpha, 0.f, 1.f);
+            allOpaque &= (1.f == alpha);
             SkPMColor4f color{alpha, alpha, alpha, alpha};
             int srcQuadIndex = -1;
             if (set[p].fDstClipQuad) {
@@ -371,7 +372,7 @@
         }
         this->setBounds(bounds, HasAABloat(this->aaType() == GrAAType::kCoverage), IsZeroArea::kNo);
         fDomain = static_cast<unsigned>(false);
-        fWideColor = static_cast<unsigned>(false);
+        fColorType = static_cast<unsigned>(allOpaque ? ColorType::kNone : ColorType::kByte);
     }
 
     void tess(void* v, const VertexSpec& spec, const GrTextureProxy* proxy, int start,
@@ -408,7 +409,7 @@
         GrQuadType quadType = GrQuadType::kRect;
         GrQuadType srcQuadType = GrQuadType::kRect;
         Domain domain = Domain::kNo;
-        bool wideColor = false;
+        ColorType colorType = ColorType::kNone;
         int numProxies = 0;
         int numTotalQuads = 0;
         auto textureType = fProxies[0].fProxy->textureType();
@@ -426,7 +427,7 @@
             if (op.fDomain) {
                 domain = Domain::kYes;
             }
-            wideColor |= op.fWideColor;
+            colorType = SkTMax(colorType, static_cast<ColorType>(op.fColorType));
             numProxies += op.fProxyCnt;
             for (unsigned p = 0; p < op.fProxyCnt; ++p) {
                 numTotalQuads += op.fProxies[p].fQuadCnt;
@@ -443,8 +444,7 @@
             }
         }
 
-        VertexSpec vertexSpec(quadType, wideColor ? ColorType::kHalf : ColorType::kByte,
-                              srcQuadType, /* hasLocal */ true, domain, aaType,
+        VertexSpec vertexSpec(quadType, colorType, srcQuadType, /* hasLocal */ true, domain, aaType,
                               /* alpha as coverage */ true);
 
         GrSamplerState samplerState = GrSamplerState(GrSamplerState::WrapMode::kClamp,
@@ -562,7 +562,7 @@
         }
 
         fDomain |= that->fDomain;
-        fWideColor |= that->fWideColor;
+        fColorType = SkTMax(fColorType, that->fColorType);
         if (upgradeToCoverageAAOnMerge) {
             fAAType = static_cast<unsigned>(GrAAType::kCoverage);
         }
@@ -644,11 +644,12 @@
     unsigned fFilter : 2;
     unsigned fAAType : 2;
     unsigned fDomain : 1;
-    unsigned fWideColor : 1;
+    unsigned fColorType : 2;
+    GR_STATIC_ASSERT(GrQuadPerEdgeAA::kColorTypeCount <= 4);
     // Used to track whether fProxy is ref'ed or has a pending IO after finalize() is called.
     unsigned fFinalized : 1;
     unsigned fCanSkipAllocatorGather : 1;
-    unsigned fProxyCnt : 32 - 8;
+    unsigned fProxyCnt : 32 - 9;
     Proxy fProxies[1];
 
     static_assert(kGrQuadTypeCount <= 4, "GrQuadType does not fit in 2 bits");