Only clamp when we think our math requires it.

If we require our inputs are sound, in-gamut, premul colors (a in [0,1], r,g,b in [0,a]) then we should only need to clamp when the math we perform requires it.  The safety clamps before each store are paranoia.

The main thing this pipeline handles right now that needs clamping is the plus transfermode.
This is either used to blend, where the clamp must come after the coverage lerp, or used via a mode color filter, where we have no choice but to clamp right at the end of the color filer.

This changes how the mode color filter draws with the plus transfermode.  It didn't used to clamp at all.  I think this is a bug fix.

BUG=skia:

GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=4034

Change-Id: I3cbaade2127cc88c8782596f45749c4fe4b0e953
Reviewed-on: https://skia-review.googlesource.com/4034
Reviewed-by: Brian Osman <brianosman@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/core/SkBlendModePriv.h b/src/core/SkBlendModePriv.h
index fc2d6b3..29df639 100644
--- a/src/core/SkBlendModePriv.h
+++ b/src/core/SkBlendModePriv.h
@@ -12,12 +12,13 @@
 #include "SkRasterPipeline.h"
 
 bool SkBlendMode_SupportsCoverageAsAlpha(SkBlendMode);
+bool SkBlendMode_CanOverflow(SkBlendMode);
+bool SkBlendMode_AppendStages(SkBlendMode, SkRasterPipeline* = nullptr);
 
 #if SK_SUPPORT_GPU
 #include "GrXferProcessor.h"
 sk_sp<GrXPFactory> SkBlendMode_AsXPFactory(SkBlendMode);
 #endif
 
-bool SkBlendMode_AppendStages(SkBlendMode, SkRasterPipeline* = nullptr);
 
 #endif
diff --git a/src/core/SkModeColorFilter.cpp b/src/core/SkModeColorFilter.cpp
index 98cb3b2..f000e7c 100644
--- a/src/core/SkModeColorFilter.cpp
+++ b/src/core/SkModeColorFilter.cpp
@@ -90,7 +90,12 @@
     // and applying the opposite xfermode, e.g. dst-in instead of src-in.
     p->append(SkRasterPipeline::swap_src_dst);
     p->append(SkRasterPipeline::constant_color, &fPM4f);
-    return SkBlendMode_AppendStages((SkBlendMode)fMode, p);
+    auto mode = (SkBlendMode)fMode;
+    if (!SkBlendMode_AppendStages(mode, p)) {
+        return false;
+    }
+    if (SkBlendMode_CanOverflow(mode)) { p->append(SkRasterPipeline::clamp_1); }
+    return true;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 548e503..ccfdeb5 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -55,7 +55,7 @@
 // the Stage*.  This mostly matters on 64-bit Windows where every register is precious.
 
 #define SK_RASTER_PIPELINE_STAGES(M)                            \
-    M(swap_src_dst) M(constant_color)                           \
+    M(swap_src_dst) M(constant_color) M(clamp_1)                \
     M(load_s_565)  M(load_d_565)  M(store_565)                  \
     M(load_s_srgb) M(load_d_srgb) M(store_srgb)                 \
     M(load_s_f16)  M(load_d_f16)  M(store_f16)                  \
diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp
index daf34a2..9ad0482 100644
--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@@ -42,6 +42,7 @@
     void append_load_d(SkRasterPipeline*) const;
     void append_store (SkRasterPipeline*) const;
     void append_blend (SkRasterPipeline*) const;
+    void maybe_clamp  (SkRasterPipeline*) const;
 
     SkPixmap         fDst;
     SkRasterPipeline fShader;
@@ -164,12 +165,17 @@
     SkAssertResult(SkBlendMode_AppendStages(fBlend, p));
 }
 
+void SkRasterPipelineBlitter::maybe_clamp(SkRasterPipeline* p) const {
+    if (SkBlendMode_CanOverflow(fBlend)) { p->append(SkRasterPipeline::clamp_1); }
+}
+
 void SkRasterPipelineBlitter::blitH(int x, int y, int w) {
     if (!fBlitH) {
         SkRasterPipeline p;
         p.extend(fShader);
         this->append_load_d(&p);
         this->append_blend(&p);
+        this->maybe_clamp(&p);
         this->append_store(&p);
         fBlitH = p.compile();
     }
@@ -185,6 +191,7 @@
         this->append_load_d(&p);
         this->append_blend(&p);
         p.append(SkRasterPipeline::lerp_constant_float, &fConstantCoverage);
+        this->maybe_clamp(&p);
         this->append_store(&p);
         fBlitAntiH = p.compile();
     }
@@ -212,6 +219,7 @@
         this->append_load_d(&p);
         this->append_blend(&p);
         p.append(SkRasterPipeline::lerp_u8, &fMaskPtr);
+        this->maybe_clamp(&p);
         this->append_store(&p);
         fBlitMaskA8 = p.compile();
     }
@@ -222,6 +230,7 @@
         this->append_load_d(&p);
         this->append_blend(&p);
         p.append(SkRasterPipeline::lerp_565, &fMaskPtr);
+        this->maybe_clamp(&p);
         this->append_store(&p);
         fBlitMaskLCD16 = p.compile();
     }
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index 9b61726..7180824 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -1486,6 +1486,8 @@
 }
 #endif
 
+bool SkBlendMode_CanOverflow(SkBlendMode mode) { return mode == SkBlendMode::kPlus; }
+
 bool SkBlendMode_AppendStages(SkBlendMode mode, SkRasterPipeline* p) {
     auto stage = SkRasterPipeline::srcover;
     switch (mode) {
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 5b16bf3..b22c9fc 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -131,19 +131,6 @@
     static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa,   \
                                                const SkNf& d, const SkNf& da)
 
-// Clamp colors into [0,1] premul (e.g. just before storing back to memory).
-SI void clamp_01_premul(SkNf& r, SkNf& g, SkNf& b, SkNf& a) {
-    a = SkNf::Max(a, 0.0f);
-    r = SkNf::Max(r, 0.0f);
-    g = SkNf::Max(g, 0.0f);
-    b = SkNf::Max(b, 0.0f);
-
-    a = SkNf::Min(a, 1.0f);
-    r = SkNf::Min(r, a);
-    g = SkNf::Min(g, a);
-    b = SkNf::Min(b, a);
-}
-
 SI SkNf inv(const SkNf& x) { return 1.0f - x; }
 
 SI SkNf lerp(const SkNf& from, const SkNf& to, const SkNf& cov) {
@@ -205,6 +192,22 @@
 
 STAGE(just_return, false) { }
 
+/*  We don't seem to have a need for this yet.
+STAGE(clamp_0, true) {
+    a = SkNf::Max(a, 0.0f);
+    r = SkNf::Max(r, 0.0f);
+    g = SkNf::Max(g, 0.0f);
+    b = SkNf::Max(b, 0.0f);
+}
+*/
+
+STAGE(clamp_1, true) {
+    a = SkNf::Min(a, 1.0f);
+    r = SkNf::Min(r, a);
+    g = SkNf::Min(g, a);
+    b = SkNf::Min(b, a);
+}
+
 STAGE(swap_src_dst, true) {
     SkTSwap(r,dr);
     SkTSwap(g,dg);
@@ -278,7 +281,6 @@
 }
 
 STAGE(store_565, false) {
-    clamp_01_premul(r,g,b,a);
     auto ptr = *(uint16_t**)ctx + x;
     store<kIsTail>(tail, to_565(r,g,b), ptr);
 }
@@ -336,7 +338,6 @@
 }
 
 STAGE(store_f16, false) {
-    clamp_01_premul(r,g,b,a);
     auto ptr = *(uint64_t**)ctx + x;
 
     uint64_t buf[8];
@@ -382,7 +383,6 @@
 }
 
 STAGE(store_srgb, false) {
-    clamp_01_premul(r,g,b,a);
     auto ptr = *(uint32_t**)ctx + x;
     store<kIsTail>(tail, (      sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT
                          |      sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT