Add clamp to sk_linear_to_srgb, reorder instructions

Improves performance for xforms toSRGB and to2Dot2.  Seems
more optimal to save clamping until the end.  That way we
don't stall the mul pipeline with a min/max.
toSRGB:  371us -> 346us
to2Dot2: 404us -> 387us

FWIW, it probably makes sense to clamp inside
sk_linear_to_srgb anyway.  If not, we should potentially
provide two versions (one that clamps and one that
doesn't).

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2173803002
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2173803002
diff --git a/src/core/SkSRGB.h b/src/core/SkSRGB.h
index d3baa74..08ba860 100644
--- a/src/core/SkSRGB.h
+++ b/src/core/SkSRGB.h
@@ -22,6 +22,12 @@
 
 extern const float sk_linear_from_srgb[256];
 
+static inline Sk4f sk_clamp_0_255(const Sk4f& x) {
+    // The order of the arguments is important here.  We want to make sure that NaN
+    // clamps to zero.  Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.
+    return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f);
+}
+
 static inline Sk4i sk_linear_to_srgb(const Sk4f& x) {
     // Approximation of the sRGB gamma curve (within 1 when scaled to 8-bit pixels).
     //
@@ -40,7 +46,7 @@
             + (+0.687999f  * 255.0f) * sqrt
             + (+0.412999f  * 255.0f) * ftrt;
 
-    return SkNx_cast<int>( (x < 0.0048f).thenElse(lo, hi) );
+    return SkNx_cast<int>(sk_clamp_0_255((x < 0.0048f).thenElse(lo, hi)));
 }
 
 #endif//SkSRGB_DEFINED
diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h
index 0ab9000..b4eb9a2 100644
--- a/src/opts/SkColorXform_opts.h
+++ b/src/opts/SkColorXform_opts.h
@@ -16,12 +16,6 @@
 
 namespace SK_OPTS_NS {
 
-static Sk4f clamp_0_1(const Sk4f& x) {
-    // The order of the arguments is important here.  We want to make sure that NaN
-    // clamps to zero.  Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.
-    return Sk4f::Min(Sk4f::Max(x, 0.0f), 1.0f);
-}
-
 static Sk4i linear_to_2dot2(const Sk4f& x) {
     // x^(29/64) is a very good approximation of the true value, x^(1/2.2).
     auto x2  = x.rsqrt(),                            // x^(-1/2)
@@ -29,7 +23,7 @@
          x64 = x32.rsqrt();                          // x^(+1/64)
 
     // 29 = 32 - 2 - 1
-    return Sk4f_round(255.0f * x2.invert() * x32 * x64.invert());
+    return Sk4f_round(sk_clamp_0_255(255.0f * x2.invert() * x32 * x64.invert()));
 }
 
 enum DstGamma {
@@ -82,10 +76,9 @@
                 Sk4i (*linear_to_curve)(const Sk4f&) =
                         (kSRGB_DstGamma == kDstGamma) ? sk_linear_to_srgb : linear_to_2dot2;
 
-                auto reds   = linear_to_curve(clamp_0_1(dstReds));
-                auto greens = linear_to_curve(clamp_0_1(dstGreens));
-                auto blues  = linear_to_curve(clamp_0_1(dstBlues));
-
+                auto reds   = linear_to_curve(dstReds);
+                auto greens = linear_to_curve(dstGreens);
+                auto blues  = linear_to_curve(dstBlues);
 
                 auto rgba = (reds       << SK_R32_SHIFT)
                           | (greens     << SK_G32_SHIFT)
@@ -155,7 +148,7 @@
             Sk4i (*linear_to_curve)(const Sk4f&) =
                     (kSRGB_DstGamma == kDstGamma) ? sk_linear_to_srgb : linear_to_2dot2;
 
-            auto pixel = linear_to_curve(clamp_0_1(dstPixel));
+            auto pixel = linear_to_curve(dstPixel);
 
             uint32_t rgba;
             SkNx_cast<uint8_t>(pixel).store(&rgba);