Correct sRGB <-> linear everywhere.

This trims the SkPM4fPriv methods down to just foolproof methods.
(Anything trying to build these itself is probably wrong.)

Things like Sk4f srgb_to_linear(Sk4f) can't really exist anymore,
at least not efficiently, so this refactor is somewhat more invasive
than you might think.  Generally this means things using to_4f() are
also making a misstep... that's gone too.

It also does not make sense to try to play games with linear floats
with 255 bias any more.  That hack can't work with real sRGB coding.

Rather than update them, I've removed a couple of L32 xfermode fast
paths.  I'd even rather drop it entirely...

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2163683002
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2163683002
diff --git a/src/core/SkColor.cpp b/src/core/SkColor.cpp
index a1404a2..39e9aaf 100644
--- a/src/core/SkColor.cpp
+++ b/src/core/SkColor.cpp
@@ -106,10 +106,7 @@
 #include "SkHalf.h"
 
 SkPM4f SkPM4f::FromPMColor(SkPMColor c) {
-    Sk4f value = to_4f_rgba(c);
-    SkPM4f c4;
-    (value * Sk4f(1.0f / 255)).store(&c4);
-    return c4;
+    return From4f(swizzle_rb_if_bgra(Sk4f_fromL32(c)));
 }
 
 SkColor4f SkPM4f::unpremul() const {
@@ -152,21 +149,14 @@
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-SkColor4f SkColor4f::FromColor(SkColor c) {
-    Sk4f value = SkNx_shuffle<2,1,0,3>(SkNx_cast<float>(Sk4b::Load(&c)));
-    SkColor4f c4;
-    (value * Sk4f(1.0f / 255)).store(&c4);
-    c4.fR = srgb_to_linear(c4.fR);
-    c4.fG = srgb_to_linear(c4.fG);
-    c4.fB = srgb_to_linear(c4.fB);
-    return c4;
+SkColor4f SkColor4f::FromColor(SkColor bgra) {
+    SkColor4f rgba;
+    swizzle_rb(Sk4f_fromS32(bgra)).store(rgba.vec());
+    return rgba;
 }
 
 SkColor SkColor4f::toSkColor() const {
-    SkColor result;
-    Sk4f value = Sk4f(linear_to_srgb(fB), linear_to_srgb(fG), linear_to_srgb(fR), fA);
-    SkNx_cast<uint8_t>(value * Sk4f(255) + Sk4f(0.5f)).store(&result);
-    return result;
+    return Sk4f_toS32(swizzle_rb(Sk4f::Load(this->vec())));
 }
 
 SkColor4f SkColor4f::Pin(float r, float g, float b, float a) {
diff --git a/src/core/SkColorMatrixFilterRowMajor255.cpp b/src/core/SkColorMatrixFilterRowMajor255.cpp
index cdfd1df..29a3f10 100644
--- a/src/core/SkColorMatrixFilterRowMajor255.cpp
+++ b/src/core/SkColorMatrixFilterRowMajor255.cpp
@@ -126,7 +126,7 @@
         return round(swizzle_rb_if_bgra(c4));
     }
     static Sk4f To4f(SkPMColor c) {
-        return to_4f(c) * Sk4f(1.0f/255);
+        return Sk4f_fromL32(c);
     }
 };
 void SkColorMatrixFilterRowMajor255::filterSpan(const SkPMColor src[], int count, SkPMColor dst[]) const {
diff --git a/src/core/SkLinearBitmapPipeline_sample.h b/src/core/SkLinearBitmapPipeline_sample.h
index 86ad6e1..759075b 100644
--- a/src/core/SkLinearBitmapPipeline_sample.h
+++ b/src/core/SkLinearBitmapPipeline_sample.h
@@ -170,11 +170,10 @@
     PixelConverter(const SkPixmap& srcPixmap) { }
 
     Sk4f toSk4f(Element pixel) const {
-        float gray = pixel * (1.0f/255.0f);
-        Sk4f result = Sk4f{gray, gray, gray, 1.0f};
-        return gammaType == kSRGB_SkGammaType
-               ? srgb_to_linear(result)
-               : result;
+        float gray = (gammaType == kSRGB_SkGammaType)
+            ? sk_linear_from_srgb[pixel]
+            : pixel * (1/255.0f);
+        return {gray, gray, gray, 1.0f};
     }
 };
 
diff --git a/src/core/SkPM4fPriv.h b/src/core/SkPM4fPriv.h
index 57a44c1..89a0cae 100644
--- a/src/core/SkPM4fPriv.h
+++ b/src/core/SkPM4fPriv.h
@@ -10,142 +10,65 @@
 
 #include "SkColorPriv.h"
 #include "SkPM4f.h"
+#include "SkSRGB.h"
 
-static inline float get_alpha(const Sk4f& f4) {
-    return f4[SkPM4f::A];
+static inline Sk4f set_alpha(const Sk4f& px, float alpha) {
+    return { px[0], px[1], px[2], alpha };
 }
 
-static inline Sk4f set_alpha(const Sk4f& f4, float alpha) {
-    static_assert(3 == SkPM4f::A, "");
-    return Sk4f(f4[0], f4[1], f4[2], alpha);
+static inline float get_alpha(const Sk4f& px) {
+    return px[3];
 }
 
-static inline uint32_t to_4b(const Sk4f& f4) {
-    uint32_t b4;
-    SkNx_cast<uint8_t>(f4).store((uint8_t*)&b4);
-    return b4;
+
+static inline Sk4f Sk4f_fromL32(uint32_t px) {
+    return SkNx_cast<float>(Sk4b::Load(&px)) * (1/255.0f);
 }
 
-static inline Sk4f to_4f(uint32_t b4) {
-    return SkNx_cast<float>(Sk4b::Load((const uint8_t*)&b4));
+static inline Sk4f Sk4f_fromS32(uint32_t px) {
+    return { sk_linear_from_srgb[(px >>  0) & 0xff],
+             sk_linear_from_srgb[(px >>  8) & 0xff],
+             sk_linear_from_srgb[(px >> 16) & 0xff],
+                    (1/255.0f) * (px >> 24)          };
 }
 
-static inline Sk4f to_4f_rgba(uint32_t b4) {
-    return swizzle_rb_if_bgra(to_4f(b4));
+static inline uint32_t Sk4f_toL32(const Sk4f& px) {
+    uint32_t l32;
+    SkNx_cast<uint8_t>(Sk4f_round(px * 255.0f)).store(&l32);
+    return l32;
 }
 
-static inline Sk4f srgb_to_linear(const Sk4f& s4) {
-    return set_alpha(s4 * s4, get_alpha(s4));
+static inline uint32_t Sk4f_toS32(const Sk4f& px) {
+    Sk4i  rgb = sk_linear_to_srgb(px),
+         srgb = { rgb[0], rgb[1], rgb[2], (int)(255.0f * px[3] + 0.5f) };
+
+    uint32_t s32;
+    SkNx_cast<uint8_t>(srgb).store(&s32);
+    return s32;
 }
 
-static inline Sk4f linear_to_srgb(const Sk4f& l4) {
-    return set_alpha(l4.rsqrt().invert(), get_alpha(l4));
-}
 
-static inline float srgb_to_linear(float x) {
-    return x * x;
-}
-
-static inline float linear_to_srgb(float x) {
-    return sqrtf(x);
-}
-
-static void assert_unit(float x) {
-    SkASSERT(x >= 0 && x <= 1);
-}
-
-static inline float exact_srgb_to_linear(float x) {
-    assert_unit(x);
-    float linear;
-    if (x <= 0.04045) {
-        linear = x / 12.92f;
-    } else {
-        linear = powf((x + 0.055f) / 1.055f, 2.4f);
-    }
-    assert_unit(linear);
-    return linear;
-}
-
-static inline float exact_linear_to_srgb(float x) {
-    assert_unit(x);
-    float srgb;
-    if (x <= 0.0031308f) {
-        srgb = x * 12.92f;
-    } else {
-        srgb = 1.055f * powf(x, 0.41666667f) - 0.055f;
-    }
-    assert_unit(srgb);
-    return srgb;
-}
-
-static inline Sk4f exact_srgb_to_linear(const Sk4f& x) {
-    Sk4f linear(exact_srgb_to_linear(x[0]),
-                exact_srgb_to_linear(x[1]),
-                exact_srgb_to_linear(x[2]), 1);
-    return set_alpha(linear, get_alpha(x));
-}
-
-static inline Sk4f exact_linear_to_srgb(const Sk4f& x) {
-    Sk4f srgb(exact_linear_to_srgb(x[0]),
-              exact_linear_to_srgb(x[1]),
-              exact_linear_to_srgb(x[2]), 1);
-    return set_alpha(srgb, get_alpha(x));
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-static inline Sk4f Sk4f_fromL32(uint32_t src) {
-    return to_4f(src) * Sk4f(1.0f/255);
-}
-
-static inline Sk4f Sk4f_fromS32(uint32_t src) {
-    return srgb_to_linear(to_4f(src) * Sk4f(1.0f/255));
-}
-
-// Color handling:
+// SkColor handling:
 //   SkColor has an ordering of (b, g, r, a) if cast to an Sk4f, so the code swizzles r and b to
 // produce the needed (r, g, b, a) ordering.
 static inline Sk4f Sk4f_from_SkColor(SkColor color) {
     return swizzle_rb(Sk4f_fromS32(color));
 }
 
-static inline uint32_t Sk4f_toL32(const Sk4f& x4) {
-    return to_4b(x4 * Sk4f(255) + Sk4f(0.5f));
+static inline void assert_unit(float x) {
+    SkASSERT(0 <= x && x <= 1);
 }
 
-static inline uint32_t Sk4f_toS32(const Sk4f& x4) {
-    return to_4b(linear_to_srgb(x4) * Sk4f(255) + Sk4f(0.5f));
-}
-
-static inline Sk4f exact_Sk4f_fromS32(uint32_t src) {
-    return exact_srgb_to_linear(to_4f(src) * Sk4f(1.0f/255));
-}
-static inline uint32_t exact_Sk4f_toS32(const Sk4f& x4) {
-    return to_4b(exact_linear_to_srgb(x4) * Sk4f(255) + Sk4f(0.5f));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// An implementation of SrcOver from bytes to bytes in linear space that takes advantage of the
-// observation that the 255's cancel.
-//    invA = 1 - (As / 255);
-//
-//    R = 255 * sqrt((Rs/255)^2 + (Rd/255)^2 * invA)
-// => R = 255 * sqrt((Rs^2 + Rd^2 * invA)/255^2)
-// => R = sqrt(Rs^2 + Rd^2 * invA)
-// Note: src is assumed to be linear.
-static inline void srcover_blend_srgb8888_srgb_1(uint32_t* dst, const Sk4f& src) {
-    Sk4f d = srgb_to_linear(to_4f(*dst));
-    Sk4f invAlpha = 1.0f - Sk4f{src[SkPM4f::A]} * (1.0f / 255.0f);
-    Sk4f r = linear_to_srgb(src + d * invAlpha) + 0.5f;
-    *dst = to_4b(r);
-}
-
-static inline void srcover_srgb8888_srgb_1(uint32_t* dst, const uint32_t pixel) {
-    if ((~pixel & 0xFF000000) == 0) {
-        *dst = pixel;
-    } else if ((pixel & 0xFF000000) != 0) {
-        srcover_blend_srgb8888_srgb_1(dst, srgb_to_linear(to_4f(pixel)));
+static inline float exact_srgb_to_linear(float srgb) {
+    assert_unit(srgb);
+    float linear;
+    if (srgb <= 0.04045) {
+        linear = srgb / 12.92f;
+    } else {
+        linear = powf((srgb + 0.055f) / 1.055f, 2.4f);
     }
+    assert_unit(linear);
+    return linear;
 }
 
 #endif
diff --git a/src/core/SkSpanProcs.cpp b/src/core/SkSpanProcs.cpp
index 87dcbc0..b2e9914 100644
--- a/src/core/SkSpanProcs.cpp
+++ b/src/core/SkSpanProcs.cpp
@@ -22,7 +22,7 @@
     SkASSERT(src.addr32(x + count - 1, y));
 
     for (int i = 0; i < count; ++i) {
-        (to_4f_rgba(addr[i]) * Sk4f(1.0f/255)).store(span[i].fVec);
+        swizzle_rb_if_bgra(Sk4f_fromL32(addr[i])).store(span[i].fVec);
     }
 }
 
@@ -32,7 +32,7 @@
     SkASSERT(src.addr32(x + count - 1, y));
 
     for (int i = 0; i < count; ++i) {
-        srgb_to_linear(to_4f_rgba(addr[i]) * Sk4f(1.0f/255)).store(span[i].fVec);
+        swizzle_rb_if_bgra(Sk4f_fromS32(addr[i])).store(span[i].fVec);
     }
 }
 
diff --git a/src/core/SkXfermode4f.cpp b/src/core/SkXfermode4f.cpp
index d861973..87a12a7 100644
--- a/src/core/SkXfermode4f.cpp
+++ b/src/core/SkXfermode4f.cpp
@@ -35,44 +35,40 @@
     return (D == kSRGB_Dst) ? Sk4f_fromS32(dstC) : Sk4f_fromL32(dstC);
 }
 
-static Sk4f srgb_4b_to_linear_unit(SkPMColor dstC) {
-    return Sk4f_fromS32(dstC);
-}
-
 template <DstType D> uint32_t store_dst(const Sk4f& x4) {
     return (D == kSRGB_Dst) ? Sk4f_toS32(x4) : Sk4f_toL32(x4);
 }
 
-static Sk4f linear_unit_to_srgb_255f(const Sk4f& l4) {
-    return linear_to_srgb(l4) * Sk4f(255) + Sk4f(0.5f);
+static Sk4x4f load_4_srgb(const void* vptr) {
+    auto ptr = (const uint32_t*)vptr;
+
+    Sk4x4f rgba;
+
+    rgba.r = { sk_linear_from_srgb[(ptr[0] >>  0) & 0xff],
+               sk_linear_from_srgb[(ptr[1] >>  0) & 0xff],
+               sk_linear_from_srgb[(ptr[2] >>  0) & 0xff],
+               sk_linear_from_srgb[(ptr[3] >>  0) & 0xff] };
+
+    rgba.g = { sk_linear_from_srgb[(ptr[0] >>  8) & 0xff],
+               sk_linear_from_srgb[(ptr[1] >>  8) & 0xff],
+               sk_linear_from_srgb[(ptr[2] >>  8) & 0xff],
+               sk_linear_from_srgb[(ptr[3] >>  8) & 0xff] };
+
+    rgba.b = { sk_linear_from_srgb[(ptr[0] >> 16) & 0xff],
+               sk_linear_from_srgb[(ptr[1] >> 16) & 0xff],
+               sk_linear_from_srgb[(ptr[2] >> 16) & 0xff],
+               sk_linear_from_srgb[(ptr[3] >> 16) & 0xff] };
+
+    rgba.a = SkNx_cast<float>((Sk4i::Load(ptr) >> 24) & 0xff) * (1/255.0f);
+
+    return rgba;
 }
 
-// Load 4 interlaced 8888 sRGB pixels as an Sk4x4f, transposed and converted to float.
-static Sk4x4f load_4_srgb(const void* ptr) {
-    auto p = Sk4x4f::Transpose((const uint8_t*)ptr);
-
-    // Scale to [0,1].
-    p.r *= 1/255.0f;
-    p.g *= 1/255.0f;
-    p.b *= 1/255.0f;
-    p.a *= 1/255.0f;
-
-    // Apply approximate sRGB gamma correction to convert to linear (as if gamma were 2).
-    p.r *= p.r;
-    p.g *= p.g;
-    p.b *= p.b;
-
-    return p;
-}
-
-// Store an Sk4x4f back to 4 interlaced 8888 sRGB pixels.
 static void store_4_srgb(void* ptr, const Sk4x4f& p) {
-    // Convert back to sRGB and [0,255], again approximating sRGB as gamma == 2.
-    auto r = p.r.rsqrt().invert() * 255.0f + 0.5f,
-         g = p.g.rsqrt().invert() * 255.0f + 0.5f,
-         b = p.b.rsqrt().invert() * 255.0f + 0.5f,
-         a = p.a                  * 255.0f + 0.5f;
-    Sk4x4f{r,g,b,a}.transpose((uint8_t*)ptr);
+    ( sk_linear_to_srgb(p.r) <<  0
+    | sk_linear_to_srgb(p.g) <<  8
+    | sk_linear_to_srgb(p.b) << 16
+    | Sk4f_round(255.0f*p.a) << 24).store(ptr);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -197,41 +193,24 @@
     const Sk4f s4 = src->to4f_pmorder();
 
     if (aa) {
-        if (D == kLinear_Dst) {
-            // operate in bias-255 space for src and dst
-            const Sk4f& s4_255 = s4 * Sk4f(255);
-            while (count >= 4) {
-                Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.f);
-                Sk4f r0 = lerp(s4_255, to_4f(dst[0]), Sk4f(aa4[0])) + Sk4f(0.5f);
-                Sk4f r1 = lerp(s4_255, to_4f(dst[1]), Sk4f(aa4[1])) + Sk4f(0.5f);
-                Sk4f r2 = lerp(s4_255, to_4f(dst[2]), Sk4f(aa4[2])) + Sk4f(0.5f);
-                Sk4f r3 = lerp(s4_255, to_4f(dst[3]), Sk4f(aa4[3])) + Sk4f(0.5f);
-                Sk4f_ToBytes((uint8_t*)dst, r0, r1, r2, r3);
-
-                dst += 4;
-                aa += 4;
-                count -= 4;
-            }
-        } else {    // kSRGB
-            SkPMColor srcColor = store_dst<D>(s4);
-            while (count-- > 0) {
-                SkAlpha cover = *aa++;
-                switch (cover) {
-                    case 0xFF: {
-                        *dst++ = srcColor;
-                        break;
-                    }
-                    case 0x00: {
-                        dst++;
-                        break;
-                    }
-                    default: {
-                        Sk4f d4 = load_dst<D>(*dst);
-                        *dst++ = store_dst<D>(lerp(s4, d4, cover));
-                    }
+        SkPMColor srcColor = store_dst<D>(s4);
+        while (count-- > 0) {
+            SkAlpha cover = *aa++;
+            switch (cover) {
+                case 0xFF: {
+                    *dst++ = srcColor;
+                    break;
+                }
+                case 0x00: {
+                    dst++;
+                    break;
+                }
+                default: {
+                    Sk4f d4 = load_dst<D>(*dst);
+                    *dst++ = store_dst<D>(lerp(s4, d4, cover));
                 }
             }
-        }          // kSRGB
+        }
     } else {
         sk_memset32(dst, store_dst<D>(s4), count);
     }
@@ -274,18 +253,15 @@
     } else {
         while (count >= 4 && D == kSRGB_Dst) {
             auto d = load_4_srgb(dst);
-
             auto s = Sk4x4f::Transpose(src->fVec);
         #if defined(SK_PMCOLOR_IS_BGRA)
             SkTSwap(s.r, s.b);
         #endif
-
             auto invSA = 1.0f - s.a;
             auto r = s.r + d.r * invSA,
                  g = s.g + d.g * invSA,
                  b = s.b + d.b * invSA,
                  a = s.a + d.a * invSA;
-
             store_4_srgb(dst, Sk4x4f{r,g,b,a});
             count -= 4;
             dst += 4;
@@ -322,23 +298,9 @@
             dst[i] = Sk4f_toL32(r4);
         }
     } else {
-        const Sk4f s4_255 = s4 * Sk4f(255) + Sk4f(0.5f);   // +0.5 to pre-bias for rounding
-        while (count >= 4) {
-            Sk4f d0 = to_4f(dst[0]);
-            Sk4f d1 = to_4f(dst[1]);
-            Sk4f d2 = to_4f(dst[2]);
-            Sk4f d3 = to_4f(dst[3]);
-            Sk4f_ToBytes((uint8_t*)dst,
-                         s4_255 + d0 * dst_scale,
-                         s4_255 + d1 * dst_scale,
-                         s4_255 + d2 * dst_scale,
-                         s4_255 + d3 * dst_scale);
-            dst += 4;
-            count -= 4;
-        }
         for (int i = 0; i < count; ++i) {
-            Sk4f d4 = to_4f(dst[i]);
-            dst[i] = to_4b(s4_255 + d4 * dst_scale);
+            Sk4f d4 = Sk4f_fromL32(dst[i]);
+            dst[i] = Sk4f_toL32(s4 + d4 * dst_scale);
         }
     }
 }
@@ -354,7 +316,8 @@
             if (0 == a) {
                 continue;
             }
-            Sk4f d4 = srgb_4b_to_linear_unit(dst[i]);
+
+            Sk4f d4 = Sk4f_fromS32(dst[i]);
             Sk4f r4;
             if (a != 0xFF) {
                 const Sk4f s4_aa = scale_by_coverage(s4, a);
@@ -362,30 +325,27 @@
             } else {
                 r4 = s4 + d4 * dst_scale;
             }
-            dst[i] = to_4b(linear_unit_to_srgb_255f(r4));
+            dst[i] = Sk4f_toS32(r4);
         }
     } else {
         while (count >= 4) {
             auto d = load_4_srgb(dst);
-
             auto s = Sk4x4f{{ src->r() }, { src->g() }, { src->b() }, { src->a() }};
         #if defined(SK_PMCOLOR_IS_BGRA)
             SkTSwap(s.r, s.b);
         #endif
-
             auto invSA = 1.0f - s.a;
             auto r = s.r + d.r * invSA,
                  g = s.g + d.g * invSA,
                  b = s.b + d.b * invSA,
                  a = s.a + d.a * invSA;
-
             store_4_srgb(dst, Sk4x4f{r,g,b,a});
             count -= 4;
             dst += 4;
         }
         for (int i = 0; i < count; ++i) {
-            Sk4f d4 = srgb_4b_to_linear_unit(dst[i]);
-            dst[i] = to_4b(linear_unit_to_srgb_255f(s4 + d4 * dst_scale));
+            Sk4f d4 = Sk4f_fromS32(dst[i]);
+            dst[i] = Sk4f_toS32(s4 + d4 * dst_scale);
         }
     }
 }
@@ -443,26 +403,13 @@
 void src_1_lcd(uint32_t dst[], const SkPM4f* src, int count, const uint16_t lcd[]) {
     const Sk4f s4 = src->to4f_pmorder();
 
-    if (D == kLinear_Dst) {
-        // operate in bias-255 space for src and dst
-        const Sk4f s4bias = s4 * Sk4f(255);
-        for (int i = 0; i < count; ++i) {
-            uint16_t rgb = lcd[i];
-            if (0 == rgb) {
-                continue;
-            }
-            Sk4f d4bias = to_4f(dst[i]);
-            dst[i] = to_4b(lerp(s4bias, d4bias, lcd16_to_unit_4f(rgb))) | (SK_A32_MASK << SK_A32_SHIFT);
+    for (int i = 0; i < count; ++i) {
+        uint16_t rgb = lcd[i];
+        if (0 == rgb) {
+            continue;
         }
-    } else {    // kSRGB
-        for (int i = 0; i < count; ++i) {
-            uint16_t rgb = lcd[i];
-            if (0 == rgb) {
-                continue;
-            }
-            Sk4f d4 = load_dst<D>(dst[i]);
-            dst[i] = store_dst<D>(lerp(s4, d4, lcd16_to_unit_4f(rgb))) | (SK_A32_MASK << SK_A32_SHIFT);
-        }
+        Sk4f d4 = load_dst<D>(dst[i]);
+        dst[i] = store_dst<D>(lerp(s4, d4, lcd16_to_unit_4f(rgb))) | (SK_A32_MASK << SK_A32_SHIFT);
     }
 }
 
diff --git a/src/effects/gradients/Sk4fGradientPriv.h b/src/effects/gradients/Sk4fGradientPriv.h
index 9745119..6542683 100644
--- a/src/effects/gradients/Sk4fGradientPriv.h
+++ b/src/effects/gradients/Sk4fGradientPriv.h
@@ -109,18 +109,16 @@
     using Type = SkPMColor;
 
     static Sk4f load(const SkPM4f& c) {
-        // Prescaling by (255^2, 255^2, 255^2, 255) on load, to avoid a 255 multiply on
-        // each store (S32 conversion yields a uniform 255 factor).
-        return c.to4f_pmorder() * Sk4f(255 * 255, 255 * 255, 255 * 255, 255);
+        return c.to4f_pmorder();
     }
 
     static void store(const Sk4f& c, Type* dst) {
         // FIXME: this assumes opaque colors.  Handle unpremultiplication.
-        *dst = to_4b(linear_to_srgb(PM::apply(c)));
+        *dst = Sk4f_toS32(PM::apply(c));
     }
 
     static void store(const Sk4f& c, Type* dst, int n) {
-        sk_memset32(dst, to_4b(linear_to_srgb(PM::apply(c))), n);
+        sk_memset32(dst, Sk4f_toS32(PM::apply(c)), n);
     }
 
     static void store4x(const Sk4f& c0, const Sk4f& c1,
diff --git a/src/effects/gradients/Sk4fLinearGradient.cpp b/src/effects/gradients/Sk4fLinearGradient.cpp
index d22dbff..dc6e530 100644
--- a/src/effects/gradients/Sk4fLinearGradient.cpp
+++ b/src/effects/gradients/Sk4fLinearGradient.cpp
@@ -53,13 +53,10 @@
     Sk4x4f        c4x = Sk4x4f::Transpose(c, c + dc, c + dc * 2, c + dc * 3);
 
     while (n >= 4) {
-        const Sk4x4f cx4s32 = {
-            c4x.r.rsqrt().invert(),
-            c4x.g.rsqrt().invert(),
-            c4x.b.rsqrt().invert(),
-            c4x.a
-        };
-        cx4s32.transpose((uint8_t*)dst);
+        ( sk_linear_to_srgb(c4x.r) <<  0
+        | sk_linear_to_srgb(c4x.g) <<  8
+        | sk_linear_to_srgb(c4x.b) << 16
+        | Sk4f_round(255.0f*c4x.a) << 24).store(dst);
 
         c4x.r += dc4x.r;
         c4x.g += dc4x.g;
diff --git a/src/opts/SkBlend_opts.h b/src/opts/SkBlend_opts.h
index 4f6d1e9..1da4c4f 100644
--- a/src/opts/SkBlend_opts.h
+++ b/src/opts/SkBlend_opts.h
@@ -21,19 +21,21 @@
 
 namespace SK_OPTS_NS {
 
-static inline void srcover_srgb8888_srgb_1(uint32_t* dst, const uint32_t pixel) {
-    if ((~pixel & 0xFF000000) == 0) {
-        *dst = pixel;
-    } else if ((pixel & 0xFF000000) != 0) {
-        srcover_blend_srgb8888_srgb_1(dst, srgb_to_linear(to_4f(pixel)));
+static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {
+    if (src >= 0xFF000000) {
+        *dst = src;
+        return;
     }
+    auto d = Sk4f_fromS32(*dst),
+         s = Sk4f_fromS32( src);
+    *dst = Sk4f_toS32(s + d * (1.0f - s[3]));
 }
 
 static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {
-    srcover_srgb8888_srgb_1(dst++, *src++);
-    srcover_srgb8888_srgb_1(dst++, *src++);
-    srcover_srgb8888_srgb_1(dst++, *src++);
-    srcover_srgb8888_srgb_1(dst, *src);
+    srcover_srgb_srgb_1(dst++, *src++);
+    srcover_srgb_srgb_1(dst++, *src++);
+    srcover_srgb_srgb_1(dst++, *src++);
+    srcover_srgb_srgb_1(dst  , *src  );
 }
 
 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
@@ -87,7 +89,7 @@
 
                 count = count & 3;
                 while (count-- > 0) {
-                    srcover_srgb8888_srgb_1(dst++, *src++);
+                    srcover_srgb_srgb_1(dst++, *src++);
                 }
             }
         }
@@ -159,7 +161,7 @@
 
                 count = count & 3;
                 while (count-- > 0) {
-                    srcover_srgb8888_srgb_1(dst++, *src++);
+                    srcover_srgb_srgb_1(dst++, *src++);
                 }
             }
         }
@@ -172,7 +174,7 @@
             int n = SkTMin(ndst, nsrc);
 
             for (int i = 0; i < n; i++) {
-                srcover_srgb8888_srgb_1(dst++, src[i]);
+                srcover_srgb_srgb_1(dst++, src[i]);
             }
             ndst -= n;
         }