Enable legacy premuls in SkColorSpaceXform

***Will allow for simplified Android framework code, they typically
   want a color correct transform followed by a gamma encoded premul.
***Chrome does the same, so this will make it easier to replace their
   codecs.
***Will decrease code size.  Both types of premuls are moved off the
   fast path here - one is essentially unused in production and the
   other is not "encouraged".
***Will actually make the common case faster: sRGB->sRGB means no
   color xform, just premul in SkSwizzler.

BUG=skia:

CQ_INCLUDE_TRYBOTS=skia.primary:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD

Change-Id: Ia4ec1d273b6f137151f951d37c0ebf975f6b9a3e
Reviewed-on: https://skia-review.googlesource.com/8848
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Matt Sarett <msarett@google.com>
diff --git a/src/codec/SkCodec.cpp b/src/codec/SkCodec.cpp
index c31d533..e9eeb45 100644
--- a/src/codec/SkCodec.cpp
+++ b/src/codec/SkCodec.cpp
@@ -474,17 +474,7 @@
     }
 }
 
-bool SkCodec::initializeColorXform(const SkImageInfo& info) {
-    // TODO (msarett):
-    // Handle equality checking and legacy behavior for flagged SkColorSpaces.
-    // Until this is implemented, remove any flags on output color spaces.  This
-    // will prevent strange behaviors.  Ex: sRGB != sRGB + flag, but we don't want
-    // this to trigger a color xform.
-    SkImageInfo dstInfo = info;
-    if (dstInfo.colorSpace()) {
-        dstInfo = info.makeColorSpace(as_CSB(dstInfo.colorSpace())->makeWithoutFlags());
-    }
-
+bool SkCodec::initializeColorXform(const SkImageInfo& dstInfo) {
     fColorXform = nullptr;
     bool needsPremul = needs_premul(dstInfo, fEncodedInfo);
     if (needs_color_xform(dstInfo, fSrcInfo, needsPremul)) {
diff --git a/src/codec/SkCodecPriv.h b/src/codec/SkCodecPriv.h
index 110cbdc..026120f 100644
--- a/src/codec/SkCodecPriv.h
+++ b/src/codec/SkCodecPriv.h
@@ -302,16 +302,23 @@
 
 static inline bool needs_color_xform(const SkImageInfo& dstInfo, const SkImageInfo& srcInfo,
                                      bool needsPremul) {
+    // We never perform a color xform in legacy mode.
+    if (!dstInfo.colorSpace()) {
+        return false;
+    }
+
     // F16 is by definition a linear space, so we always must perform a color xform.
     bool isF16 = kRGBA_F16_SkColorType == dstInfo.colorType();
 
     // Need a color xform when dst space does not match the src.
-    bool srcDstNotEqual = !SkColorSpace::Equals(srcInfo.colorSpace(), dstInfo.colorSpace());
+    bool srcDstNotEqual =
+            !SkColorSpace_Base::EqualsIgnoreFlags(srcInfo.colorSpace(), dstInfo.colorSpace());
 
-    // We never perform a color xform in legacy mode.
-    bool isLegacy = nullptr == dstInfo.colorSpace();
+    // We provide the option for both legacy premuls and color correct premuls.
+    bool needsColorCorrectPremul =
+            needsPremul && !as_CSB(dstInfo.colorSpace())->nonLinearBlending();
 
-    return !isLegacy && (needsPremul || isF16 || srcDstNotEqual);
+    return needsColorCorrectPremul || isF16 || srcDstNotEqual;
 }
 
 static inline SkAlphaType select_xform_alpha(SkAlphaType dstAlphaType, SkAlphaType srcAlphaType) {
diff --git a/src/core/SkColorSpace.cpp b/src/core/SkColorSpace.cpp
index b24a8d67..399a161 100644
--- a/src/core/SkColorSpace.cpp
+++ b/src/core/SkColorSpace.cpp
@@ -554,6 +554,11 @@
     }
 }
 
+bool SkColorSpace_Base::EqualsIgnoreFlags(SkColorSpace* src, SkColorSpace* dst) {
+    return SkColorSpace::Equals(as_CSB(src)->makeWithoutFlags().get(),
+                                as_CSB(dst)->makeWithoutFlags().get());
+}
+
 bool SkColorSpace::Equals(const SkColorSpace* src, const SkColorSpace* dst) {
     if (src == dst) {
         return true;
diff --git a/src/core/SkColorSpaceXform.cpp b/src/core/SkColorSpaceXform.cpp
index a66d52e..067868d 100644
--- a/src/core/SkColorSpaceXform.cpp
+++ b/src/core/SkColorSpaceXform.cpp
@@ -312,7 +312,7 @@
 
     ColorSpaceMatch csm = kNone_ColorSpaceMatch;
     SkMatrix44 srcToDst(SkMatrix44::kUninitialized_Constructor);
-    if (SkColorSpace::Equals(srcSpace, dstSpace)) {
+    if (SkColorSpace_Base::EqualsIgnoreFlags(srcSpace, dstSpace)) {
         srcToDst.setIdentity();
         csm = kFull_ColorSpaceMatch;
     } else {
@@ -505,26 +505,6 @@
     rgba = rgba + rTgTbT;
 }
 
-static AI void premultiply(Sk4f& dr, Sk4f& dg, Sk4f& db, const Sk4f& da, bool kClamp) {
-    if (kClamp) {
-        dr = Sk4f::Min(dr, 1.0f);
-        dg = Sk4f::Min(dg, 1.0f);
-        db = Sk4f::Min(db, 1.0f);
-    }
-
-    dr = da * dr;
-    dg = da * dg;
-    db = da * db;
-}
-
-static AI void premultiply_1(const Sk4f& a, Sk4f& rgba, bool kClamp) {
-    if (kClamp) {
-        rgba = Sk4f::Min(rgba, 1.0f);
-    }
-
-    rgba = a * rgba;
-}
-
 template <Order kOrder>
 static AI void store_srgb(void* dst, const uint32_t* src, Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&,
                           const uint8_t* const[3]) {
@@ -765,8 +745,7 @@
                              const uint8_t* const dstTables[3]) {
     LoadFn load;
     Load1Fn load_1;
-    const bool kLoadAlpha = (kPremul_SkAlphaType == kAlphaType) ||
-                            (kF16_Linear_DstFormat == kDst && kOpaque_SkAlphaType != kAlphaType);
+    const bool kLoadAlpha = kF16_Linear_DstFormat == kDst && kOpaque_SkAlphaType != kAlphaType;
     switch (kSrc) {
         case kRGBA_8888_Linear_SrcFormat:
             if (kLoadAlpha) {
@@ -859,14 +838,6 @@
             break;
     }
 
-    // We always clamp before converting to 8888 outputs (because we have to).
-    // In these cases, we also need a clamp before the premul step to make sure
-    // R, G, B are not logically greater than A.
-    const bool kClamp = kRGBA_8888_Linear_DstFormat == kDst ||
-                        kRGBA_8888_SRGB_DstFormat == kDst ||
-                        kRGBA_8888_2Dot2_DstFormat == kDst ||
-                        kRGBA_8888_Table_DstFormat == kDst;
-
     const uint32_t* src = (const uint32_t*) vsrc;
     Sk4f rXgXbX, rYgYbY, rZgZbZ, rTgTbT;
     load_matrix(matrix, rXgXbX, rYgYbY, rZgZbZ, rTgTbT);
@@ -891,10 +862,6 @@
                 da = a;
             }
 
-            if (kPremul_SkAlphaType == kAlphaType) {
-                premultiply(dr, dg, db, da, kClamp);
-            }
-
             load(src, r, g, b, a, srcTables);
 
             store(dst, src - 4, dr, dg, db, da, dstTables);
@@ -913,10 +880,6 @@
             da = a;
         }
 
-        if (kPremul_SkAlphaType == kAlphaType) {
-            premultiply(dr, dg, db, da, kClamp);
-        }
-
         store(dst, src - 4, dr, dg, db, da, dstTables);
         dst = SkTAddOffset<void>(dst, 4 * sizeOfDstPixel);
     }
@@ -933,10 +896,6 @@
             rgba = Sk4f(r[0], g[0], b[0], a[0]);
         }
 
-        if (kPremul_SkAlphaType == kAlphaType) {
-            premultiply_1(a, rgba, kClamp);
-        }
-
         store_1(dst, src, rgba, a, dstTables);
 
         src += 1;
@@ -973,6 +932,7 @@
 SkColorSpaceXform_XYZ<kCSM>
 ::SkColorSpaceXform_XYZ(SkColorSpace_XYZ* srcSpace, const SkMatrix44& srcToDst,
                         SkColorSpace_XYZ* dstSpace)
+    : fLinearBlending(!dstSpace->nonLinearBlending())
 {
     fSrcToDst[ 0] = srcToDst.get(0, 0);
     fSrcToDst[ 1] = srcToDst.get(1, 0);
@@ -1033,10 +993,6 @@
             color_xform_RGBA<kSrc, kDst, kOpaque_SkAlphaType, kCSM>
                     (dst, src, len, srcTables, matrix, dstTables);
             return true;
-        case kPremul_SkAlphaType:
-            color_xform_RGBA<kSrc, kDst, kPremul_SkAlphaType, kCSM>
-                    (dst, src, len, srcTables, matrix, dstTables);
-            return true;
         case kUnpremul_SkAlphaType:
             color_xform_RGBA<kSrc, kDst, kUnpremul_SkAlphaType, kCSM>
                     (dst, src, len, srcTables, matrix, dstTables);
@@ -1106,7 +1062,8 @@
 
     if (kRGBA_F32_ColorFormat == dstColorFormat ||
         kRGBA_U16_BE_ColorFormat == srcColorFormat ||
-        kRGB_U16_BE_ColorFormat == srcColorFormat)
+        kRGB_U16_BE_ColorFormat == srcColorFormat ||
+        kPremul_SkAlphaType == alphaType)
     {
         return this->applyPipeline(dstColorFormat, dst, srcColorFormat, src, len, alphaType);
     }
@@ -1258,11 +1215,11 @@
         }
     }
 
-    if (kPremul_SkAlphaType == alphaType) {
+    if (kPremul_SkAlphaType == alphaType && fLinearBlending) {
         pipeline.append(SkRasterPipeline::premul);
     }
 
-    StoreTablesContext storeTables;
+    TablesContext tables;
     switch (fDstGamma) {
         case kSRGB_DstGamma:
             pipeline.append(SkRasterPipeline::to_srgb);
@@ -1270,45 +1227,36 @@
         case k2Dot2_DstGamma:
             pipeline.append(SkRasterPipeline::to_2dot2);
             break;
+        case kTable_DstGamma:
+            tables.fR = fDstGammaTables[0];
+            tables.fG = fDstGammaTables[1];
+            tables.fB = fDstGammaTables[2];
+            tables.fCount = SkColorSpaceXform_Base::kDstGammaTableSize;
+            pipeline.append(SkRasterPipeline::byte_tables_rgb, &tables);
         default:
             break;
     }
 
+    if (kPremul_SkAlphaType == alphaType && !fLinearBlending) {
+        pipeline.append(SkRasterPipeline::premul);
+    }
+
     switch (dstColorFormat) {
         case kRGBA_8888_ColorFormat:
-            if (kTable_DstGamma == fDstGamma) {
-                storeTables.fDst = (uint32_t*) dst;
-                storeTables.fR = fDstGammaTables[0];
-                storeTables.fG = fDstGammaTables[1];
-                storeTables.fB = fDstGammaTables[2];
-                storeTables.fCount = SkColorSpaceXform_Base::kDstGammaTableSize;
-                pipeline.append(SkRasterPipeline::store_tables, &storeTables);
-            } else {
-                pipeline.append(SkRasterPipeline::store_8888, &dst);
-            }
+             pipeline.append(SkRasterPipeline::store_8888, &dst);
             break;
         case kBGRA_8888_ColorFormat:
-            if (kTable_DstGamma == fDstGamma) {
-                storeTables.fDst = (uint32_t*) dst;
-                storeTables.fR = fDstGammaTables[2];
-                storeTables.fG = fDstGammaTables[1];
-                storeTables.fB = fDstGammaTables[0];
-                storeTables.fCount = SkColorSpaceXform_Base::kDstGammaTableSize;
-                pipeline.append(SkRasterPipeline::swap_rb);
-                pipeline.append(SkRasterPipeline::store_tables, &storeTables);
-            } else {
-                pipeline.append(SkRasterPipeline::swap_rb);
-                pipeline.append(SkRasterPipeline::store_8888, &dst);
-            }
+            pipeline.append(SkRasterPipeline::swap_rb);
+            pipeline.append(SkRasterPipeline::store_8888, &dst);
             break;
         case kRGBA_F16_ColorFormat:
-            if (kLinear_DstGamma != fDstGamma) {
+            if (kLinear_DstGamma != fDstGamma || !fLinearBlending) {
                 return false;
             }
             pipeline.append(SkRasterPipeline::store_f16, &dst);
             break;
         case kRGBA_F32_ColorFormat:
-            if (kLinear_DstGamma != fDstGamma) {
+            if (kLinear_DstGamma != fDstGamma || !fLinearBlending) {
                 return false;
             }
             pipeline.append(SkRasterPipeline::store_f32, &dst);
diff --git a/src/core/SkColorSpaceXform_Base.h b/src/core/SkColorSpaceXform_Base.h
index 48035c5..cb98c68 100644
--- a/src/core/SkColorSpaceXform_Base.h
+++ b/src/core/SkColorSpaceXform_Base.h
@@ -74,6 +74,7 @@
 
     SrcGamma             fSrcGamma;
     DstGamma             fDstGamma;
+    bool                 fLinearBlending;
 
     friend class SkColorSpaceXform;
     friend std::unique_ptr<SkColorSpaceXform> SlowIdentityXform(SkColorSpace_XYZ* space);
@@ -86,8 +87,8 @@
     const float* fB;
 };
 
-struct StoreTablesContext {
-    uint32_t*      fDst;
+// Must be kept in sync with "Tables" struct in RasterPipeline_opts byte_tables_rgb.
+struct TablesContext {
     const uint8_t* fR;
     const uint8_t* fG;
     const uint8_t* fB;
diff --git a/src/core/SkColorSpace_Base.h b/src/core/SkColorSpace_Base.h
index 4cc4207..0060bb3 100644
--- a/src/core/SkColorSpace_Base.h
+++ b/src/core/SkColorSpace_Base.h
@@ -207,6 +207,8 @@
 
     static sk_sp<SkColorSpace> MakeNamed(Named);
 
+    static bool EqualsIgnoreFlags(SkColorSpace* src, SkColorSpace* dst);
+
 protected:
     SkColorSpace_Base(sk_sp<SkData> profileData, uint32_t flags);
 
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 6976906..0d2bfa4 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -72,7 +72,7 @@
     M(load_8888) M(store_8888)                                   \
     M(load_u16_be) M(load_rgb_u16_be) M(store_u16_be)            \
     M(load_tables_u16_be) M(load_tables_rgb_u16_be)              \
-    M(load_tables) M(store_tables)                               \
+    M(load_tables)                                               \
     M(scale_u8) M(scale_1_float)                                 \
     M(lerp_u8) M(lerp_565) M(lerp_1_float)                       \
     M(dstatop) M(dstin) M(dstout) M(dstover)                     \
@@ -96,7 +96,7 @@
     M(bicubic_n3y) M(bicubic_n1y) M(bicubic_p1y) M(bicubic_p3y)  \
     M(save_xy) M(accumulate)                                     \
     M(linear_gradient_2stops)                                    \
-    M(byte_tables)                                               \
+    M(byte_tables) M(byte_tables_rgb)                            \
     M(shader_adapter)                                            \
     M(rgb_to_hsl)                                                \
     M(hsl_to_rgb)
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 0a7441b..07f1e5a 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -704,20 +704,6 @@
     a = 1.0f;
 }
 
-STAGE_CTX(store_tables, const StoreTablesContext*) {
-    auto ptr = ctx->fDst + x;
-
-    float scale = ctx->fCount - 1;
-    SkNi ri = SkNf_round(scale, r);
-    SkNi gi = SkNf_round(scale, g);
-    SkNi bi = SkNf_round(scale, b);
-
-    store(tail, ( SkNx_cast<int>(gather(tail, ctx->fR, ri)) << 0
-                | SkNx_cast<int>(gather(tail, ctx->fG, gi)) << 8
-                | SkNx_cast<int>(gather(tail, ctx->fB, bi)) << 16
-                | SkNf_round(255.0f, a)                     << 24), (int*)ptr);
-}
-
 SI SkNf inv(const SkNf& x) { return 1.0f - x; }
 
 RGBA_XFERMODE(clear)    { return 0.0f; }
@@ -1145,6 +1131,16 @@
     a = SkNf_from_byte(gather(tail, tables->a, SkNf_round(255.0f, a)));
 }
 
+STAGE_CTX(byte_tables_rgb, const void*) {
+    struct Tables { const uint8_t *r, *g, *b; int n; };
+    auto tables = (const Tables*)ctx;
+
+    float scale = tables->n - 1;
+    r = SkNf_from_byte(gather(tail, tables->r, SkNf_round(scale, r)));
+    g = SkNf_from_byte(gather(tail, tables->g, SkNf_round(scale, g)));
+    b = SkNf_from_byte(gather(tail, tables->b, SkNf_round(scale, b)));
+}
+
 STAGE_CTX(shader_adapter, SkShader::Context*) {
     SkPM4f buf[N];
     static_assert(sizeof(buf) == sizeof(r) + sizeof(g) + sizeof(b) + sizeof(a), "");