Optimize color xforms with 2.2 gammas for SSE2

Because we recognize commonly used gamma tables and
parameters as 2.2f, about 98% of jpegs with color profiles
will pass through this xform (assuming the dst is also
2.2f).  Sample size is 10,322 jpegs.

I won't go crazy with performance numbers because this is
a work in progress, particularly in terms of correctness.

201295.jpg on HP z620
(300x280, most common form of sRGB profile)

Decode Time + QCMS Xform      1.28 ms
QCMS Xform Only               0.495 ms
Decode Time + Skia Opt Xform  1.01 ms
Skia Opt Xform Only           0.235 ms

Decode Time + Xform Speed-up  1.27x
Xform Only Speed-up           2.11x

FWIW, Skia xform time before these optimizations was
41.1 ms.  But we expected that code to be slow.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2046013002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2046013002
diff --git a/bench/ColorCodecBench.cpp b/bench/ColorCodecBench.cpp
index 85785d8..bfbea62 100644
--- a/bench/ColorCodecBench.cpp
+++ b/bench/ColorCodecBench.cpp
@@ -172,6 +172,10 @@
     if (FLAGS_qcms) {
         fDstSpaceQCMS.reset(qcms_profile_from_memory(dstData->data(), dstData->size()));
         SkASSERT(fDstSpaceQCMS);
+
+        // This call takes a non-trivial amount of time, but I think it's the most fair to
+        // treat it as overhead.  It only needs to happen once.
+        qcms_profile_precache_output_transform(fDstSpaceQCMS);
     } else
 #endif
     {
diff --git a/src/core/SkColorSpaceXform.cpp b/src/core/SkColorSpaceXform.cpp
index 216e993..0faff88 100644
--- a/src/core/SkColorSpaceXform.cpp
+++ b/src/core/SkColorSpaceXform.cpp
@@ -8,6 +8,7 @@
 #include "SkColorPriv.h"
 #include "SkColorSpace_Base.h"
 #include "SkColorSpaceXform.h"
+#include "SkOpts.h"
 
 static inline bool compute_gamut_xform(SkMatrix44* srcToDst, const SkMatrix44& srcToXYZ,
                                        const SkMatrix44& dstToXYZ) {
@@ -36,18 +37,10 @@
         return nullptr;
     }
 
-    if (as_CSB(srcSpace)->gammas()->isValues() && as_CSB(dstSpace)->gammas()->isValues()) {
-        float srcGammas[3];
-        float dstGammas[3];
-        srcGammas[0] = as_CSB(srcSpace)->gammas()->fRed.fValue;
-        srcGammas[1] = as_CSB(srcSpace)->gammas()->fGreen.fValue;
-        srcGammas[2] = as_CSB(srcSpace)->gammas()->fBlue.fValue;
-        dstGammas[0] = 1.0f / as_CSB(dstSpace)->gammas()->fRed.fValue;
-        dstGammas[1] = 1.0f / as_CSB(dstSpace)->gammas()->fGreen.fValue;
-        dstGammas[2] = 1.0f / as_CSB(dstSpace)->gammas()->fBlue.fValue;
-
-        return std::unique_ptr<SkColorSpaceXform>(
-                new SkGammaByValueXform(srcGammas, srcToDst, dstGammas));
+    if (SkColorSpace::k2Dot2Curve_GammaNamed == srcSpace->gammaNamed() &&
+        SkColorSpace::k2Dot2Curve_GammaNamed == dstSpace->gammaNamed())
+    {
+        return std::unique_ptr<SkColorSpaceXform>(new Sk2Dot2Xform(srcToDst));
     }
 
     return std::unique_ptr<SkColorSpaceXform>(
@@ -56,72 +49,53 @@
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+Sk2Dot2Xform::Sk2Dot2Xform(const SkMatrix44& srcToDst)
+{
+    // Build row major 4x4 matrix:
+    //   rX gX bX 0
+    //   rY gY bY 0
+    //   rZ gZ bZ 0
+    //   rQ gQ bQ 0
+    fSrcToDst[0] = srcToDst.getFloat(0, 0);
+    fSrcToDst[1] = srcToDst.getFloat(0, 1);
+    fSrcToDst[2] = srcToDst.getFloat(0, 2);
+    fSrcToDst[3] = 0.0f;
+    fSrcToDst[4] = srcToDst.getFloat(1, 0);
+    fSrcToDst[5] = srcToDst.getFloat(1, 1);
+    fSrcToDst[6] = srcToDst.getFloat(1, 2);
+    fSrcToDst[7] = 0.0f;
+    fSrcToDst[8] = srcToDst.getFloat(2, 0);
+    fSrcToDst[9] = srcToDst.getFloat(2, 1);
+    fSrcToDst[10] = srcToDst.getFloat(2, 2);
+    fSrcToDst[11] = 0.0f;
+    fSrcToDst[12] = srcToDst.getFloat(3, 0);
+    fSrcToDst[13] = srcToDst.getFloat(3, 1);
+    fSrcToDst[14] = srcToDst.getFloat(3, 2);
+    fSrcToDst[15] = 0.0f;
+}
+
+void Sk2Dot2Xform::xform_RGBA_8888(uint32_t* dst, const uint32_t* src, uint32_t len) const {
+    SkOpts::color_xform_2Dot2_RGBA_to_8888(dst, src, len, fSrcToDst);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 static inline float byte_to_float(uint8_t v) {
     return ((float) v) * (1.0f / 255.0f);
 }
 
-static inline uint8_t clamp_float_to_byte(float v) {
+// Expand range from 0-1 to 0-255, then convert.
+static inline uint8_t clamp_normalized_float_to_byte(float v) {
     v = v * 255.0f;
-    if (v > 255.0f) {
+    if (v >= 254.5f) {
         return 255;
-    } else if (v <= 0.0f) {
+    } else if (v < 0.5f) {
         return 0;
     } else {
         return (uint8_t) (v + 0.5f);
     }
 }
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-SkGammaByValueXform::SkGammaByValueXform(float srcGammas[3], const SkMatrix44& srcToDst,
-                                         float dstGammas[3])
-    : fSrcToDst(srcToDst)
-{
-    memcpy(fSrcGammas, srcGammas, 3 * sizeof(float));
-    memcpy(fDstGammas, dstGammas, 3 * sizeof(float));
-}
-
-void SkGammaByValueXform::xform_RGBA_8888(uint32_t* dst, const uint32_t* src, uint32_t len) const {
-    while (len-- > 0) {
-        float srcFloats[3];
-        srcFloats[0] = byte_to_float((*src >>  0) & 0xFF);
-        srcFloats[1] = byte_to_float((*src >>  8) & 0xFF);
-        srcFloats[2] = byte_to_float((*src >> 16) & 0xFF);
-
-        // Convert to linear.
-        srcFloats[0] = pow(srcFloats[0], fSrcGammas[0]);
-        srcFloats[1] = pow(srcFloats[1], fSrcGammas[1]);
-        srcFloats[2] = pow(srcFloats[2], fSrcGammas[2]);
-
-        // Convert to dst gamut.
-        float dstFloats[3];
-        dstFloats[0] = srcFloats[0] * fSrcToDst.getFloat(0, 0) +
-                       srcFloats[1] * fSrcToDst.getFloat(1, 0) +
-                       srcFloats[2] * fSrcToDst.getFloat(2, 0) + fSrcToDst.getFloat(3, 0);
-        dstFloats[1] = srcFloats[0] * fSrcToDst.getFloat(0, 1) +
-                       srcFloats[1] * fSrcToDst.getFloat(1, 1) +
-                       srcFloats[2] * fSrcToDst.getFloat(2, 1) + fSrcToDst.getFloat(3, 1);
-        dstFloats[2] = srcFloats[0] * fSrcToDst.getFloat(0, 2) +
-                       srcFloats[1] * fSrcToDst.getFloat(1, 2) +
-                       srcFloats[2] * fSrcToDst.getFloat(2, 2) + fSrcToDst.getFloat(3, 2);
-
-        // Convert to dst gamma.
-        dstFloats[0] = pow(dstFloats[0], fDstGammas[0]);
-        dstFloats[1] = pow(dstFloats[1], fDstGammas[1]);
-        dstFloats[2] = pow(dstFloats[2], fDstGammas[2]);
-
-        *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),
-                                   clamp_float_to_byte(dstFloats[0]),
-                                   clamp_float_to_byte(dstFloats[1]),
-                                   clamp_float_to_byte(dstFloats[2]));
-
-        dst++;
-        src++;
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
 // Interpolating lookup in a variably sized table.
 static inline float interp_lut(uint8_t byte, float* table, size_t tableSize) {
     float index = byte_to_float(byte) * (tableSize - 1);
@@ -261,9 +235,9 @@
         }
 
         *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),
-                                   clamp_float_to_byte(dstFloats[0]),
-                                   clamp_float_to_byte(dstFloats[1]),
-                                   clamp_float_to_byte(dstFloats[2]));
+                                   clamp_normalized_float_to_byte(dstFloats[0]),
+                                   clamp_normalized_float_to_byte(dstFloats[1]),
+                                   clamp_normalized_float_to_byte(dstFloats[2]));
 
         dst++;
         src++;
diff --git a/src/core/SkColorSpaceXform.h b/src/core/SkColorSpaceXform.h
index d54d1b8..3472643 100644
--- a/src/core/SkColorSpaceXform.h
+++ b/src/core/SkColorSpaceXform.h
@@ -34,17 +34,15 @@
     virtual ~SkColorSpaceXform() {}
 };
 
-class SkGammaByValueXform : public SkColorSpaceXform {
+class Sk2Dot2Xform : public SkColorSpaceXform {
 public:
 
     void xform_RGBA_8888(uint32_t* dst, const uint32_t* src, uint32_t len) const override;
 
 private:
-    SkGammaByValueXform(float srcGammas[3], const SkMatrix44& srcToDst, float dstGammas[3]);
+    Sk2Dot2Xform(const SkMatrix44& srcToDst);
 
-    float            fSrcGammas[3];
-    const SkMatrix44 fSrcToDst;
-    float            fDstGammas[3];
+    float fSrcToDst[16];
 
     friend class SkColorSpaceXform;
 };
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 6d3e6c9..8dec3fa 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -16,6 +16,7 @@
 #include "SkBlitRow_opts.h"
 #include "SkBlurImageFilter_opts.h"
 #include "SkColorCubeFilter_opts.h"
+#include "SkColorXform_opts.h"
 #include "SkMorphologyImageFilter_opts.h"
 #include "SkSwizzler_opts.h"
 #include "SkTextureCompressor_opts.h"
@@ -76,6 +77,9 @@
 
     decltype(srcover_srgb_srgb) srcover_srgb_srgb = sk_default::srcover_srgb_srgb;
 
+    decltype(color_xform_2Dot2_RGBA_to_8888) color_xform_2Dot2_RGBA_to_8888 =
+            sk_default::color_xform_2Dot2_RGBA_to_8888;
+
     // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
     void Init_ssse3();
     void Init_sse41();
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index b8aea4a..0711471 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -68,6 +68,11 @@
     // Blend ndst src pixels over dst, where both src and dst point to sRGB pixels (RGBA or BGRA).
     // If nsrc < ndst, we loop over src to create a pattern.
     extern void (*srcover_srgb_srgb)(uint32_t* dst, const uint32_t* src, int ndst, int nsrc);
+
+    // Color xform RGBA input into SkPMColor ordered 8888 pixels.  Does not premultiply, and
+    // assumes src and dst gamma curves are both 2.2f exponentials.
+    extern void (*color_xform_2Dot2_RGBA_to_8888)(uint32_t* dst, const uint32_t* src, int len,
+                                                  const float srcToDstMatrix[16]);
 }
 
 #endif//SkOpts_DEFINED
diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h
new file mode 100644
index 0000000..3fc620f
--- /dev/null
+++ b/src/opts/SkColorXform_opts.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkColorXform_opts_DEFINED
+#define SkColorXform_opts_DEFINED
+
+#include "SkColorPriv.h"
+
+namespace SK_OPTS_NS {
+
+static uint8_t clamp_float_to_byte(float v) {
+    if (v >= 254.5f) {
+        return 255;
+    } else if (v < 0.5f) {
+        return 0;
+    } else {
+        return (uint8_t) (v + 0.5f);
+    }
+}
+
+static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_t* src, int len,
+                                                    const float matrix[16]) {
+    while (len-- > 0) {
+        float srcFloats[3];
+        srcFloats[0] = (float) ((*src >>  0) & 0xFF);
+        srcFloats[1] = (float) ((*src >>  8) & 0xFF);
+        srcFloats[2] = (float) ((*src >> 16) & 0xFF);
+
+        // Convert to linear.
+        // TODO (msarett):
+        // We should use X^2.2 here instead of X^2.  What is the impact on correctness?
+        // We should be able to get closer to 2.2 at a small performance cost.
+        srcFloats[0] = srcFloats[0] * srcFloats[0];
+        srcFloats[1] = srcFloats[1] * srcFloats[1];
+        srcFloats[2] = srcFloats[2] * srcFloats[2];
+
+        // Convert to dst gamut.
+        float dstFloats[3];
+        // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost always zero.
+        // Should we have another optimized path that avoids the extra addition when they
+        // are zero?
+        dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
+                       srcFloats[2] * matrix[8] + matrix[12];
+        dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
+                       srcFloats[2] * matrix[9] + matrix[13];
+        dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
+                       srcFloats[2] * matrix[10] + matrix[14];
+
+        // Convert to dst gamma.
+        // TODO (msarett):
+        // We should use X^(1/2.2) here instead of X^(1/2).  What is the impact on correctness?
+        // We should be able to get closer to (1/2.2) at a small performance cost.
+        dstFloats[0] = sqrtf(dstFloats[0]);
+        dstFloats[1] = sqrtf(dstFloats[1]);
+        dstFloats[2] = sqrtf(dstFloats[2]);
+
+        *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),
+                                   clamp_float_to_byte(dstFloats[0]),
+                                   clamp_float_to_byte(dstFloats[1]),
+                                   clamp_float_to_byte(dstFloats[2]));
+
+        dst++;
+        src++;
+    }
+}
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+
+static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, int len,
+                                           const float matrix[16]) {
+    // Load transformation matrix.
+    __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);
+    __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);
+    __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);
+    __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]);
+
+    while (len >= 4) {
+        // Load 4 pixels and convert them to floats.
+        __m128i rgba = _mm_loadu_si128((const __m128i*) src);
+        __m128i byteMask = _mm_set1_epi32(0xFF);
+        __m128 reds   = _mm_cvtepi32_ps(_mm_and_si128(               rgba,      byteMask));
+        __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba,  8), byteMask));
+        __m128 blues  = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask));
+
+        // Convert to linear.
+        // FIXME (msarett):
+        // Should we be more accurate?
+        reds   = _mm_mul_ps(reds, reds);
+        greens = _mm_mul_ps(greens, greens);
+        blues  = _mm_mul_ps(blues, blues);
+
+        // Apply the transformation matrix to dst gamut.
+        // FIXME (msarett):
+        // rQ, gQ, and bQ are almost always zero.  Can we save a couple instructions?
+
+        // Splat rX, rY, rZ, and rQ each across a register.
+        __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00);
+        __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00);
+        __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);
+        __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00);
+
+        // dstReds = rX * reds + rY * greens + rZ * blues + rQ
+        __m128 dstReds =                     _mm_mul_ps(reds,   rX);
+               dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));
+               dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues,  rZ));
+               dstReds = _mm_add_ps(dstReds,                    rQ);
+
+        // Splat gX, gY, gZ, and gQ each across a register.
+        __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);
+        __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);
+        __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);
+        __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55);
+
+        // dstGreens = gX * reds + gY * greens + gZ * blues + gQ
+        __m128 dstGreens =                       _mm_mul_ps(reds,   gX);
+               dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));
+               dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues,  gZ));
+               dstGreens = _mm_add_ps(dstGreens,                    gQ);
+
+        // Splat bX, bY, bZ, and bQ each across a register.
+        __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);
+        __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);
+        __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);
+        __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA);
+
+        // dstBlues = bX * reds + bY * greens + bZ * blues + bQ
+        __m128 dstBlues =                      _mm_mul_ps(reds,   bX);
+               dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));
+               dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues,  bZ));
+               dstBlues = _mm_add_ps(dstBlues,                    bQ);
+
+        // Convert to dst gamma.
+        // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt.
+        // FIXME (msarett):
+        // Should we be more accurate?
+        dstReds   = _mm_rcp_ps(_mm_rsqrt_ps(dstReds));
+        dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens));
+        dstBlues  = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues));
+
+        // Clamp floats to 0-255 range.
+        dstReds   = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstReds,   _mm_set1_ps(255.0f)));
+        dstGreens = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstGreens, _mm_set1_ps(255.0f)));
+        dstBlues  = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstBlues,  _mm_set1_ps(255.0f)));
+
+        // Convert to bytes and store to memory.
+        rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba);
+#ifdef SK_PMCOLOR_IS_RGBA
+        rgba = _mm_or_si128(rgba,                _mm_cvtps_epi32(dstReds)       );
+        rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens),  8));
+        rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues),  16));
+#else
+        rgba = _mm_or_si128(rgba,                _mm_cvtps_epi32(dstBlues)      );
+        rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens),  8));
+        rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds),   16));
+#endif
+        _mm_storeu_si128((__m128i*) dst, rgba);
+
+        dst += 4;
+        src += 4;
+        len -= 4;
+    }
+
+    color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);
+}
+
+#else
+
+static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, int len,
+                                           const float matrix[16]) {
+    color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);
+}
+
+#endif
+
+}
+
+#endif // SkColorXform_opts_DEFINED