Initial implementation of a SkColorSpace_A2B xform

There is support for all features of SkColorSpace_A2B.

Tests for these functionality were adapted from
the XYZ xform, plus a CLUT-specific test was added.

Shared functions used by both SkColorSpaceXform_XYZ and SkColorSpaceXform_A2B
have been moved into a shared header.

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2449243003
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2449243003
diff --git a/gm/labpcsdemo.cpp b/gm/labpcsdemo.cpp
index d2a4ba9..26e48a8 100644
--- a/gm/labpcsdemo.cpp
+++ b/gm/labpcsdemo.cpp
@@ -21,99 +21,6 @@
 #include "SkSurface.h"
 #include "SkTypes.h"
 
-static inline void interp_3d_clut(float dst[3], float src[3], const SkColorLookUpTable* colorLUT) {
-    // Call the src components x, y, and z.
-    uint8_t maxX = colorLUT->fGridPoints[0] - 1;
-    uint8_t maxY = colorLUT->fGridPoints[1] - 1;
-    uint8_t maxZ = colorLUT->fGridPoints[2] - 1;
-
-    // An approximate index into each of the three dimensions of the table.
-    float x = src[0] * maxX;
-    float y = src[1] * maxY;
-    float z = src[2] * maxZ;
-
-    // This gives us the low index for our interpolation.
-    int ix = sk_float_floor2int(x);
-    int iy = sk_float_floor2int(y);
-    int iz = sk_float_floor2int(z);
-
-    // Make sure the low index is not also the max index.
-    ix = (maxX == ix) ? ix - 1 : ix;
-    iy = (maxY == iy) ? iy - 1 : iy;
-    iz = (maxZ == iz) ? iz - 1 : iz;
-
-    // Weighting factors for the interpolation.
-    float diffX = x - ix;
-    float diffY = y - iy;
-    float diffZ = z - iz;
-
-    // Constants to help us navigate the 3D table.
-    // Ex: Assume x = a, y = b, z = c.
-    //     table[a * n001 + b * n010 + c * n100] logically equals table[a][b][c].
-    const int n000 = 0;
-    const int n001 = 3 * colorLUT->fGridPoints[1] * colorLUT->fGridPoints[2];
-    const int n010 = 3 * colorLUT->fGridPoints[2];
-    const int n011 = n001 + n010;
-    const int n100 = 3;
-    const int n101 = n100 + n001;
-    const int n110 = n100 + n010;
-    const int n111 = n110 + n001;
-
-    // Base ptr into the table.
-    const float* ptr = &(colorLUT->table()[ix*n001 + iy*n010 + iz*n100]);
-
-    // The code below performs a tetrahedral interpolation for each of the three
-    // dst components.  Once the tetrahedron containing the interpolation point is
-    // identified, the interpolation is a weighted sum of grid values at the
-    // vertices of the tetrahedron.  The claim is that tetrahedral interpolation
-    // provides a more accurate color conversion.
-    // blogs.mathworks.com/steve/2006/11/24/tetrahedral-interpolation-for-colorspace-conversion/
-    //
-    // I have one test image, and visually I can't tell the difference between
-    // tetrahedral and trilinear interpolation.  In terms of computation, the
-    // tetrahedral code requires more branches but less computation.  The
-    // SampleICC library provides an option for the client to choose either
-    // tetrahedral or trilinear.
-    for (int i = 0; i < 3; i++) {
-        if (diffZ < diffY) {
-            if (diffZ < diffX) {
-                dst[i] = (ptr[n000] + diffZ * (ptr[n110] - ptr[n010]) +
-                                      diffY * (ptr[n010] - ptr[n000]) +
-                                      diffX * (ptr[n111] - ptr[n110]));
-            } else if (diffY < diffX) {
-                dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) +
-                                      diffY * (ptr[n011] - ptr[n001]) +
-                                      diffX * (ptr[n001] - ptr[n000]));
-            } else {
-                dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) +
-                                      diffY * (ptr[n010] - ptr[n000]) +
-                                      diffX * (ptr[n011] - ptr[n010]));
-            }
-        } else {
-            if (diffZ < diffX) {
-                dst[i] = (ptr[n000] + diffZ * (ptr[n101] - ptr[n001]) +
-                                      diffY * (ptr[n111] - ptr[n101]) +
-                                      diffX * (ptr[n001] - ptr[n000]));
-            } else if (diffY < diffX) {
-                dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) +
-                                      diffY * (ptr[n111] - ptr[n101]) +
-                                      diffX * (ptr[n101] - ptr[n100]));
-            } else {
-                dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) +
-                                      diffY * (ptr[n110] - ptr[n100]) +
-                                      diffX * (ptr[n111] - ptr[n110]));
-            }
-        }
-
-        // Increment the table ptr in order to handle the next component.
-        // Note that this is the how table is designed: all of nXXX
-        // variables are multiples of 3 because there are 3 output
-        // components.
-        ptr++;
-    }
-}
-
-
 /**
  *  This tests decoding from a Lab source image and displays on the left
  *  the image as raw RGB values, and on the right a Lab PCS.
@@ -152,7 +59,7 @@
             return;
         }
         std::unique_ptr<SkCodec> codec(SkCodec::NewFromStream(stream));
-        
+
 
         // srgb_lab_pcs.icc is an elaborate way to specify sRGB but uses
         // Lab as the PCS, so we can take any arbitrary image that should
@@ -179,7 +86,7 @@
             bool printConversions = false;
             // We're skipping evaluating the TRCs and the matrix here since they aren't
             // in the ICC profile initially used here.
-            for (size_t e = 0; e < cs.count(); ++e) {
+            for (int e = 0; e < cs.count(); ++e) {
                 switch (cs.element(e).type()) {
                     case SkColorSpace_A2B::Element::Type::kGammaNamed:
                         SkASSERT(kLinear_SkGammaNamed == cs.element(e).gammaNamed());
@@ -207,9 +114,9 @@
                     }
 
                     float lab[4] = { r * (1.f/255.f), g * (1.f/255.f), b * (1.f/255.f), 1.f };
-                    
-                    interp_3d_clut(lab, lab, colorLUT);
-                    
+
+                    colorLUT->interp3D(lab, lab);
+
                     // Lab has ranges [0,100] for L and [-128,127] for a and b
                     // but the ICC profile loader stores as [0,1]. The ICC
                     // specifies an offset of -128 to convert.
diff --git a/gn/core.gni b/gn/core.gni
index 80514c4..60c7761 100644
--- a/gn/core.gni
+++ b/gn/core.gni
@@ -69,6 +69,8 @@
   "$_src/core/SkColor.cpp",
   "$_src/core/SkColorFilter.cpp",
   "$_src/core/SkColorFilterShader.cpp",
+  "$_src/core/SkColorLookUpTable.cpp",
+  "$_src/core/SkColorLookUpTable.h",
   "$_src/core/SkColorMatrixFilterRowMajor255.cpp",
   "$_src/core/SkColorMatrixFilterRowMajor255.h",
   "$_src/core/SkColorShader.cpp",
@@ -80,6 +82,8 @@
   "$_src/core/SkColorSpace_XYZ.h",
   "$_src/core/SkColorSpace_ICC.cpp",
   "$_src/core/SkColorSpaceXform.cpp",
+  "$_src/core/SkColorSpaceXform_A2B.cpp",
+  "$_src/core/SkColorSpaceXform_A2B.h",
   "$_src/core/SkColorTable.cpp",
   "$_src/core/SkComposeShader.cpp",
   "$_src/core/SkConfig8888.cpp",
diff --git a/src/core/SkColorLookUpTable.cpp b/src/core/SkColorLookUpTable.cpp
new file mode 100644
index 0000000..73f3e88
--- /dev/null
+++ b/src/core/SkColorLookUpTable.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkColorLookUpTable.h"
+#include "SkFloatingPoint.h"
+
+void SkColorLookUpTable::interp3D(float dst[3], float src[3]) const {
+    // Call the src components x, y, and z.
+    const uint8_t maxX = fGridPoints[0] - 1;
+    const uint8_t maxY = fGridPoints[1] - 1;
+    const uint8_t maxZ = fGridPoints[2] - 1;
+
+    // An approximate index into each of the three dimensions of the table.
+    const float x = src[0] * maxX;
+    const float y = src[1] * maxY;
+    const float z = src[2] * maxZ;
+
+    // This gives us the low index for our interpolation.
+    int ix = sk_float_floor2int(x);
+    int iy = sk_float_floor2int(y);
+    int iz = sk_float_floor2int(z);
+
+    // Make sure the low index is not also the max index.
+    ix = (maxX == ix) ? ix - 1 : ix;
+    iy = (maxY == iy) ? iy - 1 : iy;
+    iz = (maxZ == iz) ? iz - 1 : iz;
+
+    // Weighting factors for the interpolation.
+    const float diffX = x - ix;
+    const float diffY = y - iy;
+    const float diffZ = z - iz;
+
+    // Constants to help us navigate the 3D table.
+    // Ex: Assume x = a, y = b, z = c.
+    //     table[a * n001 + b * n010 + c * n100] logically equals table[a][b][c].
+    const int n000 = 0;
+    const int n001 = 3 * fGridPoints[1] * fGridPoints[2];
+    const int n010 = 3 * fGridPoints[2];
+    const int n011 = n001 + n010;
+    const int n100 = 3;
+    const int n101 = n100 + n001;
+    const int n110 = n100 + n010;
+    const int n111 = n110 + n001;
+
+    // Base ptr into the table.
+    const float* ptr = &(table()[ix*n001 + iy*n010 + iz*n100]);
+
+    // The code below performs a tetrahedral interpolation for each of the three
+    // dst components.  Once the tetrahedron containing the interpolation point is
+    // identified, the interpolation is a weighted sum of grid values at the
+    // vertices of the tetrahedron.  The claim is that tetrahedral interpolation
+    // provides a more accurate color conversion.
+    // blogs.mathworks.com/steve/2006/11/24/tetrahedral-interpolation-for-colorspace-conversion/
+    //
+    // I have one test image, and visually I can't tell the difference between
+    // tetrahedral and trilinear interpolation.  In terms of computation, the
+    // tetrahedral code requires more branches but less computation.  The
+    // SampleICC library provides an option for the client to choose either
+    // tetrahedral or trilinear.
+    for (int i = 0; i < 3; i++) {
+        if (diffZ < diffY) {
+            if (diffZ < diffX) {
+                dst[i] = (ptr[n000] + diffZ * (ptr[n110] - ptr[n010]) +
+                                      diffY * (ptr[n010] - ptr[n000]) +
+                                      diffX * (ptr[n111] - ptr[n110]));
+            } else if (diffY < diffX) {
+                dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) +
+                                      diffY * (ptr[n011] - ptr[n001]) +
+                                      diffX * (ptr[n001] - ptr[n000]));
+            } else {
+                dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) +
+                                      diffY * (ptr[n010] - ptr[n000]) +
+                                      diffX * (ptr[n011] - ptr[n010]));
+            }
+        } else {
+            if (diffZ < diffX) {
+                dst[i] = (ptr[n000] + diffZ * (ptr[n101] - ptr[n001]) +
+                                      diffY * (ptr[n111] - ptr[n101]) +
+                                      diffX * (ptr[n001] - ptr[n000]));
+            } else if (diffY < diffX) {
+                dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) +
+                                      diffY * (ptr[n111] - ptr[n101]) +
+                                      diffX * (ptr[n101] - ptr[n100]));
+            } else {
+                dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) +
+                                      diffY * (ptr[n110] - ptr[n100]) +
+                                      diffX * (ptr[n111] - ptr[n110]));
+            }
+        }
+
+        // Increment the table ptr in order to handle the next component.
+        // Note that this is the how table is designed: all of nXXX
+        // variables are multiples of 3 because there are 3 output
+        // components.
+        ptr++;
+    }
+}
diff --git a/src/core/SkColorLookUpTable.h b/src/core/SkColorLookUpTable.h
new file mode 100644
index 0000000..b9eb81a
--- /dev/null
+++ b/src/core/SkColorLookUpTable.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkColorLookUpTable_DEFINED
+#define SkColorLookUpTable_DEFINED
+
+#include "SkRefCnt.h"
+#include "SkTemplates.h"
+
+class SkColorLookUpTable : public SkRefCnt {
+public:
+    static constexpr uint8_t kOutputChannels = 3;
+
+    SkColorLookUpTable(uint8_t inputChannels, const uint8_t gridPoints[3]) {
+        SkASSERT(3 == inputChannels);
+        memcpy(fGridPoints, gridPoints, 3 * sizeof(uint8_t));   
+    }
+
+    void interp3D(float dst[3], float src[3]) const;
+
+private:
+    const float* table() const {
+        return SkTAddOffset<const float>(this, sizeof(SkColorLookUpTable));
+    }
+
+    uint8_t fGridPoints[3];
+
+public:
+    // Objects of this type are created in a custom fashion using sk_malloc_throw
+    // and therefore must be sk_freed.
+    void* operator new(size_t size) = delete;
+    void* operator new(size_t, void* p) { return p; }
+    void operator delete(void* p) { sk_free(p); }
+};
+
+#endif
diff --git a/src/core/SkColorSpaceXform.cpp b/src/core/SkColorSpaceXform.cpp
index 9de8bf7..450a643 100644
--- a/src/core/SkColorSpaceXform.cpp
+++ b/src/core/SkColorSpaceXform.cpp
@@ -10,7 +10,9 @@
 #include "SkColorSpace_Base.h"
 #include "SkColorSpace_XYZ.h"
 #include "SkColorSpacePriv.h"
+#include "SkColorSpaceXform_A2B.h"
 #include "SkColorSpaceXform_Base.h"
+#include "SkColorSpaceXformPriv.h"
 #include "SkHalf.h"
 #include "SkOpts.h"
 #include "SkSRGB.h"
@@ -90,14 +92,6 @@
     }
 }
 
-// Interpolating lookup in a variably sized table.
-static float interp_lut(float input, const float* table, int tableSize) {
-    float index = input * (tableSize - 1);
-    float diff = index - sk_float_floor2int(index);
-    return table[(int) sk_float_floor2int(index)] * (1.0f - diff) +
-            table[(int) sk_float_ceil2int(index)] * diff;
-}
-
 // outTable is always 256 entries, inTable may be larger or smaller.
 static void build_table_linear_from_gamma(float* outTable, const float* inTable,
                                           int inTableSize) {
@@ -161,32 +155,6 @@
     }
 }
 
-// Inverse table lookup.  Ex: what index corresponds to the input value?  This will
-// have strange results when the table is non-increasing.  But any sane gamma
-// function will be increasing.
-static float inverse_interp_lut(float input, const float* table, int tableSize) {
-    if (input <= table[0]) {
-        return table[0];
-    } else if (input >= table[tableSize - 1]) {
-        return 1.0f;
-    }
-
-    for (int i = 1; i < tableSize; i++) {
-        if (table[i] >= input) {
-            // We are guaranteed that input is greater than table[i - 1].
-            float diff = input - table[i - 1];
-            float distance = table[i] - table[i - 1];
-            float index = (i - 1) + diff / distance;
-            return index / (tableSize - 1);
-        }
-    }
-
-    // Should be unreachable, since we'll return before the loop if input is
-    // larger than the last entry.
-    SkASSERT(false);
-    return 0.0f;
-}
-
 static void build_table_linear_to_gamma(uint8_t* outTable, const float* inTable,
                                         int inTableSize) {
     for (int i = 0; i < kDstGammaTableSize; i++) {
@@ -355,18 +323,14 @@
     }
 
     if (SkColorSpace_Base::Type::kA2B == as_CSB(dstSpace)->type()) {
-        SkColorSpacePrintf("A2B destinations not supported\n");
+        SkCSXformPrintf("A2B destinations not supported\n");
         return nullptr;
     }
 
     if (SkColorSpace_Base::Type::kA2B == as_CSB(srcSpace)->type()) {
-        // TODO (raftias): return an A2B-supporting SkColorSpaceXform here once the xform.
-        // is implemented. SkColorSpaceXform_Base only supports XYZ+TRC based SkColorSpaces
-        //SkColorSpace_A2B* src = static_cast<SkColorSpace_A2B*>(srcSpace);
-        //SkColorSpace_XYZ* dst = static_cast<SkColorSpace_XYZ*>(dstSpace);
-        //return std::unique_ptr<SkColorSpaceXform>(new SkColorSpaceXform_A2B(src, dst));
-        SkColorSpacePrintf("A2B sources not supported (yet)\n");
-        return nullptr;
+        SkColorSpace_A2B* src = static_cast<SkColorSpace_A2B*>(srcSpace);
+        SkColorSpace_XYZ* dst = static_cast<SkColorSpace_XYZ*>(dstSpace);
+        return std::unique_ptr<SkColorSpaceXform>(new SkColorSpaceXform_A2B(src, dst));
     }
     SkColorSpace_XYZ* srcSpaceXYZ = static_cast<SkColorSpace_XYZ*>(srcSpace);
     SkColorSpace_XYZ* dstSpaceXYZ = static_cast<SkColorSpace_XYZ*>(dstSpace);
@@ -568,8 +532,7 @@
 }
 
 template <Order kOrder>
-static AI void load_rgb_linear(const uint32_t* src,
-                               Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a,
+static AI void load_rgb_linear(const uint32_t* src, Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a,
                                const float* const[3]) {
     int kRShift, kGShift = 8, kBShift;
     set_rb_shifts(kOrder, &kRShift, &kBShift);
@@ -580,8 +543,7 @@
 }
 
 template <Order kOrder>
-static AI void load_rgba_linear(const uint32_t* src,
-                                Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a,
+static AI void load_rgba_linear(const uint32_t* src, Sk4f& r, Sk4f& g, Sk4f& b, Sk4f& a,
                                 const float* const[3]) {
     int kRShift, kGShift = 8, kBShift;
     set_rb_shifts(kOrder, &kRShift, &kBShift);
@@ -675,8 +637,7 @@
 }
 
 template <Order kOrder>
-static AI void store_srgb(void* dst, const uint32_t* src,
-                          Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&,
+static AI void store_srgb(void* dst, const uint32_t* src, Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&,
                           const uint8_t* const[3]) {
     int kRShift, kGShift = 8, kBShift;
     set_rb_shifts(kOrder, &kRShift, &kBShift);
@@ -724,8 +685,7 @@
 }
 
 template <Order kOrder>
-static AI void store_2dot2(void* dst, const uint32_t* src,
-                           Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&,
+static AI void store_2dot2(void* dst, const uint32_t* src, Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&,
                            const uint8_t* const[3]) {
     int kRShift, kGShift = 8, kBShift;
     set_rb_shifts(kOrder, &kRShift, &kBShift);
@@ -763,8 +723,7 @@
 }
 
 template <Order kOrder>
-static AI void store_linear(void* dst, const uint32_t* src,
-                            Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&,
+static AI void store_linear(void* dst, const uint32_t* src, Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&,
                             const uint8_t* const[3]) {
     int kRShift, kGShift = 8, kBShift;
     set_rb_shifts(kOrder, &kRShift, &kBShift);
@@ -798,9 +757,8 @@
 }
 
 template <Order kOrder>
-static AI void store_f16(void* dst, const uint32_t* src,
-                             Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da,
-                             const uint8_t* const[3]) {
+static AI void store_f16(void* dst, const uint32_t* src, Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da,
+                         const uint8_t* const[3]) {
     Sk4h::Store4(dst, SkFloatToHalf_finite_ftz(dr),
                       SkFloatToHalf_finite_ftz(dg),
                       SkFloatToHalf_finite_ftz(db),
@@ -816,8 +774,7 @@
 }
 
 template <Order kOrder>
-static AI void store_f32(void* dst, const uint32_t* src,
-                         Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da,
+static AI void store_f32(void* dst, const uint32_t* src, Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f& da,
                          const uint8_t* const[3]) {
     Sk4f::Store4(dst, dr, dg, db, da);
 }
@@ -831,9 +788,8 @@
 }
 
 template <Order kOrder>
-static AI void store_f16_opaque(void* dst, const uint32_t* src,
-                                Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&,
-                                const uint8_t* const[3]) {
+static AI void store_f16_opaque(void* dst, const uint32_t* src, Sk4f& dr, Sk4f& dg, Sk4f& db,
+                                Sk4f&, const uint8_t* const[3]) {
     Sk4h::Store4(dst, SkFloatToHalf_finite_ftz(dr),
                       SkFloatToHalf_finite_ftz(dg),
                       SkFloatToHalf_finite_ftz(db),
@@ -851,9 +807,8 @@
 }
 
 template <Order kOrder>
-static AI void store_generic(void* dst, const uint32_t* src,
-                                 Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&,
-                                 const uint8_t* const dstTables[3]) {
+static AI void store_generic(void* dst, const uint32_t* src, Sk4f& dr, Sk4f& dg, Sk4f& db, Sk4f&,
+                             const uint8_t* const dstTables[3]) {
     int kRShift, kGShift = 8, kBShift;
     set_rb_shifts(kOrder, &kRShift, &kBShift);
     dr = Sk4f::Min(Sk4f::Max(1023.0f * dr, 0.0f), 1023.0f);
@@ -1115,7 +1070,7 @@
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-static inline int num_tables(SkColorSpace_XYZ* space) {
+static AI int num_tables(SkColorSpace_XYZ* space) {
     switch (space->gammaNamed()) {
         case kSRGB_SkGammaNamed:
         case k2Dot2Curve_SkGammaNamed:
diff --git a/src/core/SkColorSpaceXformPriv.h b/src/core/SkColorSpaceXformPriv.h
new file mode 100644
index 0000000..c2418a9
--- /dev/null
+++ b/src/core/SkColorSpaceXformPriv.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkColorSpaceXformPriv_DEFINED
+#define SkColorSpaceXformPriv_DEFINED
+
+#include "SkColorSpace_Base.h"
+#include "SkHalf.h"
+#include "SkSRGB.h"
+
+#define AI SK_ALWAYS_INLINE
+
+#define SkCSXformPrintfDefined 0
+#define SkCSXformPrintf(...)
+
+// Interpolating lookup in a variably sized table.
+static AI float interp_lut(float input, const float* table, int tableSize) {
+    float index = input * (tableSize - 1);
+    float diff = index - sk_float_floor2int(index);
+    return table[(int) sk_float_floor2int(index)] * (1.0f - diff) +
+           table[(int) sk_float_ceil2int(index)] * diff;
+}
+
+// Inverse table lookup.  Ex: what index corresponds to the input value?  This will
+// have strange results when the table is non-increasing.  But any sane gamma
+// function will be increasing.
+static float inverse_interp_lut(float input, const float* table, int tableSize) {
+    if (input <= table[0]) {
+        return table[0];
+    } else if (input >= table[tableSize - 1]) {
+        return 1.0f;
+    }
+
+    for (int i = 1; i < tableSize; i++) {
+        if (table[i] >= input) {
+            // We are guaranteed that input is greater than table[i - 1].
+            float diff = input - table[i - 1];
+            float distance = table[i] - table[i - 1];
+            float index = (i - 1) + diff / distance;
+            return index / (tableSize - 1);
+        }
+    }
+
+    // Should be unreachable, since we'll return before the loop if input is
+    // larger than the last entry.
+    SkASSERT(false);
+    return 0.0f;
+}
+
+#undef AI
+
+#endif
diff --git a/src/core/SkColorSpaceXform_A2B.cpp b/src/core/SkColorSpaceXform_A2B.cpp
new file mode 100644
index 0000000..04ecf77
--- /dev/null
+++ b/src/core/SkColorSpaceXform_A2B.cpp
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkColorSpaceXform_A2B.h"
+
+#include "SkColorPriv.h"
+#include "SkColorSpace_A2B.h"
+#include "SkColorSpace_XYZ.h"
+#include "SkColorSpacePriv.h"
+#include "SkColorSpaceXformPriv.h"
+#include "SkMakeUnique.h"
+#include "SkNx.h"
+#include "SkSRGB.h"
+#include "SkTypes.h"
+
+#include "SkRasterPipeline_opts.h"
+
+#define AI SK_ALWAYS_INLINE
+
+namespace {
+
+class ApplyParametric {
+public:
+    ApplyParametric(const SkColorSpaceTransferFn& fn)
+        : fFn(fn)
+    {}
+
+    float operator()(float x) const {
+        float y;
+        if (x >= fFn.fD) {
+            y = ::powf(fFn.fA * x + fFn.fB, fFn.fG) + fFn.fC;
+        } else {
+            y = fFn.fE * x + fFn.fF;
+        }
+        if (y >= 1.f) {
+            return 1.f;
+        } else if (y >= 0.f) {
+            return y;
+        }
+        return 0.f;
+    }
+
+private:
+    SkColorSpaceTransferFn fFn;
+};
+
+class ApplyTable {
+public:
+    ApplyTable(const float* table, int size)
+        : fTable(table)
+        , fSize(size)
+    {}
+
+    float operator()(float x) const {
+        return interp_lut(x, fTable, fSize);
+    }
+
+private:
+    const float* fTable;
+    int          fSize;
+};
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+bool SkColorSpaceXform_A2B::onApply(ColorFormat dstFormat, void* dst, ColorFormat srcFormat,
+                                    const void* src, int count, SkAlphaType alphaType) const {
+    SkRasterPipeline pipeline;
+    switch (srcFormat) {
+        case kBGRA_8888_ColorFormat:
+            pipeline.append(SkRasterPipeline::load_s_8888, &src);
+            pipeline.append(SkRasterPipeline::swap_rb);
+            break;
+        case kRGBA_8888_ColorFormat:
+            pipeline.append(SkRasterPipeline::load_s_8888, &src);
+            break;
+        default:
+            SkCSXformPrintf("F16/F32 source color format not supported\n");
+            return false;
+    }
+
+    pipeline.extend(fElementsPipeline);
+
+    if (kPremul_SkAlphaType == alphaType) {
+        pipeline.append(SkRasterPipeline::premul);
+    }
+
+    switch (dstFormat) {
+        case kBGRA_8888_ColorFormat:
+            pipeline.append(SkRasterPipeline::swap_rb);
+            pipeline.append(SkRasterPipeline::store_8888, &dst);
+            break;
+        case kRGBA_8888_ColorFormat:
+            pipeline.append(SkRasterPipeline::store_8888, &dst);
+            break;
+        case kRGBA_F16_ColorFormat:
+            if (!fLinearDstGamma) {
+                return false;
+            }
+            pipeline.append(SkRasterPipeline::store_f16, &dst);
+            break;
+        case kRGBA_F32_ColorFormat:
+            if (!fLinearDstGamma) {
+                return false;
+            }
+            pipeline.append(SkRasterPipeline::store_f32, &dst);
+            break;
+    }
+
+    auto p = pipeline.compile();
+
+    p(0, count);
+
+    return true;
+}
+
+static inline SkColorSpaceTransferFn value_to_parametric(float exp) {
+    return {exp, 1.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+}
+
+static inline SkColorSpaceTransferFn gammanamed_to_parametric(SkGammaNamed gammaNamed) {
+    switch (gammaNamed) {
+        case kLinear_SkGammaNamed:
+            return value_to_parametric(1.f);
+        case kSRGB_SkGammaNamed:
+            return {2.4f, (1.f / 1.055f), (0.055f / 1.055f), 0.f, 0.04045f, (1.f / 12.92f), 0.f};
+        case k2Dot2Curve_SkGammaNamed:
+            return value_to_parametric(2.2f);
+        default:
+            SkASSERT(false);
+            return {-1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f};
+    }
+}
+
+static inline SkColorSpaceTransferFn gamma_to_parametric(const SkGammas& gammas, int channel) {
+    switch (gammas.type(channel)) {
+        case SkGammas::Type::kNamed_Type:
+            return gammanamed_to_parametric(gammas.data(channel).fNamed);
+        case SkGammas::Type::kValue_Type:
+            return value_to_parametric(gammas.data(channel).fValue);
+        case SkGammas::Type::kParam_Type:
+            return gammas.params(channel);
+        default:
+            SkASSERT(false);
+            return {-1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f};
+    }
+}
+static inline SkColorSpaceTransferFn invert_parametric(const SkColorSpaceTransferFn& fn) {
+    // Original equation is:       y = (ax + b)^g + c   for x >= d
+    //                             y = ex + f           otherwise
+    //
+    // so 1st inverse is:          (y - c)^(1/g) = ax + b
+    //                             x = ((y - c)^(1/g) - b) / a
+    //
+    // which can be re-written as: x = (1/a)(y - c)^(1/g) - b/a
+    //                             x = ((1/a)^g)^(1/g) * (y - c)^(1/g) - b/a
+    //                             x = ([(1/a)^g]y + [-((1/a)^g)c]) ^ [1/g] + [-b/a]
+    //
+    // and 2nd inverse is:         x = (y - f) / e
+    // which can be re-written as: x = [1/e]y + [-f/e]
+    //
+    // and now both can be expressed in terms of the same parametric form as the
+    // original - parameters are enclosed in square barckets.
+
+    // find inverse for linear segment (if possible)
+    float e, f;
+    if (0.f == fn.fE) {
+        // otherwise assume it should be 0 as it is the lower segment
+        // as y = f is a constant function
+        e = 0.f;
+        f = 0.f;
+    } else {
+        e = 1.f / fn.fE;
+        f = -fn.fF / fn.fE;
+    }
+    // find inverse for the other segment (if possible)
+    float g, a, b, c;
+    if (0.f == fn.fA || 0.f == fn.fG) {
+        // otherwise assume it should be 1 as it is the top segment
+        // as you can't invert the constant functions y = b^g + c, or y = 1 + c
+        g = 1.f;
+        a = 0.f;
+        b = 0.f;
+        c = 1.f;
+    } else {
+        g = 1.f / fn.fG;
+        a = powf(1.f / fn.fA, fn.fG);
+        b = -a * fn.fC;
+        c = -fn.fB / fn.fA;
+    }
+    const float d = fn.fE * fn.fD + fn.fF;
+    return {g, a, b, c, d, e, f};
+}
+
+static std::vector<float> build_inverse_table(const float* inTable, int inTableSize) {
+    static constexpr int kInvTableSize = 256;
+    std::vector<float> outTable(kInvTableSize);
+    for (int i = 0; i < kInvTableSize; ++i) {
+        const float x = ((float) i) * (1.f / ((float) (kInvTableSize - 1)));
+        const float y = inverse_interp_lut(x, inTable, inTableSize);
+        outTable[i] = y;
+    }
+    return outTable;
+}
+
+SkColorSpaceXform_A2B::SkColorSpaceXform_A2B(SkColorSpace_A2B* srcSpace,
+                                             SkColorSpace_XYZ* dstSpace)
+    : fLinearDstGamma(kLinear_SkGammaNamed == dstSpace->gammaNamed()) {
+#if (SkCSXformPrintfDefined)
+    static const char* debugGammaNamed[4] = {
+        "Linear", "SRGB", "2.2", "NonStandard"
+    };
+    static const char* debugGammas[5] = {
+        "None", "Named", "Value", "Table", "Param"
+    };
+#endif
+    // add in all input color space -> PCS xforms
+    for (int i = 0; i < srcSpace->count(); ++i) {
+        const SkColorSpace_A2B::Element& e = srcSpace->element(i);
+        switch (e.type()) {
+            case SkColorSpace_A2B::Element::Type::kGammaNamed:
+                if (kLinear_SkGammaNamed != e.gammaNamed()) {
+                    SkCSXformPrintf("Gamma stage added: %s\n",
+                                    debugGammaNamed[(int)e.gammaNamed()]);
+                    addGamma(ApplyParametric(gammanamed_to_parametric(e.gammaNamed())),
+                             kRGB_Channels);
+                }
+                break;
+            case SkColorSpace_A2B::Element::Type::kGammas: {
+                    const SkGammas& gammas = e.gammas();
+                    SkCSXformPrintf("Gamma stage added:");
+                    for (int channel = 0; channel < 3; ++channel) {
+                        SkCSXformPrintf("  %s", debugGammas[(int)gammas.type(channel)]);
+                    }
+                    SkCSXformPrintf("\n");
+                    bool gammaNeedsRef = false;
+                    for (int channel = 0; channel < 3; ++channel) {
+                        if (SkGammas::Type::kTable_Type == gammas.type(channel)) {
+                            addGamma(ApplyTable(gammas.table(channel),
+                                                gammas.data(channel).fTable.fSize),
+                                                static_cast<Channels>(channel));
+                            gammaNeedsRef = true;
+                        } else {
+                            addGamma(ApplyParametric(gamma_to_parametric(gammas, channel)),
+                                     static_cast<Channels>(channel));
+                        }
+                    }
+                    if (gammaNeedsRef) {
+                        fGammaRefs.push_back(sk_ref_sp(&gammas));
+                    }
+                }
+                break;
+            case SkColorSpace_A2B::Element::Type::kCLUT:
+                SkCSXformPrintf("CLUT stage added [%d][%d][%d]\n", e.colorLUT().fGridPoints[0],
+                                e.colorLUT().fGridPoints[1], e.colorLUT().fGridPoints[2]);
+                fCLUTs.push_back(sk_ref_sp(&e.colorLUT()));
+                fElementsPipeline.append(SkRasterPipeline::color_lookup_table,
+                                         fCLUTs.back().get());
+                break;
+            case SkColorSpace_A2B::Element::Type::kMatrix:
+                if (!e.matrix().isIdentity()) {
+                    SkCSXformPrintf("Matrix stage added\n");
+                    addMatrix(e.matrix());
+                }
+                break;
+        }
+    }
+
+    // Lab PCS -> XYZ PCS
+    if (SkColorSpace_A2B::PCS::kLAB == srcSpace->pcs()) {
+        SkCSXformPrintf("Lab -> XYZ element added\n");
+        fElementsPipeline.append(SkRasterPipeline::lab_to_xyz);
+    }
+
+    // and XYZ PCS -> output color space xforms
+    if (!dstSpace->fromXYZD50()->isIdentity()) {
+        addMatrix(*dstSpace->fromXYZD50());
+    }
+
+    if (kNonStandard_SkGammaNamed != dstSpace->gammaNamed()) {
+        if (!fLinearDstGamma) {
+            addGamma(ApplyParametric(
+                            invert_parametric(gammanamed_to_parametric(dstSpace->gammaNamed()))),
+                     kRGB_Channels);
+        }
+    } else {
+        for (int channel = 0; channel < 3; ++channel) {
+            const SkGammas& gammas = *dstSpace->gammas();
+            if (SkGammas::Type::kTable_Type == gammas.type(channel)) {
+                fGammaTables.push_front(build_inverse_table(gammas.table(channel),
+                                                            gammas.data(channel).fTable.fSize));
+                addGamma(ApplyTable(fGammaTables.front().data(), fGammaTables.front().size()),
+                         static_cast<Channels>(channel));
+            } else {
+                addGamma(ApplyParametric(invert_parametric(gamma_to_parametric(gammas, channel))),
+                         static_cast<Channels>(channel));
+            }
+        }
+    }
+}
+
+void SkColorSpaceXform_A2B::addGamma(std::function<float(float)> fn, Channels channels) {
+    fGammaFunctions.push_front(std::move(fn));
+    switch (channels) {
+        case kRGB_Channels:
+            fElementsPipeline.append(SkRasterPipeline::fn_1_r, &fGammaFunctions.front());
+            fElementsPipeline.append(SkRasterPipeline::fn_1_g, &fGammaFunctions.front());
+            fElementsPipeline.append(SkRasterPipeline::fn_1_b, &fGammaFunctions.front());
+            break;
+        case kR_Channels:
+            fElementsPipeline.append(SkRasterPipeline::fn_1_r, &fGammaFunctions.front());
+            break;
+        case kG_Channels:
+            fElementsPipeline.append(SkRasterPipeline::fn_1_g, &fGammaFunctions.front());
+            break;
+        case kB_Channels:
+            fElementsPipeline.append(SkRasterPipeline::fn_1_b, &fGammaFunctions.front());
+            break;
+        default:
+            SkASSERT(false);
+    }
+}
+
+void SkColorSpaceXform_A2B::addMatrix(const SkMatrix44& matrix) {
+    fMatrices.push_front(std::vector<float>(12));
+    auto& m = fMatrices.front();
+    m[ 0] = matrix.get(0, 0);
+    m[ 1] = matrix.get(1, 0);
+    m[ 2] = matrix.get(2, 0);
+    m[ 3] = matrix.get(0, 1);
+    m[ 4] = matrix.get(1, 1);
+    m[ 5] = matrix.get(2, 1);
+    m[ 6] = matrix.get(0, 2);
+    m[ 7] = matrix.get(1, 2);
+    m[ 8] = matrix.get(2, 2);
+    m[ 9] = matrix.get(0, 3);
+    m[10] = matrix.get(1, 3);
+    m[11] = matrix.get(2, 3);
+    SkASSERT(matrix.get(3, 0) == 0.f);
+    SkASSERT(matrix.get(3, 1) == 0.f);
+    SkASSERT(matrix.get(3, 2) == 0.f);
+    SkASSERT(matrix.get(3, 3) == 1.f);
+    fElementsPipeline.append(SkRasterPipeline::matrix_3x4, m.data());
+    fElementsPipeline.append(SkRasterPipeline::clamp_0);
+    fElementsPipeline.append(SkRasterPipeline::clamp_a);
+}
+
+
diff --git a/src/core/SkColorSpaceXform_A2B.h b/src/core/SkColorSpaceXform_A2B.h
new file mode 100644
index 0000000..681261a
--- /dev/null
+++ b/src/core/SkColorSpaceXform_A2B.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkColorSpaceXform_A2B_DEFINED
+#define SkColorSpaceXform_A2B_DEFINED
+
+#include "SkColorSpace_Base.h"
+#include "SkColorSpaceXform_Base.h"
+#include "SkRasterPipeline.h"
+
+#include <forward_list>
+#include <functional>
+#include <vector>
+
+class SkColorSpace_A2B;
+class SkColorSpace_XYZ;
+
+
+class SkColorSpaceXform_A2B : public SkColorSpaceXform_Base {
+public:
+    bool onApply(ColorFormat dstFormat, void* dst, ColorFormat srcFormat, const void* src,
+                 int count, SkAlphaType alphaType) const override;
+
+private:
+    SkColorSpaceXform_A2B(SkColorSpace_A2B* srcSpace, SkColorSpace_XYZ* dstSpace);
+
+    enum Channels {
+        kRGB_Channels = -1,
+        kR_Channels   =  0,
+        kG_Channels   =  1,
+        kB_Channels   =  2
+    };
+    void addGamma(std::function<float(float)> fn, Channels channels);
+
+    void addMatrix(const SkMatrix44& matrix);
+
+    SkRasterPipeline                               fElementsPipeline;
+    bool                                           fLinearDstGamma;
+    // storage used by the pipeline
+    std::forward_list<std::function<float(float)>> fGammaFunctions;
+    std::forward_list<std::vector<float>>          fMatrices;
+    std::forward_list<std::vector<float>>          fGammaTables;
+    std::vector<sk_sp<const SkColorLookUpTable>>   fCLUTs;
+    // these are here to maintain ownership of tables used in the pipeline
+    std::vector<sk_sp<const SkGammas>>             fGammaRefs;
+
+    friend class SkColorSpaceXform;
+};
+
+#endif
diff --git a/src/core/SkColorSpace_A2B.h b/src/core/SkColorSpace_A2B.h
index 726e3d7..2fb7a83 100644
--- a/src/core/SkColorSpace_A2B.h
+++ b/src/core/SkColorSpace_A2B.h
@@ -132,7 +132,7 @@
     };
     const Element& element(size_t i) const { return fElements[i]; }
     
-    size_t count() const { return fElements.size(); }
+    int count() const { return (int)fElements.size(); }
 
     // the intermediate profile connection space that this color space
     // represents the transformation to
@@ -150,6 +150,7 @@
     std::vector<Element> fElements;
 
     friend class SkColorSpace;
+    friend class ColorSpaceXformTest;
     typedef SkColorSpace_Base INHERITED;
 };
 
diff --git a/src/core/SkColorSpace_Base.h b/src/core/SkColorSpace_Base.h
index 3dbf351..480febd 100644
--- a/src/core/SkColorSpace_Base.h
+++ b/src/core/SkColorSpace_Base.h
@@ -8,6 +8,7 @@
 #ifndef SkColorSpace_Base_DEFINED
 #define SkColorSpace_Base_DEFINED
 
+#include "SkColorLookUpTable.h"
 #include "SkColorSpace.h"
 #include "SkData.h"
 #include "SkOnce.h"
@@ -143,30 +144,6 @@
     void operator delete(void* p) { sk_free(p); }
 };
 
-struct SkColorLookUpTable : public SkRefCnt {
-    static constexpr uint8_t kOutputChannels = 3;
-
-    uint8_t                  fInputChannels;
-    uint8_t                  fGridPoints[3];
-
-    const float* table() const {
-        return SkTAddOffset<const float>(this, sizeof(SkColorLookUpTable));
-    }
-
-    SkColorLookUpTable(uint8_t inputChannels, uint8_t gridPoints[3])
-        : fInputChannels(inputChannels)
-    {
-        SkASSERT(3 == inputChannels);
-        memcpy(fGridPoints, gridPoints, 3 * sizeof(uint8_t));
-    }
-
-    // Objects of this type are created in a custom fashion using sk_malloc_throw
-    // and therefore must be sk_freed.
-    void* operator new(size_t size) = delete;
-    void* operator new(size_t, void* p) { return p; }
-    void operator delete(void* p) { sk_free(p); }
-};
-
 class SkColorSpace_Base : public SkColorSpace {
 public:
 
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 9193de5..361793b 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -60,6 +60,7 @@
     M(load_s_565)  M(load_d_565)  M(store_565)                   \
     M(load_s_srgb) M(load_d_srgb) M(store_srgb)                  \
     M(load_s_f16)  M(load_d_f16)  M(store_f16)                   \
+    M(load_s_8888) M(store_8888)                                 \
     M(scale_u8) M(scale_constant_float)                          \
     M(lerp_u8) M(lerp_565) M(lerp_constant_float)                \
     M(dst)                                                       \
@@ -68,7 +69,9 @@
     M(clear) M(modulate) M(multiply) M(plus_) M(screen) M(xor_)  \
     M(colorburn) M(colordodge) M(darken) M(difference)           \
     M(exclusion) M(hardlight) M(lighten) M(overlay) M(softlight) \
-    M(luminance_to_alpha) M(matrix_4x5)
+    M(luminance_to_alpha) M(matrix_3x4) M(matrix_4x5)            \
+    M(fn_1_r) M(fn_1_g) M(fn_1_b)                                \
+    M(color_lookup_table) M(lab_to_xyz) M(swap_rb)
 
 class SkRasterPipeline {
 public:
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 5c5418b..a687627 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -9,6 +9,7 @@
 #define SkRasterPipeline_opts_DEFINED
 
 #include "SkColorPriv.h"
+#include "SkColorLookUpTable.h"
 #include "SkHalf.h"
 #include "SkPM4f.h"
 #include "SkPM4fPriv.h"
@@ -431,6 +432,25 @@
                          | SkNx_cast<int>(0.5f + 255.0f * a) << SK_A32_SHIFT), (int*)ptr);
 }
 
+STAGE(load_s_8888, true) {
+    auto ptr = *(const uint32_t**)ctx + x;
+
+    auto px = load<kIsTail>(tail, ptr);
+    auto to_int = [](const SkNx<N, uint32_t>& v) { return SkNi::Load(&v); };
+    r = (1/255.0f)*SkNx_cast<float>(to_int((px >> 0) & 0xff));
+    g = (1/255.0f)*SkNx_cast<float>(to_int((px >> 8) & 0xff));
+    b = (1/255.0f)*SkNx_cast<float>(to_int((px >> 16) & 0xff));
+    a = (1/255.0f)*SkNx_cast<float>(to_int(px >> 24));
+}
+
+STAGE(store_8888, false) {
+    auto ptr = *(uint32_t**)ctx + x;
+    store<kIsTail>(tail, ( SkNx_cast<int>(255.0f * r + 0.5f) << 0
+                         | SkNx_cast<int>(255.0f * g + 0.5f) << 8
+                         | SkNx_cast<int>(255.0f * b + 0.5f) << 16
+                         | SkNx_cast<int>(255.0f * a + 0.5f) << 24 ), (int*)ptr);
+}
+
 RGBA_XFERMODE(clear)    { return 0.0f; }
 //RGBA_XFERMODE(src)      { return s; }   // This would be a no-op stage, so we just omit it.
 RGBA_XFERMODE(dst)      { return d; }
@@ -490,6 +510,18 @@
     r = g = b = 0;
 }
 
+STAGE(matrix_3x4, true) {
+    auto m = (const float*)ctx;
+
+    auto fma = [](const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); };
+    auto R = fma(r,m[0], fma(g,m[3], fma(b,m[6], m[ 9]))),
+         G = fma(r,m[1], fma(g,m[4], fma(b,m[7], m[10]))),
+         B = fma(r,m[2], fma(g,m[5], fma(b,m[8], m[11])));
+    r = R;
+    g = G;
+    b = B;
+}
+
 STAGE(matrix_4x5, true) {
     auto m = (const float*)ctx;
 
@@ -504,6 +536,80 @@
     a = A;
 }
 
+STAGE(fn_1_r, true) {
+    auto fn = (const std::function<float(float)>*)ctx;
+    float result[N];
+    for (int i = 0; i < N; ++i) {
+        result[i] = (*fn)(r[i]);
+    }
+    r = SkNf::Load(result);
+}
+
+STAGE(fn_1_g, true) {
+    auto fn = (const std::function<float(float)>*)ctx;
+    float result[N];
+    for (int i = 0; i < N; ++i) {
+        result[i] = (*fn)(g[i]);
+    }
+    g = SkNf::Load(result);
+}
+
+STAGE(fn_1_b, true) {
+    auto fn = (const std::function<float(float)>*)ctx;
+    float result[N];
+    for (int i = 0; i < N; ++i) {
+        result[i] = (*fn)(b[i]);
+    }
+    b = SkNf::Load(result);
+}
+
+STAGE(color_lookup_table, true) {
+    const SkColorLookUpTable* colorLUT = (const SkColorLookUpTable*)ctx;
+    float rgb[3];
+    float result[3][N];
+    for (int i = 0; i < N; ++i) {
+        rgb[0] = r[i];
+        rgb[1] = g[i];
+        rgb[2] = b[i];
+        colorLUT->interp3D(rgb, rgb);
+        result[0][i] = rgb[0];
+        result[1][i] = rgb[1];
+        result[2][i] = rgb[2];
+    }
+    r = SkNf::Load(result[0]);
+    g = SkNf::Load(result[1]);
+    b = SkNf::Load(result[2]);
+}
+
+STAGE(lab_to_xyz, true) {
+    const auto lab_l = r * 100.0f;
+    const auto lab_a = g * 255.0f - 128.0f;
+    const auto lab_b = b * 255.0f - 128.0f;
+    auto Y = (lab_l + 16.0f) * (1/116.0f);
+    auto X = lab_a * (1/500.0f) + Y;
+    auto Z = Y - (lab_b * (1/200.0f));
+
+    const auto X3 = X*X*X;
+    X = (X3 > 0.008856f).thenElse(X3, (X - (16/116.0f)) * (1/7.787f));
+    const auto Y3 = Y*Y*Y;
+    Y = (Y3 > 0.008856f).thenElse(Y3, (Y - (16/116.0f)) * (1/7.787f));
+    const auto Z3 = Z*Z*Z;
+    Z = (Z3 > 0.008856f).thenElse(Z3, (Z - (16/116.0f)) * (1/7.787f));
+
+    // adjust to D50 illuminant
+    X *= 0.96422f;
+    Y *= 1.00000f;
+    Z *= 0.82521f;
+
+    r = X;
+    g = Y;
+    b = Z;
+}
+
+STAGE(swap_rb, true) {
+    SkTSwap(r, b);
+}
+
 template <typename Fn>
 SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) {
     switch (st) {
diff --git a/tests/ColorSpaceXformTest.cpp b/tests/ColorSpaceXformTest.cpp
index 477e61a..0e67fe4 100644
--- a/tests/ColorSpaceXformTest.cpp
+++ b/tests/ColorSpaceXformTest.cpp
@@ -10,6 +10,7 @@
 #include "SkCodecPriv.h"
 #include "SkColorPriv.h"
 #include "SkColorSpace.h"
+#include "SkColorSpace_A2B.h"
 #include "SkColorSpace_Base.h"
 #include "SkColorSpace_XYZ.h"
 #include "SkColorSpaceXform_Base.h"
@@ -25,10 +26,42 @@
         // Use special testing entry point, so we don't skip the xform, even though src == dst.
         return SlowIdentityXform(static_cast<SkColorSpace_XYZ*>(space.get()));
     }
+
+    static std::unique_ptr<SkColorSpaceXform> CreateIdentityXform_A2B(
+            SkGammaNamed gammaNamed, const sk_sp<SkGammas>& gammas) {
+        std::vector<SkColorSpace_A2B::Element> srcElements;
+        // sRGB
+        const float values[16] = {
+            0.4358f, 0.3853f, 0.1430f, 0.0f,
+            0.2224f, 0.7170f, 0.0606f, 0.0f,
+            0.0139f, 0.0971f, 0.7139f, 0.0f,
+            0.0000f, 0.0000f, 0.0000f, 1.0f
+        };
+        SkMatrix44 arbitraryMatrix{SkMatrix44::kUninitialized_Constructor};
+        arbitraryMatrix.setRowMajorf(values);
+        if (kNonStandard_SkGammaNamed == gammaNamed) {
+            srcElements.push_back(SkColorSpace_A2B::Element(gammas));
+        } else {
+            srcElements.push_back(SkColorSpace_A2B::Element(gammaNamed));
+        }
+        srcElements.push_back(SkColorSpace_A2B::Element(arbitraryMatrix));
+        auto srcSpace = ColorSpaceXformTest::CreateA2BSpace(SkColorSpace_A2B::PCS::kXYZ,
+                                                            std::move(srcElements));
+        sk_sp<SkColorSpace> dstSpace(new SkColorSpace_XYZ(gammaNamed, gammas, arbitraryMatrix,
+                                                          nullptr));
+
+        return SkColorSpaceXform::New(static_cast<SkColorSpace_A2B*>(srcSpace.get()),
+                                      static_cast<SkColorSpace_XYZ*>(dstSpace.get()));
+    }
+
+    static sk_sp<SkColorSpace> CreateA2BSpace(SkColorSpace_A2B::PCS pcs,
+                                              std::vector<SkColorSpace_A2B::Element> elements) {
+        return sk_sp<SkColorSpace>(new SkColorSpace_A2B(pcs, nullptr, std::move(elements)));
+    }
 };
 
 static bool almost_equal(int x, int y) {
-    return SkTAbs(x - y) <= 1;
+    return SkTAbs(x - y) <= 1 ;
 }
 
 static void test_identity_xform(skiatest::Reporter* r, const sk_sp<SkGammas>& gammas,
@@ -67,6 +100,42 @@
     }
 }
 
+static void test_identity_xform_A2B(skiatest::Reporter* r, SkGammaNamed gammaNamed,
+                                    const sk_sp<SkGammas>& gammas, bool repeat) {
+    // Arbitrary set of 10 pixels
+    constexpr int width = 10;
+    constexpr uint32_t srcPixels[width] = {
+            0xFFABCDEF, 0xFF146829, 0xFF382759, 0xFF184968, 0xFFDE8271,
+            0xFF32AB52, 0xFF0383BC, 0xFF000102, 0xFFFFFFFF, 0xFFDDEEFF, };
+    uint32_t dstPixels[width];
+
+    // Create and perform an identity xform.
+    auto xform = ColorSpaceXformTest::CreateIdentityXform_A2B(gammaNamed, gammas);
+    bool result = xform->apply(select_xform_format(kN32_SkColorType), dstPixels,
+                               SkColorSpaceXform::kBGRA_8888_ColorFormat, srcPixels, width,
+                               kOpaque_SkAlphaType);
+    REPORTER_ASSERT(r, result);
+
+    // Since the src->dst matrix is the identity, and the gamma curves match,
+    // the pixels should be unchanged.
+    for (int i = 0; i < width; i++) {
+        REPORTER_ASSERT(r, almost_equal(((srcPixels[i] >>  0) & 0xFF),
+                                        SkGetPackedB32(dstPixels[i])));
+        REPORTER_ASSERT(r, almost_equal(((srcPixels[i] >>  8) & 0xFF),
+                                        SkGetPackedG32(dstPixels[i])));
+        REPORTER_ASSERT(r, almost_equal(((srcPixels[i] >> 16) & 0xFF),
+                                        SkGetPackedR32(dstPixels[i])));
+        REPORTER_ASSERT(r, almost_equal(((srcPixels[i] >> 24) & 0xFF),
+                                        SkGetPackedA32(dstPixels[i])));
+    }
+
+    if (repeat) {
+        // We should cache part of the transform after the run.  So it is interesting
+        // to make sure it still runs correctly the second time.
+        test_identity_xform_A2B(r, gammaNamed, gammas, false);
+    }
+}
+
 DEF_TEST(ColorSpaceXform_TableGamma, r) {
     // Lookup-table based gamma curves
     constexpr size_t tableSize = 10;
@@ -90,6 +159,7 @@
     table[8] = 0.75f;
     table[9] = 1.00f;
     test_identity_xform(r, gammas, true);
+    test_identity_xform_A2B(r, kNonStandard_SkGammaNamed, gammas, true);
 }
 
 DEF_TEST(ColorSpaceXform_ParametricGamma, r) {
@@ -116,6 +186,7 @@
     params->fC = 0.0f;
     params->fG = 2.4f;
     test_identity_xform(r, gammas, true);
+    test_identity_xform_A2B(r, kNonStandard_SkGammaNamed, gammas, true);
 }
 
 DEF_TEST(ColorSpaceXform_ExponentialGamma, r) {
@@ -124,6 +195,7 @@
     gammas->fRedType = gammas->fGreenType = gammas->fBlueType = SkGammas::Type::kValue_Type;
     gammas->fRedData.fValue = gammas->fGreenData.fValue = gammas->fBlueData.fValue = 1.4f;
     test_identity_xform(r, gammas, true);
+    test_identity_xform_A2B(r, kNonStandard_SkGammaNamed, gammas, true);
 }
 
 DEF_TEST(ColorSpaceXform_NamedGamma, r) {
@@ -133,6 +205,10 @@
     gammas->fGreenData.fNamed = k2Dot2Curve_SkGammaNamed;
     gammas->fBlueData.fNamed = kLinear_SkGammaNamed;
     test_identity_xform(r, gammas, true);
+    test_identity_xform_A2B(r, kNonStandard_SkGammaNamed, gammas, true);
+    test_identity_xform_A2B(r, kSRGB_SkGammaNamed, nullptr, true);
+    test_identity_xform_A2B(r, k2Dot2Curve_SkGammaNamed, nullptr, true);
+    test_identity_xform_A2B(r, kLinear_SkGammaNamed, nullptr, true);
 }
 
 DEF_TEST(ColorSpaceXform_NonMatchingGamma, r) {
@@ -174,5 +250,68 @@
     gammas->fBlueData.fParamOffset = sizeof(float) * tableSize;
 
     test_identity_xform(r, gammas, true);
+    test_identity_xform_A2B(r, kNonStandard_SkGammaNamed, gammas, true);
+}
+
+DEF_TEST(ColorSpaceXform_A2BCLUT, r) {
+    constexpr int inputChannels = 3;
+    constexpr int gp            = 4; // # grid points
+
+    constexpr int numEntries    = gp*gp*gp*3;
+    uint8_t gridPoints[3] = {gp, gp, gp};
+    void* memory = sk_malloc_throw(sizeof(SkColorLookUpTable) + sizeof(float) * numEntries);
+    sk_sp<SkColorLookUpTable> colorLUT(new (memory) SkColorLookUpTable(inputChannels, gridPoints));
+    // make a CLUT that rotates R, G, and B ie R->G, G->B, B->R
+    float* table = SkTAddOffset<float>(memory, sizeof(SkColorLookUpTable));
+    for (int r = 0; r < gp; ++r) {
+        for (int g = 0; g < gp; ++g) {
+            for (int b = 0; b < gp; ++b) {
+                table[3*(gp*gp*r + gp*g + b) + 0] = g * (1.f / (gp - 1.f));
+                table[3*(gp*gp*r + gp*g + b) + 1] = b * (1.f / (gp - 1.f));
+                table[3*(gp*gp*r + gp*g + b) + 2] = r * (1.f / (gp - 1.f));
+            }
+        }
+    }
+
+    // build an even distribution of pixels every (7 / 255) steps
+    // to test the xform on
+    constexpr int pixelgp   = 7;
+    constexpr int numPixels = pixelgp*pixelgp*pixelgp;
+    SkAutoTMalloc<uint32_t> srcPixels(numPixels);
+    int srcIndex = 0;
+    for (int r = 0; r < pixelgp; ++r) {
+        for (int g = 0; g < pixelgp; ++g) {
+            for (int b = 0; b < pixelgp; ++b) {
+                const int red   = (int) (r * (255.f / (pixelgp - 1.f)));
+                const int green = (int) (g * (255.f / (pixelgp - 1.f)));
+                const int blue  = (int) (b * (255.f / (pixelgp - 1.f)));
+                srcPixels[srcIndex] = SkColorSetRGB(red, green, blue);
+                ++srcIndex;
+            }
+        }
+    }
+    SkAutoTMalloc<uint32_t> dstPixels(numPixels);
+
+    // src space is identity besides CLUT
+    std::vector<SkColorSpace_A2B::Element> srcElements;
+    srcElements.push_back(SkColorSpace_A2B::Element(std::move(colorLUT)));
+    auto srcSpace = ColorSpaceXformTest::CreateA2BSpace(SkColorSpace_A2B::PCS::kXYZ,
+                                                        std::move(srcElements));
+    // dst space is entirely identity
+    auto dstSpace = SkColorSpace::MakeRGB(SkColorSpace::kLinear_RenderTargetGamma, SkMatrix44::I());
+    auto xform = SkColorSpaceXform::New(srcSpace.get(), dstSpace.get());
+    bool result = xform->apply(SkColorSpaceXform::kRGBA_8888_ColorFormat, dstPixels.get(),
+                               SkColorSpaceXform::kRGBA_8888_ColorFormat, srcPixels.get(),
+                               numPixels, kOpaque_SkAlphaType);
+    REPORTER_ASSERT(r, result);
+
+    for (int i = 0; i < numPixels; ++i) {
+        REPORTER_ASSERT(r, almost_equal(SkColorGetR(srcPixels[i]),
+                                        SkColorGetG(dstPixels[i])));
+        REPORTER_ASSERT(r, almost_equal(SkColorGetG(srcPixels[i]),
+                                        SkColorGetB(dstPixels[i])));
+        REPORTER_ASSERT(r, almost_equal(SkColorGetB(srcPixels[i]),
+                                        SkColorGetR(dstPixels[i])));
+    }
 }