Patches on top of Radu's latest.

    patch from issue 1273033005 at patchset 120001 (http://crrev.com/1273033005#ps120001)

BUG=skia:

Review URL: https://codereview.chromium.org/1288323004
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 890d9c7..d5308e7 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -11,6 +11,7 @@
 #define SK_OPTS_NS portable
 #include "SkBlitMask_opts.h"
 #include "SkBlurImageFilter_opts.h"
+#include "SkColorCubeFilter_opts.h"
 #include "SkFloatingPoint_opts.h"
 #include "SkMorphologyImageFilter_opts.h"
 #include "SkTextureCompressor_opts.h"
@@ -38,6 +39,7 @@
     decltype(memset16)               memset16 = portable::memset16;
     decltype(memset32)               memset32 = portable::memset32;
     decltype(create_xfermode) create_xfermode = SkCreate4pxXfermode;
+    decltype(color_cube_filter_span) color_cube_filter_span = portable::color_cube_filter_span;
 
     decltype(box_blur_xx) box_blur_xx = portable::box_blur_xx;
     decltype(box_blur_xy) box_blur_xy = portable::box_blur_xy;
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index e0ef7dc..04aa9b1 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -44,6 +44,15 @@
     extern bool (*fill_block_dimensions)(SkTextureCompressor::Format, int* x, int* y);
 
     extern void (*blit_mask_d32_a8)(SkPMColor*, size_t, const SkAlpha*, size_t, SkColor, int, int);
+
+    // This function is an optimized version of SkColorCubeFilter::filterSpan
+    extern void (*color_cube_filter_span)(const SkPMColor[],
+                                          int,
+                                          SkPMColor[],
+                                          const int * [2],
+                                          const SkScalar * [2],
+                                          int,
+                                          const SkColor*);
 }
 
 #endif//SkOpts_DEFINED
diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h
index f97f25c..1fc80f5 100644
--- a/src/core/SkPMFloat.h
+++ b/src/core/SkPMFloat.h
@@ -26,6 +26,7 @@
 public:
     static SkPMFloat FromPMColor(SkPMColor c) { return SkPMFloat(c); }
     static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); }
+    static SkPMFloat FromBGRx(SkColor c);  // Ignores c's alpha, instead forcing it to 1.
 
     Sk4f alphas() const;  // argb -> aaaa, generally faster than the equivalent Sk4f(this->a()).
 
diff --git a/src/effects/SkColorCubeFilter.cpp b/src/effects/SkColorCubeFilter.cpp
index 3b7c75f..f37b695 100644
--- a/src/effects/SkColorCubeFilter.cpp
+++ b/src/effects/SkColorCubeFilter.cpp
@@ -8,6 +8,7 @@
 #include "SkColorCubeFilter.h"
 #include "SkColorPriv.h"
 #include "SkOnce.h"
+#include "SkOpts.h"
 #include "SkReadBuffer.h"
 #include "SkUnPreMultiply.h"
 #include "SkWriteBuffer.h"
@@ -128,36 +129,9 @@
     const SkScalar* colorToScalar;
     fCache.getProcessingLuts(&colorToIndex, &colorToFactors, &colorToScalar);
 
-    const int dim = fCache.cubeDimension();
-    SkColor* colorCube = (SkColor*)fCubeData->data();
-    for (int i = 0; i < count; ++i) {
-        SkColor inputColor = SkUnPreMultiply::PMColorToColor(src[i]);
-        uint8_t r = SkColorGetR(inputColor);
-        uint8_t g = SkColorGetG(inputColor);
-        uint8_t b = SkColorGetB(inputColor);
-        uint8_t a = SkColorGetA(inputColor);
-        SkScalar rOut(0), gOut(0), bOut(0);
-        for (int x = 0; x < 2; ++x) {
-            for (int y = 0; y < 2; ++y) {
-                for (int z = 0; z < 2; ++z) {
-                    SkColor lutColor = colorCube[colorToIndex[x][r] +
-                                                (colorToIndex[y][g] +
-                                                 colorToIndex[z][b] * dim) * dim];
-                    SkScalar factor = colorToFactors[x][r] *
-                                      colorToFactors[y][g] *
-                                      colorToFactors[z][b];
-                    rOut += colorToScalar[SkColorGetR(lutColor)] * factor;
-                    gOut += colorToScalar[SkColorGetG(lutColor)] * factor;
-                    bOut += colorToScalar[SkColorGetB(lutColor)] * factor;
-                }
-            }
-        }
-        const SkScalar aOut = SkIntToScalar(a);
-        dst[i] = SkPackARGB32(a,
-            SkScalarRoundToInt(rOut * aOut),
-            SkScalarRoundToInt(gOut * aOut),
-            SkScalarRoundToInt(bOut * aOut));
-    }
+    SkOpts::color_cube_filter_span(src, count, dst, colorToIndex,
+                                   colorToFactors, fCache.cubeDimension(),
+                                   (SkColor*)fCubeData->data());
 }
 
 SkFlattenable* SkColorCubeFilter::CreateProc(SkReadBuffer& buffer) {
diff --git a/src/opts/SkColorCubeFilter_opts.h b/src/opts/SkColorCubeFilter_opts.h
new file mode 100644
index 0000000..325d7aa
--- /dev/null
+++ b/src/opts/SkColorCubeFilter_opts.h
@@ -0,0 +1,85 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SkColorCubeFilter_opts_DEFINED
+#define SkColorCubeFilter_opts_DEFINED
+
+#include "SkColor.h"
+#include "SkPMFloat.h"
+#include "SkUnPreMultiply.h"
+
+namespace SK_OPTS_NS {
+
+void color_cube_filter_span(const SkPMColor src[],
+                            int count,
+                            SkPMColor dst[],
+                            const int* colorToIndex[2],
+                            const SkScalar* colorToFactors[2],
+                            int dim,
+                            const SkColor* colorCube) {
+    uint8_t* ptr_dst = reinterpret_cast<uint8_t*>(dst);
+    uint8_t r, g, b, a;
+
+    for (int i = 0; i < count; ++i) {
+        const SkPMColor input = src[i];
+        a = input >> SK_A32_SHIFT;
+
+        if (a != 255) {
+            const SkColor source = SkUnPreMultiply::PMColorToColor(input);
+            r = SkColorGetR(source);
+            g = SkColorGetG(source);
+            b = SkColorGetB(source);
+        } else {
+            r = SkGetPackedR32(input);
+            g = SkGetPackedG32(input);
+            b = SkGetPackedB32(input);
+        }
+
+        const SkScalar g0 = colorToFactors[0][g],
+                       g1 = colorToFactors[1][g],
+                       b0 = colorToFactors[0][b],
+                       b1 = colorToFactors[1][b];
+
+        const Sk4f g0b0(g0*b0),
+                   g0b1(g0*b1),
+                   g1b0(g1*b0),
+                   g1b1(g1*b1);
+
+        const int i00 = (colorToIndex[0][g] + colorToIndex[0][b] * dim) * dim;
+        const int i01 = (colorToIndex[0][g] + colorToIndex[1][b] * dim) * dim;
+        const int i10 = (colorToIndex[1][g] + colorToIndex[0][b] * dim) * dim;
+        const int i11 = (colorToIndex[1][g] + colorToIndex[1][b] * dim) * dim;
+
+        SkPMFloat color(0);
+
+        for (int x = 0; x < 2; ++x) {
+            const int ix = colorToIndex[x][r];
+
+            const SkColor lutColor00 = colorCube[ix + i00];
+            const SkColor lutColor01 = colorCube[ix + i01];
+            const SkColor lutColor10 = colorCube[ix + i10];
+            const SkColor lutColor11 = colorCube[ix + i11];
+
+            Sk4f  sum = SkPMFloat::FromBGRx(lutColor00) * g0b0;
+            sum = sum + SkPMFloat::FromBGRx(lutColor01) * g0b1;
+            sum = sum + SkPMFloat::FromBGRx(lutColor10) * g1b0;
+            sum = sum + SkPMFloat::FromBGRx(lutColor11) * g1b1;
+
+            color = color + sum * Sk4f((float)colorToFactors[x][r]);
+        }
+
+        if (a != 255) {
+            color = color * Sk4f(((float)a) / 255);
+        }
+
+        dst[i] = color.round();
+
+        ptr_dst[SK_A32_SHIFT / 8] = a;
+        ptr_dst += 4;
+    }
+}
+
+}  // namespace SK_OPTS NS
+
+#endif  // SkColorCubeFilter_opts_DEFINED
diff --git a/src/opts/SkOpts_neon.cpp b/src/opts/SkOpts_neon.cpp
index 789a977..aa184d8 100644
--- a/src/opts/SkOpts_neon.cpp
+++ b/src/opts/SkOpts_neon.cpp
@@ -10,6 +10,7 @@
 #define SK_OPTS_NS neon
 #include "SkBlitMask_opts.h"
 #include "SkBlurImageFilter_opts.h"
+#include "SkColorCubeFilter_opts.h"
 #include "SkFloatingPoint_opts.h"
 #include "SkMorphologyImageFilter_opts.h"
 #include "SkTextureCompressor_opts.h"
@@ -36,5 +37,7 @@
         fill_block_dimensions = neon::fill_block_dimensions;
 
         blit_mask_d32_a8 = neon::blit_mask_d32_a8;
+
+        color_cube_filter_span = neon::color_cube_filter_span;
     }
 }
diff --git a/src/opts/SkOpts_ssse3.cpp b/src/opts/SkOpts_ssse3.cpp
index 7e056a0..c65f0cb 100644
--- a/src/opts/SkOpts_ssse3.cpp
+++ b/src/opts/SkOpts_ssse3.cpp
@@ -8,11 +8,13 @@
 #include "SkOpts.h"
 #define SK_OPTS_NS ssse3
 #include "SkBlitMask_opts.h"
+#include "SkColorCubeFilter_opts.h"
 #include "SkXfermode_opts.h"
 
 namespace SkOpts {
     void Init_ssse3() {
         create_xfermode = SkCreate4pxXfermode;
         blit_mask_d32_a8 = ssse3::blit_mask_d32_a8;
+        color_cube_filter_span = ssse3::color_cube_filter_span;
     }
 }
diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h
index 8bee5b5..ecb151f 100644
--- a/src/opts/SkPMFloat_neon.h
+++ b/src/opts/SkPMFloat_neon.h
@@ -7,6 +7,8 @@
 
 namespace { // See SkPMFloat.h
 
+static_assert(SK_A32_SHIFT == 24, "This file assumes little-endian.");
+
 inline SkPMFloat::SkPMFloat(SkPMColor c) {
     SkPMColorAssert(c);
     uint8x8_t   fix8    = (uint8x8_t)vdup_n_u32(c);
@@ -28,8 +30,21 @@
 }
 
 inline Sk4f SkPMFloat::alphas() const {
-    static_assert(SK_A32_SHIFT == 24, "Assuming little-endian.");
     return vdupq_lane_f32(vget_high_f32(fVec), 1);  // Duplicate high lane of high half i.e. lane 3.
 }
 
+inline SkPMFloat SkPMFloat::FromBGRx(SkColor c) {
+    uint8x8_t fix8 = (uint8x8_t)vdup_n_u32(c);
+#if defined(SK_PMCOLOR_IS_RGBA)
+    fix8 = vtbl1_u8(fix8, vcreate_u8(0x0300010203000102ULL));  // 03 00 01 02, 2x, i.e. swap R&B.
+#endif
+    uint16x8_t fix8_16 = vmovl_u8(fix8);
+    uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
+    fix8_32 = vsetq_lane_u32(0xFF, fix8_32, 3);  // Force alpha to 1.
+
+    SkPMFloat pmf = Sk4f(vmulq_f32(vcvtq_f32_u32(fix8_32), vdupq_n_f32(1.0f/255)));
+    SkASSERT(pmf.isValid());
+    return pmf;
+}
+
 }  // namespace
diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h
index 518ad15..42446e6 100644
--- a/src/opts/SkPMFloat_none.h
+++ b/src/opts/SkPMFloat_none.h
@@ -34,4 +34,14 @@
     return Sk4f(this->a());
 }
 
+inline SkPMFloat SkPMFloat::FromBGRx(SkColor c) {
+  float inv255 = 1.0f / 255;
+  SkPMFloat pmf = SkPMFloat::FromARGB(1.0f,
+                                      SkGetPackedR32(c) * inv255,
+                                      SkGetPackedG32(c) * inv255,
+                                      SkGetPackedB32(c) * inv255);
+  SkASSERT(pmf.isValid());
+  return pmf;
+}
+
 }  // namespace
diff --git a/src/opts/SkPMFloat_sse.h b/src/opts/SkPMFloat_sse.h
index 8550388..6cfee1d 100644
--- a/src/opts/SkPMFloat_sse.h
+++ b/src/opts/SkPMFloat_sse.h
@@ -38,4 +38,29 @@
     return _mm_shuffle_ps(fVec, fVec, 0xff);  // Read as 11 11 11 11, copying lane 3 to all lanes.
 }
 
+inline SkPMFloat SkPMFloat::FromBGRx(SkColor c) {
+    __m128i fix8 = _mm_cvtsi32_si128((int)c);
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+    const char _ = ~0;  // Zero these bytes.
+    __m128i fix8_32 = _mm_shuffle_epi8(fix8,
+    #if defined(SK_PMCOLOR_IS_BGRA)
+            _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, _,_,_,_)
+    #else
+            _mm_setr_epi8(2,_,_,_, 1,_,_,_, 0,_,_,_, _,_,_,_)
+    #endif
+    );
+#else
+    __m128i fix8_16 = _mm_unpacklo_epi8 (fix8   , _mm_setzero_si128()),
+            fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
+    #if defined(SK_PMCOLOR_IS_RGBA)
+        fix8_32 = _mm_shuffle_epi32(fix8_32, 0xC6);  // C6 == 11 00 01 10, i.e swap lanes 0 and 2.
+    #endif
+#endif
+    fix8_32 = _mm_or_si128(fix8_32, _mm_set_epi32(0xFF,0,0,0));  // Force alpha to 1.
+
+    SkPMFloat pmf = Sk4f(_mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f/255)));
+    SkASSERT(pmf.isValid());
+    return pmf;
+}
+
 }  // namespace