Turn on Sk4px xfermodes when we have NEON too.

For SSE, Sk4px is better than Sk4f is better than SkXfermodes_opts_SSE2 (where implemented).
For NEON, Sk4px is better than SkXfermodes_opts_arm_neon is better than Sk4f (where implemented).

This is a 1.6-1.9x speedup for Plus,Modulate, and Screen for NEON.

BUG=skia:

Review URL: https://codereview.chromium.org/1128053004
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index ee00023..a2ab65b 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -19,19 +19,17 @@
 #include "SkUtilsArm.h"
 #include "SkWriteBuffer.h"
 
-#ifndef SK_SUPPORT_LEGACY_SCALAR_XFERMODES
+// When implemented, the Sk4f and Sk4px xfermodes beat src/opts/SkXfermodes_opts_SSE2's.
+// When implemented, the Sk4px, but not Sk4f, xfermodes beat src/opts/SkXfermodes_arm_neon's.
 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
-    /*
-     * To be conservative, we only enable the new code path (using SkPMFloat) when we
-     * "know" we're faster, which at the moment is only when we have SSE2 or better.
-     */
-#else
-    #define SK_SUPPORT_LEGACY_SCALAR_XFERMODES
-#endif
+    #define SK_4F_XFERMODES_ARE_FAST
+    #define SK_4PX_XFERMODES_ARE_FAST
+#elif defined(SK_ARM_HAS_NEON)
+    #define SK_4PX_XFERMODES_ARE_FAST
 #endif
 
 #if !SK_ARM_NEON_IS_NONE
-#include "SkXfermode_opts_arm_neon.h"
+    #include "SkXfermode_opts_arm_neon.h"
 #endif
 
 #define SkAlphaMulAlpha(a, b)   SkMulDiv255Round(a, b)
@@ -1196,7 +1194,6 @@
 { screen_modeproc,  SkXfermode::kOne_Coeff,     SkXfermode::kISC_Coeff },
 */
 
-#ifndef SK_SUPPORT_LEGACY_SCALAR_XFERMODES
 static const float gInv255 = 0.0039215683f; //  (1.0f / 255) - ULP == SkBits2Float(0x3B808080)
 
 static Sk4f ramp(const Sk4f& v0, const Sk4f& v1, const Sk4f& t) {
@@ -1412,7 +1409,6 @@
 
     typedef SkProcCoeffXfermode INHERITED;
 };
-#endif
 
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -1474,57 +1470,32 @@
         rec.fProc = pp;
     }
 
-    SkXfermode* xfer = NULL;
-
-#ifndef SK_SUPPORT_LEGACY_SCALAR_XFERMODES
+#if defined(SK_4PX_XFERMODES_ARE_FAST) && !defined(SK_PREFER_LEGACY_FLOAT_XFERMODES)
     switch (mode) {
-        case SkXfermode::kSrcATop_Mode:
-            xfer = SkT4fXfermode<SrcATop4f>::Create(rec);
-            break;
-        case SkXfermode::kDstATop_Mode:
-            xfer = SkT4fXfermode<DstATop4f>::Create(rec);
-            break;
-        case SkXfermode::kXor_Mode:
-            xfer = SkT4fXfermode<Xor4f>::Create(rec);
-            break;
-    #ifdef SK_PREFER_LEGACY_FLOAT_XFERMODES
-        case SkXfermode::kPlus_Mode:
-            xfer = SkT4fXfermode<Plus4f>::Create(rec);
-            break;
-        case SkXfermode::kModulate_Mode:
-            xfer = SkT4fXfermode<Modulate4f>::Create(rec);
-            break;
-        case SkXfermode::kScreen_Mode:
-            xfer = SkT4fXfermode<Screen4f>::Create(rec);
-            break;
-    #else
-        case SkXfermode::kPlus_Mode:
-            xfer = SkT4pxXfermode<Plus4f>::Create(rec);
-            break;
-        case SkXfermode::kModulate_Mode:
-            xfer = SkT4pxXfermode<Modulate4f>::Create(rec);
-            break;
-        case SkXfermode::kScreen_Mode:
-            xfer = SkT4pxXfermode<Screen4f>::Create(rec);
-            break;
-    #endif
-        case SkXfermode::kMultiply_Mode:
-            xfer = SkT4fXfermode<Multiply4f>::Create(rec);
-            break;
-        case SkXfermode::kDifference_Mode:
-            xfer = SkT4fXfermode<Difference4f>::Create(rec);
-            break;
-        case SkXfermode::kExclusion_Mode:
-            xfer = SkT4fXfermode<Exclusion4f>::Create(rec);
-            break;
-        default:
-            break;
-    }
-    if (xfer) {
-        return xfer;
+        case SkXfermode::kPlus_Mode:     return SkT4pxXfermode<Plus4f>::Create(rec);
+        case SkXfermode::kModulate_Mode: return SkT4pxXfermode<Modulate4f>::Create(rec);
+        case SkXfermode::kScreen_Mode:   return SkT4pxXfermode<Screen4f>::Create(rec);
+        default: break;
     }
 #endif
 
+#if defined(SK_4F_XFERMODES_ARE_FAST)
+    switch (mode) {
+        case SkXfermode::kSrcATop_Mode:    return SkT4fXfermode<SrcATop4f>::Create(rec);
+        case SkXfermode::kDstATop_Mode:    return SkT4fXfermode<DstATop4f>::Create(rec);
+        case SkXfermode::kXor_Mode:        return SkT4fXfermode<Xor4f>::Create(rec);
+        case SkXfermode::kPlus_Mode:       return SkT4fXfermode<Plus4f>::Create(rec);
+        case SkXfermode::kModulate_Mode:   return SkT4fXfermode<Modulate4f>::Create(rec);
+        case SkXfermode::kScreen_Mode:     return SkT4fXfermode<Screen4f>::Create(rec);
+        case SkXfermode::kMultiply_Mode:   return SkT4fXfermode<Multiply4f>::Create(rec);
+        case SkXfermode::kDifference_Mode: return SkT4fXfermode<Difference4f>::Create(rec);
+        case SkXfermode::kExclusion_Mode:  return SkT4fXfermode<Exclusion4f>::Create(rec);
+        default: break;
+    }
+#endif
+
+    SkXfermode* xfer = NULL;
+
     // check if we have a platform optim for that
     SkProcCoeffXfermode* xfm = SkPlatformXfermodeFactory(rec, mode);
     if (xfm != NULL) {
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index b9d4357..e4dbec9 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -355,6 +355,8 @@
 
     void store(uint8_t vals[16]) const { vst1q_u8(vals, fVec); }
 
+    SkNi saturatedAdd(const SkNi& o) const { return vqaddq_u8(fVec, o.fVec); }
+
     SkNi operator + (const SkNi& o) const { return vaddq_u8(fVec, o.fVec); }
     SkNi operator - (const SkNi& o) const { return vsubq_u8(fVec, o.fVec); }
     SkNi operator * (const SkNi& o) const { return vmulq_u8(fVec, o.fVec); }