Turn on Sk4px xfermodes when we have NEON too.
For SSE, Sk4px is better than Sk4f is better than SkXfermodes_opts_SSE2 (where implemented).
For NEON, Sk4px is better than SkXfermodes_opts_arm_neon is better than Sk4f (where implemented).
This is a 1.6-1.9x speedup for Plus,Modulate, and Screen for NEON.
BUG=skia:
Review URL: https://codereview.chromium.org/1128053004
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index ee00023..a2ab65b 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -19,19 +19,17 @@
#include "SkUtilsArm.h"
#include "SkWriteBuffer.h"
-#ifndef SK_SUPPORT_LEGACY_SCALAR_XFERMODES
+// When implemented, the Sk4f and Sk4px xfermodes beat src/opts/SkXfermodes_opts_SSE2's.
+// When implemented, the Sk4px, but not Sk4f, xfermodes beat src/opts/SkXfermodes_arm_neon's.
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
- /*
- * To be conservative, we only enable the new code path (using SkPMFloat) when we
- * "know" we're faster, which at the moment is only when we have SSE2 or better.
- */
-#else
- #define SK_SUPPORT_LEGACY_SCALAR_XFERMODES
-#endif
+ #define SK_4F_XFERMODES_ARE_FAST
+ #define SK_4PX_XFERMODES_ARE_FAST
+#elif defined(SK_ARM_HAS_NEON)
+ #define SK_4PX_XFERMODES_ARE_FAST
#endif
#if !SK_ARM_NEON_IS_NONE
-#include "SkXfermode_opts_arm_neon.h"
+ #include "SkXfermode_opts_arm_neon.h"
#endif
#define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b)
@@ -1196,7 +1194,6 @@
{ screen_modeproc, SkXfermode::kOne_Coeff, SkXfermode::kISC_Coeff },
*/
-#ifndef SK_SUPPORT_LEGACY_SCALAR_XFERMODES
static const float gInv255 = 0.0039215683f; // (1.0f / 255) - ULP == SkBits2Float(0x3B808080)
static Sk4f ramp(const Sk4f& v0, const Sk4f& v1, const Sk4f& t) {
@@ -1412,7 +1409,6 @@
typedef SkProcCoeffXfermode INHERITED;
};
-#endif
///////////////////////////////////////////////////////////////////////////////
@@ -1474,57 +1470,32 @@
rec.fProc = pp;
}
- SkXfermode* xfer = NULL;
-
-#ifndef SK_SUPPORT_LEGACY_SCALAR_XFERMODES
+#if defined(SK_4PX_XFERMODES_ARE_FAST) && !defined(SK_PREFER_LEGACY_FLOAT_XFERMODES)
switch (mode) {
- case SkXfermode::kSrcATop_Mode:
- xfer = SkT4fXfermode<SrcATop4f>::Create(rec);
- break;
- case SkXfermode::kDstATop_Mode:
- xfer = SkT4fXfermode<DstATop4f>::Create(rec);
- break;
- case SkXfermode::kXor_Mode:
- xfer = SkT4fXfermode<Xor4f>::Create(rec);
- break;
- #ifdef SK_PREFER_LEGACY_FLOAT_XFERMODES
- case SkXfermode::kPlus_Mode:
- xfer = SkT4fXfermode<Plus4f>::Create(rec);
- break;
- case SkXfermode::kModulate_Mode:
- xfer = SkT4fXfermode<Modulate4f>::Create(rec);
- break;
- case SkXfermode::kScreen_Mode:
- xfer = SkT4fXfermode<Screen4f>::Create(rec);
- break;
- #else
- case SkXfermode::kPlus_Mode:
- xfer = SkT4pxXfermode<Plus4f>::Create(rec);
- break;
- case SkXfermode::kModulate_Mode:
- xfer = SkT4pxXfermode<Modulate4f>::Create(rec);
- break;
- case SkXfermode::kScreen_Mode:
- xfer = SkT4pxXfermode<Screen4f>::Create(rec);
- break;
- #endif
- case SkXfermode::kMultiply_Mode:
- xfer = SkT4fXfermode<Multiply4f>::Create(rec);
- break;
- case SkXfermode::kDifference_Mode:
- xfer = SkT4fXfermode<Difference4f>::Create(rec);
- break;
- case SkXfermode::kExclusion_Mode:
- xfer = SkT4fXfermode<Exclusion4f>::Create(rec);
- break;
- default:
- break;
- }
- if (xfer) {
- return xfer;
+ case SkXfermode::kPlus_Mode: return SkT4pxXfermode<Plus4f>::Create(rec);
+ case SkXfermode::kModulate_Mode: return SkT4pxXfermode<Modulate4f>::Create(rec);
+ case SkXfermode::kScreen_Mode: return SkT4pxXfermode<Screen4f>::Create(rec);
+ default: break;
}
#endif
+#if defined(SK_4F_XFERMODES_ARE_FAST)
+ switch (mode) {
+ case SkXfermode::kSrcATop_Mode: return SkT4fXfermode<SrcATop4f>::Create(rec);
+ case SkXfermode::kDstATop_Mode: return SkT4fXfermode<DstATop4f>::Create(rec);
+ case SkXfermode::kXor_Mode: return SkT4fXfermode<Xor4f>::Create(rec);
+ case SkXfermode::kPlus_Mode: return SkT4fXfermode<Plus4f>::Create(rec);
+ case SkXfermode::kModulate_Mode: return SkT4fXfermode<Modulate4f>::Create(rec);
+ case SkXfermode::kScreen_Mode: return SkT4fXfermode<Screen4f>::Create(rec);
+ case SkXfermode::kMultiply_Mode: return SkT4fXfermode<Multiply4f>::Create(rec);
+ case SkXfermode::kDifference_Mode: return SkT4fXfermode<Difference4f>::Create(rec);
+ case SkXfermode::kExclusion_Mode: return SkT4fXfermode<Exclusion4f>::Create(rec);
+ default: break;
+ }
+#endif
+
+ SkXfermode* xfer = NULL;
+
// check if we have a platform optim for that
SkProcCoeffXfermode* xfm = SkPlatformXfermodeFactory(rec, mode);
if (xfm != NULL) {
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index b9d4357..e4dbec9 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -355,6 +355,8 @@
void store(uint8_t vals[16]) const { vst1q_u8(vals, fVec); }
+ SkNi saturatedAdd(const SkNi& o) const { return vqaddq_u8(fVec, o.fVec); }
+
SkNi operator + (const SkNi& o) const { return vaddq_u8(fVec, o.fVec); }
SkNi operator - (const SkNi& o) const { return vsubq_u8(fVec, o.fVec); }
SkNi operator * (const SkNi& o) const { return vmulq_u8(fVec, o.fVec); }