Clean up remaining users of SkPMFloat

This switches over SkXfermodes_opts.h and SkColorMatrixFilter to use Sk4f,
and converts the SkPMFloat benches to Sk4f benches.

No pixels should change here, and no code beyond the Sk4f_ benches should change speed.
The benches are faster than the old versions.

BUG=skia:4117

Review URL: https://codereview.chromium.org/1324743002
diff --git a/bench/PMFloatBench.cpp b/bench/PMFloatBench.cpp
deleted file mode 100644
index 540fdb7..0000000
--- a/bench/PMFloatBench.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright 2015 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "Benchmark.h"
-#include "SkPMFloat.h"
-
-// Used to prevent the compiler from optimizing away the whole loop.
-volatile uint32_t blackhole = 0;
-
-// Not a great random number generator, but it's very fast.
-// The code we're measuring is quite fast, so low overhead is essential.
-static uint32_t lcg_rand(uint32_t* seed) {
-    *seed *= 1664525;
-    *seed += 1013904223;
-    return *seed;
-}
-
-// I'm having better luck getting these to constant-propagate away as template parameters.
-struct PMFloatRoundtripBench : public Benchmark {
-    PMFloatRoundtripBench() {}
-
-    const char* onGetName() override { return "SkPMFloat_roundtrip"; }
-    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
-
-    void onDraw(const int loops, SkCanvas* canvas) override {
-        // Unlike blackhole, junk can and probably will be a register.
-        uint32_t junk = 0;
-        uint32_t seed = 0;
-        for (int i = 0; i < loops; i++) {
-            SkPMColor color;
-        #ifdef SK_DEBUG
-            // Our SkASSERTs will remind us that it's technically required that we premultiply.
-            color = SkPreMultiplyColor(lcg_rand(&seed));
-        #else
-            // But it's a lot faster not to, and this code won't really mind the non-PM colors.
-            color = lcg_rand(&seed);
-        #endif
-
-            auto f = SkPMFloat::FromPMColor(color);
-            SkPMColor back = f.round();
-            junk ^= back;
-        }
-        blackhole ^= junk;
-    }
-};
-DEF_BENCH(return new PMFloatRoundtripBench;)
-
-struct PMFloatGradientBench : public Benchmark {
-    const char* onGetName() override { return "PMFloat_gradient"; }
-    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
-
-    SkPMColor fDevice[100];
-    void onDraw(const int loops, SkCanvas*) override {
-        Sk4f c0 = SkPMFloat::FromARGB(1, 1, 0, 0),
-             c1 = SkPMFloat::FromARGB(1, 0, 0, 1),
-             dc = c1 - c0,
-             fx(0.1f),
-             dx(0.002f),
-             dcdx(dc*dx),
-             dcdx4(dcdx+dcdx+dcdx+dcdx);
-
-        for (int n = 0; n < loops; n++) {
-            Sk4f a = c0 + dc*fx,
-                 b = a + dcdx,
-                 c = b + dcdx,
-                 d = c + dcdx;
-            for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
-                fDevice[i+0] = SkPMFloat(a).round();
-                fDevice[i+1] = SkPMFloat(b).round();
-                fDevice[i+2] = SkPMFloat(c).round();
-                fDevice[i+3] = SkPMFloat(d).round();
-                a = a + dcdx4;
-                b = b + dcdx4;
-                c = c + dcdx4;
-                d = d + dcdx4;
-            }
-        }
-    }
-};
-
-DEF_BENCH(return new PMFloatGradientBench;)
diff --git a/bench/Sk4fBench.cpp b/bench/Sk4fBench.cpp
new file mode 100644
index 0000000..5397863
--- /dev/null
+++ b/bench/Sk4fBench.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Benchmark.h"
+#include "SkColor.h"
+#include "SkNx.h"
+
+// Used to prevent the compiler from optimizing away the whole loop.
+volatile uint32_t blackhole = 0;
+
+// Not a great random number generator, but it's very fast.
+// The code we're measuring is quite fast, so low overhead is essential.
+static uint32_t lcg_rand(uint32_t* seed) {
+    *seed *= 1664525;
+    *seed += 1013904223;
+    return *seed;
+}
+
+struct Sk4fBytesRoundtripBench : public Benchmark {
+    Sk4fBytesRoundtripBench() {}
+
+    const char* onGetName() override { return "Sk4f_roundtrip"; }
+    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
+
+    void onDraw(const int loops, SkCanvas* canvas) override {
+        // Unlike blackhole, junk can and probably will be a register.
+        uint32_t junk = 0;
+        uint32_t seed = 0;
+        for (int i = 0; i < loops; i++) {
+            uint32_t color = lcg_rand(&seed),
+                     back;
+            auto f = Sk4f::FromBytes((const uint8_t*)&color);
+            f.toBytes((uint8_t*)&back);
+            junk ^= back;
+        }
+        blackhole ^= junk;
+    }
+};
+DEF_BENCH(return new Sk4fBytesRoundtripBench;)
+
+struct Sk4fGradientBench : public Benchmark {
+    const char* onGetName() override { return "Sk4f_gradient"; }
+    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
+
+    SkPMColor fDevice[100];
+    void onDraw(const int loops, SkCanvas*) override {
+        Sk4f c0(0,0,255,255),
+             c1(255,0,0,255),
+             dc = c1 - c0,
+             fx(0.1f),
+             dx(0.002f),
+             dcdx(dc*dx),
+             dcdx4(dcdx+dcdx+dcdx+dcdx);
+
+        for (int n = 0; n < loops; n++) {
+            Sk4f a = c0 + dc*fx + Sk4f(0.5f),  // add an extra 0.5f to get rounding for free.
+                 b = a + dcdx,
+                 c = b + dcdx,
+                 d = c + dcdx;
+            for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
+                a.toBytes((uint8_t*)(fDevice+i+0));
+                b.toBytes((uint8_t*)(fDevice+i+1));
+                c.toBytes((uint8_t*)(fDevice+i+2));
+                d.toBytes((uint8_t*)(fDevice+i+3));
+                a = a + dcdx4;
+                b = b + dcdx4;
+                c = c + dcdx4;
+                d = d + dcdx4;
+            }
+        }
+    }
+};
+DEF_BENCH(return new Sk4fGradientBench;)