Clean up remaining users of SkPMFloat

This switches over SkXfermodes_opts.h and SkColorMatrixFilter to use Sk4f,
and converts the SkPMFloat benches to Sk4f benches.

No pixels should change here, and no code beyond the Sk4f_ benches should change speed.
The benches are faster than the old versions.

BUG=skia:4117

Review URL: https://codereview.chromium.org/1324743002
diff --git a/bench/Sk4fBench.cpp b/bench/Sk4fBench.cpp
new file mode 100644
index 0000000..5397863
--- /dev/null
+++ b/bench/Sk4fBench.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Benchmark.h"
+#include "SkColor.h"
+#include "SkNx.h"
+
+// Used to prevent the compiler from optimizing away the whole loop.
+volatile uint32_t blackhole = 0;
+
+// Not a great random number generator, but it's very fast.
+// The code we're measuring is quite fast, so low overhead is essential.
+static uint32_t lcg_rand(uint32_t* seed) {
+    *seed *= 1664525;
+    *seed += 1013904223;
+    return *seed;
+}
+
+struct Sk4fBytesRoundtripBench : public Benchmark {
+    Sk4fBytesRoundtripBench() {}
+
+    const char* onGetName() override { return "Sk4f_roundtrip"; }
+    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
+
+    void onDraw(const int loops, SkCanvas* canvas) override {
+        // Unlike blackhole, junk can and probably will be a register.
+        uint32_t junk = 0;
+        uint32_t seed = 0;
+        for (int i = 0; i < loops; i++) {
+            uint32_t color = lcg_rand(&seed),
+                     back;
+            auto f = Sk4f::FromBytes((const uint8_t*)&color);
+            f.toBytes((uint8_t*)&back);
+            junk ^= back;
+        }
+        blackhole ^= junk;
+    }
+};
+DEF_BENCH(return new Sk4fBytesRoundtripBench;)
+
+struct Sk4fGradientBench : public Benchmark {
+    const char* onGetName() override { return "Sk4f_gradient"; }
+    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
+
+    SkPMColor fDevice[100];
+    void onDraw(const int loops, SkCanvas*) override {
+        Sk4f c0(0,0,255,255),
+             c1(255,0,0,255),
+             dc = c1 - c0,
+             fx(0.1f),
+             dx(0.002f),
+             dcdx(dc*dx),
+             dcdx4(dcdx+dcdx+dcdx+dcdx);
+
+        for (int n = 0; n < loops; n++) {
+            Sk4f a = c0 + dc*fx + Sk4f(0.5f),  // add an extra 0.5f to get rounding for free.
+                 b = a + dcdx,
+                 c = b + dcdx,
+                 d = c + dcdx;
+            for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
+                a.toBytes((uint8_t*)(fDevice+i+0));
+                b.toBytes((uint8_t*)(fDevice+i+1));
+                c.toBytes((uint8_t*)(fDevice+i+2));
+                d.toBytes((uint8_t*)(fDevice+i+3));
+                a = a + dcdx4;
+                b = b + dcdx4;
+                c = c + dcdx4;
+                d = d + dcdx4;
+            }
+        }
+    }
+};
+DEF_BENCH(return new Sk4fGradientBench;)