Unify some SkNx code

 - one base case and one N=1 case instead of two each (or three with doubles)
 - use SkNx_cast instead of FromBytes/toBytes
 - 4-at-a-time Sk4f::ToBytes becomes a special standalone Sk4f_ToBytes

If I did everything right, this'll be perf- and pixel- neutral.

https://gold.skia.org/search2?issue=1526523003&unt=true&query=source_type%3Dgm&master=false

BUG=skia:
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1526523003
diff --git a/bench/Sk4fBench.cpp b/bench/Sk4fBench.cpp
index b2f2b4a..1daa3f2 100644
--- a/bench/Sk4fBench.cpp
+++ b/bench/Sk4fBench.cpp
@@ -33,8 +33,8 @@
         for (int i = 0; i < loops; i++) {
             uint32_t color = lcg_rand(&seed),
                      back;
-            auto f = Sk4f::FromBytes((const uint8_t*)&color);
-            f.toBytes((uint8_t*)&back);
+            auto f = SkNx_cast<float>(Sk4b::Load((const uint8_t*)&color));
+            SkNx_cast<uint8_t>(f).store((uint8_t*)&back);
             junk ^= back;
         }
         blackhole ^= junk;
@@ -62,7 +62,7 @@
                  c = b + dcdx,
                  d = c + dcdx;
             for (size_t i = 0; i < SK_ARRAY_COUNT(fDevice); i += 4) {
-                Sk4f::ToBytes((uint8_t*)(fDevice+i), a, b, c, d);
+                Sk4f_ToBytes((uint8_t*)(fDevice+i), a, b, c, d);
                 a = a + dcdx4;
                 b = b + dcdx4;
                 c = c + dcdx4;