Improve naive SkColorXform to half floats

This should give us a good baseline to explore using SkRasterPipeline.

A particular colorxform to half float drops from 425us to 282us on my desktop.

Color Xform to Half Float (HP z620)
Original                              425us
Trans16 (not 32)                      355us
Vector Trans16                        378us
Trans16 + Keep Halfs in Vector        335us
Vector Trans16 + Keep Halfs in Vector 282us
Final                                 282us

Color Xform to Half Float (Nexus 5X)
Original                              556us
Final                                 472us

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2159993003
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2159993003
diff --git a/src/core/SkXfermodeF16.cpp b/src/core/SkXfermodeF16.cpp
index 63058f9..219e911 100644
--- a/src/core/SkXfermodeF16.cpp
+++ b/src/core/SkXfermodeF16.cpp
@@ -25,13 +25,13 @@
             Sk4f d4 = SkHalfToFloat_finite(dst[i]);
             d4.store(d.fVec);
             Sk4f r4 = Sk4f::Load(proc(*src, d).fVec);
-            dst[i] = SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i]));
+            SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])).store(&dst[i]);
         }
     } else {
         for (int i = 0; i < count; ++i) {
             SkHalfToFloat_finite(dst[i]).store(d.fVec);
             Sk4f r4 = Sk4f::Load(proc(*src, d).fVec);
-            dst[i] = SkFloatToHalf_finite(r4);
+            SkFloatToHalf_finite(r4).store(&dst[i]);
         }
     }
 }
@@ -45,13 +45,13 @@
             Sk4f d4 = SkHalfToFloat_finite(dst[i]);
             d4.store(d.fVec);
             Sk4f r4 = Sk4f::Load(proc(src[i], d).fVec);
-            dst[i] = SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i]));
+            SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])).store(&dst[i]);
         }
     } else {
         for (int i = 0; i < count; ++i) {
             SkHalfToFloat_finite(dst[i]).store(d.fVec);
             Sk4f r4 = Sk4f::Load(proc(src[i], d).fVec);
-            dst[i] = SkFloatToHalf_finite(r4);
+            SkFloatToHalf_finite(r4).store(&dst[i]);
         }
     }
 }
@@ -65,7 +65,7 @@
         for (int i = 0; i < count; ++i) {
             if (aa[i]) {
                 const Sk4f d4 = SkHalfToFloat_finite(dst[i]);
-                dst[i] = SkFloatToHalf_finite(d4 * Sk4f((255 - aa[i]) * 1.0f/255));
+                SkFloatToHalf_finite(d4 * Sk4f((255 - aa[i]) * 1.0f/255)).store(&dst[i]);
             }
         }
     } else {
@@ -83,10 +83,12 @@
     if (aa) {
         for (int i = 0; i < count; ++i) {
             const Sk4f d4 = SkHalfToFloat_finite(dst[i]);
-            dst[i] = SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i]));
+            SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i])).store(&dst[i]);
         }
     } else {
-        sk_memset64(dst, SkFloatToHalf_finite(s4), count);
+        uint64_t s4h;
+        SkFloatToHalf_finite(s4).store(&s4h);
+        sk_memset64(dst, s4h, count);
     }
 }
 
@@ -96,12 +98,12 @@
         for (int i = 0; i < count; ++i) {
             const Sk4f s4 = Sk4f::Load(src[i].fVec);
             const Sk4f d4 = SkHalfToFloat_finite(dst[i]);
-            dst[i] = SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i]));
+            SkFloatToHalf_finite(lerp_by_coverage(s4, d4, aa[i])).store(&dst[i]);
         }
     } else {
         for (int i = 0; i < count; ++i) {
             const Sk4f s4 = Sk4f::Load(src[i].fVec);
-            dst[i] = SkFloatToHalf_finite(s4);
+            SkFloatToHalf_finite(s4).store(&dst[i]);
         }
     }
 }
@@ -124,9 +126,9 @@
         const Sk4f d4 = SkHalfToFloat_finite(dst[i]);
         const Sk4f r4 = s4 + d4 * dst_scale;
         if (aa) {
-            dst[i] = SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i]));
+            SkFloatToHalf_finite(lerp_by_coverage(r4, d4, aa[i])).store(&dst[i]);
         } else {
-            dst[i] = SkFloatToHalf_finite(r4);
+            SkFloatToHalf_finite(r4).store(&dst[i]);
         }
     }
 }
@@ -140,7 +142,7 @@
         if (aa) {
             r = lerp_by_coverage(r, d, aa[i]);
         }
-        dst[i] = SkFloatToHalf_finite(r);
+        SkFloatToHalf_finite(r).store(&dst[i]);
     }
 }