SoftLight with SkPMFloat

SSE speeds up about 4.5x over existing integer SSE,
NEON speeds up about 3x over serial integer code.

We expect 1-2 bit component diffs in the usual GMs.

Still guarded by SK_SUPPORT_LEGACY_XFERMODES,
which I'll now try to lift in Chrome.

BUG=skia:

Review URL: https://codereview.chromium.org/1221493002
diff --git a/src/core/Sk4pxXfermode.h b/src/core/Sk4pxXfermode.h
index b587183..28fd9f1 100644
--- a/src/core/Sk4pxXfermode.h
+++ b/src/core/Sk4pxXfermode.h
@@ -141,6 +141,31 @@
                                           otherwise));
     return srcover * SkPMFloat(1,0,0,0) + colors * SkPMFloat(0,1,1,1);
 }
+XFERMODE(SoftLight) {
+    auto sa = s.alphas(),
+         da = d.alphas(),
+         isa = Sk4f(1)-sa,
+         ida = Sk4f(1)-da;
+
+    // Some common terms.
+    auto m  = (da > Sk4f(0)).thenElse(d / da, Sk4f(0)),
+         s2 = Sk4f(2)*s,
+         m4 = Sk4f(4)*m;
+
+    // The logic forks three ways:
+    //    1. dark src?
+    //    2. light src, dark dst?
+    //    3. light src, light dst?
+    auto darkSrc = d*(sa + (s2 - sa)*(Sk4f(1) - m)),        // Used in case 1.
+         darkDst = (m4*m4 + m4)*(m - Sk4f(1)) + Sk4f(7)*m,  // Used in case 2.
+         liteDst = m.sqrt() - m,                            // Used in case 3.
+         liteSrc = d*sa + da*(s2-sa)*(Sk4f(4)*d < da).thenElse(darkDst, liteDst); // Case 2 or 3?
+
+    auto alpha  = s + d*isa;
+    auto colors = s*ida + d*isa + (s2 < sa).thenElse(darkSrc, liteSrc);           // Case 1 or 2/3?
+
+    return alpha * SkPMFloat(1,0,0,0) + colors * SkPMFloat(0,1,1,1);
+}
 #undef XFERMODE
 
 // A reasonable fallback mode for doing AA is to simply apply the transfermode first,
@@ -244,6 +269,7 @@
 
         case SkXfermode::kColorDodge_Mode: return SkTPMFloatXfermode<ColorDodge>::Create(rec);
         case SkXfermode::kColorBurn_Mode:  return SkTPMFloatXfermode<ColorBurn>::Create(rec);
+        case SkXfermode::kSoftLight_Mode:  return SkTPMFloatXfermode<SoftLight>::Create(rec);
 #endif
         default: break;
     }
diff --git a/src/opts/SkXfermode_opts_SSE2.cpp b/src/opts/SkXfermode_opts_SSE2.cpp
index ca26263..2024a17 100644
--- a/src/opts/SkXfermode_opts_SSE2.cpp
+++ b/src/opts/SkXfermode_opts_SSE2.cpp
@@ -516,15 +516,13 @@
                                                          SkXfermode::Mode mode) {
     SkXfermodeProcSIMD proc = nullptr;
     switch (mode) {
-        // TODO(mtklein): Sk4pxXfermode has these now.  Clean up.
+        // TODO(mtklein): Sk4pxXfermode has these now.  Clean up the whole file!
         case SkProcCoeffXfermode::kOverlay_Mode:    proc =    overlay_modeproc_SSE2; break;
         case SkProcCoeffXfermode::kDarken_Mode:     proc =     darken_modeproc_SSE2; break;
         case SkProcCoeffXfermode::kLighten_Mode:    proc =    lighten_modeproc_SSE2; break;
         case SkProcCoeffXfermode::kHardLight_Mode:  proc =  hardlight_modeproc_SSE2; break;
         case SkProcCoeffXfermode::kColorDodge_Mode: proc = colordodge_modeproc_SSE2; break;
         case SkProcCoeffXfermode::kColorBurn_Mode:  proc =  colorburn_modeproc_SSE2; break;
-
-        // TODO(mtklein): implement this with SkPMFloat.
         case SkProcCoeffXfermode::kSoftLight_Mode:  proc =  softlight_modeproc_SSE2; break;
         default: break;
     }