3-15% speedup to HardLight / Overlay xfermodes.

While investigating my bug (skia:4052) I saw this TODO and figured
it'd make me feel better about an otherwise unsuccessful investigation.

This speeds up HardLight and Overlay (same code) by about 15% with SSE, mostly
by rewriting the logic from 1 cheap comparison and 2 expensive div255() calls
to 2 cheap comparisons and 1 expensive div255().

NEON speeds up by a more modest ~3%.

BUG=skia:

Review URL: https://codereview.chromium.org/1230663005
diff --git a/src/core/Sk4px.h b/src/core/Sk4px.h
index e046e26..e1d4dc1 100644
--- a/src/core/Sk4px.h
+++ b/src/core/Sk4px.h
@@ -70,6 +70,7 @@
         Wide operator >> (int bits) const { return INHERITED::operator>>(bits); }
         Wide operator << (int bits) const { return INHERITED::operator<<(bits); }
         static Wide Min(const Wide& a, const Wide& b) { return INHERITED::Min(a,b); }
+        Wide thenElse(const Wide& t, const Wide& e) const { return INHERITED::thenElse(t,e); }
 
     private:
         typedef Sk16h INHERITED;
@@ -77,6 +78,7 @@
 
     Wide widenLo() const;               // ARGB -> 0A 0R 0G 0B
     Wide widenHi() const;               // ARGB -> A0 R0 G0 B0
+    Wide widenLoHi() const;             // ARGB -> AA RR GG BB
     Wide mulWiden(const Sk16b&) const;  // 8-bit x 8-bit -> 16-bit components.
 
     // The only 8-bit multiply we use is 8-bit x 8-bit -> 16-bit.  Might as well make it pithy.
diff --git a/src/core/Sk4pxXfermode.h b/src/core/Sk4pxXfermode.h
index 98b0bd9..97321b7 100644
--- a/src/core/Sk4pxXfermode.h
+++ b/src/core/Sk4pxXfermode.h
@@ -68,15 +68,13 @@
     auto sa = s.alphas(),
          da = d.alphas();
 
-    auto isLite = (sa-s) < s;
+    auto isLite = ((sa-s) < s).widenLoHi();
 
     auto dark = s*d << 1,
          lite = sa*da - ((da-d)*(sa-s) << 1),
          both = s*da.inv() + d*sa.inv();
 
-    // TODO: do isLite in 16-bit so we only have to div255() once.
-    auto colors = isLite.thenElse((lite + both).div255(),
-                                  (dark + both).div255());
+    auto colors = (both + isLite.thenElse(lite, dark)).div255();
     return alphas.zeroColors() + colors.zeroAlphas();
 }
 XFERMODE(Overlay) { return HardLight::Xfer(d,s); }
diff --git a/src/opts/Sk4px_NEON.h b/src/opts/Sk4px_NEON.h
index 9401864..cd6dea9 100644
--- a/src/opts/Sk4px_NEON.h
+++ b/src/opts/Sk4px_NEON.h
@@ -40,6 +40,12 @@
                  vshll_n_u8(vget_high_u8(this->fVec), 8));
 }
 
+inline Sk4px::Wide Sk4px::widenLoHi() const {
+    auto zipped = vzipq_u8(this->fVec, this->fVec);
+    return Sk16h((uint16x8_t)zipped.val[0],
+                 (uint16x8_t)zipped.val[1]);
+}
+
 inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
     return Sk16h(vmull_u8(vget_low_u8 (this->fVec), vget_low_u8 (other.fVec)),
                  vmull_u8(vget_high_u8(this->fVec), vget_high_u8(other.fVec)));
diff --git a/src/opts/Sk4px_SSE2.h b/src/opts/Sk4px_SSE2.h
index 74ccffc..3809c5e 100644
--- a/src/opts/Sk4px_SSE2.h
+++ b/src/opts/Sk4px_SSE2.h
@@ -31,6 +31,11 @@
                  _mm_unpackhi_epi8(_mm_setzero_si128(), this->fVec));
 }
 
+inline Sk4px::Wide Sk4px::widenLoHi() const {
+    return Sk16h(_mm_unpacklo_epi8(this->fVec, this->fVec),
+                 _mm_unpackhi_epi8(this->fVec, this->fVec));
+}
+
 inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
     return this->widenLo() * Sk4px(other).widenLo();
 }
diff --git a/src/opts/Sk4px_none.h b/src/opts/Sk4px_none.h
index ce2f845..ba13e58 100644
--- a/src/opts/Sk4px_none.h
+++ b/src/opts/Sk4px_none.h
@@ -48,6 +48,8 @@
 
 inline Sk4px::Wide Sk4px::widenHi() const { return this->widenLo() << 8; }
 
+inline Sk4px::Wide Sk4px::widenLoHi() const { return this->widenLo() + this->widenHi(); }
+
 inline Sk4px::Wide Sk4px::mulWiden(const Sk16b& other) const {
     return this->widenLo() * Sk4px(other).widenLo();
 }
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index ccba163..1cae223 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -337,6 +337,11 @@
         return vgetq_lane_u16(fVec, k&7);
     }
 
+    SkNi thenElse(const SkNi& t, const SkNi& e) const {
+        return vorrq_u16(vandq_u16(t.fVec, fVec),
+                         vbicq_u16(e.fVec, fVec));
+    }
+
     uint16x8_t fVec;
 };
 
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 2d3acbe..e165f58 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -257,6 +257,11 @@
                                                   _mm_sub_epi8(b.fVec, top_8x)));
     }
 
+    SkNi thenElse(const SkNi& t, const SkNi& e) const {
+        return _mm_or_si128(_mm_and_si128   (fVec, t.fVec),
+                            _mm_andnot_si128(fVec, e.fVec));
+    }
+
     template <int k> uint16_t kth() const {
         SkASSERT(0 <= k && k < 8);
         return _mm_extract_epi16(fVec, k);
diff --git a/tests/SkNxTest.cpp b/tests/SkNxTest.cpp
index 5893214..4005d25 100644
--- a/tests/SkNxTest.cpp
+++ b/tests/SkNxTest.cpp
@@ -192,3 +192,19 @@
     }
     }
 }
+
+DEF_TEST(Sk4px_widening, r) {
+    SkPMColor colors[] = {
+        SkPreMultiplyColor(0xff00ff00),
+        SkPreMultiplyColor(0x40008000),
+        SkPreMultiplyColor(0x7f020406),
+        SkPreMultiplyColor(0x00000000),
+    };
+    auto packed = Sk4px::Load4(colors);
+
+    auto wideLo = packed.widenLo(),
+         wideHi = packed.widenHi(),
+         wideLoHi    = packed.widenLoHi(),
+         wideLoHiAlt = wideLo + wideHi;
+    REPORTER_ASSERT(r, 0 == memcmp(&wideLoHi, &wideLoHiAlt, sizeof(wideLoHi)));
+}