Flush to zero when loading f16 with sse2/sse4.1.

The multiply by 0x77800000 is quite slow when the input is denormalized.
We don't mind flushing those values (in the range of 1e-5) to zero.

Implement portable load_f16() / store_f16() too.

Change-Id: I125cff1c79ca71d9abe22ac7877136d86707cb56
Reviewed-on: https://skia-review.googlesource.com/8467
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index a58a7d7..3b27ffe 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -18,7 +18,7 @@
     {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f},
     0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f,       // from_srgb
     12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f,   //   to_srgb
-    0x77800000, 0x07800000,                            // fp16 <-> fp32
+    0x77800000, 0x07800000, 0x04000400,                // fp16 <-> fp32
 };
 
 using JumperStage = void(size_t, void**, const SkJumper_constants*);
diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h
index f6088dd..7a42a52 100644
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@@ -46,6 +46,7 @@
     // fp16 <-> fp32
     uint32_t _0x77800000;
     uint32_t _0x07800000;
+    uint32_t _0x04000400;
 };
 
 #endif//SkJumper_DEFINED
diff --git a/src/jumper/SkJumper_generated.h b/src/jumper/SkJumper_generated.h
index 9641b5f..591a3c7 100644
--- a/src/jumper/SkJumper_generated.h
+++ b/src/jumper/SkJumper_generated.h
@@ -1953,12 +1953,19 @@
     0x48,0x8b,0x00,                             //  mov           (%rax),%rax
     0xf3,0x0f,0x6f,0x04,0xf8,                   //  movdqu        (%rax,%rdi,8),%xmm0
     0xf3,0x0f,0x6f,0x4c,0xf8,0x10,              //  movdqu        0x10(%rax,%rdi,8),%xmm1
-    0x66,0x0f,0x6f,0xd8,                        //  movdqa        %xmm0,%xmm3
-    0x66,0x0f,0x61,0xd9,                        //  punpcklwd     %xmm1,%xmm3
+    0x66,0x0f,0x6f,0xd0,                        //  movdqa        %xmm0,%xmm2
+    0x66,0x0f,0x61,0xd1,                        //  punpcklwd     %xmm1,%xmm2
     0x66,0x0f,0x69,0xc1,                        //  punpckhwd     %xmm1,%xmm0
+    0x66,0x44,0x0f,0x6f,0xc2,                   //  movdqa        %xmm2,%xmm8
+    0x66,0x44,0x0f,0x61,0xc0,                   //  punpcklwd     %xmm0,%xmm8
+    0x66,0x0f,0x69,0xd0,                        //  punpckhwd     %xmm0,%xmm2
+    0x66,0x0f,0x6e,0x42,0x64,                   //  movd          0x64(%rdx),%xmm0
+    0x66,0x0f,0x70,0xd8,0x00,                   //  pshufd        $0x0,%xmm0,%xmm3
     0x66,0x0f,0x6f,0xcb,                        //  movdqa        %xmm3,%xmm1
-    0x66,0x0f,0x61,0xc8,                        //  punpcklwd     %xmm0,%xmm1
-    0x66,0x0f,0x69,0xd8,                        //  punpckhwd     %xmm0,%xmm3
+    0x66,0x41,0x0f,0x65,0xc8,                   //  pcmpgtw       %xmm8,%xmm1
+    0x66,0x41,0x0f,0xdf,0xc8,                   //  pandn         %xmm8,%xmm1
+    0x66,0x0f,0x65,0xda,                        //  pcmpgtw       %xmm2,%xmm3
+    0x66,0x0f,0xdf,0xda,                        //  pandn         %xmm2,%xmm3
     0x66,0x0f,0x38,0x33,0xc1,                   //  pmovzxwd      %xmm1,%xmm0
     0x66,0x0f,0x72,0xf0,0x0d,                   //  pslld         $0xd,%xmm0
     0x66,0x0f,0x6e,0x52,0x5c,                   //  movd          0x5c(%rdx),%xmm2
@@ -2586,12 +2593,19 @@
     0x48,0x8b,0x00,                             //  mov           (%rax),%rax
     0xf3,0x0f,0x6f,0x04,0xf8,                   //  movdqu        (%rax,%rdi,8),%xmm0
     0xf3,0x0f,0x6f,0x4c,0xf8,0x10,              //  movdqu        0x10(%rax,%rdi,8),%xmm1
-    0x66,0x0f,0x6f,0xd8,                        //  movdqa        %xmm0,%xmm3
-    0x66,0x0f,0x61,0xd9,                        //  punpcklwd     %xmm1,%xmm3
+    0x66,0x0f,0x6f,0xd0,                        //  movdqa        %xmm0,%xmm2
+    0x66,0x0f,0x61,0xd1,                        //  punpcklwd     %xmm1,%xmm2
     0x66,0x0f,0x69,0xc1,                        //  punpckhwd     %xmm1,%xmm0
+    0x66,0x44,0x0f,0x6f,0xc2,                   //  movdqa        %xmm2,%xmm8
+    0x66,0x44,0x0f,0x61,0xc0,                   //  punpcklwd     %xmm0,%xmm8
+    0x66,0x0f,0x69,0xd0,                        //  punpckhwd     %xmm0,%xmm2
+    0x66,0x0f,0x6e,0x42,0x64,                   //  movd          0x64(%rdx),%xmm0
+    0x66,0x0f,0x70,0xd8,0x00,                   //  pshufd        $0x0,%xmm0,%xmm3
     0x66,0x0f,0x6f,0xcb,                        //  movdqa        %xmm3,%xmm1
-    0x66,0x0f,0x61,0xc8,                        //  punpcklwd     %xmm0,%xmm1
-    0x66,0x0f,0x69,0xd8,                        //  punpckhwd     %xmm0,%xmm3
+    0x66,0x41,0x0f,0x65,0xc8,                   //  pcmpgtw       %xmm8,%xmm1
+    0x66,0x41,0x0f,0xdf,0xc8,                   //  pandn         %xmm8,%xmm1
+    0x66,0x0f,0x65,0xda,                        //  pcmpgtw       %xmm2,%xmm3
+    0x66,0x0f,0xdf,0xda,                        //  pandn         %xmm2,%xmm3
     0x66,0x45,0x0f,0xef,0xc0,                   //  pxor          %xmm8,%xmm8
     0x66,0x0f,0x6f,0xc1,                        //  movdqa        %xmm1,%xmm0
     0x66,0x41,0x0f,0x61,0xc0,                   //  punpcklwd     %xmm8,%xmm0
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 6c106c3..20ea719 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -402,8 +402,16 @@
     auto ptr = *(const uint64_t**)ctx + x;
 
 #if !defined(JUMPER)
-    // TODO:
-    (void)ptr;
+    auto half_to_float = [&](int16_t h) {
+        if (h < 0x0400) { h = 0; }                // Flush denorm and negative to zero.
+        return bit_cast<F>(h << 13)               // Line up the mantissa,
+             * bit_cast<F>(U32(k->_0x77800000));  // then fix up the exponent.
+    };
+    auto rgba = (const int16_t*)ptr;
+    r = half_to_float(rgba[0]);
+    g = half_to_float(rgba[1]);
+    b = half_to_float(rgba[2]);
+    a = half_to_float(rgba[3]);
 #elif defined(__aarch64__)
     auto halfs = vld4_f16((const float16_t*)ptr);
     r = vcvt_f32_f16(halfs.val[0]);
@@ -448,6 +456,11 @@
     auto rg = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
          ba = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3
 
+    // half_to_float() slows down ~10x for denorm inputs, so we flush them to zero.
+    // With a signed comparison this conveniently also flushes negative half floats to zero.
+    rg = _mm_andnot_si128(_mm_cmplt_epi16(rg, U32(k->_0x04000400)), rg);
+    ba = _mm_andnot_si128(_mm_cmplt_epi16(ba, U32(k->_0x04000400)), ba);
+
     auto half_to_float = [&](U32 h) {
         return bit_cast<F>(h << 13)               // Line up the mantissa,
              * bit_cast<F>(U32(k->_0x77800000));  // then fix up the exponent.
@@ -464,8 +477,15 @@
     auto ptr = *(uint64_t**)ctx + x;
 
 #if !defined(JUMPER)
-    // TODO:
-    (void)ptr;
+    auto float_to_half = [&](F f) {
+        return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000)))  // Fix up the exponent,
+            >> 13;                                                  // then line up the mantissa.
+    };
+    auto rgba = (int16_t*)ptr;
+    rgba[0] = float_to_half(r);
+    rgba[1] = float_to_half(g);
+    rgba[2] = float_to_half(b);
+    rgba[3] = float_to_half(a);
 #elif defined(__aarch64__)
     float16x4x4_t halfs = {{
         vcvt_f16_f32(r),