SkHalfToFloat_01 / SkFloatToHalf_01

These are basically inlined, 4-at-a-time versions of our existing functions,
but cut down to avoid any work that's only necessary outside [0,1].

Both f16 and f32 denorms should work fine modulo the usual ARMv7 NEON denorm==zero caveat.

In exchange for a little speed, f32->f16 does not round properly.
Instead it truncates, so it's never off by more than 1 bit.

Support for finite values >1 or <0 is straightforward to add back.
>1 might already work as-is.

Getting close to _u16 performance:
    micros   	bench
    261.13  	xferu64_bw_1_opaque_u16
   1833.51  	xferu64_bw_1_alpha_u16
   2762.32 ?	xferu64_aa_1_opaque_u16
   3334.29  	xferu64_aa_1_alpha_u16
    249.78  	xferu64_bw_1_opaque_f16
   3383.18  	xferu64_bw_1_alpha_f16
   4214.72  	xferu64_aa_1_opaque_f16
   4701.19  	xferu64_aa_1_alpha_f16

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1685133005

Committed: https://skia.googlesource.com/skia/+/9ea11a4235b3e3521cc8bf914a27c2d0dc062db9

CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1685133005
diff --git a/tests/Float16Test.cpp b/tests/Float16Test.cpp
index f437268..f96d904 100644
--- a/tests/Float16Test.cpp
+++ b/tests/Float16Test.cpp
@@ -10,6 +10,7 @@
 #include "SkHalf.h"
 #include "SkOpts.h"
 #include "SkPixmap.h"
+#include "SkRandom.h"
 
 static bool eq_within_half_float(float a, float b) {
     const float kTolerance = 1.0f / (1 << (8 + 10));
@@ -64,3 +65,37 @@
     SkOpts::half_to_float(fscratch, hs, 7);
     REPORTER_ASSERT(reporter, 0 == memcmp(fscratch, fs, sizeof(fs)));
 }
+
+DEF_TEST(HalfToFloat_01, r) {
+    for (uint16_t h = 0; h < 0x8000; h++) {
+        float f = SkHalfToFloat(h);
+        if (f >= 0 && f <= 1) {
+            REPORTER_ASSERT(r, SkHalfToFloat_01(h)[0] == f);
+            REPORTER_ASSERT(r, SkFloatToHalf_01(SkHalfToFloat_01(h)) == h);
+        }
+    }
+}
+
+DEF_TEST(FloatToHalf_01, r) {
+#if 0
+    for (uint32_t bits = 0; bits < 0x80000000; bits++) {
+#else
+    SkRandom rand;
+    for (int i = 0; i < 1000000; i++) {
+        uint32_t bits = rand.nextU();
+#endif
+        float f;
+        memcpy(&f, &bits, 4);
+        if (f >= 0 && f <= 1) {
+            uint16_t h1 = (uint16_t)SkFloatToHalf_01(Sk4f(f,0,0,0)),
+                     h2 = SkFloatToHalf(f);
+            bool ok = (h1 == h2 || h1 == h2-1);
+            REPORTER_ASSERT(r, ok);
+            if (!ok) {
+                SkDebugf("%08x (%d) -> %04x (%d), want %04x (%d)\n",
+                         bits, bits>>23, h1, h1>>10, h2, h2>>10);
+                break;
+            }
+        }
+    }
+}