Add mulHi to SkNx

Add mulHi to base SkNx, and specialize implementations for Sk4u for
neon and sse.

Add casts for converting from uint8_t by 4 to uint32_t by 4.

Cq-Include-Trybots: skia.primary:Test-Debian9-Clang-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD
Change-Id: I29a32e2ad9812a47fff841ceca334e562362836f
Reviewed-on: https://skia-review.googlesource.com/57960
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Herb Derby <herb@google.com>
diff --git a/tests/SkNxTest.cpp b/tests/SkNxTest.cpp
index 240d7e0..afa6750 100644
--- a/tests/SkNxTest.cpp
+++ b/tests/SkNxTest.cpp
@@ -165,6 +165,20 @@
     }
 }
 
+DEF_TEST(SkNi_mulHi, r) {
+    // First 8 primes.
+    Sk4u a{ 0x00020000, 0x00030000, 0x00050000, 0x00070000 };
+    Sk4u b{ 0x000b0000, 0x000d0000, 0x00110000, 0x00130000 };
+
+    Sk4u q{22, 39, 85, 133};
+
+    Sk4u c = a.mulHi(b);
+    REPORTER_ASSERT(r, c[0] == q[0]);
+    REPORTER_ASSERT(r, c[1] == q[1]);
+    REPORTER_ASSERT(r, c[2] == q[2]);
+    REPORTER_ASSERT(r, c[3] == q[3]);
+}
+
 DEF_TEST(Sk4px_muldiv255round, r) {
     for (int a = 0; a < (1<<8); a++) {
     for (int b = 0; b < (1<<8); b++) {