explicitly vectorize sk_memset{16,32,64}

This ought to help clients who don't enable autovectorization.

With autovectorization enabled, this new version is like,
hyper-vectorized compared to the old autovectorization.
Instead of handling 128 bytes max per loop, it now
handles up to 512 bytes per loop.  Pretty exciting.

Locally perf effects are a mix, but we'd expect this to help
Chrome unambiguously if they've turned off autovectorization.

  $ out/ok bench:samples=100 sw filter:match=memset32_\\d\* serial

  Before:
    [memset32_100000]   16ms    @0  20.1ms  @99 20.2ms  @100
    [memset32_10000]    1.07ms  @0  1.26ms  @99 1.31ms  @100
    [memset32_1000]     73.9µs  @0  89.4µs  @99 90.1µs  @100
    [memset32_100]      8.59µs  @0  9.74µs  @99 9.96µs  @100
    [memset32_10]       7.45µs  @0  8.96µs  @99 8.99µs  @100
    [memset32_1]        2.29µs  @0  2.81µs  @99 2.92µs  @100

  After:
    [memset32_100000]   16.2ms  @0  17.3ms  @99 17.3ms  @100
    [memset32_10000]    1.06ms  @0  1.18ms  @99 1.23ms  @100
    [memset32_1000]     72µs    @0  75.6µs  @99 84.7µs  @100
    [memset32_100]      9.14µs  @0  10.6µs  @99 10.7µs  @100
    [memset32_10]       5.43µs  @0  5.88µs  @99 5.99µs  @100
    [memset32_1]        3.43µs  @0  3.65µs  @99 3.83µs  @100

BUG=chromium:755391

Change-Id: If9059a30ca7a345f1f7c37bd51473c29e8bb8922
Reviewed-on: https://skia-review.googlesource.com/34746
Reviewed-by: Florin Malita <fmalita@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/opts/SkUtils_opts.h b/src/opts/SkUtils_opts.h
index ba93305..d67a777 100644
--- a/src/opts/SkUtils_opts.h
+++ b/src/opts/SkUtils_opts.h
@@ -8,24 +8,36 @@
 #ifndef SkUtils_opts_DEFINED
 #define SkUtils_opts_DEFINED
 
-#include "stdint.h"
+#include <stdint.h>
+#include "SkNx.h"
 
 namespace SK_OPTS_NS {
 
-    static void memset16(uint16_t buffer[], uint16_t value, int count) {
-        for (int i = 0; i < count; i++) {
-            buffer[i] = value;
+    template <typename T>
+    static void memsetT(T buffer[], T value, int count) {
+    #if defined(__AVX__)
+        static const int N = 32 / sizeof(T);
+    #else
+        static const int N = 16 / sizeof(T);
+    #endif
+        while (count >= N) {
+            SkNx<N,T>(value).store(buffer);
+            buffer += N;
+            count  -= N;
         }
+        while (count --> 0) {
+            *buffer++ = value;
+        }
+    }
+
+    static void memset16(uint16_t buffer[], uint16_t value, int count) {
+        memsetT(buffer, value, count);
     }
     static void memset32(uint32_t buffer[], uint32_t value, int count) {
-        for (int i = 0; i < count; i++) {
-            buffer[i] = value;
-        }
+        memsetT(buffer, value, count);
     }
     static void memset64(uint64_t buffer[], uint64_t value, int count) {
-        for (int i = 0; i < count; i++) {
-            buffer[i] = value;
-        }
+        memsetT(buffer, value, count);
     }
 
 }