explicitly vectorize sk_memset{16,32,64}
This ought to help clients who don't enable autovectorization.
With autovectorization enabled, this new version is like,
hyper-vectorized compared to the old autovectorization.
Instead of handling 128 bytes max per loop, it now
handles up to 512 bytes per loop. Pretty exciting.
Locally perf effects are a mix, but we'd expect this to help
Chrome unambiguously if they've turned off autovectorization.
$ out/ok bench:samples=100 sw filter:match=memset32_\\d\* serial
Before:
[memset32_100000] 16ms @0 20.1ms @99 20.2ms @100
[memset32_10000] 1.07ms @0 1.26ms @99 1.31ms @100
[memset32_1000] 73.9µs @0 89.4µs @99 90.1µs @100
[memset32_100] 8.59µs @0 9.74µs @99 9.96µs @100
[memset32_10] 7.45µs @0 8.96µs @99 8.99µs @100
[memset32_1] 2.29µs @0 2.81µs @99 2.92µs @100
After:
[memset32_100000] 16.2ms @0 17.3ms @99 17.3ms @100
[memset32_10000] 1.06ms @0 1.18ms @99 1.23ms @100
[memset32_1000] 72µs @0 75.6µs @99 84.7µs @100
[memset32_100] 9.14µs @0 10.6µs @99 10.7µs @100
[memset32_10] 5.43µs @0 5.88µs @99 5.99µs @100
[memset32_1] 3.43µs @0 3.65µs @99 3.83µs @100
BUG=chromium:755391
Change-Id: If9059a30ca7a345f1f7c37bd51473c29e8bb8922
Reviewed-on: https://skia-review.googlesource.com/34746
Reviewed-by: Florin Malita <fmalita@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/opts/SkUtils_opts.h b/src/opts/SkUtils_opts.h
index ba93305..d67a777 100644
--- a/src/opts/SkUtils_opts.h
+++ b/src/opts/SkUtils_opts.h
@@ -8,24 +8,36 @@
#ifndef SkUtils_opts_DEFINED
#define SkUtils_opts_DEFINED
-#include "stdint.h"
+#include <stdint.h>
+#include "SkNx.h"
namespace SK_OPTS_NS {
- static void memset16(uint16_t buffer[], uint16_t value, int count) {
- for (int i = 0; i < count; i++) {
- buffer[i] = value;
+ template <typename T>
+ static void memsetT(T buffer[], T value, int count) {
+ #if defined(__AVX__)
+ static const int N = 32 / sizeof(T);
+ #else
+ static const int N = 16 / sizeof(T);
+ #endif
+ while (count >= N) {
+ SkNx<N,T>(value).store(buffer);
+ buffer += N;
+ count -= N;
}
+ while (count --> 0) {
+ *buffer++ = value;
+ }
+ }
+
+ static void memset16(uint16_t buffer[], uint16_t value, int count) {
+ memsetT(buffer, value, count);
}
static void memset32(uint32_t buffer[], uint32_t value, int count) {
- for (int i = 0; i < count; i++) {
- buffer[i] = value;
- }
+ memsetT(buffer, value, count);
}
static void memset64(uint64_t buffer[], uint64_t value, int count) {
- for (int i = 0; i < count; i++) {
- buffer[i] = value;
- }
+ memsetT(buffer, value, count);
}
}