Add sk_memcpy32 to Memcpy bench.

The bench predates the implementation in SkUtils, but now that we've got it of course we want to measure our actual implementation.

BUG=skia:
R=reed@google.com, mtklein@google.com

Author: mtklein@chromium.org

Review URL: https://codereview.chromium.org/302763006

git-svn-id: http://skia.googlecode.com/svn/trunk@14942 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/bench/MemcpyBench.cpp b/bench/MemcpyBench.cpp
index aec2a47..3fd6dcb 100644
--- a/bench/MemcpyBench.cpp
+++ b/bench/MemcpyBench.cpp
@@ -8,6 +8,7 @@
 #include "SkBenchmark.h"
 #include "SkRandom.h"
 #include "SkTemplates.h"
+#include "SkUtils.h"
 
 template <typename Memcpy32>
 class Memcpy32Bench : public SkBenchmark {
@@ -147,6 +148,13 @@
 BENCH(memcpy32_sse2_unalign, 10000)
 BENCH(memcpy32_sse2_unalign, 100000)
 
+// Test our chosen best, from SkUtils.h
+BENCH(sk_memcpy32, 10)
+BENCH(sk_memcpy32, 100)
+BENCH(sk_memcpy32, 1000)
+BENCH(sk_memcpy32, 10000)
+BENCH(sk_memcpy32, 100000)
+
 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
 
 #undef BENCH