Make GrResourceCache perf less sensitive to key length change

Make GrResourceCache performance less sensitive to key length change.
The memcmp in GrResourceKey is called when SkTDynamicHash jumps the
slots to find the hash by a index. Avoid most of the memcmps by
comparing the hash first.

This is important because small changes in key data length can cause
big performance regressions. The theory is that key length change causes
different hash values. These hash values might trigger memcmps that
originally weren't there, causing the regression.

Adds few specialized benches to grresourcecache_add to test different
key lengths. The tests are run only on release, because on debug the
SkTDynamicHash validation takes too long, and adding many such delays
to development test runs would be unproductive. On release the tests
are quite fast.

Effect of this patch to the added tests on amd64:
grresourcecache_find_10  738us ->  768us        1.04x
 grresourcecache_find_2  472us ->  476us        1.01x
grresourcecache_find_25  841us ->  845us        1x
 grresourcecache_find_4  565us ->  531us        0.94x
grresourcecache_find_54 1.18ms ->  1.1ms        0.93x
 grresourcecache_find_5  834us ->  749us        0.9x
 grresourcecache_find_3  620us ->  542us        0.87x
 grresourcecache_add_25 2.74ms -> 2.24ms        0.82x
 grresourcecache_add_56 3.23ms -> 2.56ms        0.79x
 grresourcecache_add_54 3.34ms -> 2.62ms        0.78x
  grresourcecache_add_5 2.68ms ->  2.1ms        0.78x
 grresourcecache_add_10  2.7ms -> 2.11ms        0.78x
  grresourcecache_add_2 1.85ms -> 1.41ms        0.76x
    grresourcecache_add 1.84ms ->  1.4ms        0.76x
  grresourcecache_add_4 1.99ms -> 1.49ms        0.75x
  grresourcecache_add_3 2.11ms -> 1.55ms        0.73x
 grresourcecache_add_55   39ms -> 13.9ms        0.36x
grresourcecache_find_55 23.2ms -> 6.21ms        0.27x

On arm64 the results are similar.

On arm_v7_neon, the results lack the discontinuity at 55:
    grresourcecache_add 4.06ms -> 4.26ms        1.05x
  grresourcecache_add_2 4.05ms -> 4.23ms        1.05x
   grresourcecache_find 1.28ms ->  1.3ms        1.02x
grresourcecache_find_56 3.35ms -> 3.32ms        0.99x
 grresourcecache_find_2 1.31ms -> 1.29ms        0.99x
grresourcecache_find_54 3.28ms -> 3.24ms        0.99x
  grresourcecache_add_5 6.38ms -> 6.26ms        0.98x
 grresourcecache_add_55 8.44ms -> 8.24ms        0.98x
 grresourcecache_add_25 7.03ms -> 6.86ms        0.98x
grresourcecache_find_25  2.7ms -> 2.59ms        0.96x
 grresourcecache_find_4 1.45ms -> 1.38ms        0.95x
grresourcecache_find_10 2.52ms -> 2.39ms        0.95x
grresourcecache_find_55 3.54ms -> 3.33ms        0.94x
 grresourcecache_find_5  2.5ms -> 2.32ms        0.93x
 grresourcecache_find_3 1.57ms -> 1.43ms        0.91x

The extremely slow case, 55, is postulated to be due to the index jump
collisions running the memcmp. This is not visible on arm_v7_neon probably due
to hash function producing different results for 32 bit architectures.

This change is needed for extending path cache key in Gr
NV_path_rendering codepath. Extending is needed in order to add dashed
paths to the path cache.

Review URL: https://codereview.chromium.org/1132723003
diff --git a/bench/GrResourceCacheBench.cpp b/bench/GrResourceCacheBench.cpp
index 5807654..58b0ee8 100644
--- a/bench/GrResourceCacheBench.cpp
+++ b/bench/GrResourceCacheBench.cpp
@@ -29,22 +29,23 @@
         this->registerWithCache();
     }
 
-    static void ComputeKey(int i, GrUniqueKey* key) {
+    static void ComputeKey(int i, int keyData32Count, GrUniqueKey* key) {
         static GrUniqueKey::Domain kDomain = GrUniqueKey::GenerateDomain();
-        GrUniqueKey::Builder builder(key, kDomain, 1);
-        builder[0] = i;
+        GrUniqueKey::Builder builder(key, kDomain, keyData32Count);
+        for (int j = 0; j < keyData32Count; ++j) {
+            builder[j] = i + j;
+        }
     }
 
 private:
     size_t onGpuMemorySize() const override { return 100; }
-
     typedef GrGpuResource INHERITED;
 };
 
-static void populate_cache(GrGpu* gpu, int resourceCount) {
+static void populate_cache(GrGpu* gpu, int resourceCount, int keyData32Count) {
     for (int i = 0; i < resourceCount; ++i) {
         GrUniqueKey key;
-        BenchResource::ComputeKey(i, &key);
+        BenchResource::ComputeKey(i, keyData32Count, &key);
         GrGpuResource* resource = SkNEW_ARGS(BenchResource, (gpu));
         resource->resourcePriv().setUniqueKey(key);
         resource->unref();
@@ -53,13 +54,20 @@
 
 class GrResourceCacheBenchAdd : public Benchmark {
 public:
+    GrResourceCacheBenchAdd(int keyData32Count)
+        : fFullName("grresourcecache_add")
+        , fKeyData32Count(keyData32Count) {
+        if (keyData32Count > 1) {
+            fFullName.appendf("_%d", fKeyData32Count);
+        }
+    }
+
     bool isSuitableFor(Backend backend) override {
         return backend == kNonRendering_Backend;
     }
-
 protected:
     const char* onGetName() override {
-        return "grresourcecache_add";
+        return fFullName.c_str();
     }
 
     void onDraw(const int loops, SkCanvas* canvas) override {
@@ -79,24 +87,33 @@
         GrGpu* gpu = context->getGpu();
 
         for (int i = 0; i < loops; ++i) {
-            populate_cache(gpu, CACHE_SIZE_COUNT);
+            populate_cache(gpu, CACHE_SIZE_COUNT, fKeyData32Count);
             SkASSERT(CACHE_SIZE_COUNT == cache->getResourceCount());
         }
     }
 
 private:
+    SkString fFullName;
+    int fKeyData32Count;
     typedef Benchmark INHERITED;
 };
 
 class GrResourceCacheBenchFind : public Benchmark {
 public:
+    GrResourceCacheBenchFind(int keyData32Count)
+        : fFullName("grresourcecache_find")
+        , fKeyData32Count(keyData32Count) {
+        if (keyData32Count > 1) {
+            fFullName.appendf("_%d", fKeyData32Count);
+        }
+    }
+
     bool isSuitableFor(Backend backend) override {
         return backend == kNonRendering_Backend;
     }
-
 protected:
     const char* onGetName() override {
-        return "grresourcecache_find";
+        return fFullName.c_str();
     }
 
     void onPreDraw() override {
@@ -115,7 +132,7 @@
 
         GrGpu* gpu = fContext->getGpu();
 
-        populate_cache(gpu, CACHE_SIZE_COUNT);
+        populate_cache(gpu, CACHE_SIZE_COUNT, fKeyData32Count);
     }
 
     void onDraw(const int loops, SkCanvas* canvas) override {
@@ -127,7 +144,7 @@
         for (int i = 0; i < loops; ++i) {
             for (int k = 0; k < CACHE_SIZE_COUNT; ++k) {
                 GrUniqueKey key;
-                BenchResource::ComputeKey(k, &key);
+                BenchResource::ComputeKey(k, fKeyData32Count, &key);
                 SkAutoTUnref<GrGpuResource> resource(cache->findAndRefUniqueResource(key));
                 SkASSERT(resource);
             }
@@ -136,10 +153,36 @@
 
 private:
     SkAutoTUnref<GrContext> fContext;
+    SkString fFullName;
+    int fKeyData32Count;
     typedef Benchmark INHERITED;
 };
 
-DEF_BENCH( return new GrResourceCacheBenchAdd(); )
-DEF_BENCH( return new GrResourceCacheBenchFind(); )
+DEF_BENCH( return new GrResourceCacheBenchAdd(1); )
+#ifdef SK_RELEASE
+// Only on release because on debug the SkTDynamicHash validation is too slow.
+DEF_BENCH( return new GrResourceCacheBenchAdd(2); )
+DEF_BENCH( return new GrResourceCacheBenchAdd(3); )
+DEF_BENCH( return new GrResourceCacheBenchAdd(4); )
+DEF_BENCH( return new GrResourceCacheBenchAdd(5); )
+DEF_BENCH( return new GrResourceCacheBenchAdd(10); )
+DEF_BENCH( return new GrResourceCacheBenchAdd(25); )
+DEF_BENCH( return new GrResourceCacheBenchAdd(54); )
+DEF_BENCH( return new GrResourceCacheBenchAdd(55); )
+DEF_BENCH( return new GrResourceCacheBenchAdd(56); )
+#endif
+
+DEF_BENCH( return new GrResourceCacheBenchFind(1); )
+#ifdef SK_RELEASE
+DEF_BENCH( return new GrResourceCacheBenchFind(2); )
+DEF_BENCH( return new GrResourceCacheBenchFind(3); )
+DEF_BENCH( return new GrResourceCacheBenchFind(4); )
+DEF_BENCH( return new GrResourceCacheBenchFind(5); )
+DEF_BENCH( return new GrResourceCacheBenchFind(10); )
+DEF_BENCH( return new GrResourceCacheBenchFind(25); )
+DEF_BENCH( return new GrResourceCacheBenchFind(54); )
+DEF_BENCH( return new GrResourceCacheBenchFind(55); )
+DEF_BENCH( return new GrResourceCacheBenchFind(56); )
+#endif
 
 #endif