Implement a persistent uniqueID-based cache for SkImageFilter.

Add a unique ID to SkImageFilter, and use it as part
of a persistent cache of image-filtered results. This is used for
caching frame-to-frame coherent filters.

We also keep track of which filter subtrees do not reference the
src input, and use a GenID of zero for the src input in that case.
That way, subtrees which are not dependent on the filter input can be
cached independently of it.

This gives approximately a 4X speedup on
letmespellitoutforyou.com/samples/svg/filter_terrain.svg on Z620
and Nexus10. The cache key consists of the uniqueID of the filter, the
clip bounds, the CTM and the genID of the input bitmap.

Since this does not yet handle the case where the input primitives
(and part of the resulting filter tree) are unchanged, we have
to keep around the external cache for that painting case.
When the work to cache unchanging input primitives is done, the
old cache can be removed, and the new UniqueIDCache will be renamed
to Cache.

R=bsalomon@google.com, mtklein@google.com

Author: senorblanco@chromium.org

Review URL: https://codereview.chromium.org/414483003
diff --git a/include/core/SkBitmapDevice.h b/include/core/SkBitmapDevice.h
index e1765e5..5dde9e0 100644
--- a/include/core/SkBitmapDevice.h
+++ b/include/core/SkBitmapDevice.h
@@ -156,6 +156,8 @@
     virtual SkSurface* newSurface(const SkImageInfo&) SK_OVERRIDE;
     virtual const void* peekPixels(SkImageInfo*, size_t* rowBytes) SK_OVERRIDE;
 
+    virtual SkImageFilter::UniqueIDCache* getImageFilterCache() SK_OVERRIDE;
+
     SkBitmap    fBitmap;
 
     typedef SkBaseDevice INHERITED;
diff --git a/include/core/SkDevice.h b/include/core/SkDevice.h
index 4a9edee..504138d 100644
--- a/include/core/SkDevice.h
+++ b/include/core/SkDevice.h
@@ -378,6 +378,8 @@
      */
     virtual void flush() {}
 
+    virtual SkImageFilter::UniqueIDCache* getImageFilterCache() { return NULL; }
+
     SkIPoint    fOrigin;
     SkMetaData* fMetaData;
 
diff --git a/include/core/SkImageFilter.h b/include/core/SkImageFilter.h
index e7ba5e6..d2a4d3e 100644
--- a/include/core/SkImageFilter.h
+++ b/include/core/SkImageFilter.h
@@ -61,18 +61,30 @@
         virtual void remove(const SkImageFilter* key) = 0;
     };
 
+    // This cache maps from (filter's unique ID + CTM + clipBounds + src bitmap generation ID) to
+    // (result, offset).
+    class UniqueIDCache : public SkRefCnt {
+    public:
+        struct Key;
+        virtual ~UniqueIDCache() {}
+        static UniqueIDCache* Create(size_t maxBytes);
+        static UniqueIDCache* Get();
+        virtual bool get(const Key& key, SkBitmap* result, SkIPoint* offset) const = 0;
+        virtual void set(const Key& key, const SkBitmap& result, const SkIPoint& offset) = 0;
+    };
+
     class Context {
     public:
-        Context(const SkMatrix& ctm, const SkIRect& clipBounds, Cache* cache) :
+        Context(const SkMatrix& ctm, const SkIRect& clipBounds, UniqueIDCache* cache) :
             fCTM(ctm), fClipBounds(clipBounds), fCache(cache) {
         }
         const SkMatrix& ctm() const { return fCTM; }
         const SkIRect& clipBounds() const { return fClipBounds; }
-        Cache* cache() const { return fCache; }
+        UniqueIDCache* cache() const { return fCache; }
     private:
         SkMatrix fCTM;
         SkIRect  fClipBounds;
-        Cache*   fCache;
+        UniqueIDCache* fCache;
     };
 
     class Proxy {
@@ -210,6 +222,7 @@
         CropRect        cropRect() const { return fCropRect; }
         int             inputCount() const { return fInputs.count(); }
         SkImageFilter** inputs() const { return fInputs.get(); }
+        uint32_t        uniqueID() const { return fUniqueID; }
 
         // If the caller wants a copy of the inputs, call this and it will transfer ownership
         // of the unflattened input filters to the caller. This is just a short-cut for copying
@@ -221,6 +234,7 @@
         CropRect fCropRect;
         // most filters accept at most 2 input-filters
         SkAutoSTArray<2, SkImageFilter*> fInputs;
+        uint32_t fUniqueID;
 
         void allocInputs(int count);
     };
@@ -308,10 +322,14 @@
                              const SkIRect& bounds) const;
 
 private:
+    bool usesSrcInput() const { return fUsesSrcInput; }
+
     typedef SkFlattenable INHERITED;
     int fInputCount;
     SkImageFilter** fInputs;
+    bool fUsesSrcInput;
     CropRect fCropRect;
+    uint32_t fUniqueID; // Globally unique
 };
 
 #endif
diff --git a/include/core/SkPicture.h b/include/core/SkPicture.h
index 21ebef3..c733e53 100644
--- a/include/core/SkPicture.h
+++ b/include/core/SkPicture.h
@@ -230,13 +230,14 @@
     // V28: No longer call bitmap::flatten inside SkWriteBuffer::writeBitmap.
     // V29: Removed SaveFlags parameter from save().
     // V30: Remove redundant SkMatrix from SkLocalMatrixShader.
+    // V31: Add a serialized UniqueID to SkImageFilter.
 
     // Note: If the picture version needs to be increased then please follow the
     // steps to generate new SKPs in (only accessible to Googlers): http://goo.gl/qATVcw
 
     // Only SKPs within the min/current picture version range (inclusive) can be read.
     static const uint32_t MIN_PICTURE_VERSION = 19;
-    static const uint32_t CURRENT_PICTURE_VERSION = 30;
+    static const uint32_t CURRENT_PICTURE_VERSION = 31;
 
     mutable uint32_t      fUniqueID;
 
diff --git a/include/core/SkReadBuffer.h b/include/core/SkReadBuffer.h
index faf7eb8..2beb7ac 100644
--- a/include/core/SkReadBuffer.h
+++ b/include/core/SkReadBuffer.h
@@ -46,6 +46,7 @@
         kNoUnitMappers_Version             = 27,
         kNoMoreBitmapFlatten_Version       = 28,
         kSimplifyLocalMatrix_Version       = 30,
+        kImageFilterUniqueID_Version       = 31,
     };
 
     /**
diff --git a/include/gpu/SkGpuDevice.h b/include/gpu/SkGpuDevice.h
index 9a5a92e..b43213a 100644
--- a/include/gpu/SkGpuDevice.h
+++ b/include/gpu/SkGpuDevice.h
@@ -169,6 +169,8 @@
 
     virtual SkSurface* newSurface(const SkImageInfo&) SK_OVERRIDE;
 
+    virtual SkImageFilter::UniqueIDCache* getImageFilterCache() SK_OVERRIDE;
+
     // sets the render target, clip, and matrix on GrContext. Use forceIdenity to override
     // SkDraw's matrix and draw in device coords.
     void prepareDraw(const SkDraw&, bool forceIdentity);
diff --git a/src/core/SkBitmapDevice.cpp b/src/core/SkBitmapDevice.cpp
index 09b3b60..8c92f71 100644
--- a/src/core/SkBitmapDevice.cpp
+++ b/src/core/SkBitmapDevice.cpp
@@ -367,6 +367,12 @@
     return NULL;
 }
 
+SkImageFilter::UniqueIDCache* SkBitmapDevice::getImageFilterCache() {
+    SkImageFilter::UniqueIDCache* cache = SkImageFilter::UniqueIDCache::Get();
+    cache->ref();
+    return cache;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 
 bool SkBitmapDevice::filterTextFlags(const SkPaint& paint, TextFlags* flags) {
diff --git a/src/core/SkCanvas.cpp b/src/core/SkCanvas.cpp
index 504c990..a561316 100644
--- a/src/core/SkCanvas.cpp
+++ b/src/core/SkCanvas.cpp
@@ -1134,13 +1134,8 @@
             SkMatrix matrix = *iter.fMatrix;
             matrix.postTranslate(SkIntToScalar(-pos.x()), SkIntToScalar(-pos.y()));
             SkIRect clipBounds = SkIRect::MakeWH(srcDev->width(), srcDev->height());
-            SkImageFilter::Cache* cache = SkImageFilter::GetExternalCache();
-            SkAutoUnref aur(NULL);
-            if (!cache) {
-                cache = SkImageFilter::Cache::Create();
-                aur.reset(cache);
-            }
-            SkImageFilter::Context ctx(matrix, clipBounds, cache);
+            SkAutoTUnref<SkImageFilter::UniqueIDCache> cache(dstDev->getImageFilterCache());
+            SkImageFilter::Context ctx(matrix, clipBounds, cache.get());
             if (filter->filterImage(&proxy, src, ctx, &dst, &offset)) {
                 SkPaint tmpUnfiltered(*paint);
                 tmpUnfiltered.setImageFilter(NULL);
@@ -1179,13 +1174,8 @@
             SkMatrix matrix = *iter.fMatrix;
             matrix.postTranslate(SkIntToScalar(-pos.x()), SkIntToScalar(-pos.y()));
             SkIRect clipBounds = SkIRect::MakeWH(bitmap.width(), bitmap.height());
-            SkImageFilter::Cache* cache = SkImageFilter::GetExternalCache();
-            SkAutoUnref aur(NULL);
-            if (!cache) {
-                cache = SkImageFilter::Cache::Create();
-                aur.reset(cache);
-            }
-            SkImageFilter::Context ctx(matrix, clipBounds, cache);
+            SkAutoTUnref<SkImageFilter::UniqueIDCache> cache(iter.fDevice->getImageFilterCache());
+            SkImageFilter::Context ctx(matrix, clipBounds, cache.get());
             if (filter->filterImage(&proxy, bitmap, ctx, &dst, &offset)) {
                 SkPaint tmpUnfiltered(*paint);
                 tmpUnfiltered.setImageFilter(NULL);
diff --git a/src/core/SkImageFilter.cpp b/src/core/SkImageFilter.cpp
index 11a1420..5fa6855 100644
--- a/src/core/SkImageFilter.cpp
+++ b/src/core/SkImageFilter.cpp
@@ -10,10 +10,12 @@
 #include "SkBitmap.h"
 #include "SkChecksum.h"
 #include "SkDevice.h"
+#include "SkLazyPtr.h"
 #include "SkReadBuffer.h"
 #include "SkWriteBuffer.h"
 #include "SkRect.h"
 #include "SkTDynamicHash.h"
+#include "SkTInternalLList.h"
 #include "SkValidationUtils.h"
 #if SK_SUPPORT_GPU
 #include "GrContext.h"
@@ -21,6 +23,39 @@
 #include "SkGr.h"
 #endif
 
+enum { kDefaultCacheSize = 128 * 1024 * 1024 };
+
+static int32_t next_image_filter_unique_id() {
+    static int32_t gImageFilterUniqueID;
+
+    // Never return 0.
+    int32_t id;
+    do {
+        id = sk_atomic_inc(&gImageFilterUniqueID) + 1;
+    } while (0 == id);
+    return id;
+}
+
+struct SkImageFilter::UniqueIDCache::Key {
+    Key(const uint32_t uniqueID, const SkMatrix& matrix, const SkIRect& clipBounds, uint32_t srcGenID)
+      : fUniqueID(uniqueID), fMatrix(matrix), fClipBounds(clipBounds), fSrcGenID(srcGenID) {
+        // Assert that Key is tightly-packed, since it is hashed.
+        SK_COMPILE_ASSERT(sizeof(Key) == sizeof(uint32_t) + sizeof(SkMatrix) + sizeof(SkIRect) +
+                                         sizeof(uint32_t), image_filter_key_tight_packing);
+        fMatrix.getType();  // force initialization of type, so hashes match
+    }
+    uint32_t fUniqueID;
+    SkMatrix fMatrix;
+    SkIRect fClipBounds;
+    uint32_t fSrcGenID;
+    bool operator==(const Key& other) const {
+        return fUniqueID == other.fUniqueID
+            && fMatrix == other.fMatrix
+            && fClipBounds == other.fClipBounds
+            && fSrcGenID == other.fSrcGenID;
+    }
+};
+
 SkImageFilter::Common::~Common() {
     for (int i = 0; i < fInputs.count(); ++i) {
         SkSafeUnref(fInputs[i]);
@@ -65,6 +100,11 @@
     
     uint32_t flags = buffer.readUInt();
     fCropRect = CropRect(rect, flags);
+    if (buffer.isVersionLT(SkReadBuffer::kImageFilterUniqueID_Version)) {
+        fUniqueID = next_image_filter_unique_id();
+    } else {
+        fUniqueID = buffer.readUInt();
+    }
     return buffer.isValid();
 }
 
@@ -75,8 +115,13 @@
 SkImageFilter::SkImageFilter(int inputCount, SkImageFilter** inputs, const CropRect* cropRect)
   : fInputCount(inputCount),
     fInputs(new SkImageFilter*[inputCount]),
-    fCropRect(cropRect ? *cropRect : CropRect(SkRect(), 0x0)) {
+    fUsesSrcInput(false),
+    fCropRect(cropRect ? *cropRect : CropRect(SkRect(), 0x0)),
+    fUniqueID(next_image_filter_unique_id()) {
     for (int i = 0; i < inputCount; ++i) {
+        if (NULL == inputs[i] || inputs[i]->usesSrcInput()) {
+            fUsesSrcInput = true;
+        }
         fInputs[i] = inputs[i];
         SkSafeRef(fInputs[i]);
     }
@@ -89,13 +134,20 @@
     delete[] fInputs;
 }
 
-SkImageFilter::SkImageFilter(int inputCount, SkReadBuffer& buffer) {
+SkImageFilter::SkImageFilter(int inputCount, SkReadBuffer& buffer)
+  : fUsesSrcInput(false) {
     Common common;
     if (common.unflatten(buffer, inputCount)) {
         fCropRect = common.cropRect();
         fInputCount = common.inputCount();
         fInputs = SkNEW_ARRAY(SkImageFilter*, fInputCount);
         common.detachInputs(fInputs);
+        for (int i = 0; i < fInputCount; ++i) {
+            if (NULL == fInputs[i] || fInputs[i]->usesSrcInput()) {
+                fUsesSrcInput = true;
+            }
+        }
+        fUniqueID = buffer.isCrossProcess() ? next_image_filter_unique_id() : common.uniqueID();
     } else {
         fInputCount = 0;
         fInputs = NULL;
@@ -113,17 +165,25 @@
     }
     buffer.writeRect(fCropRect.rect());
     buffer.writeUInt(fCropRect.flags());
+    buffer.writeUInt(fUniqueID);
 }
 
 bool SkImageFilter::filterImage(Proxy* proxy, const SkBitmap& src,
                                 const Context& context,
                                 SkBitmap* result, SkIPoint* offset) const {
-    Cache* cache = context.cache();
     SkASSERT(result);
     SkASSERT(offset);
-    SkASSERT(cache);
-    if (cache->get(this, result, offset)) {
-        return true;
+    uint32_t srcGenID = fUsesSrcInput ? src.getGenerationID() : 0;
+    Cache* externalCache = GetExternalCache();
+    UniqueIDCache::Key key(fUniqueID, context.ctm(), context.clipBounds(), srcGenID);
+    if (NULL != externalCache) {
+        if (externalCache->get(this, result, offset)) {
+            return true;
+        }
+    } else if (context.cache()) {
+        if (context.cache()->get(key, result, offset)) {
+            return true;
+        }
     }
     /*
      *  Give the proxy first shot at the filter. If it returns false, ask
@@ -131,7 +191,11 @@
      */
     if ((proxy && proxy->filterImage(this, src, context, result, offset)) ||
         this->onFilterImage(proxy, src, context, result, offset)) {
-        cache->set(this, *result, *offset);
+        if (externalCache) {
+            externalCache->set(this, *result, *offset);
+        } else if (context.cache()) {
+            context.cache()->set(key, *result, *offset);
+        }
         return true;
     }
     return false;
@@ -439,3 +503,93 @@
         delete v;
     }
 }
+
+namespace {
+
+class UniqueIDCacheImpl : public SkImageFilter::UniqueIDCache {
+public:
+    UniqueIDCacheImpl(size_t maxBytes) : fMaxBytes(maxBytes), fCurrentBytes(0) {
+    }
+    virtual ~UniqueIDCacheImpl() {
+        SkTDynamicHash<Value, Key>::Iter iter(&fLookup);
+
+        while (!iter.done()) {
+            Value* v = &*iter;
+            ++iter;
+            delete v;
+        }
+    }
+    struct Value {
+        Value(const Key& key, const SkBitmap& bitmap, const SkIPoint& offset)
+            : fKey(key), fBitmap(bitmap), fOffset(offset) {}
+        Key fKey;
+        SkBitmap fBitmap;
+        SkIPoint fOffset;
+        static const Key& GetKey(const Value& v) {
+            return v.fKey;
+        }
+        static uint32_t Hash(const Key& key) {
+            return SkChecksum::Murmur3(reinterpret_cast<const uint32_t*>(&key), sizeof(Key));
+        }
+        SK_DECLARE_INTERNAL_LLIST_INTERFACE(Value);
+    };
+    virtual bool get(const Key& key, SkBitmap* result, SkIPoint* offset) const {
+        SkAutoMutexAcquire mutex(fMutex);
+        if (Value* v = fLookup.find(key)) {
+            *result = v->fBitmap;
+            *offset = v->fOffset;
+            if (v != fLRU.head()) {
+                fLRU.remove(v);
+                fLRU.addToHead(v);
+            }
+            return true;
+        }
+        return false;
+    }
+    virtual void set(const Key& key, const SkBitmap& result, const SkIPoint& offset) {
+        SkAutoMutexAcquire mutex(fMutex);
+        if (Value* v = fLookup.find(key)) {
+            removeInternal(v);
+        }
+        Value* v = new Value(key, result, offset);
+        fLookup.add(v);
+        fLRU.addToHead(v);
+        fCurrentBytes += result.getSize();
+        while (fCurrentBytes > fMaxBytes) {
+            Value* tail = fLRU.tail();
+            SkASSERT(tail);
+            if (tail == v) {
+                break;
+            }
+            removeInternal(tail);
+        }
+    }
+private:
+    void removeInternal(Value* v) {
+        fCurrentBytes -= v->fBitmap.getSize();
+        fLRU.remove(v);
+        fLookup.remove(v->fKey);
+        delete v;
+    }
+private:
+    SkTDynamicHash<Value, Key>         fLookup;
+    mutable SkTInternalLList<Value>    fLRU;
+    size_t                             fMaxBytes;
+    size_t                             fCurrentBytes;
+    mutable SkMutex                    fMutex;
+};
+
+SkImageFilter::UniqueIDCache* CreateCache() {
+    return SkImageFilter::UniqueIDCache::Create(kDefaultCacheSize);
+}
+
+} // namespace
+
+SkImageFilter::UniqueIDCache* SkImageFilter::UniqueIDCache::Create(size_t maxBytes) {
+    return SkNEW_ARGS(UniqueIDCacheImpl, (maxBytes));
+}
+
+SkImageFilter::UniqueIDCache* SkImageFilter::UniqueIDCache::Get() {
+    SK_DECLARE_STATIC_LAZY_PTR(SkImageFilter::UniqueIDCache, cache, CreateCache);
+    return cache.get();
+}
diff --git a/src/gpu/SkGpuDevice.cpp b/src/gpu/SkGpuDevice.cpp
index 0abf9d8..2853805 100644
--- a/src/gpu/SkGpuDevice.cpp
+++ b/src/gpu/SkGpuDevice.cpp
@@ -41,6 +41,8 @@
 #include "SkXfermode.h"
 #include "SkErrorInternals.h"
 
+enum { kDefaultImageFilterCacheSize = 32 * 1024 * 1024 };
+
 #define CACHE_COMPATIBLE_DEVICE_TEXTURES 1
 
 #if 0
@@ -1425,8 +1427,9 @@
         SkMatrix matrix(*draw.fMatrix);
         matrix.postTranslate(SkIntToScalar(-left), SkIntToScalar(-top));
         SkIRect clipBounds = SkIRect::MakeWH(bitmap.width(), bitmap.height());
-        SkImageFilter::Cache* cache = SkImageFilter::Cache::Create();
-        SkAutoUnref aur(cache);
+        SkAutoTUnref<SkImageFilter::UniqueIDCache> cache(getImageFilterCache());
+        // This cache is transient, and is freed (along with all its contained
+        // textures) when it goes out of scope.
         SkImageFilter::Context ctx(matrix, clipBounds, cache);
         if (filter_texture(this, fContext, texture, filter, w, h, ctx, &filteredBitmap,
                            &offset)) {
@@ -1535,8 +1538,9 @@
         SkMatrix matrix(*draw.fMatrix);
         matrix.postTranslate(SkIntToScalar(-x), SkIntToScalar(-y));
         SkIRect clipBounds = SkIRect::MakeWH(devTex->width(), devTex->height());
-        SkImageFilter::Cache* cache = SkImageFilter::Cache::Create();
-        SkAutoUnref aur(cache);
+        // This cache is transient, and is freed (along with all its contained
+        // textures) when it goes out of scope.
+        SkAutoTUnref<SkImageFilter::UniqueIDCache> cache(getImageFilterCache());
         SkImageFilter::Context ctx(matrix, clipBounds, cache);
         if (filter_texture(this, fContext, devTex, filter, w, h, ctx, &filteredBitmap,
                            &offset)) {
@@ -2060,3 +2064,9 @@
 
     return true;
 }
+
+SkImageFilter::UniqueIDCache* SkGpuDevice::getImageFilterCache() {
+    // We always return a transient cache, so it is freed after each
+    // filter traversal.
+    return SkImageFilter::UniqueIDCache::Create(kDefaultImageFilterCacheSize);
+}
diff --git a/tests/ImageFilterTest.cpp b/tests/ImageFilterTest.cpp
index 7da4a91..0766c15 100644
--- a/tests/ImageFilterTest.cpp
+++ b/tests/ImageFilterTest.cpp
@@ -262,8 +262,7 @@
         SkIPoint offset;
         SkString str;
         str.printf("filter %d", static_cast<int>(i));
-        SkAutoTUnref<SkImageFilter::Cache> cache(SkImageFilter::Cache::Create(2));
-        SkImageFilter::Context ctx(SkMatrix::I(), SkIRect::MakeLargest(), cache.get());
+        SkImageFilter::Context ctx(SkMatrix::I(), SkIRect::MakeLargest(), NULL);
         REPORTER_ASSERT_MESSAGE(reporter, filter->filterImage(&proxy, bitmap, ctx,
                                 &result, &offset), str.c_str());
         REPORTER_ASSERT_MESSAGE(reporter, offset.fX == 20 && offset.fY == 30, str.c_str());