Start from scratch on a faster SkFlatDictionary.

This is like codereview.chromium.org/19276003, except it fits in better with the existing code, doesn't leak memory, and because it's back using SkChunkFlatController it's a little faster too, now a win across the board:

Slowdown	bench
-1.59%	desk_youtubetvbrowse.skp
-2.56%	desk_googlehome.skp
-6.40%	tabl_androidpolice.skp
-6.45%	desk_youtubetvvideo.skp
-6.91%	tabl_googlecalendar.skp
...
-29.70%	desk_yahoogames.skp
-32.17%	desk_googlespreadsheet.skp
-32.23%	mobi_wikipedia.skp
-37.16%	desk_chalkboard.skp
-41.57%	desk_pokemonwiki.skp
Overall slowdown: -22.74%

running bench [640 480] picture_record_recurring_paint_dictionary  NONRENDERING: cmsecs =   9.92
running bench [640 480] picture_record_unique_paint_dictionary  NONRENDERING: cmsecs =  22.16
running bench [640 480]  picture_record_dictionaries  NONRENDERING: cmsecs =   9.18

BUG=

Committed: http://code.google.com/p/skia/source/detail?r=10328

R=tomhudson@google.com, reed@google.com, scroggo@google.com

Author: mtklein@google.com

Review URL: https://chromiumcodereview.appspot.com/19564007

git-svn-id: http://skia.googlecode.com/svn/trunk@10336 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/include/core/SkWriter32.h b/include/core/SkWriter32.h
index 51044ab..82cf346 100644
--- a/include/core/SkWriter32.h
+++ b/include/core/SkWriter32.h
@@ -46,10 +46,17 @@
 
     // return the current offset (will always be a multiple of 4)
     uint32_t bytesWritten() const { return fSize; }
-    // DEPRECATED: use byetsWritten instead
+    // DEPRECATED: use bytesWritten instead  TODO(mtklein): clean up
     uint32_t  size() const { return this->bytesWritten(); }
 
-    void      reset();
+    // Returns true if we've written only into the storage passed into constructor or reset.
+    // (You may be able to use this to avoid a call to flatten.)
+    bool wroteOnlyToStorage() const {
+        return fHead == &fExternalBlock && this->bytesWritten() <= fExternalBlock.fSizeOfBlock;
+    }
+
+    void reset();
+    void reset(void* storage, size_t size);
 
     // size MUST be multiple of 4
     uint32_t* reserve(size_t size) {
@@ -63,8 +70,6 @@
         return block->alloc(size);
     }
 
-    void reset(void* storage, size_t size);
-
     bool writeBool(bool value) {
         this->writeInt(value);
         return value;
diff --git a/src/core/SkOrderedWriteBuffer.h b/src/core/SkOrderedWriteBuffer.h
index ebec7d1..180f9a4 100644
--- a/src/core/SkOrderedWriteBuffer.h
+++ b/src/core/SkOrderedWriteBuffer.h
@@ -26,18 +26,25 @@
 class SkOrderedWriteBuffer : public SkFlattenableWriteBuffer {
 public:
     SkOrderedWriteBuffer(size_t minSize);
-    SkOrderedWriteBuffer(size_t minSize, void* initialStorage,
-                         size_t storageSize);
+    SkOrderedWriteBuffer(size_t minSize, void* initialStorage, size_t storageSize);
     virtual ~SkOrderedWriteBuffer();
 
     virtual bool isOrderedBinaryBuffer() SK_OVERRIDE { return true; }
     virtual SkOrderedWriteBuffer* getOrderedBinaryBuffer() SK_OVERRIDE { return this; }
 
     SkWriter32* getWriter32() { return &fWriter; }
+    void reset(void* storage, size_t storageSize) { fWriter.reset(storage, storageSize); }
+
+    // Returns true if we've written only into the storage passed into constructor or reset.
+    // (You may be able to use this to avoid a call to writeToMemory.)
+    bool wroteOnlyToStorage() const { return fWriter.wroteOnlyToStorage(); }
 
     void writeToMemory(void* dst) { fWriter.flatten(dst); }
     uint32_t* reserve(size_t size) { return fWriter.reserve(size); }
-    uint32_t size() { return fWriter.size(); }
+
+    uint32_t bytesWritten() const { return fWriter.bytesWritten(); }
+    // Deprecated.  Please call bytesWritten instead.  TODO(mtklein): clean up
+    uint32_t size() const { return this->bytesWritten(); }
 
     virtual void writeByteArray(const void* data, size_t size) SK_OVERRIDE;
     virtual void writeBool(bool value) SK_OVERRIDE;
diff --git a/src/core/SkPictureFlat.cpp b/src/core/SkPictureFlat.cpp
index e9eec0f..2c6efa2 100644
--- a/src/core/SkPictureFlat.cpp
+++ b/src/core/SkPictureFlat.cpp
@@ -94,6 +94,14 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
+void SkFlatData::stampHeaderAndSentinel(int index, int32_t size) {
+    fIndex    = index;
+    fFlatSize = size;
+    fChecksum = SkChecksum::Compute(this->data32(), size);
+    this->setTopBotUnwritten();
+    this->setSentinelAsCandidate();
+}
+
 SkFlatData* SkFlatData::Create(SkFlatController* controller, const void* obj,
         int index, void (*flattenProc)(SkOrderedWriteBuffer&, const void*)) {
     // a buffer of 256 bytes should be sufficient for most paints, regions,
@@ -119,14 +127,9 @@
     size_t allocSize = sizeof(SkFlatData) + size + sizeof(uint32_t);
     SkFlatData* result = (SkFlatData*) controller->allocThrow(allocSize);
 
-    result->setIndex(index);
-    result->setTopBotUnwritten();
-    result->fFlatSize = size;
-
     // put the serialized contents into the data section of the new allocation
     buffer.writeToMemory(result->data());
-    result->fChecksum = SkChecksum::Compute(result->data32(), size);
-    result->setSentinelAsCandidate();
+    result->stampHeaderAndSentinel(index, size);
     return result;
 }
 
diff --git a/src/core/SkPictureFlat.h b/src/core/SkPictureFlat.h
index 0a147de..e4f1492 100644
--- a/src/core/SkPictureFlat.h
+++ b/src/core/SkPictureFlat.h
@@ -151,9 +151,9 @@
 //                   also responsible for flattening/unflattening objects but
 //                   details of that operation are hidden in the provided procs
 // SkFlatDictionary: is an abstract templated dictionary that maintains a
-//                   searchable set of SkFlataData objects of type T.
+//                   searchable set of SkFlatData objects of type T.
 // SkFlatController: is an interface provided to SkFlatDictionary which handles
-//                   allocation and unallocation in some cases. It also holds
+//                   allocation (and unallocation in some cases). It also holds
 //                   ref count recorders and the like.
 //
 // NOTE: any class that wishes to be used in conjunction with SkFlatDictionary
@@ -175,16 +175,15 @@
     SkFlatController();
     virtual ~SkFlatController();
     /**
-     * Provide a new block of memory for the SkFlatDictionary to use.
+     * Return a new block of memory for the SkFlatDictionary to use.
+     * This memory is owned by the controller and has the same lifetime unless you
+     * call unalloc(), in which case it may be freed early.
      */
     virtual void* allocThrow(size_t bytes) = 0;
 
     /**
-     * Unallocate a previously allocated block, returned by allocThrow.
-     * Implementation should at least perform an unallocation if passed the last
-     * pointer returned by allocThrow. If findAndReplace() is intended to be
-     * used, unalloc should also be able to unallocate the SkFlatData that is
-     * provided.
+     * Hint that this block, which was allocated with allocThrow, is no longer needed.
+     * The implementation may choose to free this memory any time beteween now and destruction.
      */
     virtual void unalloc(void* ptr) = 0;
 
@@ -400,26 +399,33 @@
         SkASSERT(SkIsAlign4(fFlatSize));
         this->data32()[fFlatSize >> 2] = value;
     }
+
+    // This does not modify the payload flat data, in case it's already been written.
+    void stampHeaderAndSentinel(int index, int32_t size);
+    template <class T> friend class SkFlatDictionary;  // For stampHeaderAndSentinel().
 };
 
 template <class T>
 class SkFlatDictionary {
+    static const size_t kWriteBufferGrowthBytes = 1024;
+
 public:
-    SkFlatDictionary(SkFlatController* controller)
-    : fController(controller) {
-        fFlattenProc = NULL;
-        fUnflattenProc = NULL;
-        SkASSERT(controller);
-        fController->ref();
-        // set to 1 since returning a zero from find() indicates failure
-        fNextIndex = 1;
+    SkFlatDictionary(SkFlatController* controller, size_t scratchSizeGuess = 0)
+    : fFlattenProc(NULL)
+    , fUnflattenProc(NULL)
+    , fController(SkRef(controller))
+    , fScratchSize(scratchSizeGuess)
+    , fScratch(AllocScratch(fScratchSize))
+    , fWriteBuffer(kWriteBufferGrowthBytes)
+    , fWriteBufferReady(false)
+    , fNextIndex(1)  { // set to 1 since returning a zero from find() indicates failure
         sk_bzero(fHash, sizeof(fHash));
         // index 0 is always empty since it is used as a signal that find failed
         fIndexedData.push(NULL);
     }
 
-    virtual ~SkFlatDictionary() {
-        fController->unref();
+    ~SkFlatDictionary() {
+        sk_free(fScratch);
     }
 
     int count() const {
@@ -532,33 +538,40 @@
     }
 
     const SkFlatData* findAndReturnFlat(const T& element) {
-        SkFlatData* flat = SkFlatData::Create(fController, &element, fNextIndex, fFlattenProc);
+        // Only valid until the next call to resetScratch().
+        const SkFlatData& scratch = this->resetScratch(element, fNextIndex);
 
-        int hashIndex = ChecksumToHashIndex(flat->checksum());
+        // See if we have it in the hash?
+        const int hashIndex = ChecksumToHashIndex(scratch.checksum());
         const SkFlatData* candidate = fHash[hashIndex];
-        if (candidate && !SkFlatData::Compare(*flat, *candidate)) {
-            fController->unalloc(flat);
+        if (candidate != NULL && SkFlatData::Compare(scratch, *candidate) == 0) {
             return candidate;
         }
 
-        int index = SkTSearch<const SkFlatData,
-                              SkFlatData::Less>((const SkFlatData**) fSortedData.begin(),
-                                                fSortedData.count(), flat, sizeof(flat));
+        // See if we have it at all?
+        const int index = SkTSearch<const SkFlatData, SkFlatData::Less>(fSortedData.begin(),
+                                                                        fSortedData.count(),
+                                                                        &scratch,
+                                                                        sizeof(&scratch));
         if (index >= 0) {
-            fController->unalloc(flat);
+            // Found.  Update hash before we return.
             fHash[hashIndex] = fSortedData[index];
             return fSortedData[index];
         }
 
-        index = ~index;
-        *fSortedData.insert(index) = flat;
-        *fIndexedData.insert(flat->index()) = flat;
+        // We don't have it.  Add it.
+        SkFlatData* detached = this->detachScratch();
+        // detached will live beyond the next call to resetScratch(), but is owned by fController.
+        *fSortedData.insert(~index) = detached;  // SkTSearch returned bit-not of where to insert.
+        *fIndexedData.insert(detached->index()) = detached;
+        fHash[hashIndex] = detached;
+
+        SkASSERT(detached->index() == fNextIndex);
         SkASSERT(fSortedData.count() == fNextIndex);
+        SkASSERT(fIndexedData.count() == fNextIndex+1);
         fNextIndex++;
-        flat->setSentinelInCache();
-        fHash[hashIndex] = flat;
-        SkASSERT(fIndexedData.count() == fSortedData.count()+1);
-        return flat;
+
+        return detached;
     }
 
 protected:
@@ -566,6 +579,76 @@
     void (*fUnflattenProc)(SkOrderedReadBuffer&, void*);
 
 private:
+    // Layout: [ SkFlatData header, 20 bytes ] [ data ..., 4-byte aligned ] [ sentinel, 4 bytes]
+    static size_t SizeWithPadding(size_t flatDataSize) {
+        SkASSERT(SkIsAlign4(flatDataSize));
+        return sizeof(SkFlatData) + flatDataSize + sizeof(uint32_t);
+    }
+
+    // Allocate a new scratch SkFlatData.  Must be sk_freed.
+    static SkFlatData* AllocScratch(size_t scratchSize) {
+        return (SkFlatData*) sk_malloc_throw(SizeWithPadding(scratchSize));
+    }
+
+    // We have to delay fWriteBuffer's initialization until its first use; fController might not
+    // be fully set up by the time we get it in the constructor.
+    void lazyWriteBufferInit() {
+        if (fWriteBufferReady) {
+            return;
+        }
+        // Without a bitmap heap, we'll flatten bitmaps into paints.  That's never what you want.
+        SkASSERT(fController->getBitmapHeap() != NULL);
+        fWriteBuffer.setBitmapHeap(fController->getBitmapHeap());
+        fWriteBuffer.setTypefaceRecorder(fController->getTypefaceSet());
+        fWriteBuffer.setNamedFactoryRecorder(fController->getNamedFactorySet());
+        fWriteBuffer.setFlags(fController->getWriteBufferFlags());
+        fWriteBufferReady = true;
+    }
+
+    // This reference is valid only until the next call to resetScratch() or detachScratch().
+    const SkFlatData& resetScratch(const T& element, int index) {
+        this->lazyWriteBufferInit();
+
+        // Flatten element into fWriteBuffer (using fScratch as storage).
+        fWriteBuffer.reset(fScratch->data(), fScratchSize);
+        fFlattenProc(fWriteBuffer, &element);
+        const size_t bytesWritten = fWriteBuffer.bytesWritten();
+
+        // If all the flattened bytes fit into fScratch, we can skip a call to writeToMemory.
+        if (!fWriteBuffer.wroteOnlyToStorage()) {
+            SkASSERT(bytesWritten > fScratchSize);
+            // It didn't all fit.  Copy into a larger replacement SkFlatData.
+            // We can't just realloc because it might move the pointer and confuse writeToMemory.
+            SkFlatData* larger = AllocScratch(bytesWritten);
+            fWriteBuffer.writeToMemory(larger->data());
+
+            // Carry on with this larger scratch to minimize the likelihood of future resizing.
+            sk_free(fScratch);
+            fScratchSize = bytesWritten;
+            fScratch = larger;
+        }
+
+        // The data is in fScratch now, but we need to stamp its header and trailing sentinel.
+        fScratch->stampHeaderAndSentinel(index, bytesWritten);
+        return *fScratch;
+    }
+
+    // This result is owned by fController and lives as long as it does (unless unalloc'd).
+    SkFlatData* detachScratch() {
+        // Allocate a new SkFlatData exactly big enough to hold our current scratch.
+        // We use the controller for this allocation to extend the allocation's lifetime and allow
+        // the controller to do whatever memory management it wants.
+        const size_t paddedSize = SizeWithPadding(fScratch->flatSize());
+        SkFlatData* detached = (SkFlatData*)fController->allocThrow(paddedSize);
+
+        // Copy scratch into the new SkFlatData, setting the sentinel for cache storage.
+        memcpy(detached, fScratch, paddedSize);
+        detached->setSentinelInCache();
+
+        // We can now reuse fScratch, and detached will live until fController dies.
+        return detached;
+    }
+
     void unflatten(T* dst, const SkFlatData* element) const {
         element->unflatten(dst, fUnflattenProc,
                            fController->getBitmapHeap(),
@@ -584,14 +667,18 @@
         }
     }
 
-    SkFlatController * const     fController;
-    int                          fNextIndex;
+    SkAutoTUnref<SkFlatController> fController;
+    size_t fScratchSize;  // How many bytes fScratch has allocated for data itself.
+    SkFlatData* fScratch;  // Owned, must be freed with sk_free.
+    SkOrderedWriteBuffer fWriteBuffer;
+    bool fWriteBufferReady;
 
     // SkFlatDictionary has two copies of the data one indexed by the
     // SkFlatData's index and the other sorted. The sorted data is used
     // for finding and uniquification while the indexed copy is used
     // for standard array-style lookups based on the SkFlatData's index
     // (as in 'unflatten').
+    int fNextIndex;
     SkTDArray<const SkFlatData*> fIndexedData;
     // fSortedData is sorted by checksum/size/data.
     SkTDArray<const SkFlatData*> fSortedData;
@@ -645,20 +732,19 @@
         this->setTypefacePlayback(&fTypefacePlayback);
     }
 
-    ~SkChunkFlatController() {
-        fTypefaceSet->unref();
-    }
-
     virtual void* allocThrow(size_t bytes) SK_OVERRIDE {
-        return fHeap.allocThrow(bytes);
+        fLastAllocated = fHeap.allocThrow(bytes);
+        return fLastAllocated;
     }
 
     virtual void unalloc(void* ptr) SK_OVERRIDE {
-        (void) fHeap.unalloc(ptr);
+        // fHeap can only free a pointer if it was the last one allocated.  Otherwise, we'll just
+        // have to wait until fHeap is destroyed.
+        if (ptr == fLastAllocated) (void)fHeap.unalloc(ptr);
     }
 
     void setupPlaybacks() const {
-        fTypefacePlayback.reset(fTypefaceSet);
+        fTypefacePlayback.reset(fTypefaceSet.get());
     }
 
     void setBitmapStorage(SkBitmapHeap* heap) {
@@ -667,23 +753,16 @@
 
 private:
     SkChunkAlloc               fHeap;
-    SkRefCntSet*               fTypefaceSet;
+    SkAutoTUnref<SkRefCntSet>  fTypefaceSet;
+    void*                      fLastAllocated;
     mutable SkTypefacePlayback fTypefacePlayback;
 };
 
-class SkBitmapDictionary : public SkFlatDictionary<SkBitmap> {
-public:
-    SkBitmapDictionary(SkFlatController* controller)
-    : SkFlatDictionary<SkBitmap>(controller) {
-        fFlattenProc = &SkFlattenObjectProc<SkBitmap>;
-        fUnflattenProc = &SkUnflattenObjectProc<SkBitmap>;
-    }
-};
-
 class SkMatrixDictionary : public SkFlatDictionary<SkMatrix> {
  public:
+    // All matrices fit in 36 bytes.
     SkMatrixDictionary(SkFlatController* controller)
-    : SkFlatDictionary<SkMatrix>(controller) {
+    : SkFlatDictionary<SkMatrix>(controller, 36) {
         fFlattenProc = &flattenMatrix;
         fUnflattenProc = &unflattenMatrix;
     }
@@ -699,8 +778,9 @@
 
 class SkPaintDictionary : public SkFlatDictionary<SkPaint> {
  public:
+    // The largest paint across ~60 .skps was 500 bytes.
     SkPaintDictionary(SkFlatController* controller)
-    : SkFlatDictionary<SkPaint>(controller) {
+    : SkFlatDictionary<SkPaint>(controller, 512) {
         fFlattenProc = &SkFlattenObjectProc<SkPaint>;
         fUnflattenProc = &SkUnflattenObjectProc<SkPaint>;
     }
diff --git a/tests/Writer32Test.cpp b/tests/Writer32Test.cpp
index 498be9c..e5b9363 100644
--- a/tests/Writer32Test.cpp
+++ b/tests/Writer32Test.cpp
@@ -187,18 +187,23 @@
         SkWriter32 writer(0);
         uint32_t storage[256];
         writer.reset(storage, sizeof(storage));
+        // These three writes are small enough to fit in storage.
         test1(reporter, &writer);
+        REPORTER_ASSERT(reporter, writer.wroteOnlyToStorage());
 
         writer.reset(storage, sizeof(storage));
         test2(reporter, &writer);
+        REPORTER_ASSERT(reporter, writer.wroteOnlyToStorage());
 
         writer.reset(storage, sizeof(storage));
         testWritePad(reporter, &writer);
+        REPORTER_ASSERT(reporter, writer.wroteOnlyToStorage());
 
-        // try overflowing the storage-block
+        // Try overflowing the storage-block.
         uint32_t smallStorage[8];
         writer.reset(smallStorage, sizeof(smallStorage));
         test2(reporter, &writer);
+        REPORTER_ASSERT(reporter, !writer.wroteOnlyToStorage());
     }
 
     // small storage
diff --git a/tools/render_pictures_main.cpp b/tools/render_pictures_main.cpp
index a2dff67..bf82831 100644
--- a/tools/render_pictures_main.cpp
+++ b/tools/render_pictures_main.cpp
@@ -39,6 +39,8 @@
             "the picture rendered in simple mode. When used in conjunction with --bbh, results "
             "are validated against the picture rendered in the same mode, but without the bbh.");
 
+DEFINE_bool(bench_record, false, "If true, drop into an infinite loop of recording the picture.");
+
 static void make_output_filepath(SkString* path, const SkString& dir,
                                  const SkString& name) {
     sk_tools::make_filepath(path, dir, name);
@@ -163,6 +165,13 @@
         return false;
     }
 
+    while (FLAGS_bench_record) {
+        const int kRecordFlags = 0;
+        SkPicture other;
+        picture->draw(other.beginRecording(picture->width(), picture->height(), kRecordFlags));
+        other.endRecording();
+    }
+
     for (int i = 0; i < FLAGS_clone; ++i) {
         SkPicture* clone = picture->clone();
         SkDELETE(picture);