Cut down SkBBH API more.
  - The expected case is now a single bulk-load insert() call instead of N;
  - reserve() and flushDeferredInserts() can fold into insert() now;
  - SkBBH subclasses may take ownership of the bounds

This appears to be a performance no-op on both my Mac and N5.  I guess
even the simplest indirect branch predictor ("same as last time") can predict
the repeated virtual calls to SkBBH::insert() perfectly.

BUG=skia:

Review URL: https://codereview.chromium.org/670213002
diff --git a/src/core/SkBBoxHierarchy.h b/src/core/SkBBoxHierarchy.h
index 7246787..de8ea0b 100644
--- a/src/core/SkBBoxHierarchy.h
+++ b/src/core/SkBBoxHierarchy.h
@@ -1,4 +1,3 @@
-
 /*
  * Copyright 2012 Google Inc.
  *
@@ -10,45 +9,31 @@
 #define SkBBoxHierarchy_DEFINED
 
 #include "SkRect.h"
-#include "SkTDArray.h"
 #include "SkRefCnt.h"
+#include "SkTDArray.h"
+#include "SkTemplates.h"
 
 /**
- * Interface for a spatial data structure that associates user data with axis-aligned
- * bounding boxes, and allows efficient retrieval of intersections with query rectangles.
+ * Interface for a spatial data structure that stores axis-aligned bounding
+ * boxes and allows efficient retrieval of intersections with query rectangles.
  */
 class SkBBoxHierarchy : public SkRefCnt {
 public:
-    SK_DECLARE_INST_COUNT(SkBBoxHierarchy)
-
     SkBBoxHierarchy() {}
+    virtual ~SkBBoxHierarchy() {}
 
     /**
-     * Hint that <= opCount calls to insert() will be made.
+     * Insert N bounding boxes into the hierarchy.
+     * The SkBBoxHierarchy may take ownership of boundsArray by calling detach().
      */
-    virtual void reserve(unsigned opCount) {}
+    virtual void insert(SkAutoTMalloc<SkRect>* boundsArray, int N) = 0;
 
     /**
-     * Insert opIndex and corresponding bounding box.
-     * @param opIndex Any value, will be returned in order.
-     * @param bounds The bounding box, should not be empty.
-     * @param defer Whether or not it is acceptable to delay insertion of this element (building up
-     *        an entire spatial data structure at once is often faster and produces better
-     *        structures than repeated inserts) until flushDeferredInserts is called or the first
-     *        search.
-     */
-    virtual void insert(unsigned opIndex, const SkRect& bounds, bool defer = false) = 0;
-
-    /**
-     * If any insertions have been deferred, force them to be inserted.
-     */
-    virtual void flushDeferredInserts() {}
-
-    /**
-     * Populate results with sorted opIndex corresponding to bounding boxes that intersect query.
+     * Populate results with the indices of bounding boxes interesecting that query.
      */
     virtual void search(const SkRect& query, SkTDArray<unsigned>* results) const = 0;
 
+    SK_DECLARE_INST_COUNT(SkBBoxHierarchy)
 private:
     typedef SkRefCnt INHERITED;
 };
diff --git a/src/core/SkRTree.cpp b/src/core/SkRTree.cpp
index 4a081db..93f9142 100644
--- a/src/core/SkRTree.cpp
+++ b/src/core/SkRTree.cpp
@@ -44,68 +44,39 @@
     this->clear();
 }
 
-void SkRTree::insert(unsigned opIndex, const SkRect& fbounds, bool defer) {
-    SkIRect bounds;
-    if (fbounds.isLargest()) {
-        bounds.setLargest();
-    } else {
-        fbounds.roundOut(&bounds);
-    }
-
+void SkRTree::insert(SkAutoTMalloc<SkRect>* boundsArray, int N) {
+    SkASSERT(this->isEmpty());
     this->validate();
-    if (bounds.isEmpty()) {
-        SkASSERT(false);
-        return;
-    }
-    Branch newBranch;
-    newBranch.fBounds = bounds;
-    newBranch.fChild.opIndex = opIndex;
-    if (this->isEmpty()) {
-        // since a bulk-load into an existing tree is as of yet unimplemented (and arguably not
-        // of vital importance right now), we only batch up inserts if the tree is empty.
-        if (defer) {
-            fDeferredInserts.push(newBranch);
-            return;
-        } else {
-            fRoot.fChild.subtree = allocateNode(0);
-            fRoot.fChild.subtree->fNumChildren = 0;
+
+    SkTDArray<Branch> deferred;
+    deferred.setReserve(N);
+
+    for (int i = 0; i < N; i++) {
+        SkIRect bounds;
+        (*boundsArray)[i].roundOut(&bounds);
+        if (bounds.isEmpty()) {
+            continue;
         }
+
+        Branch newBranch;
+        newBranch.fBounds = bounds;
+        newBranch.fChild.opIndex = i;
+
+        deferred.push(newBranch);
     }
 
-    Branch* newSibling = insert(fRoot.fChild.subtree, &newBranch);
-    fRoot.fBounds = this->computeBounds(fRoot.fChild.subtree);
-
-    if (newSibling) {
-        Node* oldRoot = fRoot.fChild.subtree;
-        Node* newRoot = this->allocateNode(oldRoot->fLevel + 1);
-        newRoot->fNumChildren = 2;
-        *newRoot->child(0) = fRoot;
-        *newRoot->child(1) = *newSibling;
-        fRoot.fChild.subtree = newRoot;
-        fRoot.fBounds = this->computeBounds(fRoot.fChild.subtree);
-    }
-
-    ++fCount;
-    this->validate();
-}
-
-void SkRTree::flushDeferredInserts() {
-    this->validate();
-    if (this->isEmpty() && fDeferredInserts.count() > 0) {
-        fCount = fDeferredInserts.count();
+    fCount = deferred.count();
+    if (fCount) {
         if (1 == fCount) {
-            fRoot.fChild.subtree = allocateNode(0);
+            fRoot.fChild.subtree = this->allocateNode(0);
             fRoot.fChild.subtree->fNumChildren = 0;
-            this->insert(fRoot.fChild.subtree, &fDeferredInserts[0]);
-            fRoot.fBounds = fDeferredInserts[0].fBounds;
+            this->insert(fRoot.fChild.subtree, &deferred[0]);
+            fRoot.fBounds = deferred[0].fBounds;
         } else {
-            fRoot = this->bulkLoad(&fDeferredInserts);
+            fRoot = this->bulkLoad(&deferred);
         }
-    } else {
-        // TODO: some algorithm for bulk loading into an already populated tree
-        SkASSERT(0 == fDeferredInserts.count());
     }
-    fDeferredInserts.rewind();
+
     this->validate();
 }
 
@@ -113,7 +84,6 @@
     SkIRect query;
     fquery.roundOut(&query);
     this->validate();
-    SkASSERT(0 == fDeferredInserts.count());  // If this fails, you should have flushed.
     if (!this->isEmpty() && SkIRect::IntersectsNoEmptyCheck(fRoot.fBounds, query)) {
         this->search(fRoot.fChild.subtree, query, results);
     }
@@ -123,7 +93,6 @@
 void SkRTree::clear() {
     this->validate();
     fNodes.reset();
-    fDeferredInserts.rewind();
     fCount = 0;
     this->validate();
 }
diff --git a/src/core/SkRTree.h b/src/core/SkRTree.h
index 0d88804..00c6c89 100644
--- a/src/core/SkRTree.h
+++ b/src/core/SkRTree.h
@@ -59,24 +59,7 @@
             bool orderWhenBulkLoading = true);
     virtual ~SkRTree();
 
-    /**
-     * Insert a node, consisting of bounds and a data value into the tree, if we don't immediately
-     * need to use the tree; we may allow the insert to be deferred (this can allow us to bulk-load
-     * a large batch of nodes at once, which tends to be faster and produce a better tree).
-     *  @param opIndex The data value
-     *  @param bounds The corresponding bounding box
-     *  @param defer Can this insert be deferred? (this may be ignored)
-     */
-    virtual void insert(unsigned opIndex, const SkRect& bounds, bool defer = false) SK_OVERRIDE;
-
-    /**
-     * If any inserts have been deferred, this will add them into the tree
-     */
-    virtual void flushDeferredInserts() SK_OVERRIDE;
-
-    /**
-     * Given a query rectangle, populates the passed-in array with the elements it intersects
-     */
+    virtual void insert(SkAutoTMalloc<SkRect>* boundsArray, int N) SK_OVERRIDE;
     virtual void search(const SkRect& query, SkTDArray<unsigned>* results) const SK_OVERRIDE;
 
     void clear();
@@ -179,7 +162,6 @@
 
     Branch fRoot;
     SkChunkAlloc fNodes;
-    SkTDArray<Branch> fDeferredInserts;
     SkScalar fAspectRatio;
     bool fSortWhenBulkLoading;
 
diff --git a/src/core/SkRecordDraw.cpp b/src/core/SkRecordDraw.cpp
index 12579e9..5981245 100644
--- a/src/core/SkRecordDraw.cpp
+++ b/src/core/SkRecordDraw.cpp
@@ -164,13 +164,7 @@
 
         // Finally feed all stored bounds into the BBH.  They'll be returned in this order.
         SkASSERT(bbh);
-        bbh->reserve(record.count());
-        for (unsigned i = 0; i < record.count(); i++) {
-            if (!fBounds[i].isEmpty()) {
-                bbh->insert(i, fBounds[i], true/*ok to defer*/);
-            }
-        }
-        bbh->flushDeferredInserts();
+        bbh->insert(&fBounds, record.count());
     }
 
     template <typename T> void operator()(const T& op) {
diff --git a/src/core/SkTileGrid.cpp b/src/core/SkTileGrid.cpp
index 10782c4..e285ccc 100644
--- a/src/core/SkTileGrid.cpp
+++ b/src/core/SkTileGrid.cpp
@@ -23,7 +23,7 @@
     SkDELETE_ARRAY(fTiles);
 }
 
-void SkTileGrid::reserve(unsigned opCount) {
+void SkTileGrid::reserve(int opCount) {
     if (fXTiles * fYTiles == 0) {
         return;  // A tileless tile grid is nonsensical, but happens in at least cc_unittests.
     }
@@ -44,7 +44,7 @@
     // than if we made no setReserve() calls, but time spent in insert() drops by about 50%.
 }
 
-void SkTileGrid::flushDeferredInserts() {
+void SkTileGrid::shrinkToFit() {
     for (SkTDArray<unsigned>* tile = fTiles; tile != fTiles + (fXTiles * fYTiles); tile++) {
         tile->shrinkToFit();
     }
@@ -70,30 +70,35 @@
     grid->fBottom = SkPin32(user.bottom() * fInvHeight, 0, fYTiles - 1);
 }
 
-void SkTileGrid::insert(unsigned opIndex, const SkRect& originalBounds, bool) {
-    SkRect bounds = originalBounds;
-    bounds.outset(fMarginWidth, fMarginHeight);
-    this->commonAdjust(&bounds);
+void SkTileGrid::insert(SkAutoTMalloc<SkRect>* boundsArray, int N) {
+    this->reserve(N);
 
-    // TODO(mtklein): can we assert this instead to save an intersection in Release mode,
-    // or just allow out-of-bound insertions to insert anyway (clamped to nearest tile)?
-    if (!SkRect::Intersects(bounds, fGridBounds)) {
-        return;
-    }
+    for (int i = 0; i < N; i++) {
+        SkRect bounds = (*boundsArray)[i];
+        bounds.outset(fMarginWidth, fMarginHeight);
+        this->commonAdjust(&bounds);
 
-    SkIRect grid;
-    this->userToGrid(bounds, &grid);
-
-    // This is just a loop over y then x.  This compiles to a slightly faster and
-    // more compact loop than if we just did fTiles[y * fXTiles + x].push(opIndex).
-    SkTDArray<unsigned>* row = &fTiles[grid.fTop * fXTiles + grid.fLeft];
-    for (int y = 0; y <= grid.fBottom - grid.fTop; y++) {
-        SkTDArray<unsigned>* tile = row;
-        for (int x = 0; x <= grid.fRight - grid.fLeft; x++) {
-            (tile++)->push(opIndex);
+        // TODO(mtklein): can we assert this instead to save an intersection in Release mode,
+        // or just allow out-of-bound insertions to insert anyway (clamped to nearest tile)?
+        if (!SkRect::Intersects(bounds, fGridBounds)) {
+            continue;
         }
-        row += fXTiles;
+
+        SkIRect grid;
+        this->userToGrid(bounds, &grid);
+
+        // This is just a loop over y then x.  This compiles to a slightly faster and
+        // more compact loop than if we just did fTiles[y * fXTiles + x].push(i).
+        SkTDArray<unsigned>* row = &fTiles[grid.fTop * fXTiles + grid.fLeft];
+        for (int y = 0; y <= grid.fBottom - grid.fTop; y++) {
+            SkTDArray<unsigned>* tile = row;
+            for (int x = 0; x <= grid.fRight - grid.fLeft; x++) {
+                (tile++)->push(i);
+            }
+            row += fXTiles;
+        }
     }
+    this->shrinkToFit();
 }
 
 // Number of tiles for which data is allocated on the stack in
diff --git a/src/core/SkTileGrid.h b/src/core/SkTileGrid.h
index fd7584f..99218c7 100644
--- a/src/core/SkTileGrid.h
+++ b/src/core/SkTileGrid.h
@@ -19,30 +19,18 @@
 class SkTileGrid : public SkBBoxHierarchy {
 public:
     SkTileGrid(int xTiles, int yTiles, const SkTileGridFactory::TileGridInfo& info);
-
     virtual ~SkTileGrid();
 
-    /**
-     * Insert a opIndex value and corresponding bounding box
-     * @param opIndex
-     * @param bounds The bounding box, should not be empty.
-     * @param defer  Ignored; SkTileGrid does not defer insertions.
-     */
-    virtual void insert(unsigned opIndex, const SkRect& bounds, bool) SK_OVERRIDE;
-
-    /**
-     * Populate 'results' with opIndexes corresponding to bounding boxes that intersect 'query'.
-     * This will be fastest if the query is an exact match to a single grid tile.
-     */
+    virtual void insert(SkAutoTMalloc<SkRect>* boundsArray, int N) SK_OVERRIDE;
     virtual void search(const SkRect& query, SkTDArray<unsigned>* results) const SK_OVERRIDE;
 
     // For testing.
     int tileCount(int x, int y) { return fTiles[y * fXTiles + x].count(); }
 
-    virtual void reserve(unsigned opCount) SK_OVERRIDE;
-    virtual void flushDeferredInserts() SK_OVERRIDE;
-
 private:
+    void reserve(int);
+    void shrinkToFit();
+
     void commonAdjust(SkRect*) const;
     void userToGrid(const SkRect&, SkIRect* grid) const;