Compiler: replace DOM traversal computation

Originally the old trace JIT used a few recursive graph walking
algorithms - which was perfectly reasonable given that the graph
size was capped at a few dozen nodes at most.  These were replaced
with iterative walk order computations  - or at least I thought
they all were.  Missed one of them, which caused a stack overflow
on a pathologically large method compilation.

Renaming of some arena_allocator items for consistency and clarity.
More detailed memory usage logging.  Reworked the allocator to waste
less space when an allocation doesn't fit and a new block must be
allocated.

Change-Id: I4d84dded3c47819eefa0de90ebb821dd12eb8be8
diff --git a/src/compiler/dex/arena_allocator.cc b/src/compiler/dex/arena_allocator.cc
index 7f1bfb3..3a3e385 100644
--- a/src/compiler/dex/arena_allocator.cc
+++ b/src/compiler/dex/arena_allocator.cc
@@ -41,12 +41,14 @@
   : default_size_(default_size),
     block_size_(default_size - sizeof(ArenaMemBlock)),
     arena_head_(NULL),
-    current_arena_(NULL),
+    current_block_(NULL),
     num_arena_blocks_(0),
-    malloc_bytes_(0) {
+    malloc_bytes_(0),
+    lost_bytes_(0),
+    num_allocations_(0) {
   memset(&alloc_stats_[0], 0, sizeof(alloc_stats_));
   // Start with an empty arena.
-  arena_head_ = current_arena_ = EmptyArena();
+  arena_head_ = current_block_ = EmptyArenaBlock();
   num_arena_blocks_++;
 }
 
@@ -58,12 +60,12 @@
     head = head->next;
     free(p);
   }
-  arena_head_ = current_arena_ = NULL;
+  arena_head_ = NULL;
   num_arena_blocks_ = 0;
 }
 
 // Return an arena with no storage for use as a sentinal.
-ArenaAllocator::ArenaMemBlock* ArenaAllocator::EmptyArena() {
+ArenaAllocator::ArenaMemBlock* ArenaAllocator::EmptyArenaBlock() {
   ArenaMemBlock* res = static_cast<ArenaMemBlock*>(malloc(sizeof(ArenaMemBlock)));
   malloc_bytes_ += sizeof(ArenaMemBlock);
   res->block_size = 0;
@@ -74,32 +76,56 @@
 
 // Arena-based malloc for compilation tasks.
 void* ArenaAllocator::NewMem(size_t size, bool zero, ArenaAllocKind kind) {
-  DCHECK(current_arena_ != NULL);
+  DCHECK(current_block_ != NULL);
   DCHECK(arena_head_ != NULL);
   size = (size + 3) & ~3;
   alloc_stats_[kind] += size;
-  if (size + current_arena_->bytes_allocated > current_arena_->block_size) {
-    size_t block_size = (size <= block_size_) ?  block_size_ : size;
-    /* Time to allocate a new block */
-    size_t allocation_size = sizeof(ArenaMemBlock) + block_size;
-    ArenaMemBlock *new_arena =
-        static_cast<ArenaMemBlock*>(malloc(allocation_size));
-    if (new_arena == NULL) {
+  ArenaMemBlock* allocation_block = current_block_;  // Assume we'll fit.
+  size_t remaining_space = current_block_->block_size - current_block_->bytes_allocated;
+  if (remaining_space < size) {
+    /*
+     * Time to allocate a new block.  If this is a large allocation or we have
+     * significant space remaining in the current block then fulfill the allocation
+     * request with a custom-sized malloc() - otherwise grab a new standard block.
+     */
+    size_t allocation_size = sizeof(ArenaMemBlock);
+    if ((remaining_space >= ARENA_HIGH_WATER) || (size > block_size_)) {
+      allocation_size += size;
+    } else {
+      allocation_size += block_size_;
+    }
+    ArenaMemBlock *new_block = static_cast<ArenaMemBlock*>(malloc(allocation_size));
+    if (new_block == NULL) {
       LOG(FATAL) << "Arena allocation failure";
     }
     malloc_bytes_ += allocation_size;
-    new_arena->block_size = block_size;
-    new_arena->bytes_allocated = 0;
-    new_arena->next = NULL;
-    current_arena_->next = new_arena;
-    current_arena_ = new_arena;
+    new_block->block_size = allocation_size - sizeof(ArenaMemBlock);
+    new_block->bytes_allocated = 0;
+    new_block->next = NULL;
     num_arena_blocks_++;
+    /*
+     * If the new block is completely full, insert it into the head of the list so we don't
+     * bother trying to fit more and won't hide the potentially allocatable space on the
+     * last (current_block_) block.  TUNING: if we move to a mark scheme, revisit
+     * this code to keep allocation order intact.
+     */
+    if (new_block->block_size == size) {
+      new_block->next = arena_head_;
+      arena_head_ = new_block;
+    } else {
+      int lost = (current_block_->block_size - current_block_->bytes_allocated);
+      lost_bytes_ += lost;
+      current_block_->next = new_block;
+      current_block_ = new_block;
+    }
+    allocation_block = new_block;
   }
-  void* ptr = &current_arena_->ptr[current_arena_->bytes_allocated];
-  current_arena_->bytes_allocated += size;
+  void* ptr = &allocation_block->ptr[allocation_block->bytes_allocated];
+  allocation_block->bytes_allocated += size;
   if (zero) {
     memset(ptr, 0, size);
   }
+  num_allocations_++;
   return ptr;
 }
 
@@ -109,9 +135,10 @@
   for (int i = 0; i < kNumAllocKinds; i++) {
     total += alloc_stats_[i];
   }
-  os << " MEM: used: " << total << ", allocated: " << malloc_bytes_ << ", wasted: "
-     << malloc_bytes_ - (total + (num_arena_blocks_ * sizeof(ArenaMemBlock))) << "\n";
-  os << "Number of arenas allocated: " << num_arena_blocks_ << "\n";
+  os << " MEM: used: " << total << ", allocated: " << malloc_bytes_
+     << ", lost: " << lost_bytes_ << "\n";
+  os << "Number of blocks allocated: " << num_arena_blocks_ << ", Number of allocations: "
+     << num_allocations_ << ", avg: " << total / num_allocations_ << "\n";
   os << "===== Allocation by kind\n";
   for (int i = 0; i < kNumAllocKinds; i++) {
       os << alloc_names[i] << std::setw(10) << alloc_stats_[i] << "\n";
diff --git a/src/compiler/dex/arena_allocator.h b/src/compiler/dex/arena_allocator.h
index 26294b6..78d4614 100644
--- a/src/compiler/dex/arena_allocator.h
+++ b/src/compiler/dex/arena_allocator.h
@@ -24,6 +24,7 @@
 namespace art {
 
 #define ARENA_DEFAULT_BLOCK_SIZE (256 * 1024)
+#define ARENA_HIGH_WATER (16 * 1024)
 
 class ArenaAllocator {
   public:
@@ -65,15 +66,17 @@
       char ptr[0];
     };
 
-    ArenaMemBlock* EmptyArena();
+    ArenaMemBlock* EmptyArenaBlock();
 
     size_t default_size_;                    // Smallest size of new allocation block.
     size_t block_size_;                      // Amount of allocatable bytes on a default block.
     ArenaMemBlock* arena_head_;              // Head of linked list of allocation blocks.
-    ArenaMemBlock* current_arena_;           // NOTE: code assumes there's always at least 1 block.
+    ArenaMemBlock* current_block_;           // NOTE: code assumes there's always at least 1 block.
     int num_arena_blocks_;
-    uint32_t malloc_bytes_;                 // Number of actual bytes malloc'd
-    uint32_t alloc_stats_[kNumAllocKinds];  // Bytes used by various allocation kinds.
+    uint32_t malloc_bytes_;                  // Number of actual bytes malloc'd
+    uint32_t alloc_stats_[kNumAllocKinds];   // Bytes used by various allocation kinds.
+    uint32_t lost_bytes_;                    // Lost memory at end of too-small region
+    uint32_t num_allocations_;
 
 };  // ArenaAllocator
 
diff --git a/src/compiler/dex/arena_bit_vector.cc b/src/compiler/dex/arena_bit_vector.cc
index 6b0fe8f..6f664e5 100644
--- a/src/compiler/dex/arena_bit_vector.cc
+++ b/src/compiler/dex/arena_bit_vector.cc
@@ -38,7 +38,6 @@
      storage_(static_cast<uint32_t*>(arena_->NewMem(storage_size_ * sizeof(uint32_t), true,
                                                     ArenaAllocator::kAllocGrowableBitMap))) {
   DCHECK_EQ(sizeof(storage_[0]), 4U);    // Assuming 32-bit units.
-  // TODO: collect detailed memory usage stats by bit vector kind.
 }
 
 /*
diff --git a/src/compiler/dex/arena_bit_vector.h b/src/compiler/dex/arena_bit_vector.h
index ffc2160..f5c471c 100644
--- a/src/compiler/dex/arena_bit_vector.h
+++ b/src/compiler/dex/arena_bit_vector.h
@@ -24,27 +24,6 @@
 
 namespace art {
 
-// Type of growable bitmap for memory tuning.
-enum OatBitMapKind {
-  kBitMapMisc = 0,
-  kBitMapUse,
-  kBitMapDef,
-  kBitMapLiveIn,
-  kBitMapBMatrix,
-  kBitMapDominators,
-  kBitMapIDominated,
-  kBitMapDomFrontier,
-  kBitMapPhi,
-  kBitMapTmpBlocks,
-  kBitMapInputBlocks,
-  kBitMapRegisterV,
-  kBitMapTempSSARegisterV,
-  kBitMapNullCheck,
-  kBitMapTmpBlockV,
-  kBitMapPredecessors,
-  kNumBitMapKinds
-};
-
 /*
  * Expanding bitmap, used for tracking resources.  Bits are numbered starting
  * from zero.  All operations on a BitVector are unsynchronized.
diff --git a/src/compiler/dex/compiler_enums.h b/src/compiler/dex/compiler_enums.h
index 71d7d65..bc456b2 100644
--- a/src/compiler/dex/compiler_enums.h
+++ b/src/compiler/dex/compiler_enums.h
@@ -387,7 +387,30 @@
   kSelectGoto
 };
 
-std::ostream& operator<<(std::ostream& os, const OpFeatureFlags& flag);
+std::ostream& operator<<(std::ostream& os, const SelectInstructionKind& kind);
+
+// Type of growable bitmap for memory tuning.
+enum OatBitMapKind {
+  kBitMapMisc = 0,
+  kBitMapUse,
+  kBitMapDef,
+  kBitMapLiveIn,
+  kBitMapBMatrix,
+  kBitMapDominators,
+  kBitMapIDominated,
+  kBitMapDomFrontier,
+  kBitMapPhi,
+  kBitMapTmpBlocks,
+  kBitMapInputBlocks,
+  kBitMapRegisterV,
+  kBitMapTempSSARegisterV,
+  kBitMapNullCheck,
+  kBitMapTmpBlockV,
+  kBitMapPredecessors,
+  kNumBitMapKinds
+};
+
+std::ostream& operator<<(std::ostream& os, const OatBitMapKind& kind);
 
 }  // namespace art
 
diff --git a/src/compiler/dex/frontend.cc b/src/compiler/dex/frontend.cc
index b212e5b..ca751ab 100644
--- a/src/compiler/dex/frontend.cc
+++ b/src/compiler/dex/frontend.cc
@@ -101,6 +101,7 @@
   //(1 << kDebugDumpCheckStats) |
   //(1 << kDebugDumpBitcodeFile) |
   //(1 << kDebugVerifyBitcode) |
+  //(1 << kDebugShowSummaryMemoryUsage) |
   0;
 
 static CompiledMethod* CompileMethod(CompilerDriver& compiler,
@@ -249,6 +250,11 @@
     }
   }
 
+  if (cu->enable_debug & (1 << kDebugShowSummaryMemoryUsage)) {
+    LOG(INFO) << "MEMINFO " << cu->arena.BytesAllocated() << " " << cu->mir_graph->GetNumBlocks()
+              << " " << PrettyMethod(method_idx, dex_file);
+  }
+
   return result;
 }
 
diff --git a/src/compiler/dex/frontend.h b/src/compiler/dex/frontend.h
index 49e0852..dc57a23 100644
--- a/src/compiler/dex/frontend.h
+++ b/src/compiler/dex/frontend.h
@@ -71,6 +71,7 @@
   kDebugDumpCheckStats,
   kDebugDumpBitcodeFile,
   kDebugVerifyBitcode,
+  kDebugShowSummaryMemoryUsage,
 };
 
 class LLVMInfo {
diff --git a/src/compiler/dex/growable_array.h b/src/compiler/dex/growable_array.h
index eb865d5..c4684a7 100644
--- a/src/compiler/dex/growable_array.h
+++ b/src/compiler/dex/growable_array.h
@@ -79,26 +79,26 @@
 
     GrowableArray(ArenaAllocator* arena, size_t init_length, OatListKind kind = kGrowableArrayMisc)
       : arena_(arena),
-        num_allocated_(0),
+        num_allocated_(init_length),
         num_used_(0),
         kind_(kind) {
       elem_list_ = static_cast<T*>(arena_->NewMem(sizeof(T) * init_length, true,
-                                                 ArenaAllocator::kAllocGrowableArray));
-      // TODO: Collect detailed memory usage stats by list kind.
+                                                  ArenaAllocator::kAllocGrowableArray));
     };
 
 
     // Expand the list size to at least new length.
     void Resize(size_t new_length) {
       if (new_length <= num_allocated_) return;
-      size_t target_length = (num_allocated_ < 128) ? num_allocated_ << 1 : num_allocated_ + 128;
+      // If it's a small list double the size, else grow 1.5x.
+      size_t target_length =
+          (num_allocated_ < 128) ? num_allocated_ << 1 : num_allocated_ + (num_allocated_ >> 1);
       if (new_length > target_length) {
          target_length = new_length;
       }
       T* new_array = static_cast<T*>(arena_->NewMem(sizeof(T) * target_length, true,
                                                     ArenaAllocator::kAllocGrowableArray));
       memcpy(new_array, elem_list_, sizeof(T) * num_allocated_);
-      // TODO: Collect stats on wasted resize memory.
       num_allocated_ = target_length;
       elem_list_ = new_array;
     };
diff --git a/src/compiler/dex/mir_dataflow.cc b/src/compiler/dex/mir_dataflow.cc
index 444874d..9f61d73 100644
--- a/src/compiler/dex/mir_dataflow.cc
+++ b/src/compiler/dex/mir_dataflow.cc
@@ -1202,13 +1202,6 @@
   }
 }
 
-/* Clear the visited flag for each BB */
-bool MIRGraph::ClearVisitedFlag(struct BasicBlock* bb)
-{
-  bb->visited = false;
-  return true;
-}
-
 /*
  * This function will make a best guess at whether the invoke will
  * end up using Method*.  It isn't critical to get it exactly right,
diff --git a/src/compiler/dex/mir_graph.h b/src/compiler/dex/mir_graph.h
index fd81967..882a508 100644
--- a/src/compiler/dex/mir_graph.h
+++ b/src/compiler/dex/mir_graph.h
@@ -592,7 +592,7 @@
    void DataFlowSSAFormat35C(MIR* mir);
    void DataFlowSSAFormat3RC(MIR* mir);
    bool FindLocalLiveIn(BasicBlock* bb);
-   bool ClearVisitedFlag(struct BasicBlock* bb);
+   void ClearAllVisitedFlags();
    bool CountUses(struct BasicBlock* bb);
    bool InferTypeAndSize(BasicBlock* bb);
    bool VerifyPredInfo(BasicBlock* bb);
@@ -621,7 +621,7 @@
    bool ComputeBlockLiveIns(BasicBlock* bb);
    bool InsertPhiNodeOperands(BasicBlock* bb);
    bool ComputeDominanceFrontier(BasicBlock* bb);
-   bool DoConstantPropogation(BasicBlock* bb);
+   void DoConstantPropogation(BasicBlock* bb);
    void CountChecks(BasicBlock* bb);
    bool CombineBlocks(BasicBlock* bb);
 
diff --git a/src/compiler/dex/mir_optimization.cc b/src/compiler/dex/mir_optimization.cc
index 54a9a83..5345501 100644
--- a/src/compiler/dex/mir_optimization.cc
+++ b/src/compiler/dex/mir_optimization.cc
@@ -39,7 +39,7 @@
   constant_values_[ssa_reg + 1] = High32Bits(value);
 }
 
-bool MIRGraph::DoConstantPropogation(BasicBlock* bb)
+void MIRGraph::DoConstantPropogation(BasicBlock* bb)
 {
   MIR* mir;
 
@@ -94,7 +94,6 @@
     }
   }
   /* TODO: implement code to handle arithmetic operations */
-  return true;
 }
 
 void MIRGraph::PropagateConstants()
@@ -848,11 +847,7 @@
 {
   if (!(cu_->disable_opt & (1 << kBBOpt))) {
     DCHECK_EQ(cu_->num_compiler_temps, 0);
-    // Mark all blocks as not visited
-    AllNodesIterator iter(this, false /* not iterative */);
-    for (BasicBlock* bb = iter.Next(); bb != NULL; bb = iter.Next()) {
-      ClearVisitedFlag(bb);
-    }
+    ClearAllVisitedFlags();
     PreOrderDfsIterator iter2(this, false /* not iterative */);
     for (BasicBlock* bb = iter2.Next(); bb != NULL; bb = iter2.Next()) {
       BuildExtendedBBList(bb);
diff --git a/src/compiler/dex/quick/gen_invoke.cc b/src/compiler/dex/quick/gen_invoke.cc
index efacff0..9fd4a86 100644
--- a/src/compiler/dex/quick/gen_invoke.cc
+++ b/src/compiler/dex/quick/gen_invoke.cc
@@ -1158,11 +1158,13 @@
   }
   RegLocation rl_object = LoadValue(rl_src_obj, kCoreReg);
   RegLocation rl_offset = LoadValue(rl_src_offset, kCoreReg);
-  RegLocation rl_value = LoadValue(rl_src_value, kCoreReg);
+  RegLocation rl_value;
   if (is_long) {
+    rl_value = LoadValueWide(rl_src_value, kCoreReg);
     OpRegReg(kOpAdd, rl_object.low_reg, rl_offset.low_reg);
     StoreBaseDispWide(rl_object.low_reg, 0, rl_value.low_reg, rl_value.high_reg);
   } else {
+    rl_value = LoadValue(rl_src_value, kCoreReg);
     StoreBaseIndexed(rl_object.low_reg, rl_offset.low_reg, rl_value.low_reg, 0, kWord);
   }
   if (is_volatile) {
diff --git a/src/compiler/dex/quick/ralloc_util.cc b/src/compiler/dex/quick/ralloc_util.cc
index dd38914..30ed1b7 100644
--- a/src/compiler/dex/quick/ralloc_util.cc
+++ b/src/compiler/dex/quick/ralloc_util.cc
@@ -37,6 +37,10 @@
     if (reg_pool_->FPRegs[i].is_temp)
       reg_pool_->FPRegs[i].in_use = false;
   }
+  // Reset temp tracking sanity check.
+  if (kIsDebugBuild) {
+    live_sreg_ = INVALID_SREG;
+  }
 }
 
  /*
diff --git a/src/compiler/dex/ssa_transformation.cc b/src/compiler/dex/ssa_transformation.cc
index 02982e5..a90d705 100644
--- a/src/compiler/dex/ssa_transformation.cc
+++ b/src/compiler/dex/ssa_transformation.cc
@@ -21,6 +21,13 @@
 
 namespace art {
 
+void MIRGraph::ClearAllVisitedFlags() {
+  AllNodesIterator iter(this, false /* not iterative */);
+  for (BasicBlock* bb = iter.Next(); bb != NULL; bb = iter.Next()) {
+    bb->visited = false;
+  }
+}
+
 BasicBlock* MIRGraph::NeedsVisit(BasicBlock* bb) {
   if (bb != NULL) {
     if (bb->visited || bb->hidden) {
@@ -96,10 +103,8 @@
   }
 
   // Reset visited flags from all nodes
-  AllNodesIterator iter(this, false /* not iterative */);
-  for (BasicBlock* bb = iter.Next(); bb != NULL; bb = iter.Next()) {
-    ClearVisitedFlag(bb);
-  }
+  ClearAllVisitedFlags();
+
   // Record dfs orders
   RecordDFSOrders(GetEntryBlock());
 
@@ -158,26 +163,42 @@
   }
 }
 
-/* Compute the post-order traversal of the CFG */
-void MIRGraph::ComputeDomPostOrderTraversal(BasicBlock* bb)
-{
-  ArenaBitVector::Iterator bv_iterator(bb->i_dominated);
-
-  /* Iterate through the dominated blocks first */
-  while (true) {
-    //TUNING: hot call to BitVectorIteratorNext
-    int bb_idx = bv_iterator.Next();
-    if (bb_idx == -1) break;
-    BasicBlock* dominated_bb = GetBasicBlock(bb_idx);
-    ComputeDomPostOrderTraversal(dominated_bb);
+void MIRGraph::ComputeDomPostOrderTraversal(BasicBlock* bb) {
+  if (dom_post_order_traversal_ == NULL) {
+    // First time - create the array.
+    dom_post_order_traversal_ =
+        new (arena_) GrowableArray<int>(arena_, num_reachable_blocks_,
+                                        kGrowableArrayDomPostOrderTraversal);
+  } else {
+    dom_post_order_traversal_->Reset();
   }
+  ClearAllVisitedFlags();
+  std::vector<std::pair<BasicBlock*, ArenaBitVector::Iterator*> > work_stack;
+  bb->visited = true;
+  work_stack.push_back(std::make_pair(bb, new (arena_) ArenaBitVector::Iterator(bb->i_dominated)));
+  while (!work_stack.empty()) {
+    std::pair<BasicBlock*, ArenaBitVector::Iterator*> curr = work_stack.back();
+    BasicBlock* curr_bb = curr.first;
+    ArenaBitVector::Iterator* curr_idom_iter = curr.second;
+    int bb_idx = curr_idom_iter->Next();
+    while ((bb_idx != -1) && (NeedsVisit(GetBasicBlock(bb_idx)) == NULL)) {
+      bb_idx = curr_idom_iter->Next();
+    }
+    if (bb_idx != -1) {
+      BasicBlock* new_bb = GetBasicBlock(bb_idx);
+      new_bb->visited = true;
+      work_stack.push_back(
+          std::make_pair(new_bb, new (arena_) ArenaBitVector::Iterator(new_bb->i_dominated)));
+    } else {
+      // no successor/next
+      dom_post_order_traversal_->Insert(curr_bb->id);
+      work_stack.pop_back();
 
-  /* Enter the current block id */
-  dom_post_order_traversal_->Insert(bb->id);
-
-  /* hacky loop detection */
-  if (bb->taken && bb->dominators->IsBitSet(bb->taken->id)) {
-    attributes_ |= METHOD_HAS_LOOP;
+      /* hacky loop detection */
+      if (curr_bb->taken && curr_bb->dominators->IsBitSet(curr_bb->taken->id)) {
+        attributes_ |= METHOD_HAS_LOOP;
+      }
+    }
   }
 }
 
@@ -401,21 +422,8 @@
     ComputeBlockDominators(bb);
   }
 
-  /*
-   * Now go ahead and compute the post order traversal based on the
-   * i_dominated sets.
-   */
-  if (dom_post_order_traversal_ == NULL) {
-    dom_post_order_traversal_ =
-        new (arena_) GrowableArray<int>(arena_, num_reachable_blocks, kGrowableArrayDomPostOrderTraversal);
-  } else {
-    dom_post_order_traversal_->Reset();
-  }
-
+  // Compute the dominance frontier for each block.
   ComputeDomPostOrderTraversal(GetEntryBlock());
-  DCHECK_EQ(dom_post_order_traversal_->Size(), num_reachable_blocks_);
-
-  /* Now compute the dominance frontier for each block */
   PostOrderDOMIterator iter5(this, false /* not iterative */);
   for (BasicBlock* bb = iter5.Next(); bb != NULL; bb = iter5.Next()) {
     ComputeDominanceFrontier(bb);
@@ -673,10 +681,7 @@
   InsertPhiNodes();
 
   /* Rename register names by local defs and phi nodes */
-  AllNodesIterator iter1(this, false /* not iterative */);
-  for (BasicBlock* bb = iter1.Next(); bb != NULL; bb = iter1.Next()) {
-    ClearVisitedFlag(bb);
-  }
+  ClearAllVisitedFlags();
   DoDFSPreOrderSSARename(GetEntryBlock());
 
   /*