optimizing: Add statistics for # of constructor fences added/removed

Statistics are attributed as follows:

Added because:
* HNewInstances requires a HConstructorFence following it.
* HReturn requires a HConstructorFence (for final fields) preceding it.

Removed because:
* Optimized in Load-Store-Elimination.
* Optimized in Prepare-For-Register-Allocation.

Test: art/test.py
Bug: 36656456
Change-Id: Ic119441c5151a5a840fc6532b411340e2d68e5eb
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index b66883f..ca3b191 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -664,6 +664,9 @@
       DCHECK(fence_target != nullptr);
 
       AppendInstruction(new (arena_) HConstructorFence(fence_target, dex_pc, arena_));
+      MaybeRecordStat(
+          compilation_stats_,
+          MethodCompilationStat::kConstructorFenceGeneratedFinal);
     }
     AppendInstruction(new (arena_) HReturnVoid(dex_pc));
   } else {
@@ -1034,6 +1037,9 @@
   HConstructorFence* ctor_fence =
       new (arena_) HConstructorFence(allocation, allocation->GetDexPc(), arena_);
   AppendInstruction(ctor_fence);
+  MaybeRecordStat(
+      compilation_stats_,
+      MethodCompilationStat::kConstructorFenceGeneratedNew);
 }
 
 static bool IsSubClass(mirror::Class* to_test, mirror::Class* super_class)
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index fddda3d..98b8592 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -40,8 +40,9 @@
  public:
   LSEVisitor(HGraph* graph,
              const HeapLocationCollector& heap_locations_collector,
-             const SideEffectsAnalysis& side_effects)
-      : HGraphVisitor(graph),
+             const SideEffectsAnalysis& side_effects,
+             OptimizingCompilerStats* stats)
+      : HGraphVisitor(graph, stats),
         heap_location_collector_(heap_locations_collector),
         side_effects_(side_effects),
         heap_values_for_(graph->GetBlocks().size(),
@@ -100,7 +101,10 @@
     //   * - Constructor fences (they never escape this thread).
     //   * - Allocations (if they are unused).
     for (HInstruction* new_instance : singleton_new_instances_) {
-      HConstructorFence::RemoveConstructorFences(new_instance);
+      size_t removed = HConstructorFence::RemoveConstructorFences(new_instance);
+      MaybeRecordStat(stats_,
+                      MethodCompilationStat::kConstructorFenceRemovedLSE,
+                      removed);
 
       if (!new_instance->HasNonEnvironmentUses()) {
         new_instance->RemoveEnvironmentUsers();
@@ -108,7 +112,10 @@
       }
     }
     for (HInstruction* new_array : singleton_new_arrays_) {
-      HConstructorFence::RemoveConstructorFences(new_array);
+      size_t removed = HConstructorFence::RemoveConstructorFences(new_array);
+      MaybeRecordStat(stats_,
+                      MethodCompilationStat::kConstructorFenceRemovedLSE,
+                      removed);
 
       if (!new_array->HasNonEnvironmentUses()) {
         new_array->RemoveEnvironmentUsers();
@@ -663,7 +670,7 @@
     return;
   }
 
-  LSEVisitor lse_visitor(graph_, heap_location_collector, side_effects_);
+  LSEVisitor lse_visitor(graph_, heap_location_collector, side_effects_, stats_);
   for (HBasicBlock* block : graph_->GetReversePostOrder()) {
     lse_visitor.VisitBasicBlock(block);
   }
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 3a1864b..8644f67 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -1198,11 +1198,14 @@
   DCHECK_EQ(0u, InputCount());
 }
 
-void HConstructorFence::RemoveConstructorFences(HInstruction* instruction) {
+size_t HConstructorFence::RemoveConstructorFences(HInstruction* instruction) {
   DCHECK(instruction->GetBlock() != nullptr);
   // Removing constructor fences only makes sense for instructions with an object return type.
   DCHECK_EQ(Primitive::kPrimNot, instruction->GetType());
 
+  // Return how many instructions were removed for statistic purposes.
+  size_t remove_count = 0;
+
   // Efficient implementation that simultaneously (in one pass):
   // * Scans the uses list for all constructor fences.
   // * Deletes that constructor fence from the uses list of `instruction`.
@@ -1250,6 +1253,7 @@
       // is removed.
       if (ctor_fence->InputCount() == 0u) {
         ctor_fence->GetBlock()->RemoveInstruction(ctor_fence);
+        ++remove_count;
       }
     }
   }
@@ -1263,6 +1267,8 @@
     }
     CHECK(instruction->GetBlock() != nullptr);
   }
+
+  return remove_count;
 }
 
 HInstruction* HConstructorFence::GetAssociatedAllocation() {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index e443142..29be8ac 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -6630,7 +6630,9 @@
   // This must *not* be called during/after prepare_for_register_allocation,
   // because that removes all the inputs to the fences but the fence is actually
   // still considered live.
-  static void RemoveConstructorFences(HInstruction* instruction);
+  //
+  // Returns how many HConstructorFence instructions were removed from graph.
+  static size_t RemoveConstructorFences(HInstruction* instruction);
 
   // Check if this constructor fence is protecting
   // an HNewInstance or HNewArray that is also the immediate
@@ -6878,9 +6880,13 @@
 
 namespace art {
 
+class OptimizingCompilerStats;
+
 class HGraphVisitor : public ValueObject {
  public:
-  explicit HGraphVisitor(HGraph* graph) : graph_(graph) {}
+  explicit HGraphVisitor(HGraph* graph, OptimizingCompilerStats* stats = nullptr)
+      : stats_(stats),
+        graph_(graph) {}
   virtual ~HGraphVisitor() {}
 
   virtual void VisitInstruction(HInstruction* instruction ATTRIBUTE_UNUSED) {}
@@ -6902,6 +6908,9 @@
 
 #undef DECLARE_VISIT_INSTRUCTION
 
+ protected:
+  OptimizingCompilerStats* stats_;
+
  private:
   HGraph* const graph_;
 
@@ -6910,7 +6919,8 @@
 
 class HGraphDelegateVisitor : public HGraphVisitor {
  public:
-  explicit HGraphDelegateVisitor(HGraph* graph) : HGraphVisitor(graph) {}
+  explicit HGraphDelegateVisitor(HGraph* graph, OptimizingCompilerStats* stats = nullptr)
+      : HGraphVisitor(graph, stats) {}
   virtual ~HGraphDelegateVisitor() {}
 
   // Visit functions that delegate to to super class.
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 70bbc38..435ca1c 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -711,11 +711,12 @@
 static void AllocateRegisters(HGraph* graph,
                               CodeGenerator* codegen,
                               PassObserver* pass_observer,
-                              RegisterAllocator::Strategy strategy) {
+                              RegisterAllocator::Strategy strategy,
+                              OptimizingCompilerStats* stats) {
   {
     PassScope scope(PrepareForRegisterAllocation::kPrepareForRegisterAllocationPassName,
                     pass_observer);
-    PrepareForRegisterAllocation(graph).Run();
+    PrepareForRegisterAllocation(graph, stats).Run();
   }
   SsaLivenessAnalysis liveness(graph, codegen);
   {
@@ -1035,7 +1036,8 @@
   AllocateRegisters(graph,
                     codegen.get(),
                     &pass_observer,
-                    regalloc_strategy);
+                    regalloc_strategy,
+                    compilation_stats_.get());
 
   codegen->Compile(code_allocator);
   pass_observer.DumpDisassembly();
diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h
index 098d23d..d6da73c 100644
--- a/compiler/optimizing/optimizing_compiler_stats.h
+++ b/compiler/optimizing/optimizing_compiler_stats.h
@@ -23,6 +23,7 @@
 #include <type_traits>
 
 #include "atomic.h"
+#include "globals.h"
 
 namespace art {
 
@@ -86,6 +87,10 @@
   kNotInlinedWont,
   kNotInlinedRecursiveBudget,
   kNotInlinedProxy,
+  kConstructorFenceGeneratedNew,
+  kConstructorFenceGeneratedFinal,
+  kConstructorFenceRemovedLSE,
+  kConstructorFenceRemovedPFRA,
   kLastStat
 };
 
@@ -202,6 +207,10 @@
       case kNotInlinedWont: name = "NotInlinedWont"; break;
       case kNotInlinedRecursiveBudget: name = "NotInlinedRecursiveBudget"; break;
       case kNotInlinedProxy: name = "NotInlinedProxy"; break;
+      case kConstructorFenceGeneratedNew: name = "ConstructorFenceGeneratedNew"; break;
+      case kConstructorFenceGeneratedFinal: name = "ConstructorFenceGeneratedFinal"; break;
+      case kConstructorFenceRemovedLSE: name = "ConstructorFenceRemovedLSE"; break;
+      case kConstructorFenceRemovedPFRA: name = "ConstructorFenceRemovedPFRA"; break;
 
       case kLastStat:
         LOG(FATAL) << "invalid stat "
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
index 7c6b69f..5de707a 100644
--- a/compiler/optimizing/prepare_for_register_allocation.cc
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -17,6 +17,7 @@
 #include "prepare_for_register_allocation.h"
 
 #include "jni_internal.h"
+#include "optimizing_compiler_stats.h"
 #include "well_known_classes.h"
 
 namespace art {
@@ -190,8 +191,9 @@
       // TODO: GetAssociatedAllocation should not care about multiple inputs
       // if we are in prepare_for_register_allocation pass only.
       constructor_fence->GetBlock()->RemoveInstruction(constructor_fence);
+      MaybeRecordStat(stats_,
+                      MethodCompilationStat::kConstructorFenceRemovedPFRA);
       return;
-      // TODO: actually remove the dmb from the .S entrypoints (initialized variants only).
     }
 
     // HNewArray does not need this check because the art_quick_alloc_array does not itself
diff --git a/compiler/optimizing/prepare_for_register_allocation.h b/compiler/optimizing/prepare_for_register_allocation.h
index 395d4ba..2c64f01 100644
--- a/compiler/optimizing/prepare_for_register_allocation.h
+++ b/compiler/optimizing/prepare_for_register_allocation.h
@@ -21,6 +21,8 @@
 
 namespace art {
 
+class OptimizingCompilerStats;
+
 /**
  * A simplification pass over the graph before doing register allocation.
  * For example it changes uses of null checks and bounds checks to the original
@@ -28,7 +30,9 @@
  */
 class PrepareForRegisterAllocation : public HGraphDelegateVisitor {
  public:
-  explicit PrepareForRegisterAllocation(HGraph* graph) : HGraphDelegateVisitor(graph) {}
+  explicit PrepareForRegisterAllocation(HGraph* graph,
+                                        OptimizingCompilerStats* stats = nullptr)
+      : HGraphDelegateVisitor(graph, stats) {}
 
   void Run();