ART: add weighted allocated bytes metrics

`process_cpu_start_time_` is moved from `art::Runtime` to `art::gc:Heap`
so that it could be reset when the app starts (current process is forked
from Zygote).

`process_cpu_end_time_` is removed from `art::Runtime` because it's only
used when the runtime is about to shutdown, so it suffices to capture it
in a local variable.

Test: Run art with -XX:DumpGCPerformanceOnShutdown on some benchmarks.
Bug: 112187497
Change-Id: I154fdb6acdf4e0c21dff835807bd4e2cf311e3d1
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index 8477c9d..46ff7dc 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -90,6 +90,7 @@
   Thread* self = Thread::Current();
   uint64_t start_time = NanoTime();
   uint64_t thread_cpu_start_time = ThreadCpuNanoTime();
+  GetHeap()->CalculateWeightedAllocatedBytes();
   Iteration* current_iteration = GetCurrentIteration();
   current_iteration->Reset(gc_cause, clear_soft_references);
   // Note transaction mode is single-threaded and there's no asynchronous GC and this flag doesn't
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index f767360..86135c1 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -210,6 +210,9 @@
       low_memory_mode_(low_memory_mode),
       long_pause_log_threshold_(long_pause_log_threshold),
       long_gc_log_threshold_(long_gc_log_threshold),
+      process_cpu_start_time_ns_(ProcessCpuNanoTime()),
+      last_process_cpu_time_ns_(process_cpu_start_time_ns_),
+      weighted_allocated_bytes_(0u),
       ignore_max_footprint_(ignore_max_footprint),
       zygote_creation_lock_("zygote creation lock", kZygoteCreationLock),
       zygote_space_(nullptr),
@@ -1062,6 +1065,14 @@
   }
 }
 
+void Heap::CalculateWeightedAllocatedBytes() {
+  uint64_t current_process_cpu_time = ProcessCpuNanoTime();
+  uint64_t bytes_allocated = GetBytesAllocated();
+  uint64_t weight = current_process_cpu_time - last_process_cpu_time_ns_;
+  weighted_allocated_bytes_ += weight * bytes_allocated;
+  last_process_cpu_time_ns_ = current_process_cpu_time;
+}
+
 uint64_t Heap::GetTotalGcCpuTime() {
   uint64_t sum = 0;
   for (auto* collector : garbage_collectors_) {
@@ -1139,6 +1150,11 @@
   for (auto* collector : garbage_collectors_) {
     collector->ResetMeasurements();
   }
+
+  process_cpu_start_time_ns_ = ProcessCpuNanoTime();
+  last_process_cpu_time_ns_ = process_cpu_start_time_ns_;
+  weighted_allocated_bytes_ = 0u;
+
   total_bytes_freed_ever_ = 0;
   total_objects_freed_ever_ = 0;
   total_wait_time_ = 0;
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index a43f315..411a446 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -395,8 +395,17 @@
     REQUIRES(!Locks::heap_bitmap_lock_)
     REQUIRES(Locks::mutator_lock_);
 
+  uint64_t GetWeightedAllocatedBytes() const {
+    return weighted_allocated_bytes_;
+  }
+
+  void CalculateWeightedAllocatedBytes();
   uint64_t GetTotalGcCpuTime();
 
+  uint64_t GetProcessCpuStartTime() const {
+    return process_cpu_start_time_ns_;
+  }
+
   // Set target ideal heap utilization ratio, implements
   // dalvik.system.VMRuntime.setTargetHeapUtilization.
   void SetTargetHeapUtilization(float target);
@@ -1161,6 +1170,15 @@
   // If we get a GC longer than long GC log threshold, then we print out the GC after it finishes.
   const size_t long_gc_log_threshold_;
 
+  // Starting time of the new process; meant to be used for measuring total process CPU time.
+  uint64_t process_cpu_start_time_ns_;
+
+  // Last time GC started; meant to be used to measure the duration between two GCs.
+  uint64_t last_process_cpu_time_ns_;
+
+  // allocated_bytes * (current_process_cpu_time - last_process_cpu_time)
+  uint64_t weighted_allocated_bytes_;
+
   // If we ignore the max footprint it lets the heap grow until it hits the heap capacity, this is
   // useful for benchmarking since it reduces time spent in GC to a low %.
   const bool ignore_max_footprint_;
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index ab79b9e..5323cee 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -278,7 +278,6 @@
       // Initially assume we perceive jank in case the process state is never updated.
       process_state_(kProcessStateJankPerceptible),
       zygote_no_threads_(false),
-      process_cpu_start_time_(ProcessCpuNanoTime()),
       verifier_logging_threshold_ms_(100) {
   static_assert(Runtime::kCalleeSaveSize ==
                     static_cast<uint32_t>(CalleeSaveType::kLastCalleeSaveType), "Unexpected size");
@@ -322,20 +321,26 @@
   }
 
   if (dump_gc_performance_on_shutdown_) {
-    process_cpu_end_time_ = ProcessCpuNanoTime();
+    heap_->CalculateWeightedAllocatedBytes();
+    uint64_t process_cpu_end_time = ProcessCpuNanoTime();
     ScopedLogSeverity sls(LogSeverity::INFO);
     // This can't be called from the Heap destructor below because it
     // could call RosAlloc::InspectAll() which needs the thread_list
     // to be still alive.
     heap_->DumpGcPerformanceInfo(LOG_STREAM(INFO));
 
-    uint64_t process_cpu_time = process_cpu_end_time_ - process_cpu_start_time_;
+    uint64_t process_cpu_time = process_cpu_end_time - heap_->GetProcessCpuStartTime();
     uint64_t gc_cpu_time = heap_->GetTotalGcCpuTime();
     float ratio = static_cast<float>(gc_cpu_time) / process_cpu_time;
     LOG_STREAM(INFO) << "GC CPU time " << PrettyDuration(gc_cpu_time)
         << " out of process CPU time " << PrettyDuration(process_cpu_time)
         << " (" << ratio << ")"
         << "\n";
+    float weighted_allocated_bytes =
+      static_cast<float>(heap_->GetWeightedAllocatedBytes()) / process_cpu_time;
+    LOG_STREAM(INFO) << "Weighted bytes allocated over CPU time: "
+        << " (" <<  PrettySize(weighted_allocated_bytes)  << ")"
+        << "\n";
   }
 
   if (jit_ != nullptr) {
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 0ccc7b7..1d58ad7 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -1101,9 +1101,6 @@
 
   MemMap protected_fault_page_;
 
-  uint64_t process_cpu_start_time_;
-  uint64_t process_cpu_end_time_;
-
   uint32_t verifier_logging_threshold_ms_;
 
   DISALLOW_COPY_AND_ASSIGN(Runtime);