Clean up sampling tracing.

- Moved maps to fields within thread.
- Created temp trace field to lessen amount of traces allocated and freed.

Change-Id: I23fe25a85ad2894cc6917f87d4046cdececf0739
diff --git a/runtime/thread.cc b/runtime/thread.cc
index c4077de..8fc744d 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -961,6 +961,8 @@
       jpeer_(NULL),
       stack_begin_(NULL),
       stack_size_(0),
+      stack_trace_sample_(NULL),
+      trace_clock_base_(0),
       thin_lock_id_(0),
       tid_(0),
       wait_mutex_(new Mutex("a thread wait mutex")),
@@ -1077,6 +1079,7 @@
   delete debug_invoke_req_;
   delete instrumentation_stack_;
   delete name_;
+  delete stack_trace_sample_;
 
   TearDownAlternateSignalStack();
 }
diff --git a/runtime/thread.h b/runtime/thread.h
index f16695d..40e3f5f 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -552,6 +552,22 @@
     return instrumentation_stack_;
   }
 
+  std::vector<mirror::ArtMethod*>* GetStackTraceSample() const {
+    return stack_trace_sample_;
+  }
+
+  void SetStackTraceSample(std::vector<mirror::ArtMethod*>* sample) {
+    stack_trace_sample_ = sample;
+  }
+
+  uint64_t GetTraceClockBase() const {
+    return trace_clock_base_;
+  }
+
+  void SetTraceClockBase(uint64_t clock_base) {
+    trace_clock_base_ = clock_base;
+  }
+
   BaseMutex* GetHeldMutex(LockLevel level) const {
     return held_mutexes_[level];
   }
@@ -698,6 +714,12 @@
   // Size of the stack
   size_t stack_size_;
 
+  // Pointer to previous stack trace captured by sampling profiler.
+  std::vector<mirror::ArtMethod*>* stack_trace_sample_;
+
+  // The clock base used for tracing.
+  uint64_t trace_clock_base_;
+
   // Thin lock thread id. This is a small integer used by the thin lock implementation.
   // This is not to be confused with the native thread's tid, nor is it the value returned
   // by java.lang.Thread.getId --- this is a distinct value, used only for locking. One
diff --git a/runtime/trace.cc b/runtime/trace.cc
index d435129..7dbd35a 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -87,7 +87,7 @@
 class BuildStackTraceVisitor : public StackVisitor {
  public:
   explicit BuildStackTraceVisitor(Thread* thread) : StackVisitor(thread, NULL),
-      method_trace_(new std::vector<mirror::ArtMethod*>) {}
+      method_trace_(Trace::AllocStackTrace()) {}
 
   bool VisitFrame() {
     mirror::ArtMethod* m = GetMethod();
@@ -126,6 +126,7 @@
 bool Trace::sampling_enabled_ = true;
 uint32_t Trace::sampling_interval_us_ = 10000;
 pthread_t Trace::sampling_pthread_ = 0U;
+UniquePtr<std::vector<mirror::ArtMethod*> > Trace::temp_stack_trace_;
 
 static mirror::ArtMethod* DecodeTraceMethodId(uint32_t tmid) {
   return reinterpret_cast<mirror::ArtMethod*>(tmid & ~kTraceMethodActionMask);
@@ -142,12 +143,25 @@
   return tmid;
 }
 
+std::vector<mirror::ArtMethod*>* Trace::AllocStackTrace() {
+  if (temp_stack_trace_.get() != NULL) {
+    return temp_stack_trace_.release();
+  } else {
+    return new std::vector<mirror::ArtMethod*>();
+  }
+}
+
+void Trace::FreeStackTrace(std::vector<mirror::ArtMethod*>* stack_trace) {
+  stack_trace->clear();
+  temp_stack_trace_.reset(stack_trace);
+}
+
 void Trace::SetDefaultClockSource(ProfilerClockSource clock_source) {
 #if defined(HAVE_POSIX_CLOCKS)
   default_clock_source_ = clock_source;
 #else
   if (clock_source != kProfilerClockSourceWall) {
-    LOG(WARNING) << "Ignoring tracing request to use ";
+    LOG(WARNING) << "Ignoring tracing request to use CPU time.";
   }
 #endif
 }
@@ -181,7 +195,8 @@
   }
 }
 
-static uint32_t GetClockOverhead(Trace* trace) {
+// Compute an average time taken to measure clocks.
+static uint32_t GetClockOverheadNanoSeconds(Trace* trace) {
   Thread* self = Thread::Current();
   uint64_t start = self->GetCpuMicroTime();
 
@@ -196,34 +211,34 @@
     MeasureClockOverhead(trace);
   }
 
-  uint64_t elapsed = self->GetCpuMicroTime() - start;
-  return uint32_t (elapsed / 32);
+  uint64_t elapsed_us = self->GetCpuMicroTime() - start;
+  return static_cast<uint32_t>(elapsed_us / 32);
 }
 
 // TODO: put this somewhere with the big-endian equivalent used by JDWP.
 static void Append2LE(uint8_t* buf, uint16_t val) {
-  *buf++ = (uint8_t) val;
-  *buf++ = (uint8_t) (val >> 8);
+  *buf++ = static_cast<uint8_t>(val);
+  *buf++ = static_cast<uint8_t>(val >> 8);
 }
 
 // TODO: put this somewhere with the big-endian equivalent used by JDWP.
 static void Append4LE(uint8_t* buf, uint32_t val) {
-  *buf++ = (uint8_t) val;
-  *buf++ = (uint8_t) (val >> 8);
-  *buf++ = (uint8_t) (val >> 16);
-  *buf++ = (uint8_t) (val >> 24);
+  *buf++ = static_cast<uint8_t>(val);
+  *buf++ = static_cast<uint8_t>(val >> 8);
+  *buf++ = static_cast<uint8_t>(val >> 16);
+  *buf++ = static_cast<uint8_t>(val >> 24);
 }
 
 // TODO: put this somewhere with the big-endian equivalent used by JDWP.
 static void Append8LE(uint8_t* buf, uint64_t val) {
-  *buf++ = (uint8_t) val;
-  *buf++ = (uint8_t) (val >> 8);
-  *buf++ = (uint8_t) (val >> 16);
-  *buf++ = (uint8_t) (val >> 24);
-  *buf++ = (uint8_t) (val >> 32);
-  *buf++ = (uint8_t) (val >> 40);
-  *buf++ = (uint8_t) (val >> 48);
-  *buf++ = (uint8_t) (val >> 56);
+  *buf++ = static_cast<uint8_t>(val);
+  *buf++ = static_cast<uint8_t>(val >> 8);
+  *buf++ = static_cast<uint8_t>(val >> 16);
+  *buf++ = static_cast<uint8_t>(val >> 24);
+  *buf++ = static_cast<uint8_t>(val >> 32);
+  *buf++ = static_cast<uint8_t>(val >> 40);
+  *buf++ = static_cast<uint8_t>(val >> 48);
+  *buf++ = static_cast<uint8_t>(val >> 56);
 }
 
 static void GetSample(Thread* thread, void* arg) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -234,14 +249,22 @@
   the_trace->CompareAndUpdateStackTrace(thread, stack_trace);
 }
 
+static void ClearThreadStackTraceAndClockBase(Thread* thread, void* arg) {
+  thread->SetTraceClockBase(0);
+  std::vector<mirror::ArtMethod*>* stack_trace = thread->GetStackTraceSample();
+  thread->SetStackTraceSample(NULL);
+  delete stack_trace;
+}
+
 void Trace::CompareAndUpdateStackTrace(Thread* thread,
                                        std::vector<mirror::ArtMethod*>* stack_trace) {
   CHECK_EQ(pthread_self(), sampling_pthread_);
-  SafeMap<Thread*, std::vector<mirror::ArtMethod*>*>::iterator map_it = thread_stack_trace_map_.find(thread);
-  if (map_it == thread_stack_trace_map_.end()) {
-    // If there's no existing stack trace in the map for this thread, log an entry event for all
+  std::vector<mirror::ArtMethod*>* old_stack_trace = thread->GetStackTraceSample();
+  // Update the thread's stack trace sample.
+  thread->SetStackTraceSample(stack_trace);
+  if (old_stack_trace == NULL) {
+    // If there's no previous stack trace sample for this thread, log an entry event for all
     // methods in the trace.
-    thread_stack_trace_map_.Put(thread, stack_trace);
     for (std::vector<mirror::ArtMethod*>::reverse_iterator rit = stack_trace->rbegin();
          rit != stack_trace->rend(); ++rit) {
       LogMethodTraceEvent(thread, *rit, instrumentation::Instrumentation::kMethodEntered);
@@ -249,11 +272,8 @@
   } else {
     // If there's a previous stack trace for this thread, diff the traces and emit entry and exit
     // events accordingly.
-    std::vector<mirror::ArtMethod*>* old_stack_trace = map_it->second;
-    thread_stack_trace_map_.Overwrite(thread, stack_trace);
     std::vector<mirror::ArtMethod*>::reverse_iterator old_rit = old_stack_trace->rbegin();
     std::vector<mirror::ArtMethod*>::reverse_iterator rit = stack_trace->rbegin();
-
     // Iterate bottom-up over both traces until there's a difference between them.
     while (old_rit != old_stack_trace->rend() && rit != stack_trace->rend() && *old_rit == *rit) {
       old_rit++;
@@ -268,7 +288,7 @@
     for (; rit != stack_trace->rend(); ++rit) {
       LogMethodTraceEvent(thread, *rit, instrumentation::Instrumentation::kMethodEntered);
     }
-    delete old_stack_trace;
+    FreeStackTrace(old_stack_trace);
   }
 }
 
@@ -279,7 +299,7 @@
 
   while (true) {
     usleep(sampling_interval_us_);
-
+    ATRACE_BEGIN("Profile sampling");
     Thread* self = Thread::Current();
     Trace* the_trace;
     {
@@ -296,6 +316,7 @@
       runtime->GetThreadList()->ForEach(GetSample, the_trace);
     }
     runtime->GetThreadList()->ResumeAll();
+    ATRACE_END();
   }
 
   runtime->DetachCurrentThread();
@@ -379,7 +400,10 @@
   if (the_trace != NULL) {
     the_trace->FinishTracing();
 
-    if (!sampling_enabled_) {
+    if (sampling_enabled_) {
+      MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
+      runtime->GetThreadList()->ForEach(ClearThreadStackTraceAndClockBase, NULL);
+    } else {
       runtime->GetInstrumentation()->RemoveListener(the_trace,
                                                     instrumentation::Instrumentation::kMethodEntered |
                                                     instrumentation::Instrumentation::kMethodExited |
@@ -427,7 +451,6 @@
 
 Trace::~Trace() {
   CHECK_EQ(sampling_pthread_, static_cast<pthread_t>(0U));
-  STLDeleteValues(&thread_stack_trace_map_);
 }
 
 static void DumpBuf(uint8_t* buf, size_t buf_size, ProfilerClockSource clock_source)
@@ -449,7 +472,7 @@
   uint64_t elapsed = MicroTime() - start_time_;
 
   size_t final_offset = cur_offset_;
-  uint32_t clock_overhead = GetClockOverhead(this);
+  uint32_t clock_overhead_ns = GetClockOverheadNanoSeconds(this);
 
   if ((flags_ & kTraceCountAllocs) != 0) {
     Runtime::Current()->SetStatsEnabled(false);
@@ -475,7 +498,7 @@
   os << StringPrintf("elapsed-time-usec=%llu\n", elapsed);
   size_t num_records = (final_offset - kTraceHeaderLength) / GetRecordSize(clock_source_);
   os << StringPrintf("num-method-calls=%zd\n", num_records);
-  os << StringPrintf("clock-call-overhead-nsec=%d\n", clock_overhead);
+  os << StringPrintf("clock-call-overhead-nsec=%d\n", clock_overhead_ns);
   os << StringPrintf("vm=art\n");
   if ((flags_ & kTraceCountAllocs) != 0) {
     os << StringPrintf("alloc-count=%d\n", Runtime::Current()->GetStat(KIND_ALLOCATED_OBJECTS));
@@ -578,16 +601,14 @@
   ptr += 6;
 
   if (UseThreadCpuClock()) {
-    // TODO: this isn't vaguely thread safe.
-    SafeMap<Thread*, uint64_t>::iterator it = thread_clock_base_map_.find(thread);
+    uint64_t clock_base = thread->GetTraceClockBase();
     uint32_t thread_clock_diff = 0;
-    if (UNLIKELY(it == thread_clock_base_map_.end())) {
-      // First event, the diff is 0, record the base time in the map.
+    if (UNLIKELY(clock_base == 0)) {
+      // First event, record the base time in the map.
       uint64_t time = thread->GetCpuMicroTime();
-      thread_clock_base_map_.Put(thread, time);
+      thread->SetTraceClockBase(time);
     } else {
-      uint64_t thread_clock_base = it->second;
-      thread_clock_diff = thread->GetCpuMicroTime() - thread_clock_base;
+      thread_clock_diff = thread->GetCpuMicroTime() - clock_base;
     }
     Append4LE(ptr, thread_clock_diff);
     ptr += 4;
diff --git a/runtime/trace.h b/runtime/trace.h
index 5733929..9fe2cc6 100644
--- a/runtime/trace.h
+++ b/runtime/trace.h
@@ -83,6 +83,11 @@
                                mirror::Throwable* exception_object)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Reuse an old stack trace if it exists, otherwise allocate a new one.
+  static std::vector<mirror::ArtMethod*>* AllocStackTrace();
+  // Clear and store an old stack trace for later use.
+  static void FreeStackTrace(std::vector<mirror::ArtMethod*>* stack_trace);
+
   ~Trace();
 
  private:
@@ -116,11 +121,8 @@
   // Sampling thread, non-zero when sampling.
   static pthread_t sampling_pthread_;
 
-  // Maps a thread to its most recent stack trace sample.
-  SafeMap<Thread*, std::vector<mirror::ArtMethod*>*> thread_stack_trace_map_;
-
-  // Maps a thread to its clock base.
-  SafeMap<Thread*, uint64_t> thread_clock_base_map_;
+  // Used to remember an unused stack trace to avoid re-allocation during sampling.
+  static UniquePtr<std::vector<mirror::ArtMethod*> > temp_stack_trace_;
 
   // File to write trace data out to, NULL if direct to ddms.
   UniquePtr<File> trace_file_;