ART: Use ThreadList RunCheckpoint for GetAllStackTraces

Move to using the ThreadList code to avoid races. The tradeoff is a more
complicated infrastructure to collect and store the traces.

Bug: 62353392
Test: m test-art-host
Test: art/test/testrunner/testrunner.py -b --host -t 911
Change-Id: I71e957dbfbd619f6cfa0cdd1e6cb894e25cf5483
diff --git a/runtime/openjdkjvmti/ti_stack.cc b/runtime/openjdkjvmti/ti_stack.cc
index 550b972..9f4002d 100644
--- a/runtime/openjdkjvmti/ti_stack.cc
+++ b/runtime/openjdkjvmti/ti_stack.cc
@@ -286,6 +286,37 @@
                               count_ptr);
 }
 
+template <typename Data>
+struct GetAllStackTracesVectorClosure : public art::Closure {
+  GetAllStackTracesVectorClosure(size_t stop, Data* data_) : stop_input(stop), data(data_) {}
+
+  void Run(art::Thread* thread) OVERRIDE
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      REQUIRES(!data->mutex) {
+    art::Thread* self = art::Thread::Current();
+
+    // Skip threads that are still starting.
+    if (thread->IsStillStarting()) {
+      return;
+    }
+
+    std::vector<jvmtiFrameInfo>* thread_frames = data->GetFrameStorageFor(self, thread);
+    if (thread_frames == nullptr) {
+      return;
+    }
+
+    // Now collect the data.
+    auto frames_fn = [&](jvmtiFrameInfo info) {
+      thread_frames->push_back(info);
+    };
+    auto visitor = MakeStackTraceVisitor(thread, 0u, stop_input, frames_fn);
+    visitor.WalkStack(/* include_transitions */ false);
+  }
+
+  const size_t stop_input;
+  Data* data;
+};
+
 jvmtiError StackUtil::GetAllStackTraces(jvmtiEnv* env,
                                         jint max_frame_count,
                                         jvmtiStackInfo** stack_info_ptr,
@@ -297,58 +328,66 @@
     return ERR(NULL_POINTER);
   }
 
-
-  art::Thread* current = art::Thread::Current();
-  art::ScopedObjectAccess soa(current);      // Now we know we have the shared lock.
-  art::ScopedThreadSuspension sts(current, art::kWaitingForDebuggerSuspension);
-  art::ScopedSuspendAll ssa("GetAllStackTraces");
-
-  std::vector<art::Thread*> threads;
-  std::vector<std::vector<jvmtiFrameInfo>> frames;
-  {
-    std::list<art::Thread*> thread_list;
-    {
-      art::MutexLock mu(current, *art::Locks::thread_list_lock_);
-      thread_list = art::Runtime::Current()->GetThreadList()->GetList();
+  struct AllStackTracesData {
+    AllStackTracesData() : mutex("GetAllStackTraces", art::LockLevel::kAbortLock) {}
+    ~AllStackTracesData() {
+      JNIEnv* jni_env = art::Thread::Current()->GetJniEnv();
+      for (jthread global_thread_ref : thread_peers) {
+        jni_env->DeleteGlobalRef(global_thread_ref);
+      }
     }
 
-    for (art::Thread* thread : thread_list) {
-      // Skip threads that are still starting.
-      if (thread->IsStillStarting()) {
-        continue;
-      }
-
-      GetStackTraceVectorClosure closure(0u, static_cast<size_t>(max_frame_count));
-      thread->RequestSynchronousCheckpoint(&closure);
+    std::vector<jvmtiFrameInfo>* GetFrameStorageFor(art::Thread* self, art::Thread* thread)
+        REQUIRES_SHARED(art::Locks::mutator_lock_)
+        REQUIRES(!mutex) {
+      art::MutexLock mu(self, mutex);
 
       threads.push_back(thread);
-      frames.emplace_back();
-      frames.back().swap(closure.frames);
-    }
-  }
 
-  // Convert the data into our output format. Note: we need to keep the threads suspended,
-  // as we need to access them for their peers.
+      jthread peer = art::Runtime::Current()->GetJavaVM()->AddGlobalRef(
+          self, thread->GetPeerFromOtherThread());
+      thread_peers.push_back(peer);
+
+      frames.emplace_back();
+      return &frames.back();
+    }
+
+    art::Mutex mutex;
+
+    // Storage. Only access directly after completion.
+
+    std::vector<art::Thread*> threads;
+    // "thread_peers" contains global references to their peers.
+    std::vector<jthread> thread_peers;
+
+    std::vector<std::vector<jvmtiFrameInfo>> frames;
+  };
+
+  AllStackTracesData data;
+  GetAllStackTracesVectorClosure<AllStackTracesData> closure(
+      static_cast<size_t>(max_frame_count), &data);
+  art::Runtime::Current()->GetThreadList()->RunCheckpoint(&closure, nullptr);
+
+  art::Thread* current = art::Thread::Current();
+
+  // Convert the data into our output format.
 
   // Note: we use an array of jvmtiStackInfo for convenience. The spec says we need to
   //       allocate one big chunk for this and the actual frames, which means we need
   //       to either be conservative or rearrange things later (the latter is implemented).
-  std::unique_ptr<jvmtiStackInfo[]> stack_info_array(new jvmtiStackInfo[frames.size()]);
+  std::unique_ptr<jvmtiStackInfo[]> stack_info_array(new jvmtiStackInfo[data.frames.size()]);
   std::vector<std::unique_ptr<jvmtiFrameInfo[]>> frame_infos;
-  frame_infos.reserve(frames.size());
+  frame_infos.reserve(data.frames.size());
 
   // Now run through and add data for each thread.
   size_t sum_frames = 0;
-  for (size_t index = 0; index < frames.size(); ++index) {
+  for (size_t index = 0; index < data.frames.size(); ++index) {
     jvmtiStackInfo& stack_info = stack_info_array.get()[index];
     memset(&stack_info, 0, sizeof(jvmtiStackInfo));
 
-    art::Thread* self = threads[index];
-    const std::vector<jvmtiFrameInfo>& thread_frames = frames[index];
+    const std::vector<jvmtiFrameInfo>& thread_frames = data.frames[index];
 
-    // For the time being, set the thread to null. We don't have good ScopedLocalRef
-    // infrastructure.
-    DCHECK(self->GetPeerFromOtherThread() != nullptr);
+    // For the time being, set the thread to null. We'll fix it up in the second stage.
     stack_info.thread = nullptr;
     stack_info.state = JVMTI_THREAD_STATE_SUSPENDED;
 
@@ -377,7 +416,7 @@
   }
 
   // No errors, yet. Now put it all into an output buffer.
-  size_t rounded_stack_info_size = art::RoundUp(sizeof(jvmtiStackInfo) * frames.size(),
+  size_t rounded_stack_info_size = art::RoundUp(sizeof(jvmtiStackInfo) * data.frames.size(),
                                                 alignof(jvmtiFrameInfo));
   size_t chunk_size = rounded_stack_info_size + sum_frames * sizeof(jvmtiFrameInfo);
   unsigned char* chunk_data;
@@ -388,18 +427,18 @@
 
   jvmtiStackInfo* stack_info = reinterpret_cast<jvmtiStackInfo*>(chunk_data);
   // First copy in all the basic data.
-  memcpy(stack_info, stack_info_array.get(), sizeof(jvmtiStackInfo) * frames.size());
+  memcpy(stack_info, stack_info_array.get(), sizeof(jvmtiStackInfo) * data.frames.size());
 
   // Now copy the frames and fix up the pointers.
   jvmtiFrameInfo* frame_info = reinterpret_cast<jvmtiFrameInfo*>(
       chunk_data + rounded_stack_info_size);
-  for (size_t i = 0; i < frames.size(); ++i) {
+  for (size_t i = 0; i < data.frames.size(); ++i) {
     jvmtiStackInfo& old_stack_info = stack_info_array.get()[i];
     jvmtiStackInfo& new_stack_info = stack_info[i];
 
-    jthread thread_peer = current->GetJniEnv()->AddLocalReference<jthread>(
-        threads[i]->GetPeerFromOtherThread());
-    new_stack_info.thread = thread_peer;
+    // Translate the global ref into a local ref.
+    new_stack_info.thread =
+        static_cast<JNIEnv*>(current->GetJniEnv())->NewLocalRef(data.thread_peers[i]);
 
     if (old_stack_info.frame_count > 0) {
       // Only copy when there's data - leave the nullptr alone.
@@ -411,7 +450,7 @@
   }
 
   *stack_info_ptr = stack_info;
-  *thread_count_ptr = static_cast<jint>(frames.size());
+  *thread_count_ptr = static_cast<jint>(data.frames.size());
 
   return ERR(NONE);
 }
@@ -438,9 +477,46 @@
   art::Thread* current = art::Thread::Current();
   art::ScopedObjectAccess soa(current);      // Now we know we have the shared lock.
 
+  struct SelectStackTracesData {
+    SelectStackTracesData() : mutex("GetSelectStackTraces", art::LockLevel::kAbortLock) {}
+
+    std::vector<jvmtiFrameInfo>* GetFrameStorageFor(art::Thread* self, art::Thread* thread)
+              REQUIRES_SHARED(art::Locks::mutator_lock_)
+              REQUIRES(!mutex) {
+      art::ObjPtr<art::mirror::Object> peer = thread->GetPeerFromOtherThread();
+      for (size_t index = 0; index != handles.size(); ++index) {
+        if (peer == handles[index].Get()) {
+          // Found the thread.
+          art::MutexLock mu(self, mutex);
+
+          threads.push_back(thread);
+          thread_list_indices.push_back(index);
+
+          frames.emplace_back();
+          return &frames.back();
+        }
+      }
+      return nullptr;
+    }
+
+    art::Mutex mutex;
+
+    // Selection data.
+
+    std::vector<art::Handle<art::mirror::Object>> handles;
+
+    // Storage. Only access directly after completion.
+
+    std::vector<art::Thread*> threads;
+    std::vector<size_t> thread_list_indices;
+
+    std::vector<std::vector<jvmtiFrameInfo>> frames;
+  };
+
+  SelectStackTracesData data;
+
   // Decode all threads to raw pointers. Put them into a handle scope to avoid any moving GC bugs.
   art::VariableSizedHandleScope hs(current);
-  std::vector<art::Handle<art::mirror::Object>> handles;
   for (jint i = 0; i != thread_count; ++i) {
     if (thread_list[i] == nullptr) {
       return ERR(INVALID_THREAD);
@@ -448,70 +524,30 @@
     if (!soa.Env()->IsInstanceOf(thread_list[i], art::WellKnownClasses::java_lang_Thread)) {
       return ERR(INVALID_THREAD);
     }
-    handles.push_back(hs.NewHandle(soa.Decode<art::mirror::Object>(thread_list[i])));
+    data.handles.push_back(hs.NewHandle(soa.Decode<art::mirror::Object>(thread_list[i])));
   }
 
-  std::vector<art::Thread*> threads;
-  std::vector<size_t> thread_list_indices;
-  std::vector<std::vector<jvmtiFrameInfo>> frames;
-
-  {
-    art::ScopedThreadSuspension sts(current, art::kWaitingForDebuggerSuspension);
-    art::ScopedSuspendAll ssa("GetThreadListStackTraces");
-
-    {
-      std::list<art::Thread*> art_thread_list;
-      {
-        art::MutexLock mu(current, *art::Locks::thread_list_lock_);
-        art_thread_list = art::Runtime::Current()->GetThreadList()->GetList();
-      }
-
-      for (art::Thread* thread : art_thread_list) {
-        if (thread->IsStillStarting()) {
-          // Skip this. We can't get the jpeer, and if it is for a thread in the thread_list,
-          // we'll just report STARTING.
-          continue;
-        }
-
-        // Get the peer, and check whether we know it.
-        art::ObjPtr<art::mirror::Object> peer = thread->GetPeerFromOtherThread();
-        for (size_t index = 0; index != handles.size(); ++index) {
-          if (peer == handles[index].Get()) {
-            // Found the thread.
-            GetStackTraceVectorClosure closure(0u, static_cast<size_t>(max_frame_count));
-            thread->RequestSynchronousCheckpoint(&closure);
-
-            threads.push_back(thread);
-            thread_list_indices.push_back(index);
-            frames.emplace_back();
-            frames.back().swap(closure.frames);
-
-            continue;
-          }
-        }
-
-        // Must be not started, or dead. We'll deal with it at the end.
-      }
-    }
-  }
+  GetAllStackTracesVectorClosure<SelectStackTracesData> closure(
+      static_cast<size_t>(max_frame_count), &data);
+  art::Runtime::Current()->GetThreadList()->RunCheckpoint(&closure, nullptr);
 
   // Convert the data into our output format.
 
   // Note: we use an array of jvmtiStackInfo for convenience. The spec says we need to
   //       allocate one big chunk for this and the actual frames, which means we need
   //       to either be conservative or rearrange things later (the latter is implemented).
-  std::unique_ptr<jvmtiStackInfo[]> stack_info_array(new jvmtiStackInfo[frames.size()]);
+  std::unique_ptr<jvmtiStackInfo[]> stack_info_array(new jvmtiStackInfo[data.frames.size()]);
   std::vector<std::unique_ptr<jvmtiFrameInfo[]>> frame_infos;
-  frame_infos.reserve(frames.size());
+  frame_infos.reserve(data.frames.size());
 
   // Now run through and add data for each thread.
   size_t sum_frames = 0;
-  for (size_t index = 0; index < frames.size(); ++index) {
+  for (size_t index = 0; index < data.frames.size(); ++index) {
     jvmtiStackInfo& stack_info = stack_info_array.get()[index];
     memset(&stack_info, 0, sizeof(jvmtiStackInfo));
 
-    art::Thread* self = threads[index];
-    const std::vector<jvmtiFrameInfo>& thread_frames = frames[index];
+    art::Thread* self = data.threads[index];
+    const std::vector<jvmtiFrameInfo>& thread_frames = data.frames[index];
 
     // For the time being, set the thread to null. We don't have good ScopedLocalRef
     // infrastructure.
@@ -562,8 +598,8 @@
     // Check whether we found a running thread for this.
     // Note: For simplicity, and with the expectation that the list is usually small, use a simple
     //       search. (The list is *not* sorted!)
-    auto it = std::find(thread_list_indices.begin(), thread_list_indices.end(), i);
-    if (it == thread_list_indices.end()) {
+    auto it = std::find(data.thread_list_indices.begin(), data.thread_list_indices.end(), i);
+    if (it == data.thread_list_indices.end()) {
       // No native thread. Must be new or dead. We need to fill out the stack info now.
       // (Need to read the Java "started" field to know whether this is starting or terminated.)
       art::ObjPtr<art::mirror::Object> peer = soa.Decode<art::mirror::Object>(thread_list[i]);
@@ -580,7 +616,7 @@
       stack_info[i].frame_buffer = nullptr;
     } else {
       // Had a native thread and frames.
-      size_t f_index = it - thread_list_indices.begin();
+      size_t f_index = it - data.thread_list_indices.begin();
 
       jvmtiStackInfo& old_stack_info = stack_info_array.get()[f_index];
       jvmtiStackInfo& new_stack_info = stack_info[i];
@@ -598,7 +634,7 @@
     }
   }
 
-  * stack_info_ptr = stack_info;
+  *stack_info_ptr = stack_info;
 
   return ERR(NONE);
 }