ART: Streaming trace mode

Add a streaming mode for tracing. Streaming uses a buffer of 16KB
and writes to the output when that buffer gets full. Streaming mode
can be enabled with -Xmethod-trace-stream and is currently not
exposed otherwise.

Add a python script that can parse the streaming format, which
simply contains strings for newly encountered threads and methods
inline, and create output that can be used with traceview.

Add Trace::Pause and Trace::Abort, which can pause and abort tracing.
Abort is different from Stop in that it does not write the data.

Add code to the zygote hooks JNI implementation that pauses tracing
before the fork, making sure that a child cannot clobber the parent's
data.

Add code to the zygote hooks JNI implementation that aborts old
tracing and starts new tracing in the child after the fork. Currently
base the output on the pid. This will not work on an unmodified
device, as the profiles directory is not generally writable, but
we do not have enough information at that point. Consider a scheme
that restarts tracing later.

Change-Id: I93c7bf87e35af582bdfdd3ecc7c52454514220dd
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index f8c0e83..51600f7 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -50,11 +50,6 @@
 
 const bool kVerboseInstrumentation = false;
 
-// Do we want to deoptimize for method entry and exit listeners or just try to intercept
-// invocations? Deoptimization forces all code to run in the interpreter and considerably hurts the
-// application's performance.
-static constexpr bool kDeoptimizeForAccurateMethodEntryExitListeners = true;
-
 static bool InstallStubsClassVisitor(mirror::Class* klass, void* arg)
     EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
   Instrumentation* instrumentation = reinterpret_cast<Instrumentation*>(arg);
@@ -846,8 +841,7 @@
   ConfigureStubs(false, false);
 }
 
-void Instrumentation::EnableMethodTracing() {
-  bool require_interpreter = kDeoptimizeForAccurateMethodEntryExitListeners;
+void Instrumentation::EnableMethodTracing(bool require_interpreter) {
   ConfigureStubs(!require_interpreter, require_interpreter);
 }
 
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index 41821a6..8b7fcca 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -49,6 +49,11 @@
   kNumHandlerTables
 };
 
+// Do we want to deoptimize for method entry and exit listeners or just try to intercept
+// invocations? Deoptimization forces all code to run in the interpreter and considerably hurts the
+// application's performance.
+static constexpr bool kDeoptimizeForAccurateMethodEntryExitListeners = true;
+
 // Instrumentation event listener API. Registered listeners will get the appropriate call back for
 // the events they are listening for. The call backs supply the thread, method and dex_pc the event
 // occurred upon. The thread may or may not be Thread::Current().
@@ -170,7 +175,8 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Enable method tracing by installing instrumentation entry/exit stubs.
-  void EnableMethodTracing()
+  void EnableMethodTracing(
+      bool require_interpreter = kDeoptimizeForAccurateMethodEntryExitListeners)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
       LOCKS_EXCLUDED(Locks::thread_list_lock_, Locks::classlinker_classes_lock_);
 
diff --git a/runtime/native/dalvik_system_ZygoteHooks.cc b/runtime/native/dalvik_system_ZygoteHooks.cc
index 022c56f..af01a02 100644
--- a/runtime/native/dalvik_system_ZygoteHooks.cc
+++ b/runtime/native/dalvik_system_ZygoteHooks.cc
@@ -18,14 +18,18 @@
 
 #include <stdlib.h>
 
+#include <cutils/process_name.h>
+
 #include "arch/instruction_set.h"
 #include "debugger.h"
 #include "java_vm_ext.h"
 #include "jit/jit.h"
 #include "jni_internal.h"
 #include "JNIHelp.h"
+#include "scoped_thread_state_change.h"
 #include "ScopedUtfChars.h"
 #include "thread-inl.h"
+#include "trace.h"
 
 #if defined(__linux__)
 #include <sys/prctl.h>
@@ -121,6 +125,11 @@
 
   runtime->PreZygoteFork();
 
+  if (Trace::GetMethodTracingMode() != TracingMode::kTracingInactive) {
+    // Tracing active, pause it.
+    Trace::Pause();
+  }
+
   // Grab thread before fork potentially makes Thread::pthread_key_self_ unusable.
   return reinterpret_cast<jlong>(ThreadForEnv(env));
 }
@@ -132,6 +141,49 @@
   thread->InitAfterFork();
   EnableDebugFeatures(debug_flags);
 
+  // Update tracing.
+  if (Trace::GetMethodTracingMode() != TracingMode::kTracingInactive) {
+    Trace::TraceOutputMode output_mode = Trace::GetOutputMode();
+    Trace::TraceMode trace_mode = Trace::GetMode();
+
+    // Just drop it.
+    Trace::Abort();
+
+    // Only restart if it was streaming mode.
+    // TODO: Expose buffer size, so we can also do file mode.
+    if (output_mode == Trace::TraceOutputMode::kStreaming) {
+      const char* proc_name_cutils = get_process_name();
+      std::string proc_name;
+      if (proc_name_cutils != nullptr) {
+        proc_name = proc_name_cutils;
+      }
+      if (proc_name_cutils == nullptr || proc_name == "zygote" || proc_name == "zygote64") {
+        // Either no process name, or the name hasn't been changed, yet. Just use pid.
+        pid_t pid = getpid();
+        proc_name = StringPrintf("%u", static_cast<uint32_t>(pid));
+      }
+
+      std::string profiles_dir(GetDalvikCache("profiles", false /* create_if_absent */));
+      if (!profiles_dir.empty()) {
+        std::string trace_file = StringPrintf("%s/%s.trace.bin", profiles_dir.c_str(),
+                                              proc_name.c_str());
+        Trace::Start(trace_file.c_str(),
+                     -1,
+                     -1,  // TODO: Expose buffer size.
+                     0,   // TODO: Expose flags.
+                     output_mode,
+                     trace_mode,
+                     0);  // TODO: Expose interval.
+        if (thread->IsExceptionPending()) {
+          ScopedObjectAccess soa(env);
+          thread->ClearException();
+        }
+      } else {
+        LOG(ERROR) << "Profiles dir is empty?!?!";
+      }
+    }
+  }
+
   if (instruction_set != nullptr) {
     ScopedUtfChars isa_string(env, instruction_set);
     InstructionSet isa = GetInstructionSetFromString(isa_string.c_str());
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index c23f744..0758b27 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -216,6 +216,8 @@
       .Define("-Xmethod-trace-file-size:_")
           .WithType<unsigned int>()
           .IntoKey(M::MethodTraceFileSize)
+      .Define("-Xmethod-trace-stream")
+          .IntoKey(M::MethodTraceStreaming)
       .Define("-Xprofile:_")
           .WithType<TraceClockSource>()
           .WithValueMap({{"threadcpuclock", TraceClockSource::kThreadCpu},
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 543b9dc..7bebb96 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -552,6 +552,17 @@
     StartProfiler(profile_output_filename_.c_str());
   }
 
+  if (trace_config_.get() != nullptr && trace_config_->trace_file != "") {
+    ScopedThreadStateChange tsc(self, kWaitingForMethodTracingStart);
+    Trace::Start(trace_config_->trace_file.c_str(),
+                 -1,
+                 static_cast<int>(trace_config_->trace_file_size),
+                 0,
+                 trace_config_->trace_output_mode,
+                 trace_config_->trace_mode,
+                 0);
+  }
+
   return true;
 }
 
@@ -1000,7 +1011,9 @@
     trace_config_->trace_file = runtime_options.ReleaseOrDefault(Opt::MethodTraceFile);
     trace_config_->trace_file_size = runtime_options.ReleaseOrDefault(Opt::MethodTraceFileSize);
     trace_config_->trace_mode = Trace::TraceMode::kMethodTracing;
-    trace_config_->trace_output_mode = Trace::TraceOutputMode::kFile;
+    trace_config_->trace_output_mode = runtime_options.Exists(Opt::MethodTraceStreaming) ?
+        Trace::TraceOutputMode::kStreaming :
+        Trace::TraceOutputMode::kFile;
   }
 
   {
@@ -1026,17 +1039,6 @@
   // TODO: move this to just be an Trace::Start argument
   Trace::SetDefaultClockSource(runtime_options.GetOrDefault(Opt::ProfileClock));
 
-  if (trace_config_.get() != nullptr) {
-    ScopedThreadStateChange tsc(self, kWaitingForMethodTracingStart);
-    Trace::Start(trace_config_->trace_file.c_str(),
-                 -1,
-                 static_cast<int>(trace_config_->trace_file_size),
-                 0,
-                 trace_config_->trace_output_mode,
-                 trace_config_->trace_mode,
-                 0);
-  }
-
   // Pre-allocate an OutOfMemoryError for the double-OOME case.
   self->ThrowNewException("Ljava/lang/OutOfMemoryError;",
                           "OutOfMemoryError thrown while trying to throw OutOfMemoryError; "
diff --git a/runtime/runtime_options.def b/runtime/runtime_options.def
index 339f925..eff787a 100644
--- a/runtime/runtime_options.def
+++ b/runtime/runtime_options.def
@@ -95,6 +95,7 @@
 RUNTIME_OPTIONS_KEY (Unit,                MethodTrace)
 RUNTIME_OPTIONS_KEY (std::string,         MethodTraceFile,                "/data/method-trace-file.bin")
 RUNTIME_OPTIONS_KEY (unsigned int,        MethodTraceFileSize,            10 * MB)
+RUNTIME_OPTIONS_KEY (Unit,                MethodTraceStreaming)
 RUNTIME_OPTIONS_KEY (TraceClockSource,    ProfileClock,                   kDefaultTraceClockSource)  // -Xprofile:
 RUNTIME_OPTIONS_KEY (TestProfilerOptions, ProfilerOpts)  // -Xenable-profiler, -Xprofile-*
 RUNTIME_OPTIONS_KEY (std::string,         Compiler)
diff --git a/runtime/trace.cc b/runtime/trace.cc
index 7326865..5322f9f 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -31,7 +31,7 @@
 #include "instrumentation.h"
 #include "mirror/art_method-inl.h"
 #include "mirror/class-inl.h"
-#include "mirror/dex_cache.h"
+#include "mirror/dex_cache-inl.h"
 #include "mirror/object_array-inl.h"
 #include "mirror/object-inl.h"
 #include "os.h"
@@ -85,9 +85,12 @@
     kTraceMethodActionMask = 0x03,  // two bits
 };
 
+static constexpr uint8_t kOpNewMethod = 1U;
+static constexpr uint8_t kOpNewThread = 2U;
+
 class BuildStackTraceVisitor : public StackVisitor {
  public:
-  explicit BuildStackTraceVisitor(Thread* thread) : StackVisitor(thread, NULL),
+  explicit BuildStackTraceVisitor(Thread* thread) : StackVisitor(thread, nullptr),
       method_trace_(Trace::AllocStackTrace()) {}
 
   bool VisitFrame() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -118,7 +121,7 @@
 
 TraceClockSource Trace::default_clock_source_ = kDefaultTraceClockSource;
 
-Trace* volatile Trace::the_trace_ = NULL;
+Trace* volatile Trace::the_trace_ = nullptr;
 pthread_t Trace::sampling_pthread_ = 0U;
 std::unique_ptr<std::vector<mirror::ArtMethod*>> Trace::temp_stack_trace_;
 
@@ -138,7 +141,7 @@
 }
 
 std::vector<mirror::ArtMethod*>* Trace::AllocStackTrace() {
-  if (temp_stack_trace_.get() != NULL) {
+  if (temp_stack_trace_.get() != nullptr) {
     return temp_stack_trace_.release();
   } else {
     return new std::vector<mirror::ArtMethod*>();
@@ -246,7 +249,7 @@
 static void ClearThreadStackTraceAndClockBase(Thread* thread, void* arg ATTRIBUTE_UNUSED) {
   thread->SetTraceClockBase(0);
   std::vector<mirror::ArtMethod*>* stack_trace = thread->GetStackTraceSample();
-  thread->SetStackTraceSample(NULL);
+  thread->SetStackTraceSample(nullptr);
   delete stack_trace;
 }
 
@@ -260,7 +263,7 @@
   uint32_t thread_clock_diff = 0;
   uint32_t wall_clock_diff = 0;
   ReadClocks(thread, &thread_clock_diff, &wall_clock_diff);
-  if (old_stack_trace == NULL) {
+  if (old_stack_trace == nullptr) {
     // If there's no previous stack trace sample for this thread, log an entry event for all
     // methods in the trace.
     for (std::vector<mirror::ArtMethod*>::reverse_iterator rit = stack_trace->rbegin();
@@ -308,7 +311,7 @@
     {
       MutexLock mu(self, *Locks::trace_lock_);
       the_trace = the_trace_;
-      if (the_trace == NULL) {
+      if (the_trace == nullptr) {
         break;
       }
     }
@@ -323,7 +326,7 @@
   }
 
   runtime->DetachCurrentThread();
-  return NULL;
+  return nullptr;
 }
 
 void Trace::Start(const char* trace_filename, int trace_fd, int buffer_size, int flags,
@@ -331,7 +334,7 @@
   Thread* self = Thread::Current();
   {
     MutexLock mu(self, *Locks::trace_lock_);
-    if (the_trace_ != NULL) {
+    if (the_trace_ != nullptr) {
       LOG(ERROR) << "Trace already in progress, ignoring this request";
       return;
     }
@@ -354,7 +357,7 @@
       trace_file.reset(new File(trace_fd, "tracefile"));
       trace_file->DisableAutoClose();
     }
-    if (trace_file.get() == NULL) {
+    if (trace_file.get() == nullptr) {
       PLOG(ERROR) << "Unable to open trace file '" << trace_filename << "'";
       ScopedObjectAccess soa(self);
       ThrowRuntimeException("Unable to open trace file '%s'", trace_filename);
@@ -372,20 +375,23 @@
   // Create Trace object.
   {
     MutexLock mu(self, *Locks::trace_lock_);
-    if (the_trace_ != NULL) {
+    if (the_trace_ != nullptr) {
       LOG(ERROR) << "Trace already in progress, ignoring this request";
     } else {
       enable_stats = (flags && kTraceCountAllocs) != 0;
-      the_trace_ = new Trace(trace_file.release(), buffer_size, flags, trace_mode);
+      the_trace_ = new Trace(trace_file.release(), trace_filename, buffer_size, flags, output_mode,
+                             trace_mode);
       if (trace_mode == TraceMode::kSampling) {
-        CHECK_PTHREAD_CALL(pthread_create, (&sampling_pthread_, NULL, &RunSamplingThread,
+        CHECK_PTHREAD_CALL(pthread_create, (&sampling_pthread_, nullptr, &RunSamplingThread,
                                             reinterpret_cast<void*>(interval_us)),
                                             "Sampling profiler thread");
+        the_trace_->interval_us_ = interval_us;
       } else {
         runtime->GetInstrumentation()->AddListener(the_trace_,
                                                    instrumentation::Instrumentation::kMethodEntered |
                                                    instrumentation::Instrumentation::kMethodExited |
                                                    instrumentation::Instrumentation::kMethodUnwind);
+        // TODO: In full-PIC mode, we don't need to fully deopt.
         runtime->GetInstrumentation()->EnableMethodTracing();
       }
     }
@@ -399,18 +405,18 @@
   }
 }
 
-void Trace::Stop() {
+void Trace::StopTracing(bool finish_tracing, bool flush_file) {
   bool stop_alloc_counting = false;
   Runtime* const runtime = Runtime::Current();
   Trace* the_trace = nullptr;
   pthread_t sampling_pthread = 0U;
   {
     MutexLock mu(Thread::Current(), *Locks::trace_lock_);
-    if (the_trace_ == NULL) {
+    if (the_trace_ == nullptr) {
       LOG(ERROR) << "Trace stop requested, but no trace currently running";
     } else {
       the_trace = the_trace_;
-      the_trace_ = NULL;
+      the_trace_ = nullptr;
       sampling_pthread = sampling_pthread_;
     }
   }
@@ -418,13 +424,16 @@
   // the sampling thread access a stale pointer. This finishes since the sampling thread exits when
   // the_trace_ is null.
   if (sampling_pthread != 0U) {
-    CHECK_PTHREAD_CALL(pthread_join, (sampling_pthread, NULL), "sampling thread shutdown");
+    CHECK_PTHREAD_CALL(pthread_join, (sampling_pthread, nullptr), "sampling thread shutdown");
     sampling_pthread_ = 0U;
   }
   runtime->GetThreadList()->SuspendAll(__FUNCTION__);
+
   if (the_trace != nullptr) {
-    stop_alloc_counting = (the_trace->flags_ & kTraceCountAllocs) != 0;
-    the_trace->FinishTracing();
+    stop_alloc_counting = (the_trace->flags_ & Trace::kTraceCountAllocs) != 0;
+    if (finish_tracing) {
+      the_trace->FinishTracing();
+    }
 
     if (the_trace->trace_mode_ == TraceMode::kSampling) {
       MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
@@ -438,8 +447,12 @@
     }
     if (the_trace->trace_file_.get() != nullptr) {
       // Do not try to erase, so flush and close explicitly.
-      if (the_trace->trace_file_->Flush() != 0) {
-        PLOG(ERROR) << "Could not flush trace file.";
+      if (flush_file) {
+        if (the_trace->trace_file_->Flush() != 0) {
+          PLOG(ERROR) << "Could not flush trace file.";
+        }
+      } else {
+        the_trace->trace_file_->MarkUnchecked();  // Do not trigger guard.
       }
       if (the_trace->trace_file_->Close() != 0) {
         PLOG(ERROR) << "Could not close trace file.";
@@ -454,15 +467,118 @@
   }
 }
 
+void Trace::Abort() {
+  // Do not write anything anymore.
+  StopTracing(false, false);
+}
+
+void Trace::Stop() {
+  // Finish writing.
+  StopTracing(true, true);
+}
+
 void Trace::Shutdown() {
   if (GetMethodTracingMode() != kTracingInactive) {
     Stop();
   }
 }
 
+void Trace::Pause() {
+  bool stop_alloc_counting = false;
+  Runtime* runtime = Runtime::Current();
+  Trace* the_trace = nullptr;
+
+  pthread_t sampling_pthread = 0U;
+  {
+    MutexLock mu(Thread::Current(), *Locks::trace_lock_);
+    if (the_trace_ == nullptr) {
+      LOG(ERROR) << "Trace pause requested, but no trace currently running";
+      return;
+    } else {
+      the_trace = the_trace_;
+      sampling_pthread = sampling_pthread_;
+    }
+  }
+
+  if (sampling_pthread != 0U) {
+    {
+      MutexLock mu(Thread::Current(), *Locks::trace_lock_);
+      the_trace_ = nullptr;
+    }
+    CHECK_PTHREAD_CALL(pthread_join, (sampling_pthread, nullptr), "sampling thread shutdown");
+    sampling_pthread_ = 0U;
+    {
+      MutexLock mu(Thread::Current(), *Locks::trace_lock_);
+      the_trace_ = the_trace;
+    }
+  }
+
+  if (the_trace != nullptr) {
+    runtime->GetThreadList()->SuspendAll(__FUNCTION__);
+    stop_alloc_counting = (the_trace->flags_ & Trace::kTraceCountAllocs) != 0;
+
+    if (the_trace->trace_mode_ == TraceMode::kSampling) {
+      MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
+      runtime->GetThreadList()->ForEach(ClearThreadStackTraceAndClockBase, nullptr);
+    } else {
+      runtime->GetInstrumentation()->DisableMethodTracing();
+      runtime->GetInstrumentation()->RemoveListener(the_trace,
+                                                    instrumentation::Instrumentation::kMethodEntered |
+                                                    instrumentation::Instrumentation::kMethodExited |
+                                                    instrumentation::Instrumentation::kMethodUnwind);
+    }
+    runtime->GetThreadList()->ResumeAll();
+  }
+
+  if (stop_alloc_counting) {
+    // Can be racy since SetStatsEnabled is not guarded by any locks.
+    Runtime::Current()->SetStatsEnabled(false);
+  }
+}
+
+void Trace::Resume() {
+  Thread* self = Thread::Current();
+  Trace* the_trace;
+  {
+    MutexLock mu(self, *Locks::trace_lock_);
+    if (the_trace_ == nullptr) {
+      LOG(ERROR) << "No trace to resume (or sampling mode), ignoring this request";
+      return;
+    }
+    the_trace = the_trace_;
+  }
+
+  Runtime* runtime = Runtime::Current();
+
+  // Enable count of allocs if specified in the flags.
+  bool enable_stats = (the_trace->flags_ && kTraceCountAllocs) != 0;
+
+  runtime->GetThreadList()->SuspendAll(__FUNCTION__);
+
+  // Reenable.
+  if (the_trace->trace_mode_ == TraceMode::kSampling) {
+    CHECK_PTHREAD_CALL(pthread_create, (&sampling_pthread_, nullptr, &RunSamplingThread,
+        reinterpret_cast<void*>(the_trace->interval_us_)), "Sampling profiler thread");
+  } else {
+    runtime->GetInstrumentation()->AddListener(the_trace,
+                                               instrumentation::Instrumentation::kMethodEntered |
+                                               instrumentation::Instrumentation::kMethodExited |
+                                               instrumentation::Instrumentation::kMethodUnwind);
+    // TODO: In full-PIC mode, we don't need to fully deopt.
+    runtime->GetInstrumentation()->EnableMethodTracing();
+  }
+
+  runtime->GetThreadList()->ResumeAll();
+
+  // Can't call this when holding the mutator lock.
+  if (enable_stats) {
+    runtime->SetStatsEnabled(true);
+  }
+}
+
 TracingMode Trace::GetMethodTracingMode() {
   MutexLock mu(Thread::Current(), *Locks::trace_lock_);
-  if (the_trace_ == NULL) {
+  if (the_trace_ == nullptr) {
     return kTracingInactive;
   } else {
     switch (the_trace_->trace_mode_) {
@@ -476,13 +592,26 @@
   }
 }
 
-Trace::Trace(File* trace_file, int buffer_size, int flags, TraceMode trace_mode)
-    : trace_file_(trace_file), buf_(new uint8_t[buffer_size]()), flags_(flags),
-      trace_mode_(trace_mode), clock_source_(default_clock_source_),
-      buffer_size_(buffer_size), start_time_(MicroTime()),
-      clock_overhead_ns_(GetClockOverheadNanoSeconds()), cur_offset_(0), overflow_(false) {
-  // Set up the beginning of the trace.
+static constexpr size_t kStreamingBufferSize = 16 * KB;
+
+Trace::Trace(File* trace_file, const char* trace_name, int buffer_size, int flags,
+             TraceOutputMode output_mode, TraceMode trace_mode)
+    : trace_file_(trace_file),
+      buf_(new uint8_t[output_mode == TraceOutputMode::kStreaming ?
+          kStreamingBufferSize :
+          buffer_size]()),
+      flags_(flags), trace_output_mode_(output_mode), trace_mode_(trace_mode),
+      clock_source_(default_clock_source_),
+      buffer_size_(output_mode == TraceOutputMode::kStreaming ?
+          kStreamingBufferSize :
+          buffer_size),
+      start_time_(MicroTime()), clock_overhead_ns_(GetClockOverheadNanoSeconds()), cur_offset_(0),
+      overflow_(false), interval_us_(0), streaming_lock_(nullptr) {
   uint16_t trace_version = GetTraceVersion(clock_source_);
+  if (output_mode == TraceOutputMode::kStreaming) {
+    trace_version |= 0xF0U;
+  }
+  // Set up the beginning of the trace.
   memset(buf_.get(), 0, kTraceHeaderLength);
   Append4LE(buf_.get(), kTraceMagicValue);
   Append2LE(buf_.get() + 4, trace_version);
@@ -495,6 +624,16 @@
 
   // Update current offset.
   cur_offset_.StoreRelaxed(kTraceHeaderLength);
+
+  if (output_mode == TraceOutputMode::kStreaming) {
+    streaming_file_name_ = trace_name;
+    streaming_lock_ = new Mutex("tracing lock");
+    seen_threads_.reset(new ThreadIDBitSet());
+  }
+}
+
+Trace::~Trace() {
+  delete streaming_lock_;
 }
 
 static void DumpBuf(uint8_t* buf, size_t buf_size, TraceClockSource clock_source)
@@ -511,14 +650,38 @@
   }
 }
 
-void Trace::FinishTracing() {
-  // Compute elapsed time.
-  uint64_t elapsed = MicroTime() - start_time_;
+static void GetVisitedMethodsFromBitSets(
+    const std::map<mirror::DexCache*, DexIndexBitSet*>& seen_methods,
+    std::set<mirror::ArtMethod*>* visited_methods) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  for (auto& e : seen_methods) {
+    DexIndexBitSet* bit_set = e.second;
+    for (uint32_t i = 0; i < bit_set->size(); ++i) {
+      if ((*bit_set)[i]) {
+        visited_methods->insert(e.first->GetResolvedMethod(i));
+      }
+    }
+  }
+}
 
-  size_t final_offset = cur_offset_.LoadRelaxed();
+void Trace::FinishTracing() {
+  size_t final_offset = 0;
 
   std::set<mirror::ArtMethod*> visited_methods;
-  GetVisitedMethods(final_offset, &visited_methods);
+  if (trace_output_mode_ == TraceOutputMode::kStreaming) {
+    // Write the secondary file with all the method names.
+    GetVisitedMethodsFromBitSets(seen_methods_, &visited_methods);
+
+    // Clean up.
+    for (auto& e : seen_methods_) {
+      delete e.second;
+    }
+  } else {
+    final_offset = cur_offset_.LoadRelaxed();
+    GetVisitedMethods(final_offset, &visited_methods);
+  }
+
+  // Compute elapsed time.
+  uint64_t elapsed = MicroTime() - start_time_;
 
   std::ostringstream os;
 
@@ -535,8 +698,10 @@
     os << StringPrintf("clock=wall\n");
   }
   os << StringPrintf("elapsed-time-usec=%" PRIu64 "\n", elapsed);
-  size_t num_records = (final_offset - kTraceHeaderLength) / GetRecordSize(clock_source_);
-  os << StringPrintf("num-method-calls=%zd\n", num_records);
+  if (trace_output_mode_ != TraceOutputMode::kStreaming) {
+    size_t num_records = (final_offset - kTraceHeaderLength) / GetRecordSize(clock_source_);
+    os << StringPrintf("num-method-calls=%zd\n", num_records);
+  }
   os << StringPrintf("clock-call-overhead-nsec=%d\n", clock_overhead_ns_);
   os << StringPrintf("vm=art\n");
   os << StringPrintf("pid=%d\n", getpid());
@@ -550,27 +715,44 @@
   os << StringPrintf("%cmethods\n", kTraceTokenChar);
   DumpMethodList(os, visited_methods);
   os << StringPrintf("%cend\n", kTraceTokenChar);
-
   std::string header(os.str());
-  if (trace_file_.get() == NULL) {
-    iovec iov[2];
-    iov[0].iov_base = reinterpret_cast<void*>(const_cast<char*>(header.c_str()));
-    iov[0].iov_len = header.length();
-    iov[1].iov_base = buf_.get();
-    iov[1].iov_len = final_offset;
-    Dbg::DdmSendChunkV(CHUNK_TYPE("MPSE"), iov, 2);
-    const bool kDumpTraceInfo = false;
-    if (kDumpTraceInfo) {
-      LOG(INFO) << "Trace sent:\n" << header;
-      DumpBuf(buf_.get(), final_offset, clock_source_);
+
+  if (trace_output_mode_ == TraceOutputMode::kStreaming) {
+    File file;
+    if (!file.Open(streaming_file_name_ + ".sec", O_CREAT | O_WRONLY)) {
+      LOG(WARNING) << "Could not open secondary trace file!";
+      return;
     }
-  } else {
-    if (!trace_file_->WriteFully(header.c_str(), header.length()) ||
-        !trace_file_->WriteFully(buf_.get(), final_offset)) {
+    if (!file.WriteFully(header.c_str(), header.length())) {
+      file.Erase();
       std::string detail(StringPrintf("Trace data write failed: %s", strerror(errno)));
       PLOG(ERROR) << detail;
       ThrowRuntimeException("%s", detail.c_str());
     }
+    if (file.FlushCloseOrErase() != 0) {
+      PLOG(ERROR) << "Could not write secondary file";
+    }
+  } else {
+    if (trace_file_.get() == nullptr) {
+      iovec iov[2];
+      iov[0].iov_base = reinterpret_cast<void*>(const_cast<char*>(header.c_str()));
+      iov[0].iov_len = header.length();
+      iov[1].iov_base = buf_.get();
+      iov[1].iov_len = final_offset;
+      Dbg::DdmSendChunkV(CHUNK_TYPE("MPSE"), iov, 2);
+      const bool kDumpTraceInfo = false;
+      if (kDumpTraceInfo) {
+        LOG(INFO) << "Trace sent:\n" << header;
+        DumpBuf(buf_.get(), final_offset, clock_source_);
+      }
+    } else {
+      if (!trace_file_->WriteFully(header.c_str(), header.length()) ||
+          !trace_file_->WriteFully(buf_.get(), final_offset)) {
+        std::string detail(StringPrintf("Trace data write failed: %s", strerror(errno)));
+        PLOG(ERROR) << detail;
+        ThrowRuntimeException("%s", detail.c_str());
+      }
+    }
   }
 }
 
@@ -654,20 +836,76 @@
   }
 }
 
+bool Trace::RegisterMethod(mirror::ArtMethod* method) {
+  mirror::DexCache* dex_cache = method->GetDexCache();
+  if (dex_cache->GetResolvedMethod(method->GetDexMethodIndex()) != method) {
+    DCHECK(dex_cache->GetResolvedMethod(method->GetDexMethodIndex()) == nullptr);
+    dex_cache->SetResolvedMethod(method->GetDexMethodIndex(), method);
+  }
+  if (seen_methods_.find(dex_cache) == seen_methods_.end()) {
+    seen_methods_.insert(std::make_pair(dex_cache, new DexIndexBitSet()));
+  }
+  DexIndexBitSet* bit_set = seen_methods_.find(dex_cache)->second;
+  if (!(*bit_set)[method->GetDexMethodIndex()]) {
+    bit_set->set(method->GetDexMethodIndex());
+    return true;
+  }
+  return false;
+}
+
+bool Trace::RegisterThread(Thread* thread) {
+  pid_t tid = thread->GetTid();
+  CHECK_LT(0U, static_cast<uint32_t>(tid));
+  CHECK_LT(static_cast<uint32_t>(tid), 65536U);
+
+  if (!(*seen_threads_)[tid]) {
+    seen_threads_->set(tid);
+    return true;
+  }
+  return false;
+}
+
+static std::string GetMethodLine(mirror::ArtMethod* method)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  return StringPrintf("%p\t%s\t%s\t%s\t%s\n", method,
+      PrettyDescriptor(method->GetDeclaringClassDescriptor()).c_str(), method->GetName(),
+      method->GetSignature().ToString().c_str(), method->GetDeclaringClassSourceFile());
+}
+
+void Trace::WriteToBuf(const uint8_t* src, size_t src_size) {
+  int32_t old_offset = cur_offset_.LoadRelaxed();
+  int32_t new_offset = old_offset + static_cast<int32_t>(src_size);
+  if (new_offset > buffer_size_) {
+    // Flush buffer.
+    if (!trace_file_->WriteFully(buf_.get(), old_offset)) {
+      PLOG(WARNING) << "Failed streaming a tracing event.";
+    }
+    old_offset = 0;
+    new_offset = static_cast<int32_t>(src_size);
+  }
+  cur_offset_.StoreRelease(new_offset);
+  // Fill in data.
+  memcpy(buf_.get() + old_offset, src, src_size);
+}
+
 void Trace::LogMethodTraceEvent(Thread* thread, mirror::ArtMethod* method,
                                 instrumentation::Instrumentation::InstrumentationEvent event,
                                 uint32_t thread_clock_diff, uint32_t wall_clock_diff) {
   // Advance cur_offset_ atomically.
   int32_t new_offset;
-  int32_t old_offset;
-  do {
-    old_offset = cur_offset_.LoadRelaxed();
-    new_offset = old_offset + GetRecordSize(clock_source_);
-    if (new_offset > buffer_size_) {
-      overflow_ = true;
-      return;
-    }
-  } while (!cur_offset_.CompareExchangeWeakSequentiallyConsistent(old_offset, new_offset));
+  int32_t old_offset = 0;
+
+  // We do a busy loop here trying to acquire the next offset.
+  if (trace_output_mode_ != TraceOutputMode::kStreaming) {
+    do {
+      old_offset = cur_offset_.LoadRelaxed();
+      new_offset = old_offset + GetRecordSize(clock_source_);
+      if (new_offset > buffer_size_) {
+        overflow_ = true;
+        return;
+      }
+    } while (!cur_offset_.CompareExchangeWeakSequentiallyConsistent(old_offset, new_offset));
+  }
 
   TraceAction action = kTraceMethodEnter;
   switch (event) {
@@ -687,7 +925,15 @@
   uint32_t method_value = EncodeTraceMethodAndAction(method, action);
 
   // Write data
-  uint8_t* ptr = buf_.get() + old_offset;
+  uint8_t* ptr;
+  static constexpr size_t kPacketSize = 14U;  // The maximum size of data in a packet.
+  uint8_t stack_buf[kPacketSize];             // Space to store a packet when in streaming mode.
+  if (trace_output_mode_ == TraceOutputMode::kStreaming) {
+    ptr = stack_buf;
+  } else {
+    ptr = buf_.get() + old_offset;
+  }
+
   Append2LE(ptr, thread->GetTid());
   Append4LE(ptr + 2, method_value);
   ptr += 6;
@@ -699,6 +945,34 @@
   if (UseWallClock()) {
     Append4LE(ptr, wall_clock_diff);
   }
+  static_assert(kPacketSize == 2 + 4 + 4 + 4, "Packet size incorrect.");
+
+  if (trace_output_mode_ == TraceOutputMode::kStreaming) {
+    MutexLock mu(Thread::Current(), *streaming_lock_);  // To serialize writing.
+    if (RegisterMethod(method)) {
+      // Write a special block with the name.
+      std::string method_line(GetMethodLine(method));
+      uint8_t buf2[5];
+      Append2LE(buf2, 0);
+      buf2[2] = kOpNewMethod;
+      Append2LE(buf2 + 3, static_cast<uint16_t>(method_line.length()));
+      WriteToBuf(buf2, sizeof(buf2));
+      WriteToBuf(reinterpret_cast<const uint8_t*>(method_line.c_str()), method_line.length());
+    }
+    if (RegisterThread(thread)) {
+      // It might be better to postpone this. Threads might not have received names...
+      std::string thread_name;
+      thread->GetThreadName(thread_name);
+      uint8_t buf2[7];
+      Append2LE(buf2, 0);
+      buf2[2] = kOpNewThread;
+      Append2LE(buf2 + 3, static_cast<uint16_t>(thread->GetTid()));
+      Append2LE(buf2 + 5, static_cast<uint16_t>(thread_name.length()));
+      WriteToBuf(buf2, sizeof(buf2));
+      WriteToBuf(reinterpret_cast<const uint8_t*>(thread_name.c_str()), thread_name.length());
+    }
+    WriteToBuf(stack_buf, sizeof(stack_buf));
+  }
 }
 
 void Trace::GetVisitedMethods(size_t buf_size,
@@ -716,9 +990,7 @@
 
 void Trace::DumpMethodList(std::ostream& os, const std::set<mirror::ArtMethod*>& visited_methods) {
   for (const auto& method : visited_methods) {
-    os << StringPrintf("%p\t%s\t%s\t%s\t%s\n", method,
-        PrettyDescriptor(method->GetDeclaringClassDescriptor()).c_str(), method->GetName(),
-        method->GetSignature().ToString().c_str(), method->GetDeclaringClassSourceFile());
+    os << GetMethodLine(method);
   }
 }
 
@@ -750,4 +1022,16 @@
   }
 }
 
+Trace::TraceOutputMode Trace::GetOutputMode() {
+  MutexLock mu(Thread::Current(), *Locks::trace_lock_);
+  CHECK(the_trace_ != nullptr) << "Trace output mode requested, but no trace currently running";
+  return the_trace_->trace_output_mode_;
+}
+
+Trace::TraceMode Trace::GetMode() {
+  MutexLock mu(Thread::Current(), *Locks::trace_lock_);
+  CHECK(the_trace_ != nullptr) << "Trace mode requested, but no trace currently running";
+  return the_trace_->trace_mode_;
+}
+
 }  // namespace art
diff --git a/runtime/trace.h b/runtime/trace.h
index d8bd428..b8329ff 100644
--- a/runtime/trace.h
+++ b/runtime/trace.h
@@ -17,6 +17,8 @@
 #ifndef ART_RUNTIME_TRACE_H_
 #define ART_RUNTIME_TRACE_H_
 
+#include <bitset>
+#include <map>
 #include <memory>
 #include <ostream>
 #include <set>
@@ -34,11 +36,15 @@
 
 namespace mirror {
   class ArtMethod;
+  class DexCache;
 }  // namespace mirror
 
 class ArtField;
 class Thread;
 
+using DexIndexBitSet = std::bitset<65536>;
+using ThreadIDBitSet = std::bitset<65536>;
+
 enum TracingMode {
   kTracingInactive,
   kMethodTracingActive,
@@ -53,7 +59,8 @@
 
   enum class TraceOutputMode {
     kFile,
-    kDDMS
+    kDDMS,
+    kStreaming
   };
 
   enum class TraceMode {
@@ -61,6 +68,8 @@
     kSampling
   };
 
+  ~Trace();
+
   static void SetDefaultClockSource(TraceClockSource clock_source);
 
   static void Start(const char* trace_filename, int trace_fd, int buffer_size, int flags,
@@ -69,7 +78,16 @@
                      Locks::thread_list_lock_,
                      Locks::thread_suspend_count_lock_,
                      Locks::trace_lock_);
+  static void Pause() LOCKS_EXCLUDED(Locks::trace_lock_, Locks::thread_list_lock_);
+  static void Resume() LOCKS_EXCLUDED(Locks::trace_lock_);
+
+  // Stop tracing. This will finish the trace and write it to file/send it via DDMS.
   static void Stop()
+        LOCKS_EXCLUDED(Locks::mutator_lock_,
+                       Locks::thread_list_lock_,
+                       Locks::trace_lock_);
+  // Abort tracing. This will just stop tracing and *not* write/send the collected data.
+  static void Abort()
       LOCKS_EXCLUDED(Locks::mutator_lock_,
                      Locks::thread_list_lock_,
                      Locks::trace_lock_);
@@ -116,19 +134,25 @@
   // Save id and name of a thread before it exits.
   static void StoreExitingThreadInfo(Thread* thread);
 
+  static TraceOutputMode GetOutputMode() LOCKS_EXCLUDED(Locks::trace_lock_);
+  static TraceMode GetMode() LOCKS_EXCLUDED(Locks::trace_lock_);
+
  private:
-  explicit Trace(File* trace_file, int buffer_size, int flags, TraceMode trace_mode);
+  Trace(File* trace_file, const char* trace_name, int buffer_size, int flags,
+        TraceOutputMode output_mode, TraceMode trace_mode);
 
   // The sampling interval in microseconds is passed as an argument.
   static void* RunSamplingThread(void* arg) LOCKS_EXCLUDED(Locks::trace_lock_);
 
+  static void StopTracing(bool finish_tracing, bool flush_file);
   void FinishTracing() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void ReadClocks(Thread* thread, uint32_t* thread_clock_diff, uint32_t* wall_clock_diff);
 
   void LogMethodTraceEvent(Thread* thread, mirror::ArtMethod* method,
                            instrumentation::Instrumentation::InstrumentationEvent event,
-                           uint32_t thread_clock_diff, uint32_t wall_clock_diff);
+                           uint32_t thread_clock_diff, uint32_t wall_clock_diff)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Methods to output traced methods and threads.
   void GetVisitedMethods(size_t end_offset, std::set<mirror::ArtMethod*>* visited_methods);
@@ -136,6 +160,18 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void DumpThreadList(std::ostream& os) LOCKS_EXCLUDED(Locks::thread_list_lock_);
 
+  // Methods to register seen entitites in streaming mode. The methods return true if the entity
+  // is newly discovered.
+  bool RegisterMethod(mirror::ArtMethod* method)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) EXCLUSIVE_LOCKS_REQUIRED(streaming_lock_);
+  bool RegisterThread(Thread* thread)
+      EXCLUSIVE_LOCKS_REQUIRED(streaming_lock_);
+
+  // Copy a temporary buffer to the main buffer. Used for streaming. Exposed here for lock
+  // annotation.
+  void WriteToBuf(const uint8_t* src, size_t src_size)
+      EXCLUSIVE_LOCKS_REQUIRED(streaming_lock_);
+
   // Singleton instance of the Trace or NULL when no method tracing is active.
   static Trace* volatile the_trace_ GUARDED_BY(Locks::trace_lock_);
 
@@ -157,7 +193,10 @@
   // Flags enabling extra tracing of things such as alloc counts.
   const int flags_;
 
-  // True if traceview should sample instead of instrumenting method entry/exit.
+  // The kind of output for this tracing.
+  const TraceOutputMode trace_output_mode_;
+
+  // The tracing method.
   const TraceMode trace_mode_;
 
   const TraceClockSource clock_source_;
@@ -180,6 +219,15 @@
   // Map of thread ids and names that have already exited.
   SafeMap<pid_t, std::string> exited_threads_;
 
+  // Sampling profiler sampling interval.
+  int interval_us_;
+
+  // Streaming mode data.
+  std::string streaming_file_name_;
+  Mutex* streaming_lock_;
+  std::map<mirror::DexCache*, DexIndexBitSet*> seen_methods_;
+  std::unique_ptr<ThreadIDBitSet> seen_threads_;
+
   DISALLOW_COPY_AND_ASSIGN(Trace);
 };
 
diff --git a/runtime/utils.cc b/runtime/utils.cc
index f13da8b..a303aa4 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -1440,32 +1440,58 @@
   }
 }
 
-std::string GetDalvikCacheOrDie(const char* subdir, const bool create_if_absent) {
+static std::string GetDalvikCacheImpl(const char* subdir,
+                                      const bool create_if_absent,
+                                      const bool abort_on_error) {
   CHECK(subdir != nullptr);
   const char* android_data = GetAndroidData();
   const std::string dalvik_cache_root(StringPrintf("%s/dalvik-cache/", android_data));
   const std::string dalvik_cache = dalvik_cache_root + subdir;
-  if (create_if_absent && !OS::DirectoryExists(dalvik_cache.c_str())) {
+  if (!OS::DirectoryExists(dalvik_cache.c_str())) {
+    if (!create_if_absent) {
+      // TODO: Check callers. Traditional behavior is to not to abort, even when abort_on_error.
+      return "";
+    }
+
     // Don't create the system's /data/dalvik-cache/... because it needs special permissions.
-    if (strcmp(android_data, "/data") != 0) {
-      int result = mkdir(dalvik_cache_root.c_str(), 0700);
-      if (result != 0 && errno != EEXIST) {
-        PLOG(FATAL) << "Failed to create dalvik-cache directory " << dalvik_cache_root;
-        return "";
+    if (strcmp(android_data, "/data") == 0) {
+      if (abort_on_error) {
+        LOG(FATAL) << "Failed to find dalvik-cache directory " << dalvik_cache
+                   << ", cannot create /data dalvik-cache.";
+        UNREACHABLE();
       }
-      result = mkdir(dalvik_cache.c_str(), 0700);
-      if (result != 0) {
+      return "";
+    }
+
+    int result = mkdir(dalvik_cache_root.c_str(), 0700);
+    if (result != 0 && errno != EEXIST) {
+      if (abort_on_error) {
+        PLOG(FATAL) << "Failed to create dalvik-cache root directory " << dalvik_cache_root;
+        UNREACHABLE();
+      }
+      return "";
+    }
+
+    result = mkdir(dalvik_cache.c_str(), 0700);
+    if (result != 0) {
+      if (abort_on_error) {
         PLOG(FATAL) << "Failed to create dalvik-cache directory " << dalvik_cache;
-        return "";
+        UNREACHABLE();
       }
-    } else {
-      LOG(FATAL) << "Failed to find dalvik-cache directory " << dalvik_cache;
       return "";
     }
   }
   return dalvik_cache;
 }
 
+std::string GetDalvikCache(const char* subdir, const bool create_if_absent) {
+  return GetDalvikCacheImpl(subdir, create_if_absent, false);
+}
+
+std::string GetDalvikCacheOrDie(const char* subdir, const bool create_if_absent) {
+  return GetDalvikCacheImpl(subdir, create_if_absent, true);
+}
+
 bool GetDalvikCacheFilename(const char* location, const char* cache_location,
                             std::string* filename, std::string* error_msg) {
   if (location[0] != '/') {
diff --git a/runtime/utils.h b/runtime/utils.h
index 6dee5fe..6708c67 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -511,6 +511,9 @@
 // Find $ANDROID_DATA, /data, or return nullptr.
 const char* GetAndroidDataSafe(std::string* error_msg);
 
+// Returns the dalvik-cache location, with subdir appended. Returns the empty string if the cache
+// could not be found (or created).
+std::string GetDalvikCache(const char* subdir, bool create_if_absent = true);
 // Returns the dalvik-cache location, or dies trying. subdir will be
 // appended to the cache location.
 std::string GetDalvikCacheOrDie(const char* subdir, bool create_if_absent = true);
diff --git a/runtime/utils_test.cc b/runtime/utils_test.cc
index aa7bc64..6ccbd13 100644
--- a/runtime/utils_test.cc
+++ b/runtime/utils_test.cc
@@ -367,6 +367,15 @@
                GetDalvikCacheFilenameOrDie("/system/framework/boot.oat", "/foo").c_str());
 }
 
+TEST_F(UtilsTest, GetDalvikCache) {
+  EXPECT_STREQ("", GetDalvikCache("should-not-exist123", false).c_str());
+
+  EXPECT_STREQ((android_data_ + "/dalvik-cache/.").c_str(), GetDalvikCache(".", false).c_str());
+  EXPECT_STREQ((android_data_ + "/dalvik-cache/should-not-be-there").c_str(),
+               GetDalvikCache("should-not-be-there", true).c_str());
+}
+
+
 TEST_F(UtilsTest, GetSystemImageFilename) {
   EXPECT_STREQ("/system/framework/arm/boot.art",
                GetSystemImageFilename("/system/framework/boot.art", kArm).c_str());