Merge "perfetto-ui: Draw area selection as a box"
diff --git a/Android.bp b/Android.bp
index d29f875..c1ca304 100644
--- a/Android.bp
+++ b/Android.bp
@@ -478,15 +478,18 @@
     ":perfetto_src_ipc_common",
     ":perfetto_src_ipc_host",
     ":perfetto_src_protozero_protozero",
-    ":perfetto_src_tracing_client_api",
+    ":perfetto_src_tracing_client_api_base",
+    ":perfetto_src_tracing_client_api_system_backend_only",
     ":perfetto_src_tracing_common",
     ":perfetto_src_tracing_core_core",
     ":perfetto_src_tracing_core_service",
+    ":perfetto_src_tracing_in_process_backend_fake",
     ":perfetto_src_tracing_ipc_common",
     ":perfetto_src_tracing_ipc_consumer_consumer",
     ":perfetto_src_tracing_ipc_producer_producer",
     ":perfetto_src_tracing_ipc_service_service",
     ":perfetto_src_tracing_platform_posix",
+    ":perfetto_src_tracing_system_process_backend",
   ],
   export_include_dirs: [
     "include",
@@ -1360,15 +1363,17 @@
     ":perfetto_src_traced_probes_probes_src",
     ":perfetto_src_traced_probes_ps_ps",
     ":perfetto_src_traced_probes_sys_stats_sys_stats",
-    ":perfetto_src_tracing_client_api",
+    ":perfetto_src_tracing_client_api_base",
     ":perfetto_src_tracing_common",
     ":perfetto_src_tracing_core_core",
     ":perfetto_src_tracing_core_service",
+    ":perfetto_src_tracing_in_process_backend",
     ":perfetto_src_tracing_ipc_common",
     ":perfetto_src_tracing_ipc_consumer_consumer",
     ":perfetto_src_tracing_ipc_producer_producer",
     ":perfetto_src_tracing_ipc_service_service",
     ":perfetto_src_tracing_platform_posix",
+    ":perfetto_src_tracing_system_process_backend",
     ":perfetto_src_tracing_test_api_test_support",
     ":perfetto_src_tracing_test_client_api_integrationtests",
     ":perfetto_test_end_to_end_integrationtests",
@@ -6500,15 +6505,13 @@
   ],
 }
 
-// GN: //src/tracing:client_api
+// GN: //src/tracing:client_api_base
 filegroup {
-  name: "perfetto_src_tracing_client_api",
+  name: "perfetto_src_tracing_client_api_base",
   srcs: [
     "src/tracing/data_source.cc",
     "src/tracing/debug_annotation.cc",
     "src/tracing/event_context.cc",
-    "src/tracing/internal/in_process_tracing_backend.cc",
-    "src/tracing/internal/system_tracing_backend.cc",
     "src/tracing/internal/tracing_muxer_impl.cc",
     "src/tracing/internal/track_event_internal.cc",
     "src/tracing/platform.cc",
@@ -6519,6 +6522,11 @@
   ],
 }
 
+// GN: //src/tracing:client_api_system_backend_only
+filegroup {
+  name: "perfetto_src_tracing_client_api_system_backend_only",
+}
+
 // GN: //src/tracing:common
 filegroup {
   name: "perfetto_src_tracing_common",
@@ -6588,6 +6596,22 @@
   ],
 }
 
+// GN: //src/tracing:in_process_backend
+filegroup {
+  name: "perfetto_src_tracing_in_process_backend",
+  srcs: [
+    "src/tracing/internal/in_process_tracing_backend.cc",
+  ],
+}
+
+// GN: //src/tracing:in_process_backend_fake
+filegroup {
+  name: "perfetto_src_tracing_in_process_backend_fake",
+  srcs: [
+    "src/tracing/internal/in_process_tracing_backend_fake.cc",
+  ],
+}
+
 // GN: //src/tracing/ipc:common
 filegroup {
   name: "perfetto_src_tracing_ipc_common",
@@ -6639,6 +6663,14 @@
   ],
 }
 
+// GN: //src/tracing:system_process_backend
+filegroup {
+  name: "perfetto_src_tracing_system_process_backend",
+  srcs: [
+    "src/tracing/internal/system_tracing_backend.cc",
+  ],
+}
+
 // GN: //src/tracing/test:api_test_support
 filegroup {
   name: "perfetto_src_tracing_test_api_test_support",
diff --git a/BUILD b/BUILD
index be3b241..1d891cc 100644
--- a/BUILD
+++ b/BUILD
@@ -685,6 +685,7 @@
         "src/trace_processor/db/table.cc",
         "src/trace_processor/db/table.h",
         "src/trace_processor/db/typed_column.h",
+        "src/trace_processor/db/typed_column_internal.h",
     ],
 )
 
@@ -1230,16 +1231,14 @@
     ],
 )
 
-# GN target: //src/tracing:client_api
+# GN target: //src/tracing:client_api_base
 filegroup(
-    name = "src_tracing_client_api",
+    name = "src_tracing_client_api_base",
     srcs = [
         "src/tracing/data_source.cc",
         "src/tracing/debug_annotation.cc",
         "src/tracing/event_context.cc",
-        "src/tracing/internal/in_process_tracing_backend.cc",
         "src/tracing/internal/in_process_tracing_backend.h",
-        "src/tracing/internal/system_tracing_backend.cc",
         "src/tracing/internal/system_tracing_backend.h",
         "src/tracing/internal/tracing_muxer_impl.cc",
         "src/tracing/internal/tracing_muxer_impl.h",
@@ -1260,6 +1259,14 @@
     ],
 )
 
+# GN target: //src/tracing:in_process_backend
+filegroup(
+    name = "src_tracing_in_process_backend",
+    srcs = [
+        "src/tracing/internal/in_process_tracing_backend.cc",
+    ],
+)
+
 # GN target: //src/tracing:platform_posix
 filegroup(
     name = "src_tracing_platform_posix",
@@ -1268,6 +1275,14 @@
     ],
 )
 
+# GN target: //src/tracing:system_process_backend
+filegroup(
+    name = "src_tracing_system_process_backend",
+    srcs = [
+        "src/tracing/internal/system_tracing_backend.cc",
+    ],
+)
+
 # GN target: //tools/trace_to_text:common
 filegroup(
     name = "tools_trace_to_text_common",
@@ -2474,15 +2489,17 @@
         ":src_ipc_common",
         ":src_ipc_host",
         ":src_protozero_protozero",
-        ":src_tracing_client_api",
+        ":src_tracing_client_api_base",
         ":src_tracing_common",
         ":src_tracing_core_core",
         ":src_tracing_core_service",
+        ":src_tracing_in_process_backend",
         ":src_tracing_ipc_common",
         ":src_tracing_ipc_consumer_consumer",
         ":src_tracing_ipc_producer_producer",
         ":src_tracing_ipc_service_service",
         ":src_tracing_platform_posix",
+        ":src_tracing_system_process_backend",
     ],
     hdrs = [
         ":include_perfetto_base_base",
diff --git a/BUILD.gn b/BUILD.gn
index af82637..c57fd69 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -132,6 +132,9 @@
 
     # Used in the when updating the ftrace protos
     "protos/perfetto/trace/ftrace:descriptor",
+
+    # Checks that the "fake" backend implementations build.
+    "src/tracing:client_api_no_backends_compile_test",
   ]
 }
 
@@ -200,7 +203,7 @@
 }
 
 if (!build_with_chromium) {
-  # Client library target.
+  # Client library target exposed to the Android tree.
   # Still in experimental stage and not API stable yet.
   # See "libperfetto_client_example" (in Android.bp.extras) for an example
   # on how to use the Perfetto Client API from the android tree.
@@ -208,10 +211,18 @@
     complete_static_lib = true
     public_deps = [
       "gn:default_deps",
-      "src/tracing:client_api",
       "src/tracing:platform_posix",
       "src/tracing/core",
     ]
+
+    if (perfetto_build_with_android) {
+      # Android in-tree builds expose only the system backend and don't expose
+      # the in-process backend. This is to save binary size and memory (see
+      # b/148198993).
+      public_deps += [ "src/tracing:client_api_system_backend_only" ]
+    } else {
+      public_deps += [ "src/tracing:client_api" ]
+    }
     sources = [ "include/perfetto/tracing.h" ]
     assert_no_deps = [ "//gn:protobuf_lite" ]
   }
@@ -248,7 +259,10 @@
       "protos/perfetto/trace/track_event:zero",
     ]
     if (enable_perfetto_ipc) {
-      deps += [ "src/tracing/ipc/producer" ]
+      deps += [
+        "src/tracing/ipc/producer",
+        "src/tracing/ipc/service",
+      ]
       public_deps += [ "include/perfetto/ext/tracing/ipc:ipc" ]
     }
   }
diff --git a/buildtools/BUILD.gn b/buildtools/BUILD.gn
index 2ef83f4..a0d5f33 100644
--- a/buildtools/BUILD.gn
+++ b/buildtools/BUILD.gn
@@ -820,6 +820,7 @@
     "-DSQLITE_TEMP_STORE=3",
     "-DSQLITE_OMIT_LOAD_EXTENSION",
     "-DSQLITE_OMIT_RANDOMNESS",
+    "-DSQLITE_OMIT_AUTOINIT",
   ]
 }
 
diff --git a/gn/standalone/BUILD.gn b/gn/standalone/BUILD.gn
index ceb928f..387e258 100644
--- a/gn/standalone/BUILD.gn
+++ b/gn/standalone/BUILD.gn
@@ -16,6 +16,15 @@
 import("//gn/standalone/sanitizers/sanitizers.gni")
 import("//gn/standalone/wasm.gni")
 
+# These warnings have been introduced with the newest version of clang (only in
+# the hermetic build) and are enabled just with -Werror.
+# TODO(primiano): we should look into Wimplicit-int-float-conversion. Seems
+# failing mostly in tests though.
+hermetic_clang_suppressions = [
+  "-Wno-c99-designator",
+  "-Wno-implicit-int-float-conversion",
+]
+
 config("extra_warnings") {
   cflags = [
     "-Wall",
@@ -96,6 +105,12 @@
     ]
   }
 
+  if (is_hermetic_clang && is_linux && !is_wasm) {
+    cflags += hermetic_clang_suppressions
+  } else {
+    not_needed([ "hermetic_clang_suppressions" ])
+  }
+
   if (is_lto) {
     cflags += [ "-flto=full" ]
     ldflags += [ "-flto=full" ]
diff --git a/include/perfetto/ext/ipc/service_proxy.h b/include/perfetto/ext/ipc/service_proxy.h
index 98f111c..02c2dc2 100644
--- a/include/perfetto/ext/ipc/service_proxy.h
+++ b/include/perfetto/ext/ipc/service_proxy.h
@@ -26,6 +26,7 @@
 #include <memory>
 #include <string>
 
+#include "perfetto/base/export.h"
 #include "perfetto/ext/base/weak_ptr.h"
 #include "perfetto/ext/ipc/deferred.h"
 
@@ -38,7 +39,7 @@
 // The base class for the client-side autogenerated stubs that forward method
 // invocations to the host. All the methods of this class are meant to be called
 // only by the autogenerated code.
-class ServiceProxy {
+class PERFETTO_EXPORT ServiceProxy {
  public:
   class EventListener {
    public:
diff --git a/include/perfetto/ext/tracing/core/tracing_service.h b/include/perfetto/ext/tracing/core/tracing_service.h
index c097a6e..695d78e 100644
--- a/include/perfetto/ext/tracing/core/tracing_service.h
+++ b/include/perfetto/ext/tracing/core/tracing_service.h
@@ -106,11 +106,12 @@
       BufferExhaustedPolicy buffer_exhausted_policy =
           BufferExhaustedPolicy::kDefault) = 0;
 
-  // If TracingService::ConnectProducer is called with |in_process=true|,
-  // this returns the producer's SharedMemoryArbiter which can be used
-  // to create TraceWriters which is able to directly commit chunks
-  // without going through an IPC layer.
-  virtual SharedMemoryArbiter* GetInProcessShmemArbiter() = 0;
+  // In some cases you can access the producer's SharedMemoryArbiter (for
+  // example if TracingService::ConnectProducer is called with
+  // |in_process=true|). The SharedMemoryArbiter can be used to create
+  // TraceWriters which is able to directly commit chunks. For the
+  // |in_process=true| case this can be done without going through an IPC layer.
+  virtual SharedMemoryArbiter* MaybeSharedMemoryArbiter() = 0;
 
   // Called in response to a Producer::Flush(request_id) call after all data
   // for the flush request has been committed.
diff --git a/include/perfetto/tracing/internal/track_event_data_source.h b/include/perfetto/tracing/internal/track_event_data_source.h
index e6c8af0..e835155 100644
--- a/include/perfetto/tracing/internal/track_event_data_source.h
+++ b/include/perfetto/tracing/internal/track_event_data_source.h
@@ -117,61 +117,72 @@
 
   // Once we've determined tracing to be enabled for this category, actually
   // write a trace event onto this thread's default track. Outlined to avoid
-  // bloating code at the actual trace point.
-  // TODO(skyostil): Investigate whether this should be fully outlined to reduce
-  // binary size.
+  // bloating code (mostly stack depth) at the actual trace point.
+  //
+  // To minimize call overhead at each trace point, we provide the following
+  // trace point argument variants:
+  //
+  // - None
+  // - Lambda
+  // - One debug annotation
+  // - Two debug annotations
+  // - Track
+  // - Track + Lambda
+  // - Track + one debug annotation
+  // - Track + two debug annotations
+
+  // Trace point which takes no arguments.
+  template <size_t CategoryIndex>
+  static void TraceForCategory(uint32_t instances,
+                               const char* event_name,
+                               perfetto::protos::pbzero::TrackEvent::Type type)
+      PERFETTO_NO_INLINE {
+    TraceForCategoryImpl<CategoryIndex>(instances, event_name, type);
+  }
+
+  // Trace point which takes a lambda function argument.
   template <size_t CategoryIndex,
-            typename ArgumentFunction = void (*)(EventContext)>
-  static void TraceForCategory(
-      uint32_t instances,
-      const char* event_name,
-      perfetto::protos::pbzero::TrackEvent::Type type,
-      ArgumentFunction arg_function = [](EventContext) {},
-      typename std::enable_if<IsValidTraceLambda<ArgumentFunction>()>::type* =
-          nullptr) PERFETTO_NO_INLINE {
-    // We don't simply call TraceForCategory(..., Track(), ...) here, since that
-    // would add extra binary bloat to all trace points that target the default
-    // track.
-    Base::template TraceWithInstances<CategoryTracePointTraits<CategoryIndex>>(
-        instances, [&](typename Base::TraceContext ctx) {
-          // TODO(skyostil): Intern categories at compile time.
-          arg_function(TrackEventInternal::WriteEvent(
-              ctx.tls_inst_->trace_writer.get(), ctx.GetIncrementalState(),
-              Registry->GetCategory(CategoryIndex)->name, event_name, type));
-          // There's no need to emit a track descriptor for the default track
-          // here since that's done in ResetIncrementalState().
-        });
+            typename ArgumentFunction = void (*)(EventContext),
+            typename ArgumentFunctionCheck = typename std::enable_if<
+                IsValidTraceLambda<ArgumentFunction>()>::type>
+  static void TraceForCategory(uint32_t instances,
+                               const char* event_name,
+                               perfetto::protos::pbzero::TrackEvent::Type type,
+                               ArgumentFunction arg_function)
+      PERFETTO_NO_INLINE {
+    TraceForCategoryImpl<CategoryIndex>(instances, event_name, type, Track(),
+                                        std::move(arg_function));
   }
 
   // This variant of the inner trace point takes a Track argument which can be
   // used to emit events on a non-default track.
   template <size_t CategoryIndex,
             typename TrackType,
-            typename ArgumentFunction = void (*)(EventContext)>
-  static void TraceForCategory(
-      uint32_t instances,
-      const char* event_name,
-      perfetto::protos::pbzero::TrackEvent::Type type,
-      const TrackType& track,
-      ArgumentFunction arg_function = [](EventContext) {},
-      typename std::enable_if<IsValidTraceLambda<ArgumentFunction>()>::type* =
-          nullptr,
-      typename std::enable_if<
-          std::is_convertible<TrackType, Track>::value>::type* = nullptr)
+            typename TrackTypeCheck = typename std::enable_if<
+                std::is_convertible<TrackType, Track>::value>::type>
+  static void TraceForCategory(uint32_t instances,
+                               const char* event_name,
+                               perfetto::protos::pbzero::TrackEvent::Type type,
+                               const TrackType& track) PERFETTO_NO_INLINE {
+    TraceForCategoryImpl<CategoryIndex>(instances, event_name, type, track);
+  }
+
+  // Trace point with a track and a lambda function.
+  template <size_t CategoryIndex,
+            typename TrackType,
+            typename ArgumentFunction = void (*)(EventContext),
+            typename ArgumentFunctionCheck = typename std::enable_if<
+                IsValidTraceLambda<ArgumentFunction>()>::type,
+            typename TrackTypeCheck = typename std::enable_if<
+                std::is_convertible<TrackType, Track>::value>::type>
+  static void TraceForCategory(uint32_t instances,
+                               const char* event_name,
+                               perfetto::protos::pbzero::TrackEvent::Type type,
+                               const TrackType& track,
+                               ArgumentFunction arg_function)
       PERFETTO_NO_INLINE {
-    PERFETTO_DCHECK(track);
-    Base::template TraceWithInstances<CategoryTracePointTraits<CategoryIndex>>(
-        instances, [&](typename Base::TraceContext ctx) {
-          // TODO(skyostil): Intern categories at compile time.
-          auto event_ctx = TrackEventInternal::WriteEvent(
-              ctx.tls_inst_->trace_writer.get(), ctx.GetIncrementalState(),
-              Registry->GetCategory(CategoryIndex)->name, event_name, type);
-          event_ctx.event()->set_track_uuid(track.uuid);
-          arg_function(std::move(event_ctx));
-          TrackEventInternal::WriteTrackDescriptorIfNeeded(
-              track, ctx.tls_inst_->trace_writer.get(),
-              ctx.GetIncrementalState());
-        });
+    TraceForCategoryImpl<CategoryIndex>(instances, event_name, type, track,
+                                        std::move(arg_function));
   }
 
   // Trace point with one debug annotation.
@@ -357,6 +368,41 @@
     }
   };
 
+  // TODO(skyostil): Make |CategoryIndex| a regular parameter to reuse trace
+  // point code across different categories.
+  template <size_t CategoryIndex,
+            typename TrackType = Track,
+            typename ArgumentFunction = void (*)(EventContext),
+            typename ArgumentFunctionCheck = typename std::enable_if<
+                IsValidTraceLambda<ArgumentFunction>()>::type,
+            typename TrackTypeCheck = typename std::enable_if<
+                std::is_convertible<TrackType, Track>::value>::type>
+  static void TraceForCategoryImpl(
+      uint32_t instances,
+      const char* event_name,
+      perfetto::protos::pbzero::TrackEvent::Type type,
+      const TrackType& track = Track(),
+      ArgumentFunction arg_function = [](EventContext) {
+      }) PERFETTO_ALWAYS_INLINE {
+    Base::template TraceWithInstances<CategoryTracePointTraits<CategoryIndex>>(
+        instances, [&](typename Base::TraceContext ctx) {
+          {
+            // TODO(skyostil): Intern categories at compile time.
+            auto event_ctx = TrackEventInternal::WriteEvent(
+                ctx.tls_inst_->trace_writer.get(), ctx.GetIncrementalState(),
+                Registry->GetCategory(CategoryIndex)->name, event_name, type);
+            if (track)
+              event_ctx.event()->set_track_uuid(track.uuid);
+            arg_function(std::move(event_ctx));
+          }
+          if (track) {
+            TrackEventInternal::WriteTrackDescriptorIfNeeded(
+                track, ctx.tls_inst_->trace_writer.get(),
+                ctx.GetIncrementalState());
+          }
+        });
+  }
+
   // Records a track descriptor into the track descriptor registry and, if we
   // are tracing, also mirrors the descriptor into the trace.
   template <typename TrackType>
diff --git a/protos/perfetto/trace/perfetto_trace.proto b/protos/perfetto/trace/perfetto_trace.proto
index 674423d..d70c75e 100644
--- a/protos/perfetto/trace/perfetto_trace.proto
+++ b/protos/perfetto/trace/perfetto_trace.proto
@@ -4150,6 +4150,9 @@
   // Chrome, these are usually static strings known at compile time, or
   // concatenations of multiple such static strings).
   optional string action = 1;
+
+  // MD5 hash of the action string.
+  optional uint64 action_hash = 2;
 }
 
 // End of protos/perfetto/trace/track_event/chrome_user_event.proto
diff --git a/protos/perfetto/trace/track_event/chrome_user_event.proto b/protos/perfetto/trace/track_event/chrome_user_event.proto
index 6f7c2e0..d071830 100644
--- a/protos/perfetto/trace/track_event/chrome_user_event.proto
+++ b/protos/perfetto/trace/track_event/chrome_user_event.proto
@@ -25,4 +25,7 @@
   // Chrome, these are usually static strings known at compile time, or
   // concatenations of multiple such static strings).
   optional string action = 1;
+
+  // MD5 hash of the action string.
+  optional uint64 action_hash = 2;
 }
diff --git a/src/perfetto_cmd/packet_writer.cc b/src/perfetto_cmd/packet_writer.cc
index 2fb0562..45aad47 100644
--- a/src/perfetto_cmd/packet_writer.cc
+++ b/src/perfetto_cmd/packet_writer.cc
@@ -191,7 +191,7 @@
   // Reinitialize the compresser if needed:
   if (!is_compressing_) {
     memset(&stream_, 0, sizeof(stream_));
-    CheckEq(deflateInit(&stream_, 9), Z_OK);
+    CheckEq(deflateInit(&stream_, 6), Z_OK);
     is_compressing_ = true;
     stream_.next_out = start_;
     stream_.avail_out = static_cast<unsigned int>(end_ - start_);
diff --git a/src/profiling/memory/heapprofd_producer_unittest.cc b/src/profiling/memory/heapprofd_producer_unittest.cc
index 66bd452..4e4b6ed 100644
--- a/src/profiling/memory/heapprofd_producer_unittest.cc
+++ b/src/profiling/memory/heapprofd_producer_unittest.cc
@@ -41,7 +41,7 @@
   MOCK_CONST_METHOD0(shared_buffer_page_size_kb, size_t());
   MOCK_METHOD2(CreateTraceWriter,
                std::unique_ptr<TraceWriter>(BufferID, BufferExhaustedPolicy));
-  MOCK_METHOD0(GetInProcessShmemArbiter, SharedMemoryArbiter*());
+  MOCK_METHOD0(MaybeSharedMemoryArbiter, SharedMemoryArbiter*());
   MOCK_METHOD1(ActivateTriggers, void(const std::vector<std::string>&));
 
   MOCK_METHOD1(RegisterDataSource, void(const DataSourceDescriptor&));
diff --git a/src/profiling/memory/shared_ring_buffer.cc b/src/profiling/memory/shared_ring_buffer.cc
index 99a9c89..826425d 100644
--- a/src/profiling/memory/shared_ring_buffer.cc
+++ b/src/profiling/memory/shared_ring_buffer.cc
@@ -104,6 +104,18 @@
     size_t outer_size = kMetaPageSize + size_ * 2 + kGuardSize;
     munmap(meta_, outer_size);
   }
+
+  // This is work-around for code like the following:
+  // https://android.googlesource.com/platform/libcore/+/4ecb71f94378716f88703b9f7548b5d24839262f/ojluni/src/main/native/UNIXProcess_md.c#427
+  // They fork, close all fds by iterating over /proc/self/fd using opendir.
+  // Unfortunately closedir calls free, which detects the fork, and then tries
+  // to destruct the Client that holds this SharedRingBuffer.
+  //
+  // ScopedResource crashes on failure to close, so we explicitly ignore
+  // failures here.
+  int fd = mem_fd_.release();
+  if (fd != -1)
+    close(fd);
 }
 
 void SharedRingBuffer::Initialize(base::ScopedFile mem_fd) {
diff --git a/src/trace_processor/clock_tracker.cc b/src/trace_processor/clock_tracker.cc
index 696a110..43a6819 100644
--- a/src/trace_processor/clock_tracker.cc
+++ b/src/trace_processor/clock_tracker.cc
@@ -41,6 +41,10 @@
 void ClockTracker::AddSnapshot(const std::vector<ClockValue>& clocks) {
   const auto snapshot_id = cur_snapshot_id_++;
 
+  // Clear the cache
+  static_assert(std::is_trivial<decltype(cache_)>::value, "must be trivial");
+  memset(&cache_[0], 0, sizeof(cache_));
+
   // Compute the fingerprint of the snapshot by hashing all clock ids. This is
   // used by the clock pathfinding logic.
   base::Hash hasher;
@@ -192,25 +196,33 @@
   return ClockPath();  // invalid path.
 }
 
-base::Optional<int64_t> ClockTracker::Convert(ClockId src_clock_id,
-                                              int64_t src_timestamp,
-                                              ClockId target_clock_id) {
-  // TODO(primiano): optimization: I bet A simple LRU cache of the form
-  // (src_clock_id, target_clock_id, latest_timestamp, translation_ns) might
-  // speed up most conversion allowing to skip FindPath and the iterations.
-
+base::Optional<int64_t> ClockTracker::ConvertSlowpath(ClockId src_clock_id,
+                                                      int64_t src_timestamp,
+                                                      ClockId target_clock_id) {
   PERFETTO_DCHECK(!IsReservedSeqScopedClockId(src_clock_id));
   PERFETTO_DCHECK(!IsReservedSeqScopedClockId(target_clock_id));
 
+  context_->storage->IncrementStats(stats::clock_sync_cache_miss);
+
   ClockPath path = FindPath(src_clock_id, target_clock_id);
   if (!path.valid()) {
     context_->storage->IncrementStats(stats::clock_sync_failure);
     return base::nullopt;
   }
 
+  // We can cache only single-path resolutions between two clocks.
+  // Caching multi-path resolutions is harder because the (src,target) tuple
+  // is not enough as a cache key: at any step the |ns| value can yield to a
+  // different choice of the next snapshot. Multi-path resolutions don't seem
+  // too frequent these days, so we focus only on caching the more frequent
+  // one-step resolutions (typically from any clock to the trace clock).
+  const bool cacheable = path.len == 1;
+  CachedClockPath cache_entry{};
+
   // Iterate trough the path found and translate timestamps onto the new clock
   // domain on each step, until the target domain is reached.
-  int64_t ns = GetClock(src_clock_id)->ToNs(src_timestamp);
+  ClockDomain* src_domain = GetClock(src_clock_id);
+  int64_t ns = src_domain->ToNs(src_timestamp);
   for (uint32_t i = 0; i < path.len; ++i) {
     const ClockGraphEdge edge = path.at(i);
     ClockDomain* cur_clock = GetClock(std::get<0>(edge));
@@ -246,12 +258,33 @@
     // The translated timestamp is the relative delta of the source timestamp
     // from the closest snapshot found (ns - *it), plus the timestamp in
     // the new clock domain for the same snapshot id.
-    ns = (ns - *it) + next_timestamp_ns;
+    const int64_t adj = next_timestamp_ns - *it;
+    ns += adj;
+
+    // On the first iteration, keep track of the bounds for the cache entry.
+    // This will allow future Convert() calls to skip the pathfinder logic
+    // as long as the query stays within the bound.
+    if (cacheable) {
+      PERFETTO_DCHECK(i == 0);
+      const int64_t kInt64Min = std::numeric_limits<int64_t>::min();
+      const int64_t kInt64Max = std::numeric_limits<int64_t>::max();
+      cache_entry.min_ts_ns = it == ts_vec.begin() ? kInt64Min : *it;
+      auto ubound = it + 1;
+      cache_entry.max_ts_ns = ubound == ts_vec.end() ? kInt64Max : *ubound;
+      cache_entry.translation_ns = adj;
+    }
 
     // The last clock in the path must be the target clock.
     PERFETTO_DCHECK(i < path.len - 1 || std::get<1>(edge) == target_clock_id);
   }
 
+  if (cacheable) {
+    cache_entry.src = src_clock_id;
+    cache_entry.src_domain = src_domain;
+    cache_entry.target = target_clock_id;
+    cache_[rnd_() % cache_.size()] = cache_entry;
+  }
+
   return ns;
 }
 
diff --git a/src/trace_processor/clock_tracker.h b/src/trace_processor/clock_tracker.h
index c261249..63fe8e1 100644
--- a/src/trace_processor/clock_tracker.h
+++ b/src/trace_processor/clock_tracker.h
@@ -21,6 +21,7 @@
 
 #include <array>
 #include <map>
+#include <random>
 #include <set>
 #include <vector>
 
@@ -154,9 +155,27 @@
   // This is typically called by the code that reads the ClockSnapshot packet.
   void AddSnapshot(const std::vector<ClockValue>&);
 
+  // Converts a timestamp between two clock domains. Tries to use the cache
+  // first (only for single-path resolutions), then falls back on path finding
+  // as described in the header.
   base::Optional<int64_t> Convert(ClockId src_clock_id,
                                   int64_t src_timestamp,
-                                  ClockId target_clock_id);
+                                  ClockId target_clock_id) {
+    if (PERFETTO_LIKELY(!cache_lookups_disabled_for_testing_)) {
+      for (const auto& ce : cache_) {
+        if (ce.src != src_clock_id || ce.target != target_clock_id)
+          continue;
+        int64_t ns = ce.src_domain->ToNs(src_timestamp);
+        if (ns >= ce.min_ts_ns && ns < ce.max_ts_ns)
+          return ns + ce.translation_ns;
+      }
+    }
+    return ConvertSlowpath(src_clock_id, src_timestamp, target_clock_id);
+  }
+
+  base::Optional<int64_t> ConvertSlowpath(ClockId src_clock_id,
+                                          int64_t src_timestamp,
+                                          ClockId target_clock_id);
 
   base::Optional<int64_t> ToTraceTime(ClockId clock_id, int64_t timestamp) {
     if (clock_id == trace_time_clock_id_)
@@ -169,6 +188,10 @@
     trace_time_clock_id_ = clock_id;
   }
 
+  void set_cache_lookups_disabled_for_testing(bool v) {
+    cache_lookups_disabled_for_testing_ = v;
+  }
+
  private:
   using SnapshotHash = uint32_t;
 
@@ -245,6 +268,17 @@
     }
   };
 
+  // Holds data for cached entries. At the moment only single-path resolution
+  // are cached.
+  struct CachedClockPath {
+    ClockId src;
+    ClockId target;
+    ClockDomain* src_domain;
+    int64_t min_ts_ns;
+    int64_t max_ts_ns;
+    int64_t translation_ns;
+  };
+
   ClockTracker(const ClockTracker&) = delete;
   ClockTracker& operator=(const ClockTracker&) = delete;
 
@@ -261,6 +295,9 @@
   std::map<ClockId, ClockDomain> clocks_;
   std::set<ClockGraphEdge> graph_;
   std::set<ClockId> non_monotonic_clocks_;
+  std::array<CachedClockPath, 2> cache_{};
+  bool cache_lookups_disabled_for_testing_ = false;
+  std::minstd_rand rnd_;  // For cache eviction.
   uint32_t cur_snapshot_id_ = 0;
 };
 
diff --git a/src/trace_processor/clock_tracker_unittest.cc b/src/trace_processor/clock_tracker_unittest.cc
index e1628af..2fc8e59 100644
--- a/src/trace_processor/clock_tracker_unittest.cc
+++ b/src/trace_processor/clock_tracker_unittest.cc
@@ -16,6 +16,8 @@
 
 #include "src/trace_processor/clock_tracker.h"
 
+#include <random>
+
 #include "perfetto/ext/base/optional.h"
 #include "src/trace_processor/trace_processor_context.h"
 #include "src/trace_processor/trace_storage.h"
@@ -207,6 +209,55 @@
   EXPECT_EQ(*ct_.ToTraceTime(c66_2, 4 /* abs 30 */), 129000);
 }
 
+// Tests that the cache doesn't affect the results of Convert() in unexpected
+// ways.
+TEST_F(ClockTrackerTest, CacheDoesntAffectResults) {
+  std::minstd_rand rnd;
+  int last_mono = 0;
+  int last_boot = 0;
+  int last_raw = 0;
+  static const int increments[] = {1, 2, 10};
+  for (int i = 0; i < 1000; i++) {
+    last_mono += increments[rnd() % base::ArraySize(increments)];
+    last_boot += increments[rnd() % base::ArraySize(increments)];
+    ct_.AddSnapshot({{MONOTONIC, last_mono}, {BOOTTIME, last_boot}});
+
+    last_raw += increments[rnd() % base::ArraySize(increments)];
+    last_boot += increments[rnd() % base::ArraySize(increments)];
+    ct_.AddSnapshot({{MONOTONIC_RAW, last_raw}, {BOOTTIME, last_boot}});
+  }
+
+  for (int i = 0; i < 1000; i++) {
+    int64_t val = static_cast<int64_t>(rnd()) % 10000;
+    for (int j = 0; j < 5; j++) {
+      ClockTracker::ClockId src;
+      ClockTracker::ClockId tgt;
+      if (j == 0) {
+        std::tie(src, tgt) = std::make_tuple(MONOTONIC, BOOTTIME);
+      } else if (j == 1) {
+        std::tie(src, tgt) = std::make_tuple(MONOTONIC_RAW, BOOTTIME);
+      } else if (j == 2) {
+        std::tie(src, tgt) = std::make_tuple(BOOTTIME, MONOTONIC);
+      } else if (j == 3) {
+        std::tie(src, tgt) = std::make_tuple(BOOTTIME, MONOTONIC_RAW);
+      } else if (j == 4) {
+        std::tie(src, tgt) = std::make_tuple(MONOTONIC_RAW, MONOTONIC);
+      } else {
+        PERFETTO_FATAL("j out of bounds");
+      }
+      // It will still write the cache, just not lookup.
+      ct_.set_cache_lookups_disabled_for_testing(true);
+      auto not_cached = ct_.Convert(src, val, tgt);
+
+      // This should 100% hit the cache.
+      ct_.set_cache_lookups_disabled_for_testing(false);
+      auto cached = ct_.Convert(src, val, tgt);
+
+      ASSERT_EQ(not_cached, cached);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace trace_processor
 }  // namespace perfetto
diff --git a/src/trace_processor/containers/bit_vector.h b/src/trace_processor/containers/bit_vector.h
index 5ef0f74..bfd8b4f 100644
--- a/src/trace_processor/containers/bit_vector.h
+++ b/src/trace_processor/containers/bit_vector.h
@@ -265,6 +265,42 @@
     size_ = size;
   }
 
+  // Creates a BitVector of size |end| with the bits between |start| and |end|
+  // filled by calling the filler function |f(index of bit)|.
+  //
+  // As an example, suppose Range(3, 7, [](x) { return x < 5 }). This would
+  // result in the following bitvector:
+  // [0 0 0 1 1 0 0 0]
+  template <typename Filler = bool(uint32_t)>
+  static BitVector Range(uint32_t start, uint32_t end, Filler f) {
+    // Compute the block index and bitvector index where we start and end
+    // working one block at a time.
+    uint32_t start_fast_block = BlockCeil(start);
+    uint32_t start_fast_idx = BlockToIndex(start_fast_block);
+    uint32_t end_fast_block = BlockFloor(end);
+    uint32_t end_fast_idx = BlockToIndex(end_fast_block);
+
+    // First, create the BitVector up to |start| then fill up to
+    // |start_fast_index| with values from the filler.
+    BitVector bv(start, false);
+    for (uint32_t i = start; i < start_fast_idx; ++i) {
+      bv.Append(f(i));
+    }
+
+    // At this point we can work one block at a time.
+    for (uint32_t i = start_fast_block; i < end_fast_block; ++i) {
+      bv.counts_.emplace_back(bv.GetNumBitsSet());
+      bv.blocks_.emplace_back(Block::FromFiller(bv.size_, f));
+      bv.size_ += Block::kBits;
+    }
+
+    // Add the last few elements to finish up to |end|.
+    for (uint32_t i = end_fast_idx; i < end; ++i) {
+      bv.Append(f(i));
+    }
+    return bv;
+  }
+
   // Updates the ith set bit of this bitvector with the value of
   // |other.IsSet(i)|.
   //
@@ -296,6 +332,17 @@
   // }
   SetBitsIterator IterateSetBits() const;
 
+  // Returns the approximate cost (in bytes) of storing a bitvector with size
+  // |n|. This can be used to make decisions about whether using a BitVector is
+  // worthwhile.
+  // This cost should not be treated as exact - it just gives an indication of
+  // the memory needed.
+  static constexpr uint32_t ApproxBytesCost(uint32_t n) {
+    // The two main things making up a bitvector is the cost of the blocks of
+    // bits and the cost of the counts vector.
+    return BlockCeil(n) * Block::kBits + BlockCeil(n) * sizeof(uint32_t);
+  }
+
  private:
   friend class internal::BaseIterator;
   friend class internal::AllBitsIterator;
@@ -325,15 +372,18 @@
     // Returns whether the bit at the given index is set.
     bool IsSet(uint32_t idx) const {
       PERFETTO_DCHECK(idx < kBits);
-      return (word >> idx) & 1ull;
+      return (word_ >> idx) & 1ull;
     }
 
+    // Bitwise ors the given |mask| to the current value.
+    void Or(uint64_t mask) { word_ |= mask; }
+
     // Sets the bit at the given index to true.
     void Set(uint32_t idx) {
       PERFETTO_DCHECK(idx < kBits);
 
       // Or the value for the true shifted up to |idx| with the word.
-      word |= 1ull << idx;
+      Or(1ull << idx);
     }
 
     // Sets the bit at the given index to false.
@@ -341,11 +391,11 @@
       PERFETTO_DCHECK(idx < kBits);
 
       // And the integer of all bits set apart from |idx| with the word.
-      word &= ~(1ull << idx);
+      word_ &= ~(1ull << idx);
     }
 
     // Clears all the bits (i.e. sets the atom to zero).
-    void ClearAll() { word = 0; }
+    void ClearAll() { word_ = 0; }
 
     // Returns the index of the nth set bit.
     // Undefined if |n| >= |GetNumBitsSet()|.
@@ -367,13 +417,13 @@
       //
       // The code below was taken from the paper
       // http://vigna.di.unimi.it/ftp/papers/Broadword.pdf
-      uint64_t s = word - ((word & 0xAAAAAAAAAAAAAAAA) >> 1);
+      uint64_t s = word_ - ((word_ & 0xAAAAAAAAAAAAAAAA) >> 1);
       s = (s & 0x3333333333333333) + ((s >> 2) & 0x3333333333333333);
       s = ((s + (s >> 4)) & 0x0F0F0F0F0F0F0F0F) * L8;
 
       uint64_t b = (BwLessThan(s, n * L8) >> 7) * L8 >> 53 & ~7ull;
       uint64_t l = n - ((s << 8) >> b & 0xFF);
-      s = (BwGtZero(((word >> b & 0xFF) * L8) & 0x8040201008040201) >> 7) * L8;
+      s = (BwGtZero(((word_ >> b & 0xFF) * L8) & 0x8040201008040201) >> 7) * L8;
 
       uint64_t ret = b + ((BwLessThan(s, l * L8) >> 7) * L8 >> 56);
 
@@ -384,7 +434,7 @@
     uint32_t GetNumBitsSet() const {
       // We use __builtin_popcountll here as it's available natively for the two
       // targets we care most about (x64 and WASM).
-      return static_cast<uint32_t>(__builtin_popcountll(word));
+      return static_cast<uint32_t>(__builtin_popcountll(word_));
     }
 
     // Returns the number of set bits up to and including the bit at |idx|.
@@ -400,13 +450,13 @@
     // all bits after this point.
     void ClearAfter(uint32_t idx) {
       PERFETTO_DCHECK(idx < kBits);
-      word = WordUntil(idx);
+      word_ = WordUntil(idx);
     }
 
     // Sets all bits between the bit at |start| and |end| (inclusive).
     void Set(uint32_t start, uint32_t end) {
       uint32_t diff = end - start;
-      word |= (MaskAllBitsSetUntil(diff) << static_cast<uint64_t>(start));
+      word_ |= (MaskAllBitsSetUntil(diff) << static_cast<uint64_t>(start));
     }
 
    private:
@@ -447,7 +497,7 @@
       uint64_t mask = MaskAllBitsSetUntil(idx);
 
       // Finish up by anding the the atom with the computed msk.
-      return word & mask;
+      return word_ & mask;
     }
 
     // Return a mask of all the bits up to and including bit at |idx|.
@@ -468,7 +518,7 @@
       return top - 1u;
     }
 
-    uint64_t word = 0;
+    uint64_t word_ = 0;
   };
 
   // Represents a group of bits with a bitcount such that it is
@@ -483,7 +533,7 @@
   class Block {
    public:
     // See class documentation for how these constants are chosen.
-    static constexpr uint32_t kWords = 8;
+    static constexpr uint16_t kWords = 8;
     static constexpr uint32_t kBits = kWords * BitWord::kBits;
 
     // Returns whether the bit at the given address is set.
@@ -589,6 +639,24 @@
       words_[end.word_idx].Set(0, end.bit_idx);
     }
 
+    template <typename Filler>
+    static Block FromFiller(uint32_t offset, Filler f) {
+      // We choose to iterate the bits as the outer loop as this allows us
+      // to reuse the mask and the bit offset between iterations of the loop.
+      // This makes a small (but noticable) impact in the performance of this
+      // function.
+      Block b;
+      for (uint32_t i = 0; i < BitWord::kBits; ++i) {
+        uint64_t mask = 1ull << i;
+        uint32_t offset_with_bit = offset + i;
+        for (uint32_t j = 0; j < Block::kWords; ++j) {
+          bool res = f(offset_with_bit + j * BitWord::kBits);
+          b.words_[j].Or(res ? mask : 0);
+        }
+      }
+      return b;
+    }
+
    private:
     std::array<BitWord, kWords> words_{};
   };
@@ -631,6 +699,17 @@
     blocks_[end.block_idx].Set(kFirstBlockOffset, end.block_offset);
   }
 
+  // Helper function to append a bit. Generally, prefer to call AppendTrue
+  // or AppendFalse instead of this function if you know the type - they will
+  // be faster.
+  void Append(bool value) {
+    if (value) {
+      AppendTrue();
+    } else {
+      AppendFalse();
+    }
+  }
+
   static Address IndexToAddress(uint32_t idx) {
     Address a;
     a.block_idx = idx / Block::kBits;
@@ -647,6 +726,29 @@
            addr.block_offset.bit_idx;
   }
 
+  // Rounds |idx| up to the nearest block boundary and returns the block
+  // index. If |idx| is already on a block boundary, the current block is
+  // returned.
+  //
+  // This is useful to be able to find indices where "fast" algorithms can start
+  // which work on entire blocks.
+  static constexpr uint32_t BlockCeil(uint32_t idx) {
+    // Adding |Block::kBits - 1| gives us a quick way to get the ceil. We
+    // do this instead of adding 1 at the end because that gives incorrect
+    // answers for index % Block::kBits == 0.
+    return (idx + Block::kBits - 1) / Block::kBits;
+  }
+
+  // Returns the index of the block which would store |idx|.
+  static constexpr uint32_t BlockFloor(uint32_t idx) {
+    return idx / Block::kBits;
+  }
+
+  // Converts a block index to a index in the BitVector.
+  static constexpr uint32_t BlockToIndex(uint32_t block) {
+    return block * Block::kBits;
+  }
+
   uint32_t size_ = 0;
   std::vector<uint32_t> counts_;
   std::vector<Block> blocks_;
diff --git a/src/trace_processor/containers/bit_vector_benchmark.cc b/src/trace_processor/containers/bit_vector_benchmark.cc
index 2deb904..6578e73 100644
--- a/src/trace_processor/containers/bit_vector_benchmark.cc
+++ b/src/trace_processor/containers/bit_vector_benchmark.cc
@@ -211,6 +211,28 @@
 }
 BENCHMARK(BM_BitVectorResize);
 
+static void BM_BitVectorRangeFixedSize(benchmark::State& state) {
+  static constexpr uint32_t kRandomSeed = 42;
+  std::minstd_rand0 rnd_engine(kRandomSeed);
+
+  uint32_t size = static_cast<uint32_t>(state.range(0));
+  uint32_t set_percentage = static_cast<uint32_t>(state.range(1));
+
+  std::vector<uint32_t> resize_fill_pool(size);
+  for (uint32_t i = 0; i < size; ++i) {
+    resize_fill_pool[i] = rnd_engine() % 100 < set_percentage ? 90 : 100;
+  }
+
+  for (auto _ : state) {
+    auto filler = [&resize_fill_pool](uint32_t i) PERFETTO_ALWAYS_INLINE {
+      return resize_fill_pool[i] < 95;
+    };
+    BitVector bv = BitVector::Range(0, size, filler);
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_BitVectorRangeFixedSize)->Apply(BitVectorArgs);
+
 static void BM_BitVectorUpdateSetBits(benchmark::State& state) {
   static constexpr uint32_t kRandomSeed = 42;
   std::minstd_rand0 rnd_engine(kRandomSeed);
diff --git a/src/trace_processor/containers/bit_vector_iterators.h b/src/trace_processor/containers/bit_vector_iterators.h
index 0047812..62094ff 100644
--- a/src/trace_processor/containers/bit_vector_iterators.h
+++ b/src/trace_processor/containers/bit_vector_iterators.h
@@ -132,6 +132,16 @@
   // Increments the iterator to point to the next bit.
   void Next() { SetIndex(index() + 1); }
 
+  // Increments the iterator to skip the next |n| bits and point to the
+  // following one.
+  // Precondition: n >= 1 & index() + n <= size().
+  void Skip(uint32_t n) {
+    PERFETTO_DCHECK(n >= 1);
+    PERFETTO_DCHECK(index() + n <= size());
+
+    SetIndex(index() + n);
+  }
+
   // Returns whether the iterator is valid.
   operator bool() const { return index() < size(); }
 };
diff --git a/src/trace_processor/containers/bit_vector_unittest.cc b/src/trace_processor/containers/bit_vector_unittest.cc
index b8e7ebb..4a78c2c 100644
--- a/src/trace_processor/containers/bit_vector_unittest.cc
+++ b/src/trace_processor/containers/bit_vector_unittest.cc
@@ -399,6 +399,18 @@
   ASSERT_FALSE(it);
 }
 
+TEST(BitVectorUnittest, Range) {
+  BitVector bv =
+      BitVector::Range(1, 1025, [](uint32_t t) { return t % 3 == 0; });
+
+  ASSERT_FALSE(bv.IsSet(0));
+  for (uint32_t i = 1; i < 1025; ++i) {
+    ASSERT_EQ(i % 3 == 0, bv.IsSet(i));
+  }
+  ASSERT_EQ(bv.size(), 1025u);
+  ASSERT_EQ(bv.GetNumBitsSet(), 341u);
+}
+
 TEST(BitVectorUnittest, QueryStressTest) {
   BitVector bv;
   std::vector<bool> bool_vec;
diff --git a/src/trace_processor/containers/row_map.h b/src/trace_processor/containers/row_map.h
index a053245..8cf91f1 100644
--- a/src/trace_processor/containers/row_map.h
+++ b/src/trace_processor/containers/row_map.h
@@ -250,11 +250,11 @@
     PERFETTO_DCHECK(idx < size());
     switch (mode_) {
       case Mode::kRange:
-        return start_idx_ + idx;
+        return GetRange(idx);
       case Mode::kBitVector:
-        return bit_vector_.IndexOfNthSet(idx);
+        return GetBitVector(idx);
       case Mode::kIndexVector:
-        return index_vector_[idx];
+        return GetIndexVector(idx);
     }
     PERFETTO_FATAL("For GCC");
   }
@@ -395,7 +395,7 @@
     }
 
     // TODO(lalitm): improve efficiency of this if we end up needing it.
-    RemoveIf([&other](uint32_t row) { return !other.Contains(row); });
+    Filter([&other](uint32_t row) { return other.Contains(row); });
   }
 
   // Filters the current RowMap into the RowMap given by |out| based on the
@@ -441,41 +441,52 @@
     // cases where |out| has only a few entries so we can scan |out| instead of
     // scanning |this|.
 
-    // TODO(lalit): investigate whether we should also scan |out| if |this| is
-    // a range or index vector as, in those cases, it would be fast to lookup
-    // |this| by index.
-
-    // We choose to full scan |this| rather than |out| as the performance
-    // penalty of incorrectly scanning |out| is much worse than mistakely
-    // scanning |this|.
-    // This is because scans on |out| involve an indexed lookup on |this| which
-    // (in the case of a bitvector) can be very expensive. On the other hand,
-    // scanning |this| means we never have to do indexed lookups but we may
-    // scan many more rows than necessary (as they may have already been
-    // excluded in out).
-    FilterIntoScanSelf(out, p);
-  }
-
-  template <typename Comparator>
-  void StableSort(std::vector<uint32_t>* out, Comparator c) const {
+    // Ideally, we'd always just scan the rows in |out| and keep those which
+    // meet |p|. However, if |this| is a BitVector, we end up needing expensive
+    // |IndexOfNthSet| calls (as we need to lookup the row before passing it to
+    // |p|).
     switch (mode_) {
       case Mode::kRange: {
-        StableSort(out, c, [this](uint32_t off) { return start_idx_ + off; });
+        auto ip = [this, p](uint32_t idx) { return p(GetRange(idx)); };
+        out->Filter(ip);
         break;
       }
       case Mode::kBitVector: {
-        StableSort(out, c, [this](uint32_t off) {
-          return bit_vector_.IndexOfNthSet(off);
-        });
+        FilterIntoScanSelfBv(out, p);
         break;
       }
       case Mode::kIndexVector: {
-        StableSort(out, c, [this](uint32_t off) { return index_vector_[off]; });
+        auto ip = [this, p](uint32_t row) { return p(GetIndexVector(row)); };
+        out->Filter(ip);
         break;
       }
     }
   }
 
+  template <typename Comparator = bool(uint32_t, uint32_t)>
+  void StableSort(std::vector<uint32_t>* out, Comparator c) const {
+    switch (mode_) {
+      case Mode::kRange:
+        std::stable_sort(out->begin(), out->end(),
+                         [this, c](uint32_t a, uint32_t b) {
+                           return c(GetRange(a), GetRange(b));
+                         });
+        break;
+      case Mode::kBitVector:
+        std::stable_sort(out->begin(), out->end(),
+                         [this, c](uint32_t a, uint32_t b) {
+                           return c(GetBitVector(a), GetBitVector(b));
+                         });
+        break;
+      case Mode::kIndexVector:
+        std::stable_sort(out->begin(), out->end(),
+                         [this, c](uint32_t a, uint32_t b) {
+                           return c(GetIndexVector(a), GetIndexVector(b));
+                         });
+        break;
+    }
+  }
+
   // Returns the iterator over the rows in this RowMap.
   Iterator IterateRows() const { return Iterator(this); }
 
@@ -489,29 +500,35 @@
     kIndexVector,
   };
 
-  // Filters the current RowMap into |out| by performing a full scan on |this|.
-  // See |FilterInto| for a full breakdown of the semantics of this function.
+  // Filters the indices in |out| by keeping those which meet |p|.
   template <typename Predicate>
-  void FilterIntoScanSelf(RowMap* out, Predicate p) const {
+  void Filter(Predicate p) {
     switch (mode_) {
       case Mode::kRange:
-        FilterIntoScanSelf(out, RangeIterator(this), p);
+        FilterRange(p);
         break;
-      case Mode::kBitVector:
-        FilterIntoScanSelf(out, bit_vector_.IterateSetBits(), p);
+      case Mode::kBitVector: {
+        for (auto it = bit_vector_.IterateSetBits(); it; it.Next()) {
+          if (!p(it.index()))
+            it.Clear();
+        }
         break;
-      case Mode::kIndexVector:
-        FilterIntoScanSelf(out, IndexVectorIterator(this), p);
+      }
+      case Mode::kIndexVector: {
+        auto ret = std::remove_if(index_vector_.begin(), index_vector_.end(),
+                                  [p](uint32_t i) { return !p(i); });
+        index_vector_.erase(ret, index_vector_.end());
         break;
+      }
     }
   }
 
   // Filters the current RowMap into |out| by performing a full scan on |this|
-  // using the |it|, a strongly typed iterator on |this| (a strongly typed
-  // iterator is used for performance reasons).
+  // where |this| is a BitVector.
   // See |FilterInto| for a full breakdown of the semantics of this function.
-  template <typename Iterator, typename Predicate>
-  void FilterIntoScanSelf(RowMap* out, Iterator it, Predicate p) const {
+  template <typename Predicate>
+  void FilterIntoScanSelfBv(RowMap* out, Predicate p) const {
+    auto it = bit_vector_.IterateSetBits();
     switch (out->mode_) {
       case Mode::kRange: {
         // TODO(lalitm): investigate whether we can reuse the data inside
@@ -559,6 +576,53 @@
     }
   }
 
+  template <typename Predicate>
+  void FilterRange(Predicate p) {
+    uint32_t count = end_idx_ - start_idx_;
+
+    // Optimization: if we are only going to scan a few rows, it's not
+    // worth the haslle of working with a BitVector.
+    constexpr uint32_t kSmallRangeLimit = 2048;
+    bool is_small_range = count < kSmallRangeLimit;
+
+    // Optimization: weif the cost of a BitVector is more than the highest
+    // possible cost an index vector could have, use the index vector.
+    uint32_t bit_vector_cost = BitVector::ApproxBytesCost(end_idx_);
+    uint32_t index_vector_cost_ub = sizeof(uint32_t) * count;
+
+    // If either of the conditions hold which make it better to use an
+    // index vector, use it instead.
+    if (is_small_range || index_vector_cost_ub <= bit_vector_cost) {
+      // Try and strike a good balance between not making the vector too
+      // big and good performance.
+      std::vector<uint32_t> iv(std::min(kSmallRangeLimit, count));
+
+      uint32_t out_idx = 0;
+      for (uint32_t i = 0; i < count; ++i) {
+        // If we reach the capacity add another small set of indices.
+        if (PERFETTO_UNLIKELY(out_idx == iv.size()))
+          iv.resize(iv.size() + kSmallRangeLimit);
+
+        // We keep this branch free by always writing the index but only
+        // incrementing the out index if the return value is true.
+        bool value = p(i + start_idx_);
+        iv[out_idx] = i + start_idx_;
+        out_idx += value;
+      }
+
+      // Make the vector the correct size and as small as possible.
+      iv.resize(out_idx);
+      iv.shrink_to_fit();
+
+      *this = RowMap(std::move(iv));
+      return;
+    }
+
+    // Otherwise, create a bitvector which spans the full range using
+    // |p| as the filler for the bits between start and end.
+    *this = RowMap(BitVector::Range(start_idx_, end_idx_, p));
+  }
+
   void InsertIntoBitVector(uint32_t row) {
     PERFETTO_DCHECK(mode_ == Mode::kBitVector);
 
@@ -567,41 +631,17 @@
     bit_vector_.Set(row);
   }
 
-  // Removes any row where |p(row)| returns false from this RowMap.
-  template <typename Predicate>
-  void RemoveIf(Predicate p) {
-    switch (mode_) {
-      case Mode::kRange: {
-        bit_vector_.Resize(start_idx_, false);
-        for (uint32_t i = start_idx_; i < end_idx_; ++i) {
-          if (p(i))
-            bit_vector_.AppendFalse();
-          else
-            bit_vector_.AppendTrue();
-        }
-        *this = RowMap(std::move(bit_vector_));
-        break;
-      }
-      case Mode::kBitVector: {
-        for (auto it = bit_vector_.IterateSetBits(); it; it.Next()) {
-          if (p(it.index()))
-            it.Clear();
-        }
-        break;
-      }
-      case Mode::kIndexVector: {
-        auto it = std::remove_if(index_vector_.begin(), index_vector_.end(), p);
-        index_vector_.erase(it, index_vector_.end());
-        break;
-      }
-    }
+  PERFETTO_ALWAYS_INLINE uint32_t GetRange(uint32_t idx) const {
+    PERFETTO_DCHECK(mode_ == Mode::kRange);
+    return start_idx_ + idx;
   }
-
-  template <typename Comparator, typename Indexer>
-  void StableSort(std::vector<uint32_t>* out, Comparator c, Indexer i) const {
-    std::stable_sort(
-        out->begin(), out->end(),
-        [&c, &i](uint32_t a, uint32_t b) { return c(i(a), i(b)); });
+  PERFETTO_ALWAYS_INLINE uint32_t GetBitVector(uint32_t idx) const {
+    PERFETTO_DCHECK(mode_ == Mode::kBitVector);
+    return bit_vector_.IndexOfNthSet(idx);
+  }
+  PERFETTO_ALWAYS_INLINE uint32_t GetIndexVector(uint32_t idx) const {
+    PERFETTO_DCHECK(mode_ == Mode::kIndexVector);
+    return index_vector_[idx];
   }
 
   RowMap SelectRowsSlow(const RowMap& selector) const;
diff --git a/src/trace_processor/containers/row_map_unittest.cc b/src/trace_processor/containers/row_map_unittest.cc
index bd92ae0..fde10d5 100644
--- a/src/trace_processor/containers/row_map_unittest.cc
+++ b/src/trace_processor/containers/row_map_unittest.cc
@@ -348,6 +348,26 @@
   ASSERT_EQ(filter.Get(1u), 5u);
 }
 
+TEST(RowMapUnittest, FilterIntoOffsetRangeWithRange) {
+  RowMap rm(100000, 100010);
+  RowMap filter(4, 7);
+  rm.FilterInto(&filter, [](uint32_t row) { return row == 100004u; });
+
+  ASSERT_EQ(filter.size(), 1u);
+  ASSERT_EQ(filter.Get(0u), 4u);
+}
+
+TEST(RowMapUnittest, FilterIntoLargeRangeWithRange) {
+  RowMap rm(0, 100000);
+  RowMap filter(0, 100000);
+  rm.FilterInto(&filter, [](uint32_t row) { return row % 2 == 0; });
+
+  ASSERT_EQ(filter.size(), 100000u / 2);
+  for (uint32_t i = 0; i < 100000 / 2; ++i) {
+    ASSERT_EQ(filter.Get(i), i * 2);
+  }
+}
+
 TEST(RowMapUnittest, FilterIntoBitVectorWithRange) {
   RowMap rm(
       BitVector{true, false, false, true, false, true, false, true, true});
diff --git a/src/trace_processor/db/BUILD.gn b/src/trace_processor/db/BUILD.gn
index 8260e3e..5e15f66 100644
--- a/src/trace_processor/db/BUILD.gn
+++ b/src/trace_processor/db/BUILD.gn
@@ -22,6 +22,7 @@
     "table.cc",
     "table.h",
     "typed_column.h",
+    "typed_column_internal.h",
   ]
   deps = [
     "../../../gn:default_deps",
diff --git a/src/trace_processor/db/column.cc b/src/trace_processor/db/column.cc
index fedfa7c..d8a88b2 100644
--- a/src/trace_processor/db/column.cc
+++ b/src/trace_processor/db/column.cc
@@ -454,8 +454,8 @@
     auto a_val = sv.GetNonNull(a_idx);
     auto b_val = sv.GetNonNull(b_idx);
 
-    int res = compare::Numeric(a_val, b_val);
-    return desc ? res > 0 : res < 0;
+    return desc ? compare::Numeric(a_val, b_val) > 0
+                : compare::Numeric(a_val, b_val) < 0;
   });
 }
 
diff --git a/src/trace_processor/db/column.h b/src/trace_processor/db/column.h
index 17ebe2d..7321502 100644
--- a/src/trace_processor/db/column.h
+++ b/src/trace_processor/db/column.h
@@ -309,13 +309,6 @@
   JoinKey join_key() const { return JoinKey{col_idx_in_table_}; }
 
  protected:
-  // Returns the string at the index |idx|.
-  // Should only be called when |type_| == ColumnType::kString.
-  NullTermStringView GetStringPoolStringAtIdx(uint32_t idx) const {
-    PERFETTO_DCHECK(type_ == ColumnType::kString);
-    return string_pool_->Get(sparse_vector<StringPool::Id>().GetNonNull(idx));
-  }
-
   // Returns the backing sparse vector cast to contain data of type T.
   // Should only be called when |type_| == ToColumnType<T>().
   template <typename T>
@@ -332,17 +325,6 @@
     return *static_cast<const SparseVector<T>*>(sparse_vector_);
   }
 
-  // Converts a primitive numeric value to an SqlValue of the correct type.
-  template <typename T>
-  static SqlValue NumericToSqlValue(T value) {
-    if (std::is_same<T, double>::value) {
-      return SqlValue::Double(value);
-    } else if (std::is_convertible<T, int64_t>::value) {
-      return SqlValue::Long(value);
-    }
-    PERFETTO_FATAL("Invalid type");
-  }
-
   const StringPool& string_pool() const { return *string_pool_; }
 
  private:
@@ -541,6 +523,13 @@
     }
   }
 
+  // Returns the string at the index |idx|.
+  // Should only be called when |type_| == ColumnType::kString.
+  NullTermStringView GetStringPoolStringAtIdx(uint32_t idx) const {
+    PERFETTO_DCHECK(type_ == ColumnType::kString);
+    return string_pool_->Get(sparse_vector<StringPool::Id>().GetNonNull(idx));
+  }
+
   // type_ is used to cast sparse_vector_ to the correct type.
   ColumnType type_ = ColumnType::kInt64;
   void* sparse_vector_ = nullptr;
diff --git a/src/trace_processor/db/typed_column.h b/src/trace_processor/db/typed_column.h
index 5be1dcc..272d465 100644
--- a/src/trace_processor/db/typed_column.h
+++ b/src/trace_processor/db/typed_column.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019 The Android Open Source Project
+ * Copyright (C) 2020 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,11 +18,140 @@
 #define SRC_TRACE_PROCESSOR_DB_TYPED_COLUMN_H_
 
 #include "src/trace_processor/db/column.h"
+#include "src/trace_processor/db/typed_column_internal.h"
 
 namespace perfetto {
 namespace trace_processor {
 
+// TypedColumn<T>
+//
+// Introduction:
+// TypedColumn exists to allow efficient access to the data in a Column without
+// having to go through dynamic type checking. There are two main reasons for
+// this:
+// 1. Performance: dynamic type checking is not free and so if this is used
+//    in a particularily hot codepath, the typechecking can be a significant
+//    overhead.
+// 2. Ergonomics: having to convert back and forth from/to SqlValue causes
+//    signifcant clutter in parts of the code which can already be quite hard
+//    to follow (e.g. trackers like StackProfileTracker which perform cross
+//    checking of various ids).
+//
+// Implementation:
+// TypedColumn is implemented as a memberless subclass of Column. This allows
+// us to reinterpret case from a Column* to a TypedColumn<T> where we know the
+// type T. The methods of TypedColumn are type-specialized methods of Column
+// which allow callers to pass raw types instead of using SqlValue.
+//
+// There are two helper classes (tc_internal::TypeHandler and
+// tc_internal::Serializer) where we specialize behaviour which needs to be
+// different based on T. See their class documentation and below for details
+// on their purpose.
+template <typename T>
+struct TypedColumn : public Column {
+ private:
+  using TH = tc_internal::TypeHandler<T>;
+
+  // The non-optional type of the data in this column.
+  using non_optional_type =
+      typename tc_internal::TypeHandler<T>::non_optional_type;
+
+  // The type of data in this column (including Optional wrapper if the type
+  // should be optional).
+  using get_type = typename tc_internal::TypeHandler<T>::get_type;
+
+  // The type which should be passed to SqlValue functions.
+  using sql_value_type = typename tc_internal::TypeHandler<T>::sql_value_type;
+
+  using Serializer = tc_internal::Serializer<non_optional_type>;
+
+ public:
+  // The type which should be stored in the SparseVector.
+  // Used by the macro code when actually constructing the SparseVectors.
+  using serialized_type = typename Serializer::serialized_type;
+
+  // Returns the data in the column at index |row|.
+  // Function chosen when TH::is_optional == true.
+  template <bool is_optional = TH::is_optional>
+  typename std::enable_if<is_optional, get_type>::type operator[](
+      uint32_t row) const {
+    return Serializer::Deserialize(sparse_vector().Get(row_map().Get(row)));
+  }
+
+  // Function chosen when TH::is_optional == false.
+  template <bool is_optional = TH::is_optional>
+  typename std::enable_if<!is_optional, get_type>::type operator[](
+      uint32_t row) const {
+    return Serializer::Deserialize(
+        sparse_vector().GetNonNull(row_map().Get(row)));
+  }
+
+  // Special function only for string types to allow retrieving the string
+  // directly from the column.
+  template <bool is_string = TH::is_string>
+  typename std::enable_if<is_string, NullTermStringView>::type GetString(
+      uint32_t row) const {
+    return string_pool().Get((*this)[row]);
+  }
+
+  // Sets the data in the column at index |row|.
+  void Set(uint32_t row, non_optional_type v) {
+    auto serialized = Serializer::Serialize(v);
+    mutable_sparse_vector()->Set(row_map().Get(row), serialized);
+  }
+
+  // Inserts the value at the end of the column.
+  void Append(T v) {
+    mutable_sparse_vector()->Append(Serializer::Serialize(v));
+  }
+
+  // Returns the row containing the given value in the Column.
+  base::Optional<uint32_t> IndexOf(sql_value_type v) const {
+    return Column::IndexOf(ToValue(v));
+  }
+
+  std::vector<get_type> ToVectorForTesting() const {
+    std::vector<T> result(row_map().size());
+    for (uint32_t i = 0; i < row_map().size(); ++i)
+      result[i] = (*this)[i];
+    return result;
+  }
+
+  // Helper functions to create constraints for the given value.
+  Constraint eq(sql_value_type v) const { return eq_value(ToValue(v)); }
+  Constraint gt(sql_value_type v) const { return gt_value(ToValue(v)); }
+  Constraint lt(sql_value_type v) const { return lt_value(ToValue(v)); }
+  Constraint ne(sql_value_type v) const { return ne_value(ToValue(v)); }
+  Constraint ge(sql_value_type v) const { return ge_value(ToValue(v)); }
+  Constraint le(sql_value_type v) const { return le_value(ToValue(v)); }
+
+  // Implements equality between two items of type |T|.
+  static constexpr bool Equals(T a, T b) { return TH::Equals(a, b); }
+
+  // Encodes the default flags for a column of the current type.
+  static constexpr uint32_t default_flags() {
+    return TH::is_optional ? Flag::kNoFlag : Flag::kNonNull;
+  }
+
+ private:
+  static SqlValue ToValue(double value) { return SqlValue::Double(value); }
+  static SqlValue ToValue(uint32_t value) { return SqlValue::Long(value); }
+  static SqlValue ToValue(int64_t value) { return SqlValue::Long(value); }
+  static SqlValue ToValue(NullTermStringView value) {
+    return SqlValue::String(value.c_str());
+  }
+
+  const SparseVector<serialized_type>& sparse_vector() const {
+    return Column::sparse_vector<serialized_type>();
+  }
+  SparseVector<serialized_type>* mutable_sparse_vector() {
+    return Column::mutable_sparse_vector<serialized_type>();
+  }
+};
+
 // Represents a column containing ids.
+// TODO(lalitm): think about unifying this with TypedColumn in the
+// future.
 template <typename Id>
 struct IdColumn : public Column {
   Id operator[](uint32_t row) const { return Id(row_map().Get(row)); }
@@ -39,182 +168,6 @@
   Constraint le(uint32_t v) const { return le_value(SqlValue::Long(v)); }
 };
 
-// Represents a column containing data with the given type T.
-//
-// This class exists as a memberless subclass of Column (i.e. sizeof(Column) ==
-// sizeof(TypedColumn<T>)); this is because Columns are type erased but we still
-// want low boilerplate methods to get/set rows in columns where we know the
-// type.
-template <typename T>
-struct TypedColumn : public Column {
-  using StoredType = T;
-
-  // Returns the data in the column at index |row|.
-  T operator[](uint32_t row) const {
-    return sparse_vector<T>().GetNonNull(row_map().Get(row));
-  }
-
-  // Sets the data in the column at index |row|.
-  void Set(uint32_t row, T v) {
-    mutable_sparse_vector<T>()->Set(row_map().Get(row), v);
-  }
-
-  // Inserts the value at the end of the column.
-  void Append(T v) { mutable_sparse_vector<T>()->Append(v); }
-
-  // Returns the row containing the given value in the Column.
-  base::Optional<uint32_t> IndexOf(T v) const {
-    return Column::IndexOf(NumericToSqlValue(v));
-  }
-
-  std::vector<T> ToVectorForTesting() const {
-    std::vector<T> result(row_map().size());
-    for (uint32_t i = 0; i < row_map().size(); ++i)
-      result[i] = (*this)[i];
-    return result;
-  }
-
-  // Implements equality between two items of type |T|.
-  static bool Equals(T a, T b) {
-    // We need to use equal_to here as it could be T == double and because we
-    // enable all compile time warnings, we will get complaints if we just use
-    // a == b.
-    return std::equal_to<T>()(a, b);
-  }
-
-  // Helper functions to create constraints for the given value.
-  Constraint eq(T v) const { return eq_value(NumericToSqlValue(v)); }
-  Constraint gt(T v) const { return gt_value(NumericToSqlValue(v)); }
-  Constraint lt(T v) const { return lt_value(NumericToSqlValue(v)); }
-  Constraint ne(T v) const { return ne_value(NumericToSqlValue(v)); }
-  Constraint ge(T v) const { return ge_value(NumericToSqlValue(v)); }
-  Constraint le(T v) const { return le_value(NumericToSqlValue(v)); }
-
-  // Encodes the default flags for a column of the current type.
-  static constexpr uint32_t default_flags() { return Flag::kNonNull; }
-};
-
-template <typename T>
-struct TypedColumn<base::Optional<T>> : public Column {
-  using StoredType = T;
-
-  // Returns the data in the column at index |row|.
-  base::Optional<T> operator[](uint32_t row) const {
-    return sparse_vector<T>().Get(row_map().Get(row));
-  }
-
-  // Sets the data in the column at index |row|.
-  void Set(uint32_t row, T v) {
-    mutable_sparse_vector<T>()->Set(row_map().Get(row), v);
-  }
-
-  // Inserts the value at the end of the column.
-  void Append(base::Optional<T> v) { mutable_sparse_vector<T>()->Append(v); }
-
-  std::vector<base::Optional<T>> ToVectorForTesting() const {
-    std::vector<T> result(row_map().size());
-    for (uint32_t i = 0; i < row_map().size(); ++i)
-      result[i] = (*this)[i];
-    return result;
-  }
-
-  // Implements equality between two items of type |T|.
-  static bool Equals(base::Optional<T> a, base::Optional<T> b) {
-    // We need to use equal_to here as it could be T == double and because we
-    // enable all compile time warnings, we will get complaints if we just use
-    // a == b. This is the same reason why we can't also just use equal_to using
-    // a and b directly because the optional implementation of equality uses
-    // == which again causes complaints.
-    return a.has_value() == b.has_value() &&
-           (!a.has_value() || std::equal_to<T>()(*a, *b));
-  }
-
-  // Helper functions to create constraints for the given value.
-  Constraint eq(T v) const { return eq_value(NumericToSqlValue(v)); }
-  Constraint gt(T v) const { return gt_value(NumericToSqlValue(v)); }
-  Constraint lt(T v) const { return lt_value(NumericToSqlValue(v)); }
-  Constraint ne(T v) const { return ne_value(NumericToSqlValue(v)); }
-  Constraint ge(T v) const { return ge_value(NumericToSqlValue(v)); }
-  Constraint le(T v) const { return le_value(NumericToSqlValue(v)); }
-
-  // Encodes the default flags for a column of the current type.
-  static constexpr uint32_t default_flags() { return Flag::kNoFlag; }
-};
-
-template <>
-struct TypedColumn<StringPool::Id> : public Column {
-  using StoredType = StringPool::Id;
-
-  // Returns the data in the column at index |row|.
-  StringPool::Id operator[](uint32_t row) const {
-    return sparse_vector<StringPool::Id>().GetNonNull(row_map().Get(row));
-  }
-
-  // Returns the string in the column by looking up the id at |row| in the
-  // StringPool.
-  NullTermStringView GetString(uint32_t row) const {
-    return GetStringPoolStringAtIdx(row_map().Get(row));
-  }
-
-  // Sets the data in the column at index |row|.
-  void Set(uint32_t row, StringPool::Id v) {
-    mutable_sparse_vector<StringPool::Id>()->Set(row_map().Get(row), v);
-  }
-
-  // Inserts the value at the end of the column.
-  void Append(StringPool::Id v) {
-    mutable_sparse_vector<StringPool::Id>()->Append(v);
-  }
-
-  // Returns the row containing the given value in the Column.
-  base::Optional<uint32_t> IndexOf(StringPool::Id v) const {
-    return Column::IndexOf(SqlValue::String(string_pool().Get(v).c_str()));
-  }
-
-  // Returns the row containing the given value in the Column.
-  base::Optional<uint32_t> IndexOf(NullTermStringView v) const {
-    return Column::IndexOf(SqlValue::String(v.c_str()));
-  }
-
-  // Implements equality between two items of type |T|.
-  static bool Equals(StringPool::Id a, StringPool::Id b) { return a == b; }
-
-  // Helper functions to create constraints for the given value.
-  Constraint eq(const char* v) const { return eq_value(SqlValue::String(v)); }
-  Constraint gt(const char* v) const { return gt_value(SqlValue::String(v)); }
-  Constraint lt(const char* v) const { return lt_value(SqlValue::String(v)); }
-  Constraint ne(const char* v) const { return ne_value(SqlValue::String(v)); }
-  Constraint ge(const char* v) const { return ge_value(SqlValue::String(v)); }
-  Constraint le(const char* v) const { return le_value(SqlValue::String(v)); }
-
-  static constexpr uint32_t default_flags() { return Flag::kNonNull; }
-};
-
-template <>
-struct TypedColumn<base::Optional<StringPool::Id>>
-    : public TypedColumn<StringPool::Id> {
-  // Inserts the value at the end of the column.
-  void Append(base::Optional<StringPool::Id> v) {
-    // Since StringPool::Id == 0 is always treated as null, rewrite
-    // base::nullopt -> 0 to remove an extra check at filter time for
-    // base::nullopt. Instead, that code can assume that the SparseVector
-    // layer always returns a valid id and can handle the nullability at the
-    // stringpool level.
-    // TODO(lalitm): remove this special casing if we migrate all tables over
-    // to macro tables and find that we can remove support for null stringids
-    // in the stringpool.
-    return TypedColumn<StringPool::Id>::Append(v ? *v : StringPool::Id::Null());
-  }
-
-  // Implements equality between two items of type |T|.
-  static bool Equals(base::Optional<StringPool::Id> a,
-                     base::Optional<StringPool::Id> b) {
-    return a == b;
-  }
-
-  static constexpr uint32_t default_flags() { return Flag::kNonNull; }
-};
-
 }  // namespace trace_processor
 }  // namespace perfetto
 
diff --git a/src/trace_processor/db/typed_column_internal.h b/src/trace_processor/db/typed_column_internal.h
new file mode 100644
index 0000000..181a3a2
--- /dev/null
+++ b/src/trace_processor/db/typed_column_internal.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_TRACE_PROCESSOR_DB_TYPED_COLUMN_INTERNAL_H_
+#define SRC_TRACE_PROCESSOR_DB_TYPED_COLUMN_INTERNAL_H_
+
+#include "src/trace_processor/db/column.h"
+
+namespace perfetto {
+namespace trace_processor {
+namespace tc_internal {
+
+// Serializer converts between the "public" type used by the rest of trace
+// processor and the type we store in the SparseVector.
+template <typename T, typename Enabled = void>
+struct Serializer {
+  using serialized_type = T;
+
+  static serialized_type Serialize(T value) { return value; }
+  static T Deserialize(serialized_type value) { return value; }
+
+  static base::Optional<serialized_type> Serialize(base::Optional<T> value) {
+    return value;
+  }
+  static base::Optional<T> Deserialize(base::Optional<serialized_type> value) {
+    return value;
+  }
+};
+
+// Specialization of Serializer for StringPool types.
+template <>
+struct Serializer<StringPool::Id> {
+  using serialized_type = StringPool::Id;
+
+  static serialized_type Serialize(StringPool::Id value) { return value; }
+  static StringPool::Id Deserialize(serialized_type value) { return value; }
+
+  static serialized_type Serialize(base::Optional<StringPool::Id> value) {
+    // Since StringPool::Id == 0 is always treated as null, rewrite
+    // base::nullopt -> 0 to remove an extra check at filter time for
+    // base::nullopt. Instead, that code can assume that the SparseVector
+    // layer always returns a valid id and can handle the nullability at the
+    // stringpool level.
+    // TODO(lalitm): remove this special casing if we migrate all tables over
+    // to macro tables and find that we can remove support for null stringids
+    // in the stringpool.
+    return value ? Serialize(*value) : StringPool::Id::Null();
+  }
+  static base::Optional<serialized_type> Deserialize(
+      base::Optional<StringPool::Id>) {
+    PERFETTO_FATAL("Should never be storing optional StringPool ids");
+  }
+};
+
+// TypeHandler (and it's specializations) allow for specialied handling of
+// functions of a TypedColumn based on what is being stored inside.
+// Default implementation of TypeHandler.
+template <typename T, typename Enable = void>
+struct TypeHandler {
+  using non_optional_type = T;
+  using get_type = T;
+  using sql_value_type = T;
+
+  static constexpr bool is_optional = false;
+  static constexpr bool is_string = false;
+
+  static bool Equals(T a, T b) {
+    // We need to use equal_to here as it could be T == double and because we
+    // enable all compile time warnings, we will get complaints if we just use
+    // a == b.
+    return std::equal_to<T>()(a, b);
+  }
+};
+
+// Specialization for Optional types.
+template <typename T>
+struct TypeHandler<base::Optional<T>> {
+  using non_optional_type = T;
+  using get_type = base::Optional<T>;
+  using sql_value_type = T;
+
+  static constexpr bool is_optional = true;
+  static constexpr bool is_string = false;
+
+  static bool Equals(base::Optional<T> a, base::Optional<T> b) {
+    // We need to use equal_to here as it could be T == double and because we
+    // enable all compile time warnings, we will get complaints if we just use
+    // a == b. This is the same reason why we can't also just use equal_to using
+    // a and b directly because the optional implementation of equality uses
+    // == which again causes complaints.
+    return a.has_value() == b.has_value() &&
+           (!a.has_value() || std::equal_to<T>()(*a, *b));
+  }
+};
+
+// Specialization for Optional<StringId> types.
+template <>
+struct TypeHandler<StringPool::Id> {
+  // get_type removes the base::Optional since we convert base::nullopt ->
+  // StringPool::Id::Null (see Serializer<StringPool> above).
+  using non_optional_type = StringPool::Id;
+  using get_type = StringPool::Id;
+  using sql_value_type = NullTermStringView;
+
+  static constexpr bool is_optional = false;
+  static constexpr bool is_string = true;
+
+  static bool Equals(StringPool::Id a, StringPool::Id b) { return a == b; }
+};
+
+// Specialization for Optional<StringId> types.
+template <>
+struct TypeHandler<base::Optional<StringPool::Id>> {
+  // get_type removes the base::Optional since we convert base::nullopt ->
+  // StringPool::Id::Null (see Serializer<StringPool> above).
+  using non_optional_type = StringPool::Id;
+  using get_type = StringPool::Id;
+  using sql_value_type = NullTermStringView;
+
+  // is_optional is false again because we always unwrap
+  // base::Optional<StringPool::Id> into StringPool::Id.
+  static constexpr bool is_optional = false;
+  static constexpr bool is_string = true;
+
+  static bool Equals(base::Optional<StringPool::Id> a,
+                     base::Optional<StringPool::Id> b) {
+    // To match our handling of treating base::nullopt ==
+    // StringPool::Id::Null(), ensure that they both compare equal to each
+    // other.
+    return a == b || (!a && b->is_null()) || (!b && a->is_null());
+  }
+};
+
+}  // namespace tc_internal
+}  // namespace trace_processor
+}  // namespace perfetto
+
+#endif  // SRC_TRACE_PROCESSOR_DB_TYPED_COLUMN_INTERNAL_H_
diff --git a/src/trace_processor/export_json.cc b/src/trace_processor/export_json.cc
index 1e78477..42370f7 100644
--- a/src/trace_processor/export_json.cc
+++ b/src/trace_processor/export_json.cc
@@ -33,6 +33,7 @@
 #include <limits>
 
 #include "perfetto/ext/base/string_splitter.h"
+#include "perfetto/ext/base/string_utils.h"
 #include "src/trace_processor/metadata.h"
 #include "src/trace_processor/trace_processor_context.h"
 #include "src/trace_processor/trace_processor_storage_impl.h"
@@ -592,7 +593,10 @@
         } else {  // A list item
           target = &(*target)[key_part.substr(0, bracketpos)];
           while (bracketpos != key_part.npos) {
-            std::string index =
+            // We constructed this string from an int earlier in trace_processor
+            // so it shouldn't be possible for this (or the StringToUInt32
+            // below) to fail.
+            std::string s =
                 key_part.substr(bracketpos + 1, key_part.find(']', bracketpos) -
                                                     bracketpos - 1);
             if (PERFETTO_UNLIKELY(!target->isNull() && !target->isArray())) {
@@ -601,7 +605,13 @@
                             args_sets_[set_id].toStyledString().c_str());
               return;
             }
-            target = &(*target)[stoi(index)];
+            base::Optional<uint32_t> index = base::StringToUInt32(s);
+            if (PERFETTO_UNLIKELY(!index)) {
+              PERFETTO_ELOG("Expected to be able to extract index from %s",
+                            key_part.c_str());
+              return;
+            }
+            target = &(*target)[index.value()];
             bracketpos = key_part.find('[', bracketpos + 1);
           }
         }
@@ -756,9 +766,11 @@
       auto track_args_id = track_table.source_arg_set_id()[track_row];
       const Json::Value* track_args = nullptr;
       bool legacy_chrome_track = false;
+      bool is_child_track = false;
       if (track_args_id) {
         track_args = &args_builder_.GetArgs(*track_args_id);
         legacy_chrome_track = (*track_args)["source"].asString() == "chrome";
+        is_child_track = track_args->isMember("parent_track_id");
       }
 
       const auto& thread_track = storage_->thread_track_table();
@@ -801,7 +813,7 @@
 
       auto opt_thread_track_row = thread_track.id().IndexOf(TrackId{track_id});
 
-      if (opt_thread_track_row) {
+      if (opt_thread_track_row && !is_child_track) {
         // Synchronous (thread) slice or instant event.
         UniqueTid utid = thread_track.utid()[*opt_thread_track_row];
         auto pid_and_tid = UtidToPidAndTid(utid);
@@ -842,7 +854,7 @@
           }
         }
         writer_.WriteCommonEvent(event);
-      } else if (!legacy_chrome_track ||
+      } else if (is_child_track ||
                  (legacy_chrome_track && track_args->isMember("source_id"))) {
         // Async event slice.
         auto opt_process_row = process_track.id().IndexOf(TrackId{track_id});
@@ -879,14 +891,20 @@
             event["id"] = PrintUint64(source_id);
           }
         } else {
-          if (opt_process_row) {
-            uint32_t upid = process_track.upid()[*opt_process_row];
+          if (opt_thread_track_row) {
+            UniqueTid utid = thread_track.utid()[*opt_thread_track_row];
+            auto pid_and_tid = UtidToPidAndTid(utid);
+            event["pid"] = Json::Int(pid_and_tid.first);
+            event["tid"] = Json::Int(pid_and_tid.second);
             event["id2"]["local"] = PrintUint64(track_id);
+          } else if (opt_process_row) {
+            uint32_t upid = process_track.upid()[*opt_process_row];
             uint32_t exported_pid = UpidToPid(upid);
             event["pid"] = Json::Int(exported_pid);
             event["tid"] =
                 Json::Int(legacy_utid ? UtidToPidAndTid(*legacy_utid).second
                                       : exported_pid);
+            event["id2"]["local"] = PrintUint64(track_id);
           } else {
             // Some legacy importers don't understand "id2" fields, so we use
             // the "usually" global "id" field instead. This works as long as
@@ -930,24 +948,31 @@
         }
       } else {
         // Global or process-scoped instant event.
-        PERFETTO_DCHECK(legacy_chrome_track);
-        PERFETTO_DCHECK(duration_ns == 0);
-        // Use "I" instead of "i" phase for backwards-compat with old consumers.
-        event["ph"] = "I";
-
-        auto opt_process_row = process_track.id().IndexOf(TrackId{track_id});
-        if (opt_process_row.has_value()) {
-          uint32_t upid = process_track.upid()[*opt_process_row];
-          uint32_t exported_pid = UpidToPid(upid);
-          event["pid"] = Json::Int(exported_pid);
-          event["tid"] =
-              Json::Int(legacy_utid ? UtidToPidAndTid(*legacy_utid).second
-                                    : exported_pid);
-          event["s"] = "p";
+        PERFETTO_DCHECK(legacy_chrome_track || !is_child_track);
+        if (duration_ns != 0) {
+          // We don't support exporting slices on the default global or process
+          // track to JSON (JSON only supports instant events on these tracks).
+          PERFETTO_DLOG(
+              "skipping non-instant slice on global or process track");
         } else {
-          event["s"] = "g";
+          // Use "I" instead of "i" phase for backwards-compat with old
+          // consumers.
+          event["ph"] = "I";
+
+          auto opt_process_row = process_track.id().IndexOf(TrackId{track_id});
+          if (opt_process_row.has_value()) {
+            uint32_t upid = process_track.upid()[*opt_process_row];
+            uint32_t exported_pid = UpidToPid(upid);
+            event["pid"] = Json::Int(exported_pid);
+            event["tid"] =
+                Json::Int(legacy_utid ? UtidToPidAndTid(*legacy_utid).second
+                                      : exported_pid);
+            event["s"] = "p";
+          } else {
+            event["s"] = "g";
+          }
+          writer_.WriteCommonEvent(event);
         }
-        writer_.WriteCommonEvent(event);
       }
     }
     return util::OkStatus();
diff --git a/src/trace_processor/export_json_unittest.cc b/src/trace_processor/export_json_unittest.cc
index 6d149e6..14ca221 100644
--- a/src/trace_processor/export_json_unittest.cc
+++ b/src/trace_processor/export_json_unittest.cc
@@ -716,9 +716,12 @@
 
 TEST_F(ExportJsonTest, InstantEvent) {
   const int64_t kTimestamp = 10000000;
+  const int64_t kTimestamp2 = 10001000;
+  const int64_t kTimestamp3 = 10002000;
   const char* kCategory = "cat";
   const char* kName = "name";
 
+  // Global legacy track.
   TrackId track =
       context_.track_tracker->GetOrCreateLegacyChromeGlobalInstantTrack();
   context_.args_tracker->Flush();  // Flush track args.
@@ -727,6 +730,19 @@
   context_.storage->mutable_slice_table()->Insert(
       {kTimestamp, 0, track.value, cat_id, name_id, 0, 0, 0});
 
+  // Global track.
+  TrackId track2 = context_.track_tracker->GetOrCreateDefaultDescriptorTrack();
+  context_.args_tracker->Flush();  // Flush track args.
+  context_.storage->mutable_slice_table()->Insert(
+      {kTimestamp2, 0, track2.value, cat_id, name_id, 0, 0, 0});
+
+  // Async event track.
+  context_.track_tracker->ReserveDescriptorChildTrack(1234, 0);
+  TrackId track3 = *context_.track_tracker->GetDescriptorTrack(1234);
+  context_.args_tracker->Flush();  // Flush track args.
+  context_.storage->mutable_slice_table()->Insert(
+      {kTimestamp3, 0, track3.value, cat_id, name_id, 0, 0, 0});
+
   base::TempFile temp_file = base::TempFile::Create();
   FILE* output = fopen(temp_file.path().c_str(), "w+");
   util::Status status = ExportJson(context_.storage.get(), output);
@@ -734,7 +750,7 @@
   EXPECT_TRUE(status.ok());
 
   Json::Value result = ToJsonValue(ReadFile(output));
-  EXPECT_EQ(result["traceEvents"].size(), 1u);
+  EXPECT_EQ(result["traceEvents"].size(), 3u);
 
   Json::Value event = result["traceEvents"][0];
   EXPECT_EQ(event["ph"].asString(), "I");
@@ -742,6 +758,20 @@
   EXPECT_EQ(event["s"].asString(), "g");
   EXPECT_EQ(event["cat"].asString(), kCategory);
   EXPECT_EQ(event["name"].asString(), kName);
+
+  Json::Value event2 = result["traceEvents"][1];
+  EXPECT_EQ(event2["ph"].asString(), "I");
+  EXPECT_EQ(event2["ts"].asInt64(), kTimestamp2 / 1000);
+  EXPECT_EQ(event2["s"].asString(), "g");
+  EXPECT_EQ(event2["cat"].asString(), kCategory);
+  EXPECT_EQ(event2["name"].asString(), kName);
+
+  Json::Value event3 = result["traceEvents"][2];
+  EXPECT_EQ(event3["ph"].asString(), "n");
+  EXPECT_EQ(event3["ts"].asInt64(), kTimestamp3 / 1000);
+  EXPECT_EQ(event3["id"].asString(), "0x2");
+  EXPECT_EQ(event3["cat"].asString(), kCategory);
+  EXPECT_EQ(event3["name"].asString(), kName);
 }
 
 TEST_F(ExportJsonTest, InstantEventOnThread) {
diff --git a/src/trace_processor/heap_profile_tracker.cc b/src/trace_processor/heap_profile_tracker.cc
index a1869a4..67883fc 100644
--- a/src/trace_processor/heap_profile_tracker.cc
+++ b/src/trace_processor/heap_profile_tracker.cc
@@ -227,11 +227,27 @@
 void HeapProfileTracker::SetProfilePacketIndex(uint32_t seq_id,
                                                uint64_t index) {
   SequenceState& sequence_state = sequence_state_[seq_id];
-  if (sequence_state.last_profile_packet_index != 0 &&
-      sequence_state.last_profile_packet_index + 1 != index) {
+  bool dropped_packet = false;
+  // heapprofd starts counting at index = 0.
+  if (!sequence_state.prev_index && index != 0) {
+    dropped_packet = true;
+  }
+
+  if (sequence_state.prev_index && *sequence_state.prev_index + 1 != index) {
+    dropped_packet = true;
+  }
+
+  if (dropped_packet) {
+    if (sequence_state.prev_index) {
+      PERFETTO_ELOG("Missing packets between %" PRIu64 " and %" PRIu64,
+                    *sequence_state.prev_index, index);
+    } else {
+      PERFETTO_ELOG("Invalid first packet index %" PRIu64 " (!= 0)", index);
+    }
+
     context_->storage->IncrementStats(stats::heapprofd_missing_packet);
   }
-  sequence_state.last_profile_packet_index = index;
+  sequence_state.prev_index = index;
 }
 
 void HeapProfileTracker::AddAllocation(
@@ -240,6 +256,7 @@
     const SourceAllocation& alloc,
     const StackProfileTracker::InternLookup* intern_lookup) {
   SequenceState& sequence_state = sequence_state_[seq_id];
+
   auto maybe_callstack_id = stack_profile_tracker->FindOrInsertCallstack(
       alloc.callstack_id, intern_lookup);
   if (!maybe_callstack_id)
@@ -260,9 +277,6 @@
       -static_cast<int64_t>(alloc.free_count),
       -static_cast<int64_t>(alloc.self_freed)};
 
-  tables::HeapProfileAllocationTable::Row alloc_delta = alloc_row;
-  tables::HeapProfileAllocationTable::Row free_delta = free_row;
-
   auto prev_alloc_it = sequence_state.prev_alloc.find({upid, callstack_id});
   if (prev_alloc_it == sequence_state.prev_alloc.end()) {
     std::tie(prev_alloc_it, std::ignore) = sequence_state.prev_alloc.emplace(
@@ -271,8 +285,6 @@
   }
 
   tables::HeapProfileAllocationTable::Row& prev_alloc = prev_alloc_it->second;
-  alloc_delta.count -= prev_alloc.count;
-  alloc_delta.size -= prev_alloc.size;
 
   auto prev_free_it = sequence_state.prev_free.find({upid, callstack_id});
   if (prev_free_it == sequence_state.prev_free.end()) {
@@ -282,15 +294,59 @@
   }
 
   tables::HeapProfileAllocationTable::Row& prev_free = prev_free_it->second;
+
+  std::set<CallsiteId>& callstacks_for_source_callstack_id =
+      sequence_state.seen_callstacks[std::make_pair(upid, alloc.callstack_id)];
+  bool new_callstack;
+  std::tie(std::ignore, new_callstack) =
+      callstacks_for_source_callstack_id.emplace(callstack_id);
+
+  if (new_callstack) {
+    sequence_state.alloc_correction[alloc.callstack_id] = prev_alloc;
+    sequence_state.free_correction[alloc.callstack_id] = prev_free;
+  }
+
+  auto alloc_correction_it =
+      sequence_state.alloc_correction.find(alloc.callstack_id);
+  if (alloc_correction_it != sequence_state.alloc_correction.end()) {
+    const auto& alloc_correction = alloc_correction_it->second;
+    alloc_row.count += alloc_correction.count;
+    alloc_row.size += alloc_correction.size;
+  }
+
+  auto free_correction_it =
+      sequence_state.free_correction.find(alloc.callstack_id);
+  if (free_correction_it != sequence_state.free_correction.end()) {
+    const auto& free_correction = free_correction_it->second;
+    free_row.count += free_correction.count;
+    free_row.size += free_correction.size;
+  }
+
+  tables::HeapProfileAllocationTable::Row alloc_delta = alloc_row;
+  tables::HeapProfileAllocationTable::Row free_delta = free_row;
+
+  alloc_delta.count -= prev_alloc.count;
+  alloc_delta.size -= prev_alloc.size;
+
   free_delta.count -= prev_free.count;
   free_delta.size -= prev_free.size;
 
-  if (alloc_delta.count)
+  if (alloc_delta.count < 0 || alloc_delta.size < 0 || free_delta.count > 0 ||
+      free_delta.size > 0) {
+    PERFETTO_DLOG("Non-monotonous allocation.");
+    context_->storage->IncrementIndexedStats(stats::heapprofd_malformed_packet,
+                                             static_cast<int>(upid));
+    return;
+  }
+
+  if (alloc_delta.count) {
     context_->storage->mutable_heap_profile_allocation_table()->Insert(
         alloc_delta);
-  if (free_delta.count)
+  }
+  if (free_delta.count) {
     context_->storage->mutable_heap_profile_allocation_table()->Insert(
         free_delta);
+  }
 
   prev_alloc = alloc_row;
   prev_free = free_row;
diff --git a/src/trace_processor/heap_profile_tracker.h b/src/trace_processor/heap_profile_tracker.h
index 53e7637..fe40a84 100644
--- a/src/trace_processor/heap_profile_tracker.h
+++ b/src/trace_processor/heap_profile_tracker.h
@@ -18,6 +18,7 @@
 #define SRC_TRACE_PROCESSOR_HEAP_PROFILE_TRACKER_H_
 
 #include <deque>
+#include <set>
 #include <unordered_map>
 
 #include "perfetto/ext/base/optional.h"
@@ -89,7 +90,27 @@
                        tables::HeapProfileAllocationTable::Row>
         prev_free;
 
-    uint64_t last_profile_packet_index = 0;
+    // For continuous dumps, we only store the delta in the data-base. To do
+    // this, we subtract the previous dump's value. Sometimes, we should not
+    // do that subtraction, because heapprofd garbage collects stacks that
+    // have no unfreed allocations. If the application then allocations again
+    // at that stack, it gets recreated and initialized to zero.
+    //
+    // To correct for this, we add the previous' stacks value to the current
+    // one, and then handle it as normal. If it is the first time we see a
+    // SourceCallstackId for a CallsiteId, we put the previous value into
+    // the correction maps below.
+    std::map<std::pair<UniquePid, StackProfileTracker::SourceCallstackId>,
+             std::set<CallsiteId>>
+        seen_callstacks;
+    std::map<StackProfileTracker::SourceCallstackId,
+             tables::HeapProfileAllocationTable::Row>
+        alloc_correction;
+    std::map<StackProfileTracker::SourceCallstackId,
+             tables::HeapProfileAllocationTable::Row>
+        free_correction;
+
+    base::Optional<uint64_t> prev_index;
   };
   std::map<uint32_t, SequenceState> sequence_state_;
   TraceProcessorContext* const context_;
diff --git a/src/trace_processor/importers/proto/graphics_event_module.cc b/src/trace_processor/importers/proto/graphics_event_module.cc
index 38979f2..f003464 100644
--- a/src/trace_processor/importers/proto/graphics_event_module.cc
+++ b/src/trace_processor/importers/proto/graphics_event_module.cc
@@ -57,7 +57,7 @@
                                      decoder.vulkan_memory_event());
       return;
     case TracePacket::kVulkanApiEventFieldNumber:
-      parser_.ParseVulkanApiEvent(decoder.vulkan_api_event());
+      parser_.ParseVulkanApiEvent(ttp.timestamp, decoder.vulkan_api_event());
       return;
   }
 }
diff --git a/src/trace_processor/importers/proto/graphics_event_parser.cc b/src/trace_processor/importers/proto/graphics_event_parser.cc
index 5337e81..ed87c84 100644
--- a/src/trace_processor/importers/proto/graphics_event_parser.cc
+++ b/src/trace_processor/importers/proto/graphics_event_parser.cc
@@ -98,10 +98,6 @@
       description_id_(context->storage->InternString("description")),
       gpu_render_stage_scope_id_(
           context->storage->InternString("gpu_render_stage")),
-      command_buffer_handle_id_(
-          context->storage->InternString("VkCommandBuffer")),
-      render_target_handle_id_(context->storage->InternString("VkFramebuffer")),
-      render_pass_handle_id_(context->storage->InternString("VkRenderPass")),
       graphics_event_scope_id_(
           context->storage->InternString("graphics_frame_event")),
       unknown_event_name_id_(context->storage->InternString("unknown_event")),
@@ -144,7 +140,10 @@
                          context_->storage->InternString("WARNING"),
                          context_->storage->InternString("ERROR"),
                          context_->storage->InternString(
-                             "UNKNOWN_SEVERITY") /* must be last */}} {}
+                             "UNKNOWN_SEVERITY") /* must be last */}},
+      vk_event_track_id_(context->storage->InternString("Vulkan Events")),
+      vk_event_scope_id_(context->storage->InternString("vulkan_events")),
+      vk_queue_submit_id_(context->storage->InternString("vkQueueSubmit")) {}
 
 void GraphicsEventParser::ParseGpuCounterEvent(int64_t ts, ConstBytes blob) {
   protos::pbzero::GpuCounterEvent::Decoder event(blob.data, blob.size);
@@ -309,23 +308,6 @@
   }
 }
 
-void GraphicsEventParser::AddVulkanHandleArg(
-    ArgsTracker::BoundInserter* inserter,
-    StringId key,
-    int32_t vk_object_type,
-    uint64_t vk_handle) {
-  char buf[256];
-  auto debug_marker_name = FindDebugName(vk_object_type, vk_handle);
-  if (debug_marker_name.has_value()) {
-    snprintf(buf, base::ArraySize(buf), "0x%016" PRIx64 " (%s)", vk_handle,
-             debug_marker_name.value().c_str());
-  } else {
-    snprintf(buf, base::ArraySize(buf), "0x%016" PRIx64, vk_handle);
-  }
-  StringId value = context_->storage->InternString(buf);
-  inserter->AddArg(key, Variadic::String(value));
-}
-
 void GraphicsEventParser::ParseGpuRenderStageEvent(int64_t ts,
                                                    ConstBytes blob) {
   protos::pbzero::GpuRenderStageEvent::Decoder event(blob.data, blob.size);
@@ -359,21 +341,6 @@
         inserter->AddArg(description_id_, Variadic::String(description));
       }
     }
-    if (event.has_command_buffer_handle()) {
-      AddVulkanHandleArg(inserter, command_buffer_handle_id_,
-                         VK_OBJECT_TYPE_COMMAND_BUFFER,
-                         event.command_buffer_handle());
-    }
-    if (event.has_render_target_handle()) {
-      AddVulkanHandleArg(inserter, render_target_handle_id_,
-                         VK_OBJECT_TYPE_FRAMEBUFFER,
-                         event.render_target_handle());
-    }
-    if (event.has_render_pass_handle()) {
-      AddVulkanHandleArg(inserter, render_pass_handle_id_,
-                         VK_OBJECT_TYPE_RENDER_PASS,
-                         event.render_pass_handle());
-    }
     for (auto it = event.extra_data(); it; ++it) {
       protos::pbzero::GpuRenderStageEvent_ExtraData_Decoder datum(*it);
       StringId name_id = context_->storage->InternString(datum.name());
@@ -414,6 +381,22 @@
       gpu_hw_queue_ids_[hw_queue_id] = track_id;
     }
 
+    auto render_target_name = FindDebugName(VK_OBJECT_TYPE_FRAMEBUFFER, event.render_target_handle());
+    auto render_target_name_id = render_target_name.has_value()
+                                  ? context_->storage->InternString(
+                                      render_target_name.value().c_str())
+                                  : kNullStringId;
+    auto render_pass_name = FindDebugName(VK_OBJECT_TYPE_RENDER_PASS, event.render_pass_handle());
+    auto render_pass_name_id = render_pass_name.has_value()
+                                  ? context_->storage->InternString(
+                                      render_pass_name.value().c_str())
+                                  : kNullStringId;
+    auto command_buffer_name = FindDebugName(VK_OBJECT_TYPE_COMMAND_BUFFER, event.command_buffer_handle());
+    auto command_buffer_name_id = command_buffer_name.has_value()
+                                  ? context_->storage->InternString(
+                                      command_buffer_name.value().c_str())
+                                  : kNullStringId;
+
     tables::GpuSliceTable::Row row;
     row.ts = ts;
     row.track_id = track_id.value;
@@ -421,8 +404,11 @@
     row.dur = static_cast<int64_t>(event.duration());
     row.context_id = static_cast<int64_t>(event.context());
     row.render_target = static_cast<int64_t>(event.render_target_handle());
+    row.render_target_name = render_target_name_id;
     row.render_pass = static_cast<int64_t>(event.render_pass_handle());
+    row.render_pass_name = render_pass_name_id;
     row.command_buffer = static_cast<int64_t>(event.command_buffer_handle());
+    row.command_buffer_name = command_buffer_name_id;
     row.submission_id = event.submission_id();
     row.hw_queue_id = hw_queue_id;
 
@@ -791,7 +777,7 @@
   context_->slice_tracker->ScopedGpu(row, args_callback);
 }
 
-void GraphicsEventParser::ParseVulkanApiEvent(ConstBytes blob) {
+void GraphicsEventParser::ParseVulkanApiEvent(int64_t ts, ConstBytes blob) {
   protos::pbzero::VulkanApiEvent::Decoder vk_event(blob.data, blob.size);
   if (vk_event.has_vk_debug_utils_object_name()) {
     protos::pbzero::VulkanApiEvent_VkDebugUtilsObjectName::Decoder event(
@@ -799,6 +785,32 @@
     debug_marker_names_[event.object_type()][event.object()] =
         event.object_name().ToStdString();
   }
+  if (vk_event.has_vk_queue_submit()) {
+    protos::pbzero::VulkanApiEvent_VkQueueSubmit::Decoder event(
+        vk_event.vk_queue_submit());
+    // Once flow table is implemented, we can create a nice UI that link the
+    // vkQueueSubmit to GpuRenderStageEvent.  For now, just add it as in a GPU
+    // track so that they can appear close to the render stage slices.
+    tables::GpuTrackTable::Row track(vk_event_track_id_);
+    track.scope = vk_event_scope_id_;
+    TrackId track_id = context_->track_tracker->InternGpuTrack(track);
+    tables::GpuSliceTable::Row row;
+    row.ts = ts;
+    row.dur = static_cast<int64_t>(event.duration_ns());
+    row.track_id = track_id.value;
+    row.name = vk_queue_submit_id_;
+    if (event.has_vk_command_buffers()) {
+      row.command_buffer = static_cast<int64_t>(*event.vk_command_buffers());
+    }
+    row.submission_id = event.submission_id();
+    auto args_callback = [this, &event](ArgsTracker::BoundInserter* inserter) {
+      inserter->AddArg(context_->storage->InternString("pid"),
+                       Variadic::Integer(event.pid()));
+      inserter->AddArg(context_->storage->InternString("tid"),
+                       Variadic::Integer(event.tid()));
+    };
+    context_->slice_tracker->ScopedGpu(row, args_callback);
+  }
 }
 
 }  // namespace trace_processor
diff --git a/src/trace_processor/importers/proto/graphics_event_parser.h b/src/trace_processor/importers/proto/graphics_event_parser.h
index c2bae51..10bf5e5 100644
--- a/src/trace_processor/importers/proto/graphics_event_parser.h
+++ b/src/trace_processor/importers/proto/graphics_event_parser.h
@@ -67,7 +67,7 @@
   void UpdateVulkanMemoryAllocationCounters(UniquePid,
                                             const VulkanMemoryEvent::Decoder&);
 
-  void ParseVulkanApiEvent(ConstBytes);
+  void ParseVulkanApiEvent(int64_t, ConstBytes);
 
  private:
   const StringId GetFullStageName(
@@ -77,10 +77,6 @@
           GpuRenderStageEvent_Specifications_Description_Decoder& hw_queue);
   base::Optional<std::string> FindDebugName(int32_t vk_object_type,
                                             uint64_t vk_handle) const;
-  void AddVulkanHandleArg(ArgsTracker::BoundInserter* inserter,
-                          StringId key,
-                          int32_t vk_object_type,
-                          uint64_t vk_handle);
 
   TraceProcessorContext* const context_;
   VulkanMemoryTracker vulkan_memory_tracker_;
@@ -91,9 +87,6 @@
   const StringId gpu_render_stage_scope_id_;
   std::vector<perfetto::base::Optional<TrackId>> gpu_hw_queue_ids_;
   size_t gpu_hw_queue_counter_ = 0;
-  const StringId command_buffer_handle_id_;
-  const StringId render_target_handle_id_;
-  const StringId render_pass_handle_id_;
   // Map of stage ID -> pair(stage name, stage description)
   std::vector<std::pair<StringId, StringId>> gpu_render_stage_ids_;
   // For GraphicsFrameEvent
@@ -125,10 +118,15 @@
   const StringId log_message_id_;
   std::array<StringId, 7> log_severity_ids_;
   // For Vulkan events.
+  // For VulkanApiEvent.VkDebugUtilsObjectName.
   // Map of vk handle -> vk object name.
   using DebugMarkerMap = std::unordered_map<uint64_t, std::string>;
   // Map of VkObjectType -> DebugMarkerMap.
   std::unordered_map<int32_t, DebugMarkerMap> debug_marker_names_;
+  // For VulkanApiEvent.VkQueueSubmit.
+  StringId vk_event_track_id_;
+  StringId vk_event_scope_id_;
+  StringId vk_queue_submit_id_;
 };
 }  // namespace trace_processor
 }  // namespace perfetto
diff --git a/src/trace_processor/importers/proto/heap_graph_module.cc b/src/trace_processor/importers/proto/heap_graph_module.cc
index c75a340..8da6932 100644
--- a/src/trace_processor/importers/proto/heap_graph_module.cc
+++ b/src/trace_processor/importers/proto/heap_graph_module.cc
@@ -217,12 +217,22 @@
           heap_graph_tracker->RowsForType(*obfuscated_class_name_id);
 
       if (cls_objects) {
-        auto interned_deobfuscated_name =
-            context_->storage->InternString(cls.deobfuscated_name());
         for (int64_t row : *cls_objects) {
+          const base::StringView obfuscated_type_name =
+              context_->storage->GetString(
+                  context_->storage->mutable_heap_graph_object_table()
+                      ->type_name()[static_cast<uint32_t>(row)]);
+          size_t array_count = NumberOfArrays(obfuscated_type_name);
+          std::string arrayed_deobfuscated_name =
+              cls.deobfuscated_name().ToStdString();
+          for (size_t i = 0; i < array_count; ++i)
+            arrayed_deobfuscated_name += "[]";
+          const StringId arrayed_deobfuscated_name_id =
+              context_->storage->InternString(
+                  base::StringView(arrayed_deobfuscated_name));
           context_->storage->mutable_heap_graph_object_table()
               ->mutable_deobfuscated_type_name()
-              ->Set(static_cast<uint32_t>(row), interned_deobfuscated_name);
+              ->Set(static_cast<uint32_t>(row), arrayed_deobfuscated_name_id);
         }
       } else {
         PERFETTO_DLOG("Class %s not found",
diff --git a/src/trace_processor/importers/proto/heap_graph_tracker.cc b/src/trace_processor/importers/proto/heap_graph_tracker.cc
index 9eb166b..e2c1fa2 100644
--- a/src/trace_processor/importers/proto/heap_graph_tracker.cc
+++ b/src/trace_processor/importers/proto/heap_graph_tracker.cc
@@ -19,6 +19,23 @@
 namespace perfetto {
 namespace trace_processor {
 
+size_t NumberOfArrays(base::StringView type) {
+  if (type.size() < 2)
+    return 0;
+
+  size_t arrays = 0;
+  while (type.size() >= 2 * (arrays + 1) &&
+         memcmp(type.end() - 2 * (arrays + 1), "[]", 2) == 0) {
+    arrays++;
+  }
+
+  return arrays;
+}
+
+base::StringView NormalizeTypeName(base::StringView type) {
+  return base::StringView(type.data(), type.size() - NumberOfArrays(type) * 2);
+}
+
 HeapGraphTracker::HeapGraphTracker(TraceProcessorContext* context)
     : context_(context) {}
 
@@ -87,10 +104,24 @@
 
 void HeapGraphTracker::SetPacketIndex(uint32_t seq_id, uint64_t index) {
   SequenceState& sequence_state = GetOrCreateSequence(seq_id);
-  if (sequence_state.prev_index != 0 &&
-      sequence_state.prev_index + 1 != index) {
-    PERFETTO_ELOG("Missing packets between %" PRIu64 " and %" PRIu64,
-                  sequence_state.prev_index, index);
+  bool dropped_packet = false;
+  // perfetto_hprof starts counting at index = 0.
+  if (!sequence_state.prev_index && index != 0) {
+    dropped_packet = true;
+  }
+
+  if (sequence_state.prev_index && *sequence_state.prev_index + 1 != index) {
+    dropped_packet = true;
+  }
+
+  if (dropped_packet) {
+    if (sequence_state.prev_index) {
+      PERFETTO_ELOG("Missing packets between %" PRIu64 " and %" PRIu64,
+                    *sequence_state.prev_index, index);
+    } else {
+      PERFETTO_ELOG("Invalid first packet index %" PRIu64 " (!= 0)", index);
+    }
+
     context_->storage->IncrementIndexedStats(
         stats::heap_graph_missing_packet,
         static_cast<int>(sequence_state.current_upid));
@@ -119,7 +150,10 @@
          /*root_type=*/base::nullopt});
     int64_t row = context_->storage->heap_graph_object_table().row_count() - 1;
     sequence_state.object_id_to_row.emplace(obj.object_id, row);
-    class_to_rows_[type_name].emplace_back(row);
+    base::StringView normalized_type =
+        NormalizeTypeName(context_->storage->GetString(type_name));
+    class_to_rows_[context_->storage->InternString(normalized_type)]
+        .emplace_back(row);
     sequence_state.walker.AddNode(row, obj.self_size,
                                   static_cast<int32_t>(type_name.raw_id()));
   }
diff --git a/src/trace_processor/importers/proto/heap_graph_tracker.h b/src/trace_processor/importers/proto/heap_graph_tracker.h
index 211b924..0699311 100644
--- a/src/trace_processor/importers/proto/heap_graph_tracker.h
+++ b/src/trace_processor/importers/proto/heap_graph_tracker.h
@@ -32,6 +32,9 @@
 
 class TraceProcessorContext;
 
+size_t NumberOfArrays(base::StringView type);
+base::StringView NormalizeTypeName(base::StringView type);
+
 class HeapGraphTracker : public HeapGraphWalker::Delegate, public Destructible {
  public:
   struct SourceObject {
@@ -108,7 +111,7 @@
     std::map<uint64_t, StringPool::Id> interned_type_names;
     std::map<uint64_t, StringPool::Id> interned_field_names;
     std::map<uint64_t, int64_t> object_id_to_row;
-    uint64_t prev_index = 0;
+    base::Optional<uint64_t> prev_index;
     HeapGraphWalker walker;
   };
 
diff --git a/src/trace_processor/importers/proto/heap_graph_tracker_unittest.cc b/src/trace_processor/importers/proto/heap_graph_tracker_unittest.cc
index 3cb5c2a..aa4e64d 100644
--- a/src/trace_processor/importers/proto/heap_graph_tracker_unittest.cc
+++ b/src/trace_processor/importers/proto/heap_graph_tracker_unittest.cc
@@ -142,6 +142,42 @@
   EXPECT_THAT(counts, UnorderedElementsAre(1, 2, 1, 1));
 }
 
+static const char kArray[] = "X[]";
+static const char kDoubleArray[] = "X[][]";
+static const char kNoArray[] = "X";
+static const char kLongNoArray[] = "ABCDE";
+
+TEST(HeapGraphTrackerTest, NormalizeTypeName) {
+  // sizeof(...) - 1 below to get rid of the null-byte.
+  EXPECT_EQ(NormalizeTypeName(base::StringView(kArray, sizeof(kArray) - 1))
+                .ToStdString(),
+            "X");
+  EXPECT_EQ(NormalizeTypeName(
+                base::StringView(kDoubleArray, sizeof(kDoubleArray) - 1))
+                .ToStdString(),
+            "X");
+  EXPECT_EQ(NormalizeTypeName(base::StringView(kNoArray, sizeof(kNoArray) - 1))
+                .ToStdString(),
+            "X");
+  EXPECT_EQ(NormalizeTypeName(
+                base::StringView(kLongNoArray, sizeof(kLongNoArray) - 1))
+                .ToStdString(),
+            "ABCDE");
+}
+
+TEST(HeapGraphTrackerTest, NumberOfArray) {
+  // sizeof(...) - 1 below to get rid of the null-byte.
+  EXPECT_EQ(NumberOfArrays(base::StringView(kArray, sizeof(kArray) - 1)), 1u);
+  EXPECT_EQ(
+      NumberOfArrays(base::StringView(kDoubleArray, sizeof(kDoubleArray) - 1)),
+      2u);
+  EXPECT_EQ(NumberOfArrays(base::StringView(kNoArray, sizeof(kNoArray) - 1)),
+            0u);
+  EXPECT_EQ(
+      NumberOfArrays(base::StringView(kLongNoArray, sizeof(kLongNoArray) - 1)),
+      0u);
+}
+
 }  // namespace
 }  // namespace trace_processor
 }  // namespace perfetto
diff --git a/src/trace_processor/importers/proto/proto_trace_parser.cc b/src/trace_processor/importers/proto/proto_trace_parser.cc
index 9d6a97b..5b36057 100644
--- a/src/trace_processor/importers/proto/proto_trace_parser.cc
+++ b/src/trace_processor/importers/proto/proto_trace_parser.cc
@@ -382,6 +382,9 @@
 
     int pid = static_cast<int>(entry.pid());
 
+    if (entry.disconnected())
+      context_->storage->IncrementIndexedStats(
+          stats::heapprofd_client_disconnected, pid);
     if (entry.buffer_corrupted())
       context_->storage->IncrementIndexedStats(
           stats::heapprofd_buffer_corrupted, pid);
diff --git a/src/trace_processor/importers/proto/proto_trace_parser_unittest.cc b/src/trace_processor/importers/proto/proto_trace_parser_unittest.cc
index 5fd1d74..d490f9a 100644
--- a/src/trace_processor/importers/proto/proto_trace_parser_unittest.cc
+++ b/src/trace_processor/importers/proto/proto_trace_parser_unittest.cc
@@ -1328,7 +1328,7 @@
   EXPECT_CALL(*slice_, Scoped(1015000, TrackId{0}, cat_2, ev_2, 0, _))
       .WillOnce(Return(1u));
 
-  EXPECT_CALL(*slice_, Scoped(1016000, TrackId{2}, cat_3, ev_3, 0, _))
+  EXPECT_CALL(*slice_, Scoped(1016000, TrackId{3}, cat_3, ev_3, 0, _))
       .WillOnce(Return(2u));
 
   EXPECT_CALL(*slice_,
@@ -1337,12 +1337,13 @@
 
   context_.sorter->ExtractEventsForced();
 
-  // First track is "Thread track 1"; second is "Async track 1", third is
-  // "Thread track 2".
-  EXPECT_EQ(storage_->track_table().row_count(), 3u);
+  // First track is "Thread track 1"; second is "Async track 1", third is global
+  // default track (parent of async track), fourth is "Thread track 2".
+  EXPECT_EQ(storage_->track_table().row_count(), 4u);
   EXPECT_EQ(storage_->track_table().name().GetString(0), "Thread track 1");
   EXPECT_EQ(storage_->track_table().name().GetString(1), "Async track 1");
-  EXPECT_EQ(storage_->track_table().name().GetString(2), "Thread track 2");
+  EXPECT_EQ(storage_->track_table().name().GetString(2), "Default Track");
+  EXPECT_EQ(storage_->track_table().name().GetString(3), "Thread track 2");
   EXPECT_EQ(storage_->thread_track_table().row_count(), 2u);
   EXPECT_EQ(storage_->thread_track_table().utid()[0], 1u);
   EXPECT_EQ(storage_->thread_track_table().utid()[1], 2u);
diff --git a/src/trace_processor/metrics/android/unmapped_java_symbols.sql b/src/trace_processor/metrics/android/unmapped_java_symbols.sql
index d13d948..e25745c 100644
--- a/src/trace_processor/metrics/android/unmapped_java_symbols.sql
+++ b/src/trace_processor/metrics/android/unmapped_java_symbols.sql
@@ -19,11 +19,21 @@
 CREATE TABLE IF NOT EXISTS types_per_upid AS
 WITH distinct_unmapped_type_names AS (
   SELECT DISTINCT upid, type_name
-  FROM heap_graph_object
-  WHERE deobfuscated_type_name IS NULL
-  AND INSTR(type_name, '.') = 0
-  AND RTRIM(type_name, '[]') NOT IN ('byte', 'char', 'short', 'int', 'long', 'boolean', 'float', 'double')
+  FROM (
+    SELECT upid, RTRIM(type_name, '[]') AS type_name
+    FROM heap_graph_object
+    WHERE deobfuscated_type_name IS NULL
+  )
+  WHERE type_name NOT IN ('byte', 'char', 'short', 'int', 'long', 'boolean', 'float', 'double')
   AND type_name NOT LIKE '$Proxy%'
+  AND type_name NOT LIKE 'java.%'
+  AND type_name NOT LIKE 'javax.%'
+  AND type_name NOT LIKE 'j$.%'
+  AND type_name NOT LIKE 'android.%'
+  AND type_name NOT LIKE 'com.android.%'
+  AND type_name NOT LIKE 'sun.%'
+  AND type_name NOT LIKE 'dalvik.%'
+  AND type_name NOT LIKE 'libcore.%'
   AND LENGTH(type_name) > 0
 )
 SELECT upid, RepeatedField(type_name) AS types
@@ -34,8 +44,15 @@
   SELECT DISTINCT upid, field_name
   FROM heap_graph_object JOIN heap_graph_reference USING (reference_set_id)
   WHERE deobfuscated_type_name IS NULL
-  AND field_name NOT LIKE '%.%.%'
   AND field_name NOT LIKE '$Proxy%'
+  AND field_name NOT LIKE 'java.%'
+  AND field_name NOT LIKE 'javax.%'
+  AND field_name NOT LIKE 'j$.%'
+  AND field_name NOT LIKE 'android.%'
+  AND field_name NOT LIKE 'com.android.%'
+  AND field_name NOT LIKE 'sun.%'
+  AND field_name NOT LIKE 'dalvik.%'
+  AND field_name NOT LIKE 'libcore.%'
   AND LENGTH(field_name) > 0
 )
 SELECT upid, RepeatedField(field_name) AS fields
diff --git a/src/trace_processor/stack_profile_tracker.h b/src/trace_processor/stack_profile_tracker.h
index 575715c..00357ce 100644
--- a/src/trace_processor/stack_profile_tracker.h
+++ b/src/trace_processor/stack_profile_tracker.h
@@ -191,6 +191,7 @@
       const InternLookup* intern_lookup);
   base::Optional<FrameId> FindOrInsertFrame(SourceFrameId,
                                             const InternLookup* intern_lookup);
+
   base::Optional<CallsiteId> FindOrInsertCallstack(
       SourceCallstackId,
       const InternLookup* intern_lookup);
diff --git a/src/trace_processor/stats.h b/src/trace_processor/stats.h
index 8d52968..f0ba05f 100644
--- a/src/trace_processor/stats.h
+++ b/src/trace_processor/stats.h
@@ -108,14 +108,17 @@
   F(vmstat_unknown_keys,                      kSingle,  kError,    kAnalysis), \
   F(vulkan_allocations_invalid_string_id,     kSingle,  kError,    kTrace),    \
   F(clock_sync_failure,                       kSingle,  kError,    kAnalysis), \
+  F(clock_sync_cache_miss,                    kSingle,  kInfo,     kAnalysis), \
   F(process_tracker_errors,                   kSingle,  kError,    kAnalysis), \
   F(json_tokenizer_failure,                   kSingle,  kError,    kTrace),    \
   F(heap_graph_invalid_string_id,             kIndexed, kError,    kTrace),    \
   F(heap_graph_non_finalized_graph,           kSingle,  kError,    kTrace),    \
   F(heap_graph_malformed_packet,              kIndexed, kError,    kTrace),    \
-  F(heap_graph_missing_packet,                kIndexed, kDataLoss, kTrace),    \
+  F(heap_graph_missing_packet,                kIndexed, kError,    kTrace),    \
   F(heapprofd_buffer_corrupted,               kIndexed, kError,    kTrace),    \
   F(heapprofd_buffer_overran,                 kIndexed, kDataLoss, kTrace),    \
+  F(heapprofd_client_disconnected,            kIndexed, kInfo,     kTrace),    \
+  F(heapprofd_malformed_packet,               kIndexed, kError,    kTrace),    \
   F(heapprofd_missing_packet,                 kSingle,  kError,    kTrace),    \
   F(heapprofd_rejected_concurrent,            kIndexed, kError,    kTrace),    \
   F(metatrace_overruns,                       kSingle,  kError,    kTrace),    \
@@ -137,7 +140,9 @@
 enum Severity {
   kInfo,      // Diagnostic counters
   kDataLoss,  // Correct operation that still resulted in data loss
-  kError      // If any kError counter is > 0 the UI will raise an error
+  kError      // If any kError counter is > 0 trace_processor_shell will
+              // raise an error. This is *not* surfaced in the web UI.
+              // TODO(b/148587181): Surface these errors in the UI.
 };
 
 enum Source {
diff --git a/src/trace_processor/tables/macros_internal.h b/src/trace_processor/tables/macros_internal.h
index a4fbab7..1533f91 100644
--- a/src/trace_processor/tables/macros_internal.h
+++ b/src/trace_processor/tables/macros_internal.h
@@ -194,7 +194,7 @@
 
 // Defines the member variable in the Table.
 #define PERFETTO_TP_TABLE_MEMBER(type, name, ...) \
-  SparseVector<TypedColumn<type>::StoredType> name##_;
+  SparseVector<TypedColumn<type>::serialized_type> name##_;
 
 // Constructs the column in the Table constructor when flags are specified.
 #define PERFETTO_TP_TABLE_CONSTRUCTOR_COLUMN_FLAGS(type, name, flags)          \
diff --git a/src/trace_processor/tables/slice_tables.h b/src/trace_processor/tables/slice_tables.h
index 7d75e7d..7d041bb 100644
--- a/src/trace_processor/tables/slice_tables.h
+++ b/src/trace_processor/tables/slice_tables.h
@@ -54,8 +54,11 @@
   PARENT(PERFETTO_TP_SLICE_TABLE_DEF, C)            \
   C(base::Optional<int64_t>, context_id)            \
   C(base::Optional<int64_t>, render_target)         \
+  C(StringPool::Id, render_target_name)             \
   C(base::Optional<int64_t>, render_pass)           \
+  C(StringPool::Id, render_pass_name)               \
   C(base::Optional<int64_t>, command_buffer)        \
+  C(StringPool::Id, command_buffer_name)            \
   C(base::Optional<uint32_t>, frame_id)             \
   C(base::Optional<uint32_t>, submission_id)        \
   C(base::Optional<uint32_t>, hw_queue_id)
diff --git a/src/trace_processor/trace_processor_impl.cc b/src/trace_processor/trace_processor_impl.cc
index 154dd3e..9f14054 100644
--- a/src/trace_processor/trace_processor_impl.cc
+++ b/src/trace_processor/trace_processor_impl.cc
@@ -360,13 +360,22 @@
       PERFETTO_ELOG("Error initializing RepeatedField");
   }
 }
+
+void EnsureSqliteInitialized() {
+  // sqlite3_initialize isn't actually thread-safe despite being documented
+  // as such; we need to make sure multiple TraceProcessorImpl instances don't
+  // call it concurrently and only gets called once per process, instead.
+  static bool init_once = [] { return sqlite3_initialize() == SQLITE_OK; }();
+  PERFETTO_CHECK(init_once);
+}
+
 }  // namespace
 
 TraceProcessorImpl::TraceProcessorImpl(const Config& cfg)
     : TraceProcessorStorageImpl(cfg) {
   RegisterAdditionalModules(&context_);
   sqlite3* db = nullptr;
-  PERFETTO_CHECK(sqlite3_initialize() == SQLITE_OK);
+  EnsureSqliteInitialized();
   PERFETTO_CHECK(sqlite3_open(":memory:", &db) == SQLITE_OK);
   InitializeSqlite(db);
   CreateBuiltinTables(db);
diff --git a/src/trace_processor/track_tracker.cc b/src/trace_processor/track_tracker.cc
index 94661b8..0ff0bd3 100644
--- a/src/trace_processor/track_tracker.cc
+++ b/src/trace_processor/track_tracker.cc
@@ -31,6 +31,7 @@
       source_id_is_process_scoped_key_(
           context->storage->InternString("source_id_is_process_scoped")),
       source_scope_key_(context->storage->InternString("source_scope")),
+      parent_track_id_key_(context->storage->InternString("parent_track_id")),
       fuchsia_source_(context->storage->InternString("fuchsia")),
       chrome_source_(context->storage->InternString("chrome")),
       android_source_(context->storage->InternString("android")),
@@ -364,11 +365,20 @@
   if (!track_id) {
     tables::TrackTable::Row track;
     track_id = context_->storage->mutable_track_table()->Insert(track);
+    // The global track with no uuid is the default global track (e.g. for
+    // global instant events). Any other global tracks are considered children
+    // of the default track.
+    if (!parent_track_id && uuid)
+      parent_track_id = GetOrCreateDefaultDescriptorTrack();
   }
 
-  context_->args_tracker->AddArgsTo(*track_id)
-      .AddArg(source_key_, Variadic::String(descriptor_source_))
+  auto args = context_->args_tracker->AddArgsTo(*track_id);
+  args.AddArg(source_key_, Variadic::String(descriptor_source_))
       .AddArg(source_id_key_, Variadic::Integer(static_cast<int64_t>(uuid)));
+  if (parent_track_id) {
+    args.AddArg(parent_track_id_key_,
+                Variadic::Integer(parent_track_id->value));
+  }
   return *track_id;
 }
 
diff --git a/src/trace_processor/track_tracker.h b/src/trace_processor/track_tracker.h
index de6233a..ddd6f11 100644
--- a/src/trace_processor/track_tracker.h
+++ b/src/trace_processor/track_tracker.h
@@ -216,6 +216,7 @@
   const StringId source_id_key_ = kNullStringId;
   const StringId source_id_is_process_scoped_key_ = kNullStringId;
   const StringId source_scope_key_ = kNullStringId;
+  const StringId parent_track_id_key_ = kNullStringId;
 
   const StringId fuchsia_source_ = kNullStringId;
   const StringId chrome_source_ = kNullStringId;
diff --git a/src/traced/probes/ftrace/cpu_reader_benchmark.cc b/src/traced/probes/ftrace/cpu_reader_benchmark.cc
index 58b4e20..ee6d246 100644
--- a/src/traced/probes/ftrace/cpu_reader_benchmark.cc
+++ b/src/traced/probes/ftrace/cpu_reader_benchmark.cc
@@ -315,8 +315,8 @@
   ProtoTranslationTable* table = GetTable(test_case->name);
   auto page = PageFromXxd(test_case->data);
 
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   DisabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config{
+      EventFilter{}, DisabledCompactSchedConfigForTesting(), {}, {}};
   ds_config.event_filter.AddEnabledEvent(
       table->EventToFtraceId(GroupAndName("sched", "sched_switch")));
 
diff --git a/src/traced/probes/ftrace/cpu_reader_fuzzer.cc b/src/traced/probes/ftrace/cpu_reader_fuzzer.cc
index 61d5b80..27e5db3 100644
--- a/src/traced/probes/ftrace/cpu_reader_fuzzer.cc
+++ b/src/traced/probes/ftrace/cpu_reader_fuzzer.cc
@@ -52,8 +52,8 @@
   memcpy(g_page, data, std::min(base::kPageSize, size));
 
   FtraceMetadata metadata{};
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   DisabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config{
+      EventFilter{}, DisabledCompactSchedConfigForTesting(), {}, {}};
   ds_config.event_filter.AddEnabledEvent(
       table->EventToFtraceId(GroupAndName("sched", "sched_switch")));
   ds_config.event_filter.AddEnabledEvent(
diff --git a/src/traced/probes/ftrace/cpu_reader_unittest.cc b/src/traced/probes/ftrace/cpu_reader_unittest.cc
index 61cdc23..2d10829 100644
--- a/src/traced/probes/ftrace/cpu_reader_unittest.cc
+++ b/src/traced/probes/ftrace/cpu_reader_unittest.cc
@@ -60,6 +60,11 @@
 
 namespace {
 
+FtraceDataSourceConfig EmptyConfig() {
+  return FtraceDataSourceConfig{
+      EventFilter{}, DisabledCompactSchedConfigForTesting(), {}, {}};
+}
+
 constexpr uint64_t kNanoInSecond = 1000 * 1000 * 1000;
 constexpr uint64_t kNanoInMicro = 1000;
 
@@ -378,8 +383,7 @@
   ProtoTranslationTable* table = GetTable(test_case->name);
   auto page = PageFromXxd(test_case->data);
 
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   DisabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config = EmptyConfig();
   ds_config.event_filter.AddEnabledEvent(
       table->EventToFtraceId(GroupAndName("ftrace", "print")));
 
@@ -507,8 +511,7 @@
   ProtoTranslationTable* table = GetTable(test_case->name);
   auto page = PageFromXxd(test_case->data);
 
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   DisabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config = EmptyConfig();
   ds_config.event_filter.AddEnabledEvent(
       table->EventToFtraceId(GroupAndName("ftrace", "print")));
 
@@ -557,8 +560,7 @@
   ProtoTranslationTable* table = GetTable(test_case->name);
   auto page = PageFromXxd(test_case->data);
 
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   DisabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config = EmptyConfig();
   ds_config.event_filter.AddEnabledEvent(
       table->EventToFtraceId(GroupAndName("ftrace", "print")));
 
@@ -598,8 +600,7 @@
   ProtoTranslationTable* table = GetTable(test_case->name);
   auto page = PageFromXxd(test_case->data);
 
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   DisabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config = EmptyConfig();
 
   FtraceMetadata metadata{};
   CompactSchedBuffer compact_buffer;
@@ -662,8 +663,7 @@
   ProtoTranslationTable* table = GetTable(test_case->name);
   auto page = PageFromXxd(test_case->data);
 
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   DisabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config = EmptyConfig();
   ds_config.event_filter.AddEnabledEvent(
       table->EventToFtraceId(GroupAndName("ftrace", "print")));
 
@@ -772,8 +772,7 @@
   ProtoTranslationTable* table = GetTable(test_case->name);
   auto page = PageFromXxd(test_case->data);
 
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   DisabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config = EmptyConfig();
   ds_config.event_filter.AddEnabledEvent(
       table->EventToFtraceId(GroupAndName("sched", "sched_switch")));
 
@@ -819,8 +818,8 @@
   ProtoTranslationTable* table = GetTable(test_case->name);
   auto page = PageFromXxd(test_case->data);
 
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   EnabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config{
+      EventFilter{}, EnabledCompactSchedConfigForTesting(), {}, {}};
   ds_config.event_filter.AddEnabledEvent(
       table->EventToFtraceId(GroupAndName("sched", "sched_switch")));
 
@@ -1160,8 +1159,7 @@
   BundleProvider bundle_provider(base::kPageSize);
   ProtoTranslationTable* table = GetTable("synthetic");
   FtraceMetadata metadata{};
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   DisabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config = EmptyConfig();
   ds_config.event_filter.AddEnabledEvent(
       table->EventToFtraceId(GroupAndName("sched", "sched_switch")));
 
@@ -1618,8 +1616,7 @@
   ProtoTranslationTable* table = GetTable(test_case->name);
   auto page = PageFromXxd(test_case->data);
 
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   DisabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config = EmptyConfig();
   ds_config.event_filter.AddEnabledEvent(
       table->EventToFtraceId(GroupAndName("sched", "sched_switch")));
 
@@ -2063,8 +2060,7 @@
   ProtoTranslationTable* table = GetTable(test_case->name);
   auto page = PageFromXxd(test_case->data);
 
-  FtraceDataSourceConfig ds_config{EventFilter{},
-                                   DisabledCompactSchedConfigForTesting()};
+  FtraceDataSourceConfig ds_config = EmptyConfig();
   ds_config.event_filter.AddEnabledEvent(
       table->EventToFtraceId(GroupAndName("sched", "sched_switch")));
 
diff --git a/src/traced/probes/ftrace/ftrace_config_muxer.cc b/src/traced/probes/ftrace/ftrace_config_muxer.cc
index f0cf926..5a93e5f 100644
--- a/src/traced/probes/ftrace/ftrace_config_muxer.cc
+++ b/src/traced/probes/ftrace/ftrace_config_muxer.cc
@@ -22,6 +22,7 @@
 #include <unistd.h>
 
 #include <algorithm>
+#include <iterator>
 
 #include "perfetto/ext/base/utils.h"
 #include "protos/perfetto/trace/ftrace/sched.pbzero.h"
@@ -67,6 +68,28 @@
                         event.substr(slash_pos + 1));
 }
 
+void UnionInPlace(const std::vector<std::string>& unsorted_a,
+                  std::vector<std::string>* out) {
+  std::vector<std::string> a = unsorted_a;
+  std::sort(a.begin(), a.end());
+  std::sort(out->begin(), out->end());
+  std::vector<std::string> v;
+  std::set_union(a.begin(), a.end(), out->begin(), out->end(),
+                 std::back_inserter(v));
+  *out = std::move(v);
+}
+
+void IntersectInPlace(const std::vector<std::string>& unsorted_a,
+                      std::vector<std::string>* out) {
+  std::vector<std::string> a = unsorted_a;
+  std::sort(a.begin(), a.end());
+  std::sort(out->begin(), out->end());
+  std::vector<std::string> v;
+  std::set_intersection(a.begin(), a.end(), out->begin(), out->end(),
+                        std::back_inserter(v));
+  *out = std::move(v);
+}
+
 }  // namespace
 
 std::set<GroupAndName> FtraceConfigMuxer::GetFtraceEvents(
@@ -453,10 +476,14 @@
   auto compact_sched =
       CreateCompactSchedConfig(request, table_->compact_sched_format());
 
-  FtraceConfigId id = ++last_id_;
-  ds_configs_.emplace(std::piecewise_construct, std::forward_as_tuple(id),
-                      std::forward_as_tuple(std::move(filter), compact_sched));
+  std::vector<std::string> apps(request.atrace_apps());
+  std::vector<std::string> categories(request.atrace_categories());
 
+  FtraceConfigId id = ++last_id_;
+  ds_configs_.emplace(
+      std::piecewise_construct, std::forward_as_tuple(id),
+      std::forward_as_tuple(std::move(filter), compact_sched, std::move(apps),
+                            std::move(categories)));
   return id;
 }
 
@@ -487,9 +514,29 @@
   if (!config_id || !ds_configs_.erase(config_id))
     return false;
   EventFilter expected_ftrace_events;
-  for (const auto& ds_config : ds_configs_) {
-    expected_ftrace_events.EnableEventsFrom(ds_config.second.event_filter);
+  std::vector<std::string> expected_apps;
+  std::vector<std::string> expected_categories;
+  for (const auto& id_config : ds_configs_) {
+    const perfetto::FtraceDataSourceConfig& config = id_config.second;
+    expected_ftrace_events.EnableEventsFrom(config.event_filter);
+    UnionInPlace(config.atrace_apps, &expected_apps);
+    UnionInPlace(config.atrace_categories, &expected_categories);
   }
+  // At this point expected_{apps,categories} contains the union of the
+  // leftover configs (if any) that should be still on. However we did not
+  // necessarily succeed in turning on atrace for each of those configs
+  // previously so we now intersect the {apps,categories} that we *did* manage
+  // to turn on with those we want on to determine the new state we should aim
+  // for:
+  IntersectInPlace(current_state_.atrace_apps, &expected_apps);
+  IntersectInPlace(current_state_.atrace_categories, &expected_categories);
+  // Work out if there is any difference between the current state and the
+  // desired state: It's sufficient to compare sizes here (since we know from
+  // above that expected_{apps,categories} is now a subset of
+  // atrace_{apps,categories}:
+  bool atrace_changed =
+      (current_state_.atrace_apps.size() != expected_apps.size()) ||
+      (current_state_.atrace_categories.size() != expected_categories.size());
 
   // Disable any events that are currently enabled, but are not in any configs
   // anymore.
@@ -524,8 +571,22 @@
       current_state_.cpu_buffer_size_pages = 1;
     ftrace_->DisableAllEvents();
     ftrace_->ClearTrace();
-    if (current_state_.atrace_on)
+  }
+
+  if (current_state_.atrace_on) {
+    if (expected_apps.empty() && expected_categories.empty()) {
       DisableAtrace();
+    } else if (atrace_changed) {
+      // Update atrace to remove the no longer wanted categories/apps. For
+      // some categories this won't disable them (e.g. categories that just
+      // enable ftrace events) for those there is nothing we can do till the
+      // last ftrace config is removed.
+      if (StartAtrace(expected_apps, expected_categories)) {
+        // Update current_state_ to reflect this change.
+        current_state_.atrace_apps = expected_apps;
+        current_state_.atrace_categories = expected_categories;
+      }
+    }
   }
 
   return true;
@@ -564,29 +625,58 @@
 }
 
 void FtraceConfigMuxer::UpdateAtrace(const FtraceConfig& request) {
+  // We want to avoid poisoning current_state_.atrace_{categories, apps}
+  // if for some reason these args make atrace unhappy so we stash the
+  // union into temps and only update current_state_ if we successfully
+  // run atrace.
+
+  std::vector<std::string> combined_categories = request.atrace_categories();
+  UnionInPlace(current_state_.atrace_categories, &combined_categories);
+
+  std::vector<std::string> combined_apps = request.atrace_apps();
+  UnionInPlace(current_state_.atrace_apps, &combined_apps);
+
+  if (current_state_.atrace_on &&
+      combined_apps.size() == current_state_.atrace_apps.size() &&
+      combined_categories.size() == current_state_.atrace_categories.size()) {
+    return;
+  }
+
+  if (StartAtrace(combined_apps, combined_categories)) {
+    current_state_.atrace_categories = combined_categories;
+    current_state_.atrace_apps = combined_apps;
+    current_state_.atrace_on = true;
+  }
+}
+
+// static
+bool FtraceConfigMuxer::StartAtrace(
+    const std::vector<std::string>& apps,
+    const std::vector<std::string>& categories) {
   PERFETTO_DLOG("Update atrace config...");
 
   std::vector<std::string> args;
   args.push_back("atrace");  // argv0 for exec()
   args.push_back("--async_start");
   args.push_back("--only_userspace");
-  for (const auto& category : request.atrace_categories())
+
+  for (const auto& category : categories)
     args.push_back(category);
-  if (!request.atrace_apps().empty()) {
+
+  if (!apps.empty()) {
     args.push_back("-a");
     std::string arg = "";
-    for (const auto& app : request.atrace_apps()) {
+    for (const auto& app : apps) {
       arg += app;
-      if (app != request.atrace_apps().back())
-        arg += ",";
+      arg += ",";
     }
+    arg.resize(arg.size() - 1);
     args.push_back(arg);
   }
 
-  if (RunAtrace(args))
-    current_state_.atrace_on = true;
-
-  PERFETTO_DLOG("...done");
+  bool result = RunAtrace(args);
+  PERFETTO_DLOG("...done (%s)", result ? "success" : "fail");
+  return result;
 }
 
 void FtraceConfigMuxer::DisableAtrace() {
@@ -594,8 +684,11 @@
 
   PERFETTO_DLOG("Stop atrace...");
 
-  if (RunAtrace({"atrace", "--async_stop", "--only_userspace"}))
+  if (RunAtrace({"atrace", "--async_stop", "--only_userspace"})) {
+    current_state_.atrace_categories.clear();
+    current_state_.atrace_apps.clear();
     current_state_.atrace_on = false;
+  }
 
   PERFETTO_DLOG("...done");
 }
diff --git a/src/traced/probes/ftrace/ftrace_config_muxer.h b/src/traced/probes/ftrace/ftrace_config_muxer.h
index 416a38b..16883c4 100644
--- a/src/traced/probes/ftrace/ftrace_config_muxer.h
+++ b/src/traced/probes/ftrace/ftrace_config_muxer.h
@@ -32,8 +32,13 @@
 // that data source's config.
 struct FtraceDataSourceConfig {
   FtraceDataSourceConfig(EventFilter _event_filter,
-                         CompactSchedConfig _compact_sched)
-      : event_filter(std::move(_event_filter)), compact_sched(_compact_sched) {}
+                         CompactSchedConfig _compact_sched,
+                         std::vector<std::string> _atrace_apps,
+                         std::vector<std::string> _atrace_categories)
+      : event_filter(std::move(_event_filter)),
+        compact_sched(_compact_sched),
+        atrace_apps(std::move(_atrace_apps)),
+        atrace_categories(std::move(_atrace_categories)) {}
 
   // The event filter allows to quickly check if a certain ftrace event with id
   // x is enabled for this data source.
@@ -41,6 +46,10 @@
 
   // Configuration of the optional compact encoding of scheduling events.
   const CompactSchedConfig compact_sched;
+
+  // Used only in Android for ATRACE_EVENT/os.Trace() userspace annotations.
+  std::vector<std::string> atrace_apps;
+  std::vector<std::string> atrace_categories;
 };
 
 // Ftrace is a bunch of globally modifiable persistent state.
@@ -103,11 +112,17 @@
   }
 
  private:
+  static bool StartAtrace(const std::vector<std::string>& apps,
+                          const std::vector<std::string>& categories);
+
   struct FtraceState {
     EventFilter ftrace_events;
+    // Used only in Android for ATRACE_EVENT/os.Trace() userspace
+    std::vector<std::string> atrace_apps;
+    std::vector<std::string> atrace_categories;
+    size_t cpu_buffer_size_pages = 0;
     bool tracing_on = false;
     bool atrace_on = false;
-    size_t cpu_buffer_size_pages = 0;
   };
 
   FtraceConfigMuxer(const FtraceConfigMuxer&) = delete;
diff --git a/src/traced/probes/ftrace/ftrace_config_muxer_unittest.cc b/src/traced/probes/ftrace/ftrace_config_muxer_unittest.cc
index 890e07a..0b069c2 100644
--- a/src/traced/probes/ftrace/ftrace_config_muxer_unittest.cc
+++ b/src/traced/probes/ftrace/ftrace_config_muxer_unittest.cc
@@ -509,7 +509,7 @@
       atrace,
       RunAtrace(ElementsAreArray(
           {"atrace", "--async_start", "--only_userspace", "-a",
-           "com.google.android.gms.persistent,com.google.android.gms"})))
+           "com.google.android.gms,com.google.android.gms.persistent"})))
       .WillOnce(Return(true));
 
   FtraceConfigId id = model.SetupConfig(config);
@@ -526,6 +526,211 @@
   ASSERT_TRUE(model.RemoveConfig(id));
 }
 
+TEST_F(FtraceConfigMuxerTest, AtraceMultipleConfigs) {
+  NiceMock<MockFtraceProcfs> ftrace;
+  MockRunAtrace atrace;
+
+  FtraceConfig config_a = CreateFtraceConfig({});
+  *config_a.add_atrace_apps() = "app_a";
+  *config_a.add_atrace_categories() = "cat_a";
+
+  FtraceConfig config_b = CreateFtraceConfig({});
+  *config_b.add_atrace_apps() = "app_b";
+  *config_b.add_atrace_categories() = "cat_b";
+
+  FtraceConfig config_c = CreateFtraceConfig({});
+  *config_c.add_atrace_apps() = "app_c";
+  *config_c.add_atrace_categories() = "cat_c";
+
+  FtraceConfigMuxer model(&ftrace, table_.get());
+
+  EXPECT_CALL(atrace, RunAtrace(ElementsAreArray({"atrace", "--async_start",
+                                                  "--only_userspace", "cat_a",
+                                                  "-a", "app_a"})))
+      .WillOnce(Return(true));
+  FtraceConfigId id_a = model.SetupConfig(config_a);
+  ASSERT_TRUE(id_a);
+
+  EXPECT_CALL(
+      atrace,
+      RunAtrace(ElementsAreArray({"atrace", "--async_start", "--only_userspace",
+                                  "cat_a", "cat_b", "-a", "app_a,app_b"})))
+      .WillOnce(Return(true));
+  FtraceConfigId id_b = model.SetupConfig(config_b);
+  ASSERT_TRUE(id_b);
+
+  EXPECT_CALL(atrace,
+              RunAtrace(ElementsAreArray({"atrace", "--async_start",
+                                          "--only_userspace", "cat_a", "cat_b",
+                                          "cat_c", "-a", "app_a,app_b,app_c"})))
+      .WillOnce(Return(true));
+  FtraceConfigId id_c = model.SetupConfig(config_c);
+  ASSERT_TRUE(id_c);
+
+  EXPECT_CALL(
+      atrace,
+      RunAtrace(ElementsAreArray({"atrace", "--async_start", "--only_userspace",
+                                  "cat_a", "cat_c", "-a", "app_a,app_c"})))
+      .WillOnce(Return(true));
+  ASSERT_TRUE(model.RemoveConfig(id_b));
+
+  EXPECT_CALL(atrace, RunAtrace(ElementsAreArray({"atrace", "--async_start",
+                                                  "--only_userspace", "cat_c",
+                                                  "-a", "app_c"})))
+      .WillOnce(Return(true));
+  ASSERT_TRUE(model.RemoveConfig(id_a));
+
+  EXPECT_CALL(atrace, RunAtrace(ElementsAreArray(
+                          {"atrace", "--async_stop", "--only_userspace"})))
+      .WillOnce(Return(true));
+  ASSERT_TRUE(model.RemoveConfig(id_c));
+}
+
+TEST_F(FtraceConfigMuxerTest, AtraceFailedConfig) {
+  NiceMock<MockFtraceProcfs> ftrace;
+  MockRunAtrace atrace;
+
+  FtraceConfig config_a = CreateFtraceConfig({});
+  *config_a.add_atrace_apps() = "app_1";
+  *config_a.add_atrace_apps() = "app_2";
+  *config_a.add_atrace_categories() = "cat_1";
+  *config_a.add_atrace_categories() = "cat_2";
+
+  FtraceConfig config_b = CreateFtraceConfig({});
+  *config_b.add_atrace_apps() = "app_fail";
+  *config_b.add_atrace_categories() = "cat_fail";
+
+  FtraceConfig config_c = CreateFtraceConfig({});
+  *config_c.add_atrace_apps() = "app_1";
+  *config_c.add_atrace_apps() = "app_3";
+  *config_c.add_atrace_categories() = "cat_1";
+  *config_c.add_atrace_categories() = "cat_3";
+
+  FtraceConfigMuxer model(&ftrace, table_.get());
+
+  EXPECT_CALL(
+      atrace,
+      RunAtrace(ElementsAreArray({"atrace", "--async_start", "--only_userspace",
+                                  "cat_1", "cat_2", "-a", "app_1,app_2"})))
+      .WillOnce(Return(true));
+  FtraceConfigId id_a = model.SetupConfig(config_a);
+  ASSERT_TRUE(id_a);
+
+  EXPECT_CALL(atrace,
+              RunAtrace(ElementsAreArray(
+                  {"atrace", "--async_start", "--only_userspace", "cat_1",
+                   "cat_2", "cat_fail", "-a", "app_1,app_2,app_fail"})))
+      .WillOnce(Return(false));
+  FtraceConfigId id_b = model.SetupConfig(config_b);
+  ASSERT_TRUE(id_b);
+
+  EXPECT_CALL(atrace,
+              RunAtrace(ElementsAreArray({"atrace", "--async_start",
+                                          "--only_userspace", "cat_1", "cat_2",
+                                          "cat_3", "-a", "app_1,app_2,app_3"})))
+      .WillOnce(Return(true));
+  FtraceConfigId id_c = model.SetupConfig(config_c);
+  ASSERT_TRUE(id_c);
+
+  EXPECT_CALL(
+      atrace,
+      RunAtrace(ElementsAreArray({"atrace", "--async_start", "--only_userspace",
+                                  "cat_1", "cat_2", "-a", "app_1,app_2"})))
+      .WillOnce(Return(true));
+  ASSERT_TRUE(model.RemoveConfig(id_c));
+
+  // Removing the config we failed to enable doesn't change the atrace state
+  // so we don't expect a call here.
+  ASSERT_TRUE(model.RemoveConfig(id_b));
+
+  EXPECT_CALL(atrace, RunAtrace(ElementsAreArray(
+                          {"atrace", "--async_stop", "--only_userspace"})))
+      .WillOnce(Return(true));
+  ASSERT_TRUE(model.RemoveConfig(id_a));
+}
+
+TEST_F(FtraceConfigMuxerTest, AtraceDuplicateConfigs) {
+  NiceMock<MockFtraceProcfs> ftrace;
+  MockRunAtrace atrace;
+
+  FtraceConfig config_a = CreateFtraceConfig({});
+  *config_a.add_atrace_apps() = "app_1";
+  *config_a.add_atrace_categories() = "cat_1";
+
+  FtraceConfig config_b = CreateFtraceConfig({});
+  *config_b.add_atrace_apps() = "app_1";
+  *config_b.add_atrace_categories() = "cat_1";
+
+  FtraceConfigMuxer model(&ftrace, table_.get());
+
+  EXPECT_CALL(atrace, RunAtrace(ElementsAreArray({"atrace", "--async_start",
+                                                  "--only_userspace", "cat_1",
+                                                  "-a", "app_1"})))
+      .WillOnce(Return(true));
+  FtraceConfigId id_a = model.SetupConfig(config_a);
+  ASSERT_TRUE(id_a);
+
+  FtraceConfigId id_b = model.SetupConfig(config_b);
+  ASSERT_TRUE(id_b);
+
+  ASSERT_TRUE(model.RemoveConfig(id_a));
+
+  EXPECT_CALL(atrace, RunAtrace(ElementsAreArray(
+                          {"atrace", "--async_stop", "--only_userspace"})))
+      .WillOnce(Return(true));
+  ASSERT_TRUE(model.RemoveConfig(id_b));
+}
+
+TEST_F(FtraceConfigMuxerTest, AtraceAndFtraceConfigs) {
+  NiceMock<MockFtraceProcfs> ftrace;
+  MockRunAtrace atrace;
+
+  FtraceConfig config_a = CreateFtraceConfig({"sched/sched_cpu_hotplug"});
+
+  FtraceConfig config_b = CreateFtraceConfig({"sched/sched_switch"});
+  *config_b.add_atrace_categories() = "b";
+
+  FtraceConfig config_c = CreateFtraceConfig({"sched/sched_switch"});
+
+  FtraceConfig config_d = CreateFtraceConfig({"sched/sched_cpu_hotplug"});
+  *config_d.add_atrace_categories() = "d";
+
+  FtraceConfigMuxer model(&ftrace, table_.get());
+
+  FtraceConfigId id_a = model.SetupConfig(config_a);
+  ASSERT_TRUE(id_a);
+
+  EXPECT_CALL(atrace, RunAtrace(ElementsAreArray({"atrace", "--async_start",
+                                                  "--only_userspace", "b"})))
+      .WillOnce(Return(true));
+  FtraceConfigId id_b = model.SetupConfig(config_b);
+  ASSERT_TRUE(id_b);
+
+  FtraceConfigId id_c = model.SetupConfig(config_c);
+  ASSERT_TRUE(id_c);
+
+  EXPECT_CALL(atrace,
+              RunAtrace(ElementsAreArray(
+                  {"atrace", "--async_start", "--only_userspace", "b", "d"})))
+      .WillOnce(Return(true));
+  FtraceConfigId id_d = model.SetupConfig(config_d);
+  ASSERT_TRUE(id_d);
+
+  EXPECT_CALL(atrace, RunAtrace(ElementsAreArray({"atrace", "--async_start",
+                                                  "--only_userspace", "b"})))
+      .WillOnce(Return(true));
+  ASSERT_TRUE(model.RemoveConfig(id_d));
+
+  ASSERT_TRUE(model.RemoveConfig(id_c));
+
+  EXPECT_CALL(atrace, RunAtrace(ElementsAreArray(
+                          {"atrace", "--async_stop", "--only_userspace"})))
+      .WillOnce(Return(true));
+  ASSERT_TRUE(model.RemoveConfig(id_b));
+
+  ASSERT_TRUE(model.RemoveConfig(id_a));
+}
+
 TEST_F(FtraceConfigMuxerTest, SetupClockForTesting) {
   MockFtraceProcfs ftrace;
   FtraceConfig config;
diff --git a/src/tracing/BUILD.gn b/src/tracing/BUILD.gn
index c93d5a5..936eb2d 100644
--- a/src/tracing/BUILD.gn
+++ b/src/tracing/BUILD.gn
@@ -17,6 +17,55 @@
 import("../../gn/perfetto.gni")
 import("../../gn/test.gni")
 
+# Full version of the client API. Supports both the in-process backend and the
+# system backend (on posix systems and if enabled by the enable_perfetto_ipc).
+# It has a larger binary footprint due to the service code for the in-proecss
+# backend.
+group("client_api") {
+  public_deps = [
+    ":client_api_base",
+    "../../gn:default_deps",
+  ]
+  deps = [ ":in_process_backend" ]
+  if (enable_perfetto_ipc) {
+    deps += [ ":system_process_backend" ]
+  } else {
+    deps += [ ":system_process_backend_fake" ]
+  }
+}
+
+# Slim version of the client API. Works only with the system backend (traced
+# connection over a UNIX socket). Has a lighter binary size impact.
+if (enable_perfetto_ipc) {
+  source_set("client_api_system_backend_only") {
+    public_deps = [
+      ":client_api_base",
+      "../../gn:default_deps",
+    ]
+    deps = [
+      ":in_process_backend_fake",
+      ":system_process_backend",
+    ]
+  }
+}
+
+# This target checks only that the "fake" backends compile. This is to detect
+# early if we break them with refactorings, without waiting for TreeHugger or
+# rolls into chromium.
+if (perfetto_build_standalone) {
+  shared_library("client_api_no_backends_compile_test") {
+    public_deps = [
+      ":client_api_base",
+      "../../gn:default_deps",
+    ]
+    deps = [
+      ":in_process_backend_fake",
+      ":platform_fake",
+      ":system_process_backend_fake",
+    ]
+  }
+}
+
 # Separate target because the embedder might not want this (e.g. on Windows).
 if (is_linux || is_mac || is_android) {
   source_set("platform_posix") {
@@ -51,13 +100,13 @@
   sources = [ "trace_writer_base.cc" ]
 }
 
-source_set("client_api") {
+# Base target for the client API. On its own doesn't provide any backend.
+source_set("client_api_base") {
   deps = [
     "../../include/perfetto/tracing/core",
     "../../protos/perfetto/config:cpp",
     "../base",
     "core",
-    "core:service",
   ]
   public_deps = [
     "../../gn:default_deps",
@@ -67,8 +116,8 @@
     "data_source.cc",
     "debug_annotation.cc",
     "event_context.cc",
-    "internal/in_process_tracing_backend.cc",
     "internal/in_process_tracing_backend.h",
+    "internal/system_tracing_backend.h",
     "internal/tracing_muxer_impl.cc",
     "internal/tracing_muxer_impl.h",
     "internal/track_event_internal.cc",
@@ -78,16 +127,59 @@
     "track_event_category_registry.cc",
     "virtual_destructors.cc",
   ]
+}
 
-  if (enable_perfetto_ipc) {
-    deps += [
+# System backend: connects to an external "traced" instance via a UNIX socket.
+# Requires the IPC layer and is supported only on posix systems.
+if (enable_perfetto_ipc) {
+  source_set("system_process_backend") {
+    public_deps = [ "../../include/perfetto/tracing" ]
+    deps = [
+      ":client_api_base",
+      "../../gn:default_deps",
+      "../../include/perfetto/tracing/core",
+      "../base",
       "ipc/consumer",
       "ipc/producer",
       "ipc/service",
     ]
-    sources += [
-      "internal/system_tracing_backend.cc",
-      "internal/system_tracing_backend.h",
-    ]
+    sources = [ "internal/system_tracing_backend.cc" ]
   }
 }
+
+# System backend fallback: it prints an error message and returns nullptr.
+source_set("system_process_backend_fake") {
+  public_deps = [ "../../include/perfetto/tracing" ]
+  deps = [
+    ":client_api_base",
+    "../../gn:default_deps",
+    "../base",
+  ]
+  sources = [ "internal/system_tracing_backend_fake.cc" ]
+}
+
+# In-process backend: starts the tracing service in-process on a dedicated
+# thread. It depends only on having a valid "platform" target. It has a larger
+# binary size cost because links in all the service code.
+source_set("in_process_backend") {
+  public_deps = [ "../../include/perfetto/tracing" ]
+  deps = [
+    ":client_api_base",
+    "../../gn:default_deps",
+    "../../include/perfetto/tracing/core",
+    "../base",
+    "core:service",
+  ]
+  sources = [ "internal/in_process_tracing_backend.cc" ]
+}
+
+# In-process backend fallback: it prints an error messaage and returns nullptr.
+source_set("in_process_backend_fake") {
+  public_deps = [ "../../include/perfetto/tracing" ]
+  deps = [
+    ":client_api_base",
+    "../../gn:default_deps",
+    "../base",
+  ]
+  sources = [ "internal/in_process_tracing_backend_fake.cc" ]
+}
diff --git a/src/tracing/core/shared_memory_arbiter_impl_unittest.cc b/src/tracing/core/shared_memory_arbiter_impl_unittest.cc
index 91bd5db..036a8a3 100644
--- a/src/tracing/core/shared_memory_arbiter_impl_unittest.cc
+++ b/src/tracing/core/shared_memory_arbiter_impl_unittest.cc
@@ -50,7 +50,7 @@
       BufferExhaustedPolicy) override {
     return nullptr;
   }
-  SharedMemoryArbiter* GetInProcessShmemArbiter() override { return nullptr; }
+  SharedMemoryArbiter* MaybeSharedMemoryArbiter() override { return nullptr; }
 
   MOCK_METHOD2(CommitData, void(const CommitDataRequest&, CommitDataCallback));
   MOCK_METHOD2(RegisterTraceWriter, void(uint32_t, uint32_t));
diff --git a/src/tracing/core/tracing_service_impl.cc b/src/tracing/core/tracing_service_impl.cc
index 394b784..24e8a0b 100644
--- a/src/tracing/core/tracing_service_impl.cc
+++ b/src/tracing/core/tracing_service_impl.cc
@@ -2820,7 +2820,7 @@
 }
 
 SharedMemoryArbiter*
-TracingServiceImpl::ProducerEndpointImpl::GetInProcessShmemArbiter() {
+TracingServiceImpl::ProducerEndpointImpl::MaybeSharedMemoryArbiter() {
   if (!inproc_shmem_arbiter_) {
     PERFETTO_FATAL(
         "The in-process SharedMemoryArbiter can only be used when "
@@ -2837,14 +2837,16 @@
 TracingServiceImpl::ProducerEndpointImpl::CreateTraceWriter(
     BufferID buf_id,
     BufferExhaustedPolicy buffer_exhausted_policy) {
-  return GetInProcessShmemArbiter()->CreateTraceWriter(buf_id,
+  PERFETTO_DCHECK(MaybeSharedMemoryArbiter());
+  return MaybeSharedMemoryArbiter()->CreateTraceWriter(buf_id,
                                                        buffer_exhausted_policy);
 }
 
 void TracingServiceImpl::ProducerEndpointImpl::NotifyFlushComplete(
     FlushRequestID id) {
   PERFETTO_DCHECK_THREAD(thread_checker_);
-  return GetInProcessShmemArbiter()->NotifyFlushComplete(id);
+  PERFETTO_DCHECK(MaybeSharedMemoryArbiter());
+  return MaybeSharedMemoryArbiter()->NotifyFlushComplete(id);
 }
 
 void TracingServiceImpl::ProducerEndpointImpl::OnTracingSetup() {
diff --git a/src/tracing/core/tracing_service_impl.h b/src/tracing/core/tracing_service_impl.h
index 61c6ef6..eb53207 100644
--- a/src/tracing/core/tracing_service_impl.h
+++ b/src/tracing/core/tracing_service_impl.h
@@ -90,7 +90,7 @@
     std::unique_ptr<TraceWriter> CreateTraceWriter(
         BufferID,
         BufferExhaustedPolicy) override;
-    SharedMemoryArbiter* GetInProcessShmemArbiter() override;
+    SharedMemoryArbiter* MaybeSharedMemoryArbiter() override;
     void NotifyFlushComplete(FlushRequestID) override;
     void NotifyDataSourceStarted(DataSourceInstanceID) override;
     void NotifyDataSourceStopped(DataSourceInstanceID) override;
diff --git a/src/tracing/internal/in_process_tracing_backend_fake.cc b/src/tracing/internal/in_process_tracing_backend_fake.cc
new file mode 100644
index 0000000..7ffafa8
--- /dev/null
+++ b/src/tracing/internal/in_process_tracing_backend_fake.cc
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/tracing/internal/in_process_tracing_backend.h"
+
+#include "perfetto/base/logging.h"
+
+namespace perfetto {
+namespace internal {
+
+// static
+InProcessTracingBackend* InProcessTracingBackend::GetInstance() {
+  PERFETTO_ELOG(
+      "In-process tracing backend not supported by the current build "
+      "configuration");
+  return nullptr;
+}
+
+}  // namespace internal
+}  // namespace perfetto
diff --git a/src/tracing/internal/system_tracing_backend_fake.cc b/src/tracing/internal/system_tracing_backend_fake.cc
new file mode 100644
index 0000000..6a940ae
--- /dev/null
+++ b/src/tracing/internal/system_tracing_backend_fake.cc
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/tracing/internal/system_tracing_backend.h"
+
+#include "perfetto/base/logging.h"
+
+namespace perfetto {
+namespace internal {
+
+// static
+SystemTracingBackend* SystemTracingBackend::GetInstance() {
+  PERFETTO_ELOG(
+      "System tracing backend not supported by the current build "
+      "configuration");
+  return nullptr;
+}
+
+}  // namespace internal
+}  // namespace perfetto
diff --git a/src/tracing/internal/tracing_muxer_impl.cc b/src/tracing/internal/tracing_muxer_impl.cc
index a185f25..e63bc11 100644
--- a/src/tracing/internal/tracing_muxer_impl.cc
+++ b/src/tracing/internal/tracing_muxer_impl.cc
@@ -421,6 +421,12 @@
   PERFETTO_DCHECK_THREAD(thread_checker_);  // Rebind the thread checker.
 
   auto add_backend = [this, &args](TracingBackend* backend, BackendType type) {
+    if (!backend) {
+      // We skip the log in release builds because the *_backend_fake.cc code
+      // has already an ELOG before returning a nullptr.
+      PERFETTO_DLOG("Backend creation failed, type %d", static_cast<int>(type));
+      return;
+    }
     TracingBackendId backend_id = backends_.size();
     backends_.emplace_back();
     RegisteredBackend& rb = backends_.back();
@@ -437,13 +443,12 @@
     rb.producer->Initialize(rb.backend->ConnectProducer(conn_args));
   };
 
-  if (args.backends & kSystemBackend) {
-#if (PERFETTO_BUILDFLAG(PERFETTO_IPC))
+  // Both the system and the in-process backends can be disabled at build-time
+  // and replaced with the _fake.cc versions. The "fake" versions will just
+  // ELOG() and return nullptr.
+
+  if (args.backends & kSystemBackend)
     add_backend(SystemTracingBackend::GetInstance(), kSystemBackend);
-#else
-    PERFETTO_ELOG("System backend not supporteed in the current configuration");
-#endif
-  }
 
   if (args.backends & kInProcessBackend)
     add_backend(InProcessTracingBackend::GetInstance(), kInProcessBackend);
diff --git a/src/tracing/ipc/producer/producer_ipc_client_impl.cc b/src/tracing/ipc/producer/producer_ipc_client_impl.cc
index 396bf4c..f06801a 100644
--- a/src/tracing/ipc/producer/producer_ipc_client_impl.cc
+++ b/src/tracing/ipc/producer/producer_ipc_client_impl.cc
@@ -358,9 +358,8 @@
                                                    buffer_exhausted_policy);
 }
 
-SharedMemoryArbiter* ProducerIPCClientImpl::GetInProcessShmemArbiter() {
-  PERFETTO_DLOG("Cannot GetInProcessShmemArbiter() via the IPC layer.");
-  return nullptr;
+SharedMemoryArbiter* ProducerIPCClientImpl::MaybeSharedMemoryArbiter() {
+  return shared_memory_arbiter_.get();
 }
 
 void ProducerIPCClientImpl::NotifyFlushComplete(FlushRequestID req_id) {
diff --git a/src/tracing/ipc/producer/producer_ipc_client_impl.h b/src/tracing/ipc/producer/producer_ipc_client_impl.h
index e556fbb..68feeea 100644
--- a/src/tracing/ipc/producer/producer_ipc_client_impl.h
+++ b/src/tracing/ipc/producer/producer_ipc_client_impl.h
@@ -76,7 +76,7 @@
   std::unique_ptr<TraceWriter> CreateTraceWriter(
       BufferID target_buffer,
       BufferExhaustedPolicy) override;
-  SharedMemoryArbiter* GetInProcessShmemArbiter() override;
+  SharedMemoryArbiter* MaybeSharedMemoryArbiter() override;
   void NotifyFlushComplete(FlushRequestID) override;
   SharedMemory* shared_memory() const override;
   size_t shared_buffer_page_size_kb() const override;
diff --git a/src/tracing/test/BUILD.gn b/src/tracing/test/BUILD.gn
index 06d3913..6a84532 100644
--- a/src/tracing/test/BUILD.gn
+++ b/src/tracing/test/BUILD.gn
@@ -53,24 +53,22 @@
   }
 }
 
-perfetto_unittest_source_set("tracing_integration_test") {
-  testonly = true
-  deps = [
-    ":test_support",
-    "../../../gn:default_deps",
-    "../../../gn:gtest_and_gmock",
-    "../../base",
-    "../../base:test_support",
-    "../core:service",
-  ]
-  if (enable_perfetto_ipc) {
-    deps += [
+if (enable_perfetto_ipc) {
+  perfetto_unittest_source_set("tracing_integration_test") {
+    testonly = true
+    deps = [
+      ":test_support",
+      "../../../gn:default_deps",
+      "../../../gn:gtest_and_gmock",
+      "../../base",
+      "../../base:test_support",
+      "../core:service",
       "../ipc/consumer",
       "../ipc/producer",
       "../ipc/service",
     ]
+    sources = [ "tracing_integration_test.cc" ]
   }
-  sources = [ "tracing_integration_test.cc" ]
 }
 
 if (enable_perfetto_integration_tests) {
diff --git a/src/tracing/test/fake_producer_endpoint.h b/src/tracing/test/fake_producer_endpoint.h
index fc364cf..c01ef91 100644
--- a/src/tracing/test/fake_producer_endpoint.h
+++ b/src/tracing/test/fake_producer_endpoint.h
@@ -44,7 +44,7 @@
       BufferExhaustedPolicy) override {
     return nullptr;
   }
-  SharedMemoryArbiter* GetInProcessShmemArbiter() override { return nullptr; }
+  SharedMemoryArbiter* MaybeSharedMemoryArbiter() override { return nullptr; }
 
   CommitDataRequest last_commit_data_request;
   CommitDataCallback last_commit_data_callback;
diff --git a/test/metrics/java_heap_stats.out b/test/metrics/java_heap_stats.out
index 8ba3b96..53b9fb6 100644
--- a/test/metrics/java_heap_stats.out
+++ b/test/metrics/java_heap_stats.out
@@ -7,7 +7,7 @@
     }
     samples {
       ts: 10
-      heap_size: 480
+      heap_size: 736
       reachable_heap_size: 96
     }
   }
diff --git a/test/synth_common.py b/test/synth_common.py
index 09bf07e..036f9b7 100644
--- a/test/synth_common.py
+++ b/test/synth_common.py
@@ -344,6 +344,18 @@
     debug_marker.object = obj
     debug_marker.object_name = obj_name
 
+  def add_vk_queue_submit(self, ts, dur, pid, tid, vk_queue, vk_command_buffers,
+                          submission_id):
+    packet = self.add_packet()
+    packet.timestamp = ts
+    submit = (self.packet.vulkan_api_event.vk_queue_submit)
+    submit.duration_ns = dur
+    submit.pid = pid
+    submit.tid = tid
+    for cmd in vk_command_buffers:
+      submit.vk_command_buffers.append(cmd)
+    submit.submission_id = submission_id
+
   def add_gpu_log(self, ts, severity, tag, message):
     packet = self.add_packet()
     packet.timestamp = ts
diff --git a/test/trace_processor/gpu_render_stages.out b/test/trace_processor/gpu_render_stages.out
index dfeb78d..0cfaaf6 100644
--- a/test/trace_processor/gpu_render_stages.out
+++ b/test/trace_processor/gpu_render_stages.out
@@ -1,26 +1,18 @@
-"track_name","track_desc","ts","dur","slice_name","depth","flat_key","string_value","context_id","render_target","render_pass","command_buffer","submission_id","hw_queue_id"
-"queue 1","queue desc 1",0,5,"render stage(1)",0,"[NULL]","[NULL]",0,0,0,0,0,1
-"queue 0","queue desc 0",0,5,"stage 0",0,"[NULL]","[NULL]",42,0,0,0,0,0
-"queue 1","queue desc 1",10,5,"stage 1",0,"description","stage desc 1",42,0,0,0,0,1
-"queue 2","[NULL]",20,5,"stage 2",0,"[NULL]","[NULL]",42,0,0,0,0,2
-"queue 0","queue desc 0",30,5,"stage 3",0,"[NULL]","[NULL]",42,0,0,0,0,0
-"Unknown GPU Queue 3","[NULL]",40,5,"render stage(4)",0,"[NULL]","[NULL]",42,0,0,0,0,3
-"queue 0","queue desc 0",50,5,"stage 0",0,"key1","value1",42,0,0,0,0,0
-"queue 0","queue desc 0",60,5,"stage 0",0,"key2","value2",42,0,0,0,0,0
-"queue 0","queue desc 0",60,5,"stage 0",0,"key1","value1",42,0,0,0,0,0
-"queue 0","queue desc 0",70,5,"stage 0",0,"key1","[NULL]",42,0,0,0,0,0
-"queue 0","queue desc 0",80,5,"stage 2",0,"[NULL]","[NULL]",42,0,0,0,0,0
-"queue 0","queue desc 0",90,5,"stage 0",0,"VkCommandBuffer","0x0000000000000030",42,16,32,48,0,0
-"queue 0","queue desc 0",90,5,"stage 0",0,"VkFramebuffer","0x0000000000000010",42,16,32,48,0,0
-"queue 0","queue desc 0",90,5,"stage 0",0,"VkRenderPass","0x0000000000000020",42,16,32,48,0,0
-"queue 0","queue desc 0",100,5,"stage 0",0,"VkCommandBuffer","0x0000000000000010 (command_buffer)",42,16,16,16,0,0
-"queue 0","queue desc 0",100,5,"stage 0",0,"VkFramebuffer","0x0000000000000010",42,16,16,16,0,0
-"queue 0","queue desc 0",100,5,"stage 0",0,"VkRenderPass","0x0000000000000010",42,16,16,16,0,0
-"queue 0","queue desc 0",110,5,"stage 0",0,"VkCommandBuffer","0x0000000000000010 (command_buffer)",42,16,16,16,0,0
-"queue 0","queue desc 0",110,5,"stage 0",0,"VkFramebuffer","0x0000000000000010",42,16,16,16,0,0
-"queue 0","queue desc 0",110,5,"stage 0",0,"VkRenderPass","0x0000000000000010 (render_pass)",42,16,16,16,0,0
-"queue 0","queue desc 0",120,5,"stage 0",0,"VkCommandBuffer","0x0000000000000010 (command_buffer)",42,16,16,16,0,0
-"queue 0","queue desc 0",120,5,"stage 0",0,"VkFramebuffer","0x0000000000000010 (framebuffer)",42,16,16,16,0,0
-"queue 0","queue desc 0",120,5,"stage 0",0,"VkRenderPass","0x0000000000000010 (render_pass)",42,16,16,16,0,0
-"queue 0","queue desc 0",130,5,"stage 0",0,"VkFramebuffer","0x0000000000000010 (renamed_buffer)",42,16,0,0,0,0
-"Unknown GPU Queue ","[NULL]",140,5,"render stage(18446744073709551615)",0,"[NULL]","[NULL]",42,0,0,0,0,1024
+"track_name","track_desc","ts","dur","slice_name","depth","flat_key","string_value","context_id","render_target","render_target_name","render_pass","render_pass_name","command_buffer","command_buffer_name","submission_id","hw_queue_id"
+"queue 1","queue desc 1",0,5,"render stage(1)",0,"[NULL]","[NULL]",0,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,1
+"queue 0","queue desc 0",0,5,"stage 0",0,"[NULL]","[NULL]",42,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,0
+"queue 1","queue desc 1",10,5,"stage 1",0,"description","stage desc 1",42,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,1
+"queue 2","[NULL]",20,5,"stage 2",0,"[NULL]","[NULL]",42,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,2
+"queue 0","queue desc 0",30,5,"stage 3",0,"[NULL]","[NULL]",42,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,0
+"Unknown GPU Queue 3","[NULL]",40,5,"render stage(4)",0,"[NULL]","[NULL]",42,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,3
+"queue 0","queue desc 0",50,5,"stage 0",0,"key1","value1",42,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,0
+"queue 0","queue desc 0",60,5,"stage 0",0,"key2","value2",42,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,0
+"queue 0","queue desc 0",60,5,"stage 0",0,"key1","value1",42,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,0
+"queue 0","queue desc 0",70,5,"stage 0",0,"key1","[NULL]",42,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,0
+"queue 0","queue desc 0",80,5,"stage 2",0,"[NULL]","[NULL]",42,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,0
+"queue 0","queue desc 0",90,5,"stage 0",0,"[NULL]","[NULL]",42,16,"[NULL]",32,"[NULL]",48,"[NULL]",0,0
+"queue 0","queue desc 0",100,5,"stage 0",0,"[NULL]","[NULL]",42,16,"[NULL]",16,"[NULL]",16,"command_buffer",0,0
+"queue 0","queue desc 0",110,5,"stage 0",0,"[NULL]","[NULL]",42,16,"[NULL]",16,"render_pass",16,"command_buffer",0,0
+"queue 0","queue desc 0",120,5,"stage 0",0,"[NULL]","[NULL]",42,16,"framebuffer",16,"render_pass",16,"command_buffer",0,0
+"queue 0","queue desc 0",130,5,"stage 0",0,"[NULL]","[NULL]",42,16,"renamed_buffer",0,"[NULL]",0,"[NULL]",0,0
+"Unknown GPU Queue ","[NULL]",140,5,"render stage(18446744073709551615)",0,"[NULL]","[NULL]",42,0,"[NULL]",0,"[NULL]",0,"[NULL]",0,1024
diff --git a/test/trace_processor/gpu_render_stages.sql b/test/trace_processor/gpu_render_stages.sql
index 1135262..c85d162 100644
--- a/test/trace_processor/gpu_render_stages.sql
+++ b/test/trace_processor/gpu_render_stages.sql
@@ -15,7 +15,8 @@
 --
 SELECT track.name AS track_name, gpu_track.description AS track_desc, ts, dur,
     gpu_slice.name AS slice_name, depth, flat_key, string_value,
-    gpu_slice.context_id, render_target, render_pass, command_buffer, submission_id, hw_queue_id
+    gpu_slice.context_id, render_target, render_target_name, render_pass, render_pass_name,
+    command_buffer, command_buffer_name, submission_id, hw_queue_id
 FROM gpu_track
 LEFT JOIN track USING (id)
 INNER JOIN gpu_slice ON gpu_track.id=gpu_slice.track_id
diff --git a/test/trace_processor/heap_graph.textproto b/test/trace_processor/heap_graph.textproto
index 295e284..b8407aa 100644
--- a/test/trace_processor/heap_graph.textproto
+++ b/test/trace_processor/heap_graph.textproto
@@ -47,6 +47,11 @@
       reference_field_id: 2
       reference_object_id: 0x01
     }
+    objects {
+      id: 0x05
+      type_id: 4
+      self_size: 256
+    }
     continued: true
     index: 1
   }
@@ -67,6 +72,10 @@
       iid: 3
       str: "a"
     }
+    type_names {
+      iid: 4
+      str: "a[]"
+    }
     field_names {
       iid: 1
       str: "FactoryProducerDelegateImplActor.foo"
diff --git a/test/trace_processor/heap_graph_object.out b/test/trace_processor/heap_graph_object.out
index a25b67b..1d3b6eb 100644
--- a/test/trace_processor/heap_graph_object.out
+++ b/test/trace_processor/heap_graph_object.out
@@ -3,3 +3,4 @@
 1,"heap_graph_object",2,10,2,32,-1,-1,1,1,"Foo","[NULL]","[NULL]"
 2,"heap_graph_object",2,10,3,128,-1,-1,1,0,"Foo","[NULL]","[NULL]"
 3,"heap_graph_object",2,10,4,256,-1,-1,1,0,"a","DeobfuscatedA","[NULL]"
+4,"heap_graph_object",2,10,5,256,-1,-1,2,0,"a[]","DeobfuscatedA[]","[NULL]"
diff --git a/test/trace_processor/heap_profile_tracker_new_stack.out b/test/trace_processor/heap_profile_tracker_new_stack.out
new file mode 100644
index 0000000..205768d
--- /dev/null
+++ b/test/trace_processor/heap_profile_tracker_new_stack.out
@@ -0,0 +1,5 @@
+"id","type","ts","upid","callsite_id","count","size"
+0,"heap_profile_allocation",0,0,0,1,1
+1,"heap_profile_allocation",0,0,0,-1,-1
+2,"heap_profile_allocation",1,0,0,1,1
+3,"heap_profile_allocation",1,0,0,-1,-1
diff --git a/test/trace_processor/heap_profile_tracker_new_stack.sql b/test/trace_processor/heap_profile_tracker_new_stack.sql
new file mode 100644
index 0000000..efed7da
--- /dev/null
+++ b/test/trace_processor/heap_profile_tracker_new_stack.sql
@@ -0,0 +1 @@
+select * from heap_profile_allocation;
diff --git a/test/trace_processor/heap_profile_tracker_new_stack.textproto b/test/trace_processor/heap_profile_tracker_new_stack.textproto
new file mode 100644
index 0000000..f84c30a
--- /dev/null
+++ b/test/trace_processor/heap_profile_tracker_new_stack.textproto
@@ -0,0 +1,76 @@
+packet {
+  clock_snapshot {
+    clocks: {
+      clock_id: 6 # BOOTTIME
+      timestamp: 0
+    }
+    clocks: {
+      clock_id: 4 # MONOTONIC_COARSE
+      timestamp: 0
+    }
+  }
+}
+
+packet {
+  previous_packet_dropped: true
+  incremental_state_cleared: true
+  trusted_packet_sequence_id: 1
+  timestamp: 0
+  interned_data {
+    mappings {
+      iid: 1
+    }
+    frames {
+      iid: 1
+      mapping_id: 1
+      rel_pc: 0x123
+    }
+    callstacks {
+      iid: 1
+      frame_ids: 1
+    }
+  }
+}
+
+packet {
+  trusted_packet_sequence_id: 1
+  timestamp: 0
+  profile_packet {
+    index: 0
+    continued: false
+    process_dumps {
+      samples {
+        callstack_id: 1
+        self_allocated: 1
+        alloc_count: 1
+        self_freed: 1
+        free_count: 1
+      }
+    }
+  }
+}
+
+packet {
+  trusted_packet_sequence_id: 1
+  timestamp: 1
+  interned_data {
+    callstacks {
+      iid: 2
+      frame_ids: 1
+    }
+  }
+  profile_packet {
+    index: 1
+    continued: false
+    process_dumps {
+      timestamp: 1
+      samples {
+        callstack_id: 2
+        self_allocated: 1
+        alloc_count: 1
+        self_freed: 1
+        free_count: 1
+      }
+    }
+  }
+}
diff --git a/test/trace_processor/index b/test/trace_processor/index
index 60711e9..f929928 100644
--- a/test/trace_processor/index
+++ b/test/trace_processor/index
@@ -131,6 +131,7 @@
 # GPU trace tests.
 gpu_counters.py gpu_counters.sql gpu_counters.out
 gpu_render_stages.py gpu_render_stages.sql gpu_render_stages.out
+vulkan_api_events.py vulkan_api_events.sql vulkan_api_events.out
 gpu_log.py gpu_log.sql gpu_log.out
 
 # Clock sync
@@ -155,6 +156,7 @@
 heap_graph_interleaved.textproto heap_graph_reference.sql heap_graph_interleaved_reference.out
 ../data/system-server-heap-graph.pftrace heap_graph_flamegraph.sql heap_graph_flamegraph_system-server-heap-graph.out
 ../data/system-server-native-profile heap_profile_flamegraph.sql heap_profile_flamegraph_system-server-native-profile.out
+heap_profile_tracker_new_stack.textproto heap_profile_tracker_new_stack.sql heap_profile_tracker_new_stack.out
 
 stack_profile_tracker_empty_callstack.textproto stack_profile_tracker_empty_callstack.sql stack_profile_tracker_empty_callstack.out
 
diff --git a/test/trace_processor/vulkan_api_events.out b/test/trace_processor/vulkan_api_events.out
new file mode 100644
index 0000000..de409f9
--- /dev/null
+++ b/test/trace_processor/vulkan_api_events.out
@@ -0,0 +1,5 @@
+"track_name","track_desc","ts","dur","slice_name","depth","flat_key","int_value","context_id","command_buffer","submission_id"
+"Vulkan Events","[NULL]",10,2,"vkQueueSubmit",0,"pid",42,"[NULL]",100,1
+"Vulkan Events","[NULL]",10,2,"vkQueueSubmit",0,"tid",43,"[NULL]",100,1
+"Vulkan Events","[NULL]",20,2,"vkQueueSubmit",0,"pid",44,"[NULL]",200,2
+"Vulkan Events","[NULL]",20,2,"vkQueueSubmit",0,"tid",45,"[NULL]",200,2
diff --git a/test/trace_processor/vulkan_api_events.py b/test/trace_processor/vulkan_api_events.py
new file mode 100644
index 0000000..b279f7a
--- /dev/null
+++ b/test/trace_processor/vulkan_api_events.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python
+# Copyright (C) 2020 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os import sys, path
+sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
+import synth_common
+
+trace = synth_common.create_trace()
+
+trace.add_vk_queue_submit(
+    ts=10,
+    dur=2,
+    pid=42,
+    tid=43,
+    vk_queue=10,
+    vk_command_buffers=[100],
+    submission_id=1)
+
+trace.add_vk_queue_submit(
+    ts=20,
+    dur=2,
+    pid=44,
+    tid=45,
+    vk_queue=11,
+    vk_command_buffers=[200, 300, 400],
+    submission_id=2)
+
+print(trace.trace.SerializeToString())
diff --git a/test/trace_processor/vulkan_api_events.sql b/test/trace_processor/vulkan_api_events.sql
new file mode 100644
index 0000000..89fccd9
--- /dev/null
+++ b/test/trace_processor/vulkan_api_events.sql
@@ -0,0 +1,23 @@
+--
+-- Copyright 2020 The Android Open Source Project
+--
+-- Licensed under the Apache License, Version 2.0 (the "License");
+-- you may not use this file except in compliance with the License.
+-- You may obtain a copy of the License at
+--
+--     https://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+SELECT track.name AS track_name, gpu_track.description AS track_desc, ts, dur,
+    gpu_slice.name AS slice_name, depth, flat_key, int_value,
+    gpu_slice.context_id, command_buffer, submission_id
+FROM gpu_track
+LEFT JOIN track USING (id)
+INNER JOIN gpu_slice ON gpu_track.id=gpu_slice.track_id
+LEFT JOIN args ON gpu_slice.arg_set_id = args.arg_set_id
+ORDER BY ts;
diff --git a/tools/diff_test_trace_processor.py b/tools/diff_test_trace_processor.py
index 2d92f39..cb1ba78 100755
--- a/tools/diff_test_trace_processor.py
+++ b/tools/diff_test_trace_processor.py
@@ -295,7 +295,7 @@
   elif args.test_type == 'metrics':
     index = os.path.join(test_dir, 'metrics', 'index')
   else:
-    print('Unknown test type {}. Supported: queries, metircs'.format(
+    print('Unknown test type {}. Supported: queries, metrics'.format(
         args.test_type))
     return 1
 
diff --git a/tools/install-build-deps b/tools/install-build-deps
index 01b27c8..e1e9c20 100755
--- a/tools/install-build-deps
+++ b/tools/install-build-deps
@@ -96,7 +96,7 @@
     # tools/clang/scripts/update.py.
     ('buildtools/clang.tgz',
      'https://commondatastorage.googleapis.com/chromium-browser-clang/Linux_x64/clang-n332890-c2443155-2.tgz',
-     'fe1b1e5bd7381ae655661cb9658487389561568d', 'linux'),
+     'd6501ffdb5dbb0ffe8a4b873cc092a9929e661ec', 'linux'),
 
     # Keep in sync with chromium DEPS.
     ('buildtools/libfuzzer',
diff --git a/tools/trace_to_text/pprof_builder.cc b/tools/trace_to_text/pprof_builder.cc
index 7d72434..4f57829 100644
--- a/tools/trace_to_text/pprof_builder.cc
+++ b/tools/trace_to_text/pprof_builder.cc
@@ -214,7 +214,8 @@
         max_symbol_id_(max_symbol_id) {
     // The pprof format expects the first entry in the string table to be the
     // empty string.
-    Intern("");
+    int64_t empty_id = Intern("");
+    PERFETTO_CHECK(empty_id == 0);
   }
 
   std::vector<Iterator> BuildViewIterators(trace_processor::TraceProcessor* tp,
diff --git a/ui/index.html b/ui/index.html
index fae544d..6fa32a6 100644
--- a/ui/index.html
+++ b/ui/index.html
@@ -6,6 +6,9 @@
   <!-- WebComponents V0 origin trial token for https://ui.perfetto.dev Expires 17 Dec 2020.
   See https://crbug.com/1021137. -->
   <meta http-equiv="origin-trial" content="AtzsILqIzNPGftktQTEYxI9GpnqFBuse5uB5n4JQO3Wa1ky4TCKmnXZli0A9g9p7Es7Il9pqarELntnfm0HriwkAAABreyJvcmlnaW4iOiJodHRwczovL3VpLnBlcmZldHRvLmRldjo0NDMiLCJmZWF0dXJlIjoiV2ViQ29tcG9uZW50c1YwIiwiZXhwaXJ5IjoxNjA4MjI2NDQzLCJpc1N1YmRvbWFpbiI6dHJ1ZX0=">
+  <!-- WebComponents V0 origin trial token for http://localhost:10000 Expires 28 Jan 2021.
+  See https://crbug.com/1021137. -->
+  <meta http-equiv="origin-trial" content="AicMEv5glMGL1lq6ZRsxFJj8xlhn3XDYZrHK0/2KreAD/r62vTFjUBOueeMTxWuU1IlRXqCugRFDD7rY45YEgwkAAABTeyJvcmlnaW4iOiJodHRwOi8vbG9jYWxob3N0OjEwMDAwIiwiZmVhdHVyZSI6IldlYkNvbXBvbmVudHNWMCIsImV4cGlyeSI6MTYxMTg0MDczNH0=">
   <link href="perfetto.css" rel="stylesheet">
   <link rel="icon" type="image/png" href="assets/logo.png">
 </head>