traced_perf: in-tree builds: arm register parsing basics

This adds libunwindstack as a dependency for traced_perf, and has the necessary
logic to sample the userspace register state of sampled processes. The
registers are converted to libunwindstack's representation and, for now,
printed.

Only supporting arm32/arm64 at the moment. x86 coming soon, standalone builds
require figuring out where to source the uapi constant definitions from.

Sorry, no tests at this point, more of a "I've ran this and it printed
something sensible according to /proc/pid/maps".

Also changes traced_perf to only build in the Android tree atm.

Change-Id: Id0df3b8c3901d733480d4289cf9b37562c3427d8
diff --git a/Android.bp b/Android.bp
index 8c4649f..3dff741 100644
--- a/Android.bp
+++ b/Android.bp
@@ -5416,6 +5416,14 @@
   ],
 }
 
+// GN: //src/profiling/perf:unwind_support
+filegroup {
+  name: "perfetto_src_profiling_perf_unwind_support",
+  srcs: [
+    "src/profiling/perf/unwind_support.cc",
+  ],
+}
+
 // GN: //src/profiling:unittests
 filegroup {
   name: "perfetto_src_profiling_unittests",
@@ -6589,6 +6597,7 @@
     ":perfetto_src_profiling_memory_wire_protocol",
     ":perfetto_src_profiling_perf_producer",
     ":perfetto_src_profiling_perf_producer_unittests",
+    ":perfetto_src_profiling_perf_unwind_support",
     ":perfetto_src_profiling_unittests",
     ":perfetto_src_protozero_protozero",
     ":perfetto_src_protozero_testing_messages_cpp_gen",
@@ -6728,6 +6737,9 @@
     "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
     "-DHAVE_HIDDEN",
   ],
+  include_dirs: [
+    "bionic/libc/kernel",
+  ],
   data: [
     "src/traced/probes/filesystem/testdata/**/*",
     "src/traced/probes/ftrace/test/data/**/*",
@@ -7055,6 +7067,7 @@
     ":perfetto_src_ipc_ipc",
     ":perfetto_src_profiling_perf_producer",
     ":perfetto_src_profiling_perf_traced_perf_main",
+    ":perfetto_src_profiling_perf_unwind_support",
     ":perfetto_src_protozero_protozero",
     ":perfetto_src_tracing_common",
     ":perfetto_src_tracing_ipc",
@@ -7062,7 +7075,10 @@
     "src/profiling/perf/main.cc",
   ],
   shared_libs: [
+    "libbase",
     "liblog",
+    "libprocinfo",
+    "libunwindstack",
   ],
   generated_headers: [
     "perfetto_protos_perfetto_common_cpp_gen_headers",
@@ -7110,6 +7126,9 @@
     "-DGOOGLE_PROTOBUF_NO_RTTI",
     "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
   ],
+  include_dirs: [
+    "bionic/libc/kernel",
+  ],
 }
 
 // GN: //src/traced/probes:traced_probes
diff --git a/gn/perfetto.gni b/gn/perfetto.gni
index 82d0665..cc1232f 100644
--- a/gn/perfetto.gni
+++ b/gn/perfetto.gni
@@ -149,9 +149,7 @@
 
   # Build the perf event profiler (traced_perf).
   # TODO(b/144281346): under development.
-  enable_perfetto_traced_perf =
-      perfetto_build_with_android ||
-      (perfetto_build_standalone && (is_linux || is_android))
+  enable_perfetto_traced_perf = perfetto_build_with_android
 
   # The Trace Processor: offline analytical engine to process traces and compute
   # metrics using a SQL engine.
diff --git a/src/profiling/perf/BUILD.gn b/src/profiling/perf/BUILD.gn
index dba1ca2..b864674 100644
--- a/src/profiling/perf/BUILD.gn
+++ b/src/profiling/perf/BUILD.gn
@@ -18,6 +18,9 @@
 
 assert(enable_perfetto_traced_perf)
 
+# TODO(rsavitski): only building in-tree at the moment (so this build file is
+# only used for gen_android_bp, expect bitrot).
+
 executable("traced_perf") {
   deps = [
     ":traced_perf_main",
@@ -43,6 +46,7 @@
 
 source_set("producer") {
   deps = [
+    ":unwind_support",
     "../../../gn:default_deps",
     "../../../protos/perfetto/config:cpp",
     "../../../protos/perfetto/config/profiling:zero",
@@ -63,6 +67,17 @@
   ]
 }
 
+source_set("unwind_support") {
+  deps = [
+    "../../../gn:default_deps",
+    "../../../gn:libunwindstack",
+    "../../../src/base",
+  ]
+  sources = [
+    "unwind_support.cc",
+    "unwind_support.h",
+  ]
+}
 source_set("producer_unittests") {
   testonly = true
   deps = [
diff --git a/src/profiling/perf/event_config.h b/src/profiling/perf/event_config.h
index da53a0b..cc750ff 100644
--- a/src/profiling/perf/event_config.h
+++ b/src/profiling/perf/event_config.h
@@ -23,6 +23,7 @@
 
 #include "perfetto/ext/base/optional.h"
 #include "perfetto/tracing/core/data_source_config.h"
+#include "src/profiling/perf/unwind_support.h"
 
 #include "protos/perfetto/config/profiling/perf_event_config.pbzero.h"
 
@@ -42,21 +43,17 @@
     protos::pbzero::PerfEventConfig::Decoder pb_config(
         ds_config.perf_event_config_raw());
 
-    if (!pb_config.has_tid())
-      return base::nullopt;
-
     return EventConfig(pb_config);
   }
 
-  int32_t target_tid() const { return target_tid_; }
+  uint32_t target_cpu() const { return target_cpu_; }
 
   perf_event_attr* perf_attr() const {
     return const_cast<perf_event_attr*>(&perf_event_attr_);
   }
 
  private:
-  EventConfig(const protos::pbzero::PerfEventConfig::Decoder& pb_config)
-      : target_tid_(pb_config.tid()) {
+  EventConfig(const protos::pbzero::PerfEventConfig::Decoder&) {
     auto& pe = perf_event_attr_;
     memset(&pe, 0, sizeof(perf_event_attr));
     pe.size = sizeof(perf_event_attr);
@@ -70,18 +67,20 @@
     pe.sample_freq = 100;
     pe.freq = true;
 
-    pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_STACK_USER;
+    pe.sample_type =
+        PERF_SAMPLE_TID | PERF_SAMPLE_STACK_USER | PERF_SAMPLE_REGS_USER;
     // Needs to be < ((u16)(~0u)), and have bottom 8 bits clear.
     pe.sample_stack_user = (1u << 15);
-
-    // Note: can't use inherit with task-scoped event mmap
-    pe.inherit = false;
+    pe.sample_regs_user = PerfUserRegsMaskForCurrentArch();
   }
 
-  // TODO(rsavitski): this will have to represent entire event groups, thus this
-  // class will represent N events. So we'll need N cpus/tids, but likely still
-  // a single perf_event_attr.
-  int32_t target_tid_ = 0;
+  // TODO(rsavitski): for now hardcode each session to be for a single cpu's
+  // scope. In general a config will correspond to N cpus and/or tids.
+  uint32_t target_cpu_ = 0;
+
+  // TODO(rsavitski): if we allow for event groups containing multiple sampled
+  // counters, we'll need to vary the .type & .config fields per
+  // perf_event_open.
   perf_event_attr perf_event_attr_;
 };
 
diff --git a/src/profiling/perf/event_config_unittest.cc b/src/profiling/perf/event_config_unittest.cc
index 6598eb4..0d7fa52 100644
--- a/src/profiling/perf/event_config_unittest.cc
+++ b/src/profiling/perf/event_config_unittest.cc
@@ -32,9 +32,8 @@
 namespace profiling {
 namespace {
 
-static DataSourceConfig ConfigForTid(int32_t tid) {
+static DataSourceConfig CreateEmptyConfig() {
   protozero::HeapBuffered<protos::pbzero::PerfEventConfig> pb_config;
-  pb_config->set_tid(tid);
   protozero::HeapBuffered<protos::pbzero::DataSourceConfig> ds_config;
   ds_config->set_perf_event_config_raw(pb_config.SerializeAsString());
   DataSourceConfig cfg;
@@ -42,17 +41,8 @@
   return cfg;
 }
 
-TEST(EventConfigTest, TidRequired) {
-  // Doesn't pass validation without a TID
-  DataSourceConfig cfg;
-  ASSERT_TRUE(cfg.ParseFromString(""));
-
-  base::Optional<EventConfig> event_config = EventConfig::Create(cfg);
-  ASSERT_FALSE(event_config.has_value());
-}
-
 TEST(EventConfigTest, AttrStructConstructed) {
-  auto cfg = ConfigForTid(42);
+  auto cfg = CreateEmptyConfig();
   base::Optional<EventConfig> event_config = EventConfig::Create(cfg);
 
   ASSERT_TRUE(event_config.has_value());
diff --git a/src/profiling/perf/event_reader.cc b/src/profiling/perf/event_reader.cc
index 7bc5ccf..a2d0e87 100644
--- a/src/profiling/perf/event_reader.cc
+++ b/src/profiling/perf/event_reader.cc
@@ -23,6 +23,7 @@
 #include <unistd.h>
 
 #include "perfetto/ext/base/utils.h"
+#include "src/profiling/perf/unwind_support.h"
 
 namespace perfetto {
 namespace profiling {
@@ -48,10 +49,15 @@
       syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags));
 }
 
+// TODO(rsavitski): one EventConfig will correspond to N perf_event_open calls
+// in the general case. Does it make sense to keep a single function which does
+// the N calls, and then returns the group leader's fd? What about cases where
+// we have >1 pid or >1 cpu to open for? Should the entire EventReader be
+// cpu-scoped?
 base::ScopedFile PerfEventOpen(const EventConfig& event_cfg) {
   base::ScopedFile perf_fd{
-      perf_event_open(event_cfg.perf_attr(), event_cfg.target_tid(),
-                      /*cpu=*/-1, /*group_fd=*/-1, PERF_FLAG_FD_CLOEXEC)};
+      perf_event_open(event_cfg.perf_attr(), /*pid=*/-1, event_cfg.target_cpu(),
+                      /*group_fd=*/-1, PERF_FLAG_FD_CLOEXEC)};
   return perf_fd;
 }
 
@@ -195,7 +201,7 @@
 void EventReader::ParseNextSampleBatch() {
   std::vector<char> data = ring_buffer_.ReadAvailable();
   if (data.size() == 0) {
-    PERFETTO_LOG("WIP: no samples");
+    PERFETTO_LOG("no samples (work in progress)");
     return;
   }
 
@@ -217,7 +223,7 @@
   if (event_hdr->type == PERF_RECORD_SAMPLE) {
     ParsePerfRecordSample(sample_start, event_hdr->size);
   } else {
-    PERFETTO_ELOG("WIP: unsupported event type");
+    PERFETTO_ELOG("Unsupported event type (work in progress)");
   }
 
   *ptr = sample_start + event_hdr->size;
@@ -229,9 +235,9 @@
                                         size_t sample_size) {
   const perf_event_attr* cfg = event_cfg_.perf_attr();
 
-  if (cfg->sample_type &
-      (~uint64_t(PERF_SAMPLE_TID | PERF_SAMPLE_STACK_USER))) {
-    PERFETTO_ELOG("WIP: unsupported sampling option.");
+  if (cfg->sample_type & (~uint64_t(PERF_SAMPLE_TID | PERF_SAMPLE_STACK_USER |
+                                    PERF_SAMPLE_REGS_USER))) {
+    PERFETTO_ELOG("Unsupported sampling option (work in progress)");
     return;
   }
 
@@ -249,6 +255,16 @@
     PERFETTO_LOG("tid: %" PRIu32 "", tid);
   }
 
+  if (cfg->sample_type & PERF_SAMPLE_REGS_USER) {
+    auto parsed_regs = ReadPerfUserRegsData(&parse_pos);
+
+    if (parsed_regs) {
+      parsed_regs->IterateRegisters([](const char* name, uint64_t value) {
+        PERFETTO_LOG("reg[%s]: %" PRIx64 "", name, value);
+      });
+    }
+  }
+
   if (cfg->sample_type & PERF_SAMPLE_STACK_USER) {
     uint64_t max_stack_size;  // the requested size
     parse_pos = ReadValue(&max_stack_size, parse_pos);
diff --git a/src/profiling/perf/event_reader.h b/src/profiling/perf/event_reader.h
index 537f489..fa9b96b 100644
--- a/src/profiling/perf/event_reader.h
+++ b/src/profiling/perf/event_reader.h
@@ -86,7 +86,7 @@
   EventReader(EventReader&&) noexcept;
   EventReader& operator=(EventReader&&) noexcept;
 
-  // TODO(rsavitski): temporary.
+  // TODO(rsavitski): temporary one-shot parser for development purposes.
   void ParseNextSampleBatch();
 
  private:
diff --git a/src/profiling/perf/perf_producer.cc b/src/profiling/perf/perf_producer.cc
index dd5d642..e544f86 100644
--- a/src/profiling/perf/perf_producer.cc
+++ b/src/profiling/perf/perf_producer.cc
@@ -66,20 +66,6 @@
     return;
   }
 
-  std::string maps_path = std::string("/proc/") +
-                          std::to_string(event_config->target_tid()) +
-                          std::string("/maps");
-  auto maps_fd = base::OpenFile(maps_path, O_RDONLY);
-  if (!maps_fd)
-    PERFETTO_PLOG("failed /proc/pid/maps open (proceeding)");
-
-  std::string mem_path = std::string("/proc/") +
-                         std::to_string(event_config->target_tid()) +
-                         std::string("/mem");
-  auto mem_fd = base::OpenFile(mem_path, O_RDONLY);
-  if (!mem_fd)
-    PERFETTO_PLOG("failed /proc/pid/mem open (proceeding)");
-
   base::Optional<EventReader> event_reader =
       EventReader::ConfigureEvents(event_config.value());
   if (!event_reader.has_value()) {
@@ -90,8 +76,7 @@
   // Build the DataSource instance.
   auto it_inserted = data_sources_.emplace(
       std::piecewise_construct, std::forward_as_tuple(instance_id),
-      std::forward_as_tuple(std::move(event_reader.value()), std::move(maps_fd),
-                            std::move(mem_fd)));
+      std::forward_as_tuple(std::move(event_reader.value())));
 
   PERFETTO_DCHECK(it_inserted.second);
 }
diff --git a/src/profiling/perf/perf_producer.h b/src/profiling/perf/perf_producer.h
index d028c75..e373836 100644
--- a/src/profiling/perf/perf_producer.h
+++ b/src/profiling/perf/perf_producer.h
@@ -66,22 +66,13 @@
     kConnected,
   };
 
-  // TODO(rsavitski): proc-fds need to live elsewhere, as they can be shared
-  // across data sources. We might also have arbitrarily many tasks handled
-  // by one data source (if scoping events to a cpu).
   struct DataSource {
-    DataSource(EventReader _event_reader,
-               base::ScopedFile _maps_fd,
-               base::ScopedFile _mem_fd)
-        : event_reader(std::move(_event_reader)),
-          maps_fd(std::move(_maps_fd)),
-          mem_fd(std::move(_mem_fd)) {}
+    DataSource(EventReader _event_reader)
+        : event_reader(std::move(_event_reader)) {}
 
+    // TODO(rsavitski): current thinking is an EventReader per cpu-scoped ring
+    // buffer. And a central bookkeeper.
     EventReader event_reader;
-
-    // note: currently populated, but unused.
-    base::ScopedFile maps_fd;
-    base::ScopedFile mem_fd;
   };
 
   void ConnectService();
diff --git a/src/profiling/perf/unwind_support.cc b/src/profiling/perf/unwind_support.cc
new file mode 100644
index 0000000..47dc3c0
--- /dev/null
+++ b/src/profiling/perf/unwind_support.cc
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/profiling/perf/unwind_support.h"
+
+#include <inttypes.h>
+#include <linux/perf_event.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <memory>
+
+#include <unwindstack/Elf.h>
+#include <unwindstack/MachineArm.h>
+#include <unwindstack/MachineArm64.h>
+#include <unwindstack/Regs.h>
+#include <unwindstack/RegsArm.h>
+#include <unwindstack/RegsArm64.h>
+#include <unwindstack/UserArm.h>
+#include <unwindstack/UserArm64.h>
+
+// TODO(rsavitski): this includes the kernel uapi constant definitions (for
+// register sampling). For now hardcoded for in-tree builds (specifically,
+// bionic/include/kernel/). Standalone builds will need to source the headers
+// from elsewhere (without depending on the host machine's system headers).
+#include <uapi/asm-arm/asm/perf_regs.h>
+#include <uapi/asm-x86/asm/perf_regs.h>
+#define perf_event_arm_regs perf_event_arm64_regs
+#include <uapi/asm-arm64/asm/perf_regs.h>
+#undef perf_event_arm_regs
+
+namespace perfetto {
+namespace profiling {
+
+namespace {
+
+template <typename T>
+const char* ReadValue(T* value_out, const char* ptr) {
+  memcpy(value_out, reinterpret_cast<const void*>(ptr), sizeof(T));
+  return ptr + sizeof(T);
+}
+
+// Supported configurations:
+// * 32 bit daemon, 32 bit userspace
+// * 64 bit daemon, mixed bitness userspace
+// Therefore give the kernel the mask corresponding to our build architecture.
+// Register parsing handles the mixed userspace ABI cases.
+// TODO(rsavitski): cleanly detect 32 bit builds being side-loaded onto a system
+// with 64 bit userspace processes.
+uint64_t PerfUserRegsMask(unwindstack::ArchEnum arch) {
+  // TODO(rsavitski): support the rest of the architectures.
+  switch (arch) {
+    case unwindstack::ARCH_ARM64:
+      return (1ULL << PERF_REG_ARM64_MAX) - 1;
+    case unwindstack::ARCH_ARM:
+      return ((1ULL << PERF_REG_ARM_MAX) - 1);
+    default:
+      PERFETTO_FATAL("Unsupported architecture (work in progress)");
+  }
+}
+
+// Adjusts the given architecture enum based on the ABI (as recorded in the perf
+// sample). Note: we do not support 64 bit samples on a 32 bit daemon build, so
+// this only converts from 64 bit to 32 bit architectures.
+unwindstack::ArchEnum ArchForAbi(unwindstack::ArchEnum arch, uint64_t abi) {
+  if (arch == unwindstack::ARCH_ARM64 && abi == PERF_SAMPLE_REGS_ABI_32) {
+    return unwindstack::ARCH_ARM;
+  }
+  if (arch == unwindstack::ARCH_X86_64 && abi == PERF_SAMPLE_REGS_ABI_32) {
+    return unwindstack::ARCH_X86;
+  }
+  return arch;
+}
+
+// Register values as an array, indexed using the kernel uapi perf_events.h enum
+// values. Unsampled values will be left as zeroes.
+// TODO(rsavitski): support all relevant architectures (allocate enough space
+// for the widest register bank).
+struct RawRegisterData {
+  static constexpr uint64_t kMaxSize = PERF_REG_ARM64_MAX;
+  uint64_t regs[kMaxSize] = {};
+};
+
+std::unique_ptr<unwindstack::Regs> ToLibUnwindstackRegs(
+    const RawRegisterData& raw_regs,
+    unwindstack::ArchEnum arch) {
+  // First converts the |RawRegisterData| array to libunwindstack's raw register
+  // format, then constructs the relevant unwindstack::Regs subclass out of the
+  // latter.
+  if (arch == unwindstack::ARCH_ARM64) {
+    static_assert(static_cast<int>(unwindstack::ARM64_REG_R0) ==
+                      static_cast<int>(PERF_REG_ARM64_X0),
+                  "register layout mismatch");
+    static_assert(static_cast<int>(unwindstack::ARM64_REG_R30) ==
+                      static_cast<int>(PERF_REG_ARM64_LR),
+                  "register layout mismatch");
+
+    unwindstack::arm64_user_regs arm64_user_regs;
+    memset(&arm64_user_regs, 0, sizeof(arm64_user_regs));
+    memcpy(&arm64_user_regs.regs[unwindstack::ARM64_REG_R0],
+           &raw_regs.regs[PERF_REG_ARM64_X0],
+           sizeof(uint64_t) * (PERF_REG_ARM64_LR - PERF_REG_ARM64_X0 + 1));
+    arm64_user_regs.sp = raw_regs.regs[PERF_REG_ARM64_SP];
+    arm64_user_regs.pc = raw_regs.regs[PERF_REG_ARM64_PC];
+
+    return std::unique_ptr<unwindstack::Regs>(
+        unwindstack::RegsArm64::Read(&arm64_user_regs));
+  }
+
+  if (arch == unwindstack::ARCH_ARM) {
+    static_assert(static_cast<int>(unwindstack::ARM_REG_R0) ==
+                      static_cast<int>(PERF_REG_ARM_R0),
+                  "register layout mismatch");
+    static_assert(static_cast<int>(unwindstack::ARM_REG_LAST) ==
+                      static_cast<int>(PERF_REG_ARM_MAX),
+                  "register layout mismatch");
+
+    unwindstack::arm_user_regs arm_user_regs;
+    memset(&arm_user_regs, 0, sizeof(arm_user_regs));
+    for (size_t i = unwindstack::ARM_REG_R0; i < unwindstack::ARM_REG_LAST;
+         i++) {
+      arm_user_regs.regs[i] = static_cast<uint32_t>(raw_regs.regs[i]);
+    }
+
+    return std::unique_ptr<unwindstack::Regs>(
+        unwindstack::RegsArm::Read(&arm_user_regs));
+  }
+
+  PERFETTO_FATAL("Unsupported architecture (work in progress)");
+}
+
+}  // namespace
+
+uint64_t PerfUserRegsMaskForCurrentArch() {
+  return PerfUserRegsMask(unwindstack::Regs::CurrentArch());
+}
+
+// Assumes that the sampling was configured with
+// |PerfUserRegsMaskForCurrentArch|.
+std::unique_ptr<unwindstack::Regs> ReadPerfUserRegsData(const char** data) {
+  unwindstack::ArchEnum requested_arch = unwindstack::Regs::CurrentArch();
+
+  // Layout, assuming a sparse bitmask requesting r1 and r15:
+  // [u64 abi] [u64 r1] [u64 r15]
+  const char* parse_pos = *data;
+  uint64_t sampled_abi;
+  parse_pos = ReadValue(&sampled_abi, parse_pos);
+  PERFETTO_LOG("WIP: abi: %" PRIu64 "", sampled_abi);
+
+  // Unpack the densely-packed register values into |RawRegisterData|, which has
+  // a value for every register (unsampled registers will be left at zero).
+  RawRegisterData raw_regs{};
+  uint64_t regs_mask = PerfUserRegsMaskForCurrentArch();
+  for (size_t i = 0; regs_mask && (i < RawRegisterData::kMaxSize); i++) {
+    if (regs_mask & (1u << i)) {
+      parse_pos = ReadValue(&raw_regs.regs[i], parse_pos);
+    }
+  }
+
+  // Special case: we've requested arm64 registers from a 64 bit kernel, but
+  // ended up sampling a 32 bit arm userspace process. The 32 bit execution
+  // state of the target process was saved by the exception entry in an
+  // ISA-specific way. The userspace R0-R14 end up saved as arm64 W0-W14, but
+  // the program counter (R15 on arm32) is still in PERF_REG_ARM64_PC (the 33rd
+  // register). So we can take the kernel-dumped 64 bit register state, reassign
+  // the PC into the R15 slot, and treat the resulting RawRegisterData as an
+  // arm32 register bank. See "Fundamentals of ARMv8-A" (ARM DOC
+  // 100878_0100_en), page 28.
+  if (requested_arch == unwindstack::ARCH_ARM64 &&
+      sampled_abi == PERF_SAMPLE_REGS_ABI_32) {
+    raw_regs.regs[PERF_REG_ARM_PC] = raw_regs.regs[PERF_REG_ARM64_PC];
+  }
+
+  // Adjust caller's parsing position.
+  *data = parse_pos;
+
+  // ABI_NONE means there were no registers (e.g. we've sampled a kernel thread,
+  // which doesn't have userspace registers). We still walk over the empty data
+  // above, but return an empty result to the caller.
+  if (sampled_abi == PERF_SAMPLE_REGS_ABI_NONE) {
+    return nullptr;
+  } else {
+    unwindstack::ArchEnum sampled_arch =
+        ArchForAbi(requested_arch, sampled_abi);
+    return ToLibUnwindstackRegs(raw_regs, sampled_arch);
+  }
+}
+
+}  // namespace profiling
+}  // namespace perfetto
diff --git a/src/profiling/perf/unwind_support.h b/src/profiling/perf/unwind_support.h
new file mode 100644
index 0000000..f0764d2
--- /dev/null
+++ b/src/profiling/perf/unwind_support.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_PROFILING_PERF_UNWIND_SUPPORT_H_
+#define SRC_PROFILING_PERF_UNWIND_SUPPORT_H_
+
+#include <stdint.h>
+#include <unwindstack/Regs.h>
+
+#include <memory>
+
+#include "perfetto/ext/base/scoped_file.h"
+
+namespace perfetto {
+namespace profiling {
+
+// Returns a bitmask for sampling the userspace register set, used when
+// configuring perf events.
+uint64_t PerfUserRegsMaskForCurrentArch();
+
+// Converts the raw sampled register bytes to libunwindstack's representation
+// (correct arch-dependent subclass). Advances |data| pointer to past the
+// register data. The unique_ptr can be empty, if there were no userspace
+// registers to sample (i.e. we've sampled a kernel thread).
+// TODO(rsavitski): come up with a better signature (also consider how much to
+// isolate libunwindstack types).
+std::unique_ptr<unwindstack::Regs> ReadPerfUserRegsData(const char** data);
+
+}  // namespace profiling
+}  // namespace perfetto
+
+#endif  // SRC_PROFILING_PERF_UNWIND_SUPPORT_H_
diff --git a/tools/gen_android_bp b/tools/gen_android_bp
index c00e697..ebc440a 100755
--- a/tools/gen_android_bp
+++ b/tools/gen_android_bp
@@ -158,11 +158,17 @@
         ('static_libs', {'libasync_safe'}),
         ('header_libs', {'bionic_libc_platform_headers'}),
     ],
-    'perfetto_unittests': [('data', set(enumerate_data_deps())),],
+    'perfetto_unittests': [
+        ('data', set(enumerate_data_deps())),
+        ('include_dirs', {'bionic/libc/kernel'}),
+    ],
     'traced_probes': [
         ('required', {'libperfetto_android_internal', 'trigger_perfetto'}),
     ],
     'libperfetto_android_internal': [('static_libs', {'libhealthhalutils'}),],
+    'traced_perf': [
+        ('include_dirs', {'bionic/libc/kernel'}),
+    ],
     'trace_processor_shell': [
       ('dist', {'targets': ['sdk_repo']}),
       ('stl', 'libc++_static'),