heapprofd_client: avoid destruction re-entrancy issues, and global destructor

(1) use unhooked malloc functions for the g_client shared_ptr.
(2) wrap g_client in a base::NoDestructor to avoid the global destructor
  (and therefore the associated at-exit issues).

See bug & the contents of the patch for a detailed description of the issue.
Plus new e2e test that would've failed without this fix.

Bug: 129749217
Change-Id: I3d1c34ae133e9b9a43d2a6b6be2ceddbc90ba307
diff --git a/src/profiling/memory/client.cc b/src/profiling/memory/client.cc
index 161499b..eba68db 100644
--- a/src/profiling/memory/client.cc
+++ b/src/profiling/memory/client.cc
@@ -116,7 +116,9 @@
 }
 
 // static
-std::shared_ptr<Client> Client::CreateAndHandshake(base::UnixSocketRaw sock) {
+std::shared_ptr<Client> Client::CreateAndHandshake(
+    base::UnixSocketRaw sock,
+    UnhookedAllocator<Client> unhooked_allocator) {
   if (!sock) {
     PERFETTO_DFATAL("Socket not connected.");
     return nullptr;
@@ -197,9 +199,10 @@
 
   PERFETTO_DCHECK(client_config.interval >= 1);
   Sampler sampler{client_config.interval};
-  return std::make_shared<Client>(std::move(sock), client_config,
-                                  std::move(shmem.value()), std::move(sampler),
-                                  FindMainThreadStack());
+  // note: the shared_ptr will retain a copy of the unhooked_allocator
+  return std::allocate_shared<Client>(
+      unhooked_allocator, std::move(sock), client_config,
+      std::move(shmem.value()), std::move(sampler), FindMainThreadStack());
 }
 
 Client::Client(base::UnixSocketRaw sock,
diff --git a/src/profiling/memory/client.h b/src/profiling/memory/client.h
index 9de433c..f39af55 100644
--- a/src/profiling/memory/client.h
+++ b/src/profiling/memory/client.h
@@ -27,6 +27,7 @@
 #include "perfetto/base/unix_socket.h"
 #include "src/profiling/memory/sampler.h"
 #include "src/profiling/memory/shared_ring_buffer.h"
+#include "src/profiling/memory/unhooked_allocator.h"
 #include "src/profiling/memory/wire_protocol.h"
 
 namespace perfetto {
@@ -43,15 +44,23 @@
 //
 // Methods of this class are thread-safe unless otherwise stated, in which case
 // the caller needs to synchronize calls behind a mutex or similar.
+//
+// Implementation warning: this class should not use any heap, as otherwise its
+// destruction would enter the possibly-hooked |free|, which can reference the
+// Client itself. If avoiding the heap is not possible, then look at using
+// UnhookedAllocator.
 class Client {
  public:
   // Returns a client that is ready for sampling allocations, using the given
   // socket (which should already be connected to heapprofd).
   //
   // Returns a shared_ptr since that is how the client will ultimately be used,
-  // and to take advantage of std::make_shared putting the object & the control
-  // block in one block of memory.
-  static std::shared_ptr<Client> CreateAndHandshake(base::UnixSocketRaw sock);
+  // and to take advantage of std::allocate_shared putting the object & the
+  // control block in one block of memory.
+  static std::shared_ptr<Client> CreateAndHandshake(
+      base::UnixSocketRaw sock,
+      UnhookedAllocator<Client> unhooked_allocator);
+
   static base::Optional<base::UnixSocketRaw> ConnectToHeapprofd(
       const std::string& sock_name);
 
@@ -72,8 +81,8 @@
     return sampler_.SampleSize(alloc_size);
   }
 
-  // Public for std::make_shared. Use CreateAndHandshake() to create instances
-  // instead.
+  // Public for std::allocate_shared. Use CreateAndHandshake() to create
+  // instances instead.
   Client(base::UnixSocketRaw sock,
          ClientConfiguration client_config,
          SharedRingBuffer shmem,
diff --git a/src/profiling/memory/heapprofd_end_to_end_test.cc b/src/profiling/memory/heapprofd_end_to_end_test.cc
index 59352f1..525d132 100644
--- a/src/profiling/memory/heapprofd_end_to_end_test.cc
+++ b/src/profiling/memory/heapprofd_end_to_end_test.cc
@@ -27,6 +27,8 @@
 #include <sys/system_properties.h>
 
 #include <fcntl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
 
 // This test only works when run on Android using an Android Q version of
 // Bionic.
@@ -46,8 +48,8 @@
 
 constexpr useconds_t kMsToUs = 1000;
 
-using ::testing::Eq;
 using ::testing::AnyOf;
+using ::testing::Eq;
 
 class HeapprofdDelegate : public ThreadDelegate {
  public:
@@ -779,9 +781,8 @@
     ASSERT_EQ(read(*ack_pipe.rd, buf, sizeof(buf)), 0);
     ack_pipe.rd.reset();
 
-    // TODO(rsavitski): this sleep is to compensate for the heapprofd delaying
-    // in closing the sockets (and therefore the client noticing that the
-    // session is over). Clarify where the delays are coming from.
+    // A brief sleep to allow the client to notice that the profiling session is
+    // to be torn down (as it rejects concurrent sessions).
     usleep(100 * kMsToUs);
 
     PERFETTO_LOG("HeapprofdEndToEnd::Reinit: Starting second");
@@ -845,6 +846,93 @@
     PERFETTO_CHECK(kill(pid, SIGKILL) == 0);
     PERFETTO_CHECK(PERFETTO_EINTR(waitpid(pid, nullptr, 0)) == pid);
   }
+
+  // TODO(rsavitski): fold exit status assertions into existing tests where
+  // possible.
+  void NativeProfilingActiveAtProcessExit() {
+    constexpr uint64_t kTestAllocSize = 128;
+    base::Pipe start_pipe = base::Pipe::Create(base::Pipe::kBothBlock);
+
+    pid_t pid = fork();
+    if (pid == 0) {  // child
+      start_pipe.rd.reset();
+      start_pipe.wr.reset();
+      for (int i = 0; i < 200; i++) {
+        // malloc and leak, otherwise the free batching will cause us to filter
+        // out the allocations (as we don't see the interleaved frees).
+        volatile char* x = static_cast<char*>(malloc(kTestAllocSize));
+        if (x) {
+          x[0] = 'x';
+        }
+        usleep(10 * kMsToUs);
+      }
+      exit(0);
+    }
+
+    ASSERT_NE(pid, -1) << "Failed to fork.";
+    start_pipe.wr.reset();
+
+    // Construct tracing config (without starting profiling).
+    auto helper = GetHelper(&task_runner);
+    TraceConfig trace_config;
+    trace_config.add_buffers()->set_size_kb(10 * 1024);
+    trace_config.set_duration_ms(5000);
+    trace_config.set_flush_timeout_ms(10000);
+
+    auto* ds_config = trace_config.add_data_sources()->mutable_config();
+    ds_config->set_name("android.heapprofd");
+
+    auto* heapprofd_config = ds_config->mutable_heapprofd_config();
+    heapprofd_config->set_sampling_interval_bytes(1);
+    *heapprofd_config->add_pid() = static_cast<uint64_t>(pid);
+
+    // Wait for child to have been scheduled at least once.
+    char buf[1] = {};
+    ASSERT_EQ(PERFETTO_EINTR(read(*start_pipe.rd, buf, sizeof(buf))), 0);
+    start_pipe.rd.reset();
+
+    // Trace until child exits.
+    helper->StartTracing(trace_config);
+
+    siginfo_t siginfo = {};
+    int wait_ret = PERFETTO_EINTR(
+        waitid(P_PID, static_cast<id_t>(pid), &siginfo, WEXITED));
+    ASSERT_FALSE(wait_ret) << "Failed to waitid.";
+
+    // Assert that the child exited successfully.
+    EXPECT_EQ(siginfo.si_code, CLD_EXITED) << "Child did not exit by itself.";
+    EXPECT_EQ(siginfo.si_status, 0) << "Child's exit status not successful.";
+
+    // Assert that we did profile the process.
+    helper->FlushAndWait(2000);
+    helper->DisableTracing();
+    helper->WaitForTracingDisabled(10000);
+    helper->ReadData();
+    helper->WaitForReadData();
+
+    const auto& packets = helper->trace();
+    ASSERT_GT(packets.size(), 0u);
+    size_t profile_packets = 0;
+    size_t samples = 0;
+    uint64_t total_allocated = 0;
+    for (const protos::TracePacket& packet : packets) {
+      if (packet.has_profile_packet() &&
+          packet.profile_packet().process_dumps().size() > 0) {
+        const auto& dumps = packet.profile_packet().process_dumps();
+        ASSERT_EQ(dumps.size(), 1);
+        const protos::ProfilePacket_ProcessHeapSamples& dump = dumps.Get(0);
+        EXPECT_EQ(dump.pid(), pid);
+        profile_packets++;
+        for (const auto& sample : dump.samples()) {
+          samples++;
+          total_allocated += sample.self_allocated();
+        }
+      }
+    }
+    EXPECT_EQ(profile_packets, 1);
+    EXPECT_GT(samples, 0);
+    EXPECT_GT(total_allocated, 0);
+  }
 };
 
 // TODO(b/118428762): unwinding is broken at least x86 emulators, blanket-skip
@@ -1024,6 +1112,25 @@
   ConcurrentSession();
 }
 
+TEST_F(HeapprofdEndToEnd, NativeProfilingActiveAtProcessExit_Central) {
+  if (IsX86())
+    return;
+
+  auto prop = DisableFork();
+  ASSERT_EQ(ReadProperty(kHeapprofdModeProperty, ""), "");
+  NativeProfilingActiveAtProcessExit();
+}
+
+TEST_F(HeapprofdEndToEnd, NativeProfilingActiveAtProcessExit_Fork) {
+  if (IsX86())
+    return;
+
+  // RAII handle that resets to central mode when out of scope.
+  auto prop = EnableFork();
+  ASSERT_EQ(ReadProperty(kHeapprofdModeProperty, ""), "fork");
+  NativeProfilingActiveAtProcessExit();
+}
+
 }  // namespace
 }  // namespace profiling
 }  // namespace perfetto
diff --git a/src/profiling/memory/malloc_hooks.cc b/src/profiling/memory/malloc_hooks.cc
index d8e6b64..4178610 100644
--- a/src/profiling/memory/malloc_hooks.cc
+++ b/src/profiling/memory/malloc_hooks.cc
@@ -32,14 +32,17 @@
 
 #include "perfetto/base/build_config.h"
 #include "perfetto/base/logging.h"
+#include "perfetto/base/no_destructor.h"
 #include "perfetto/base/unix_socket.h"
 #include "perfetto/base/utils.h"
 #include "src/profiling/memory/client.h"
 #include "src/profiling/memory/proc_utils.h"
 #include "src/profiling/memory/scoped_spinlock.h"
+#include "src/profiling/memory/unhooked_allocator.h"
 #include "src/profiling/memory/wire_protocol.h"
 
 using perfetto::profiling::ScopedSpinlock;
+using perfetto::profiling::UnhookedAllocator;
 
 // This is so we can make an so that we can swap out with the existing
 // libc_malloc_hooks.so
@@ -114,14 +117,26 @@
 // This shared_ptr itself is protected by g_client_lock. Note that shared_ptr
 // handles are not thread-safe by themselves:
 // https://en.cppreference.com/w/cpp/memory/shared_ptr/atomic
-std::shared_ptr<perfetto::profiling::Client> g_client;
+//
+// To avoid on-destruction re-entrancy issues, this shared_ptr needs to be
+// constructed with an allocator that uses the unhooked malloc & free functions.
+// See UnhookedAllocator.
+//
+// NoDestructor<> wrapper is used to avoid destructing the shared_ptr at program
+// exit. The rationale is:
+// * Avoiding the atexit destructor racing against other threads that are
+//   possibly running within the hooks.
+// * Making sure that atexit handlers running after this global's destructor
+//   can still safely enter the hooks.
+perfetto::base::NoDestructor<std::shared_ptr<perfetto::profiling::Client>>
+    g_client;
 
 // Protects g_client, and serves as an external lock for sampling decisions (see
 // perfetto::profiling::Sampler).
 //
-// TODO(rsavitski): consider lifting Sampler into this global scope. Nesting
-// under client is not necessary (though it does highlight that their validity
-// is tied together).
+// We rely on this atomic's destuction being a nop, as it is possible for the
+// hooks to attempt to acquire the spinlock after its destructor should have run
+// (technically a use-after-destruct scenario).
 std::atomic<bool> g_client_lock{false};
 
 constexpr char kHeapprofdBinPath[] = "/system/bin/heapprofd";
@@ -134,11 +149,11 @@
 // (concurrent calls get discarded).
 void ShutdownLazy() {
   ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Blocking);
-  if (!g_client)  // other invocation already initiated shutdown
+  if (!g_client.ref())  // other invocation already initiated shutdown
     return;
 
   // Clear primary shared pointer, such that later hook invocations become nops.
-  g_client.reset();
+  g_client.ref().reset();
 
   if (!android_mallopt(M_RESET_HOOKS, nullptr, 0))
     PERFETTO_PLOG("Unpatching heapprofd hooks failed.");
@@ -180,7 +195,8 @@
   return true;
 }
 
-std::shared_ptr<perfetto::profiling::Client> CreateClientForCentralDaemon() {
+std::shared_ptr<perfetto::profiling::Client> CreateClientForCentralDaemon(
+    UnhookedAllocator<perfetto::profiling::Client> unhooked_allocator) {
   PERFETTO_DLOG("Constructing client for central daemon.");
   using perfetto::profiling::Client;
 
@@ -188,10 +204,12 @@
       Client::ConnectToHeapprofd(perfetto::profiling::kHeapprofdSocketFile);
   if (!sock)
     return nullptr;
-  return Client::CreateAndHandshake(std::move(sock.value()));
+  return Client::CreateAndHandshake(std::move(sock.value()),
+                                    unhooked_allocator);
 }
 
-std::shared_ptr<perfetto::profiling::Client> CreateClientAndPrivateDaemon() {
+std::shared_ptr<perfetto::profiling::Client> CreateClientAndPrivateDaemon(
+    UnhookedAllocator<perfetto::profiling::Client> unhooked_allocator) {
   PERFETTO_DLOG("Setting up fork mode profiling.");
   perfetto::base::UnixSocketRaw parent_sock;
   perfetto::base::UnixSocketRaw child_sock;
@@ -260,8 +278,8 @@
     return nullptr;
   }
 
-  return perfetto::profiling::Client::CreateAndHandshake(
-      std::move(parent_sock));
+  return perfetto::profiling::Client::CreateAndHandshake(std::move(parent_sock),
+                                                         unhooked_allocator);
 }
 
 }  // namespace
@@ -284,14 +302,20 @@
 
   ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Blocking);
 
-  if (g_client) {
+  if (g_client.ref()) {
     PERFETTO_LOG("Rejecting concurrent profiling initialization.");
     return true;  // success as we're in a valid state
   }
 
+  // The dispatch table never changes, so let the custom allocator retain the
+  // function pointers directly.
+  UnhookedAllocator<perfetto::profiling::Client> unhooked_allocator(
+      malloc_dispatch->malloc, malloc_dispatch->free);
+
   std::shared_ptr<perfetto::profiling::Client> client =
-      ShouldForkPrivateDaemon() ? CreateClientAndPrivateDaemon()
-                                : CreateClientForCentralDaemon();
+      ShouldForkPrivateDaemon()
+          ? CreateClientAndPrivateDaemon(unhooked_allocator)
+          : CreateClientForCentralDaemon(unhooked_allocator);
 
   if (!client) {
     PERFETTO_LOG("Client not initialized, not installing hooks.");
@@ -299,7 +323,7 @@
   }
   PERFETTO_DLOG("Client initialized.");
 
-  g_client = std::move(client);
+  g_client.ref() = std::move(client);
   return true;
 }
 
@@ -324,15 +348,15 @@
   std::shared_ptr<perfetto::profiling::Client> client;
   {
     ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Blocking);
-    if (!g_client)  // no active client (most likely shutting down)
+    if (!g_client.ref())  // no active client (most likely shutting down)
       return;
 
-    sampled_alloc_sz = g_client->GetSampleSizeLocked(size);
+    sampled_alloc_sz = g_client.ref()->GetSampleSizeLocked(size);
     if (sampled_alloc_sz == 0)  // not sampling
       return;
 
-    client = g_client;  // owning copy
-  }                     // unlock
+    client = g_client.ref();  // owning copy
+  }                           // unlock
 
   if (!client->RecordMalloc(size, sampled_alloc_sz,
                             reinterpret_cast<uint64_t>(addr))) {
@@ -388,7 +412,7 @@
   std::shared_ptr<perfetto::profiling::Client> client;
   {
     ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Blocking);
-    client = g_client;  // owning copy (or empty)
+    client = g_client.ref();  // owning copy (or empty)
   }
 
   if (client) {
@@ -415,9 +439,9 @@
     ScopedSpinlock s(&g_client_lock, ScopedSpinlock::Mode::Blocking);
     // If there is no active client, we still want to reach the backing realloc,
     // so keep going.
-    if (g_client) {
-      client = g_client;  // owning copy
-      sampled_alloc_sz = g_client->GetSampleSizeLocked(size);
+    if (g_client.ref()) {
+      client = g_client.ref();  // owning copy
+      sampled_alloc_sz = g_client.ref()->GetSampleSizeLocked(size);
     }
   }  // unlock
 
diff --git a/src/profiling/memory/unhooked_allocator.h b/src/profiling/memory/unhooked_allocator.h
new file mode 100644
index 0000000..45a6262
--- /dev/null
+++ b/src/profiling/memory/unhooked_allocator.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_PROFILING_MEMORY_UNHOOKED_ALLOCATOR_H_
+#define SRC_PROFILING_MEMORY_UNHOOKED_ALLOCATOR_H_
+
+#include <stdlib.h>
+
+#include <atomic>
+#include <exception>
+
+namespace perfetto {
+namespace profiling {
+
+// An allocator that uses the given |malloc| & |free| function pointers to
+// allocate/deallocate memory. Used in the heapprofd_client library, which hooks
+// the malloc functions in a target process, and therefore needs to use this
+// allocator to not have its own heap usage enter the hooks.
+//
+// See https://en.cppreference.com/w/cpp/named_req/Allocator.
+template <typename T>
+class UnhookedAllocator {
+ public:
+  template <typename U>
+  friend class UnhookedAllocator;
+
+  // to satisfy the Allocator interface
+  using value_type = T;
+
+  // for implementation convenience
+  using unhooked_malloc_t = void* (*)(size_t);
+  using unhooked_free_t = void (*)(void*);
+
+  UnhookedAllocator(unhooked_malloc_t unhooked_malloc,
+                    unhooked_free_t unhooked_free)
+      : unhooked_malloc_(unhooked_malloc), unhooked_free_(unhooked_free) {}
+
+  template <typename U>
+  constexpr UnhookedAllocator(const UnhookedAllocator<U>& other) noexcept
+      : unhooked_malloc_(other.unhooked_malloc_),
+        unhooked_free_(other.unhooked_free_) {}
+
+  T* allocate(size_t n) {
+    if (n > size_t(-1) / sizeof(T))
+      std::terminate();
+    if (T* p = static_cast<T*>(unhooked_malloc_(n * sizeof(T))))
+      return p;
+    std::terminate();
+  }
+
+  void deallocate(T* p, size_t) noexcept { unhooked_free_(p); }
+
+ private:
+  unhooked_malloc_t unhooked_malloc_ = nullptr;
+  unhooked_free_t unhooked_free_ = nullptr;
+};
+
+template <typename T, typename U>
+bool operator==(const UnhookedAllocator<T>& first,
+                const UnhookedAllocator<U>& second) {
+  return first.unhooked_malloc_ == second.unhooked_malloc_ &&
+         first.unhooked_free_ == second.unhooked_free_;
+}
+
+template <typename T, typename U>
+bool operator!=(const UnhookedAllocator<T>& first,
+                const UnhookedAllocator<U>& second) {
+  return !(first == second);
+}
+
+}  // namespace profiling
+}  // namespace perfetto
+
+#endif  // SRC_PROFILING_MEMORY_UNHOOKED_ALLOCATOR_H_