memunreachable: prevent forking when main thread times out am: 0b1e8d31d2 am: b796351312

Original change: https://googleplex-android-review.googlesource.com/c/platform/system/memory/libmemunreachable/+/18358671

Change-Id: Ie82cf8ccb85010c57dfb6636d5e387a20f4bc083
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
diff --git a/Android.bp b/Android.bp
index 2a3fa1f..68457e5 100644
--- a/Android.bp
+++ b/Android.bp
@@ -48,6 +48,9 @@
         "libc_malloc_debug_backtrace",
         "libprocinfo",
     ],
+    header_libs: [
+        "libgtest_prod_headers",
+    ],
     export_include_dirs: ["include"],
     local_include_dirs: ["include"],
     version_script: "libmemunreachable.map",
@@ -73,6 +76,7 @@
     host_supported: true,
     srcs: [
         "tests/Allocator_test.cpp",
+        "tests/AtomicState_test.cpp",
         "tests/HeapWalker_test.cpp",
         "tests/LeakFolding_test.cpp",
     ],
diff --git a/AtomicState.h b/AtomicState.h
new file mode 100644
index 0000000..365eab9
--- /dev/null
+++ b/AtomicState.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <chrono>
+#include <functional>
+#include <mutex>
+#include <unordered_set>
+
+#include <gtest/gtest_prod.h>
+
+#include "android-base/macros.h"
+
+namespace android {
+
+/*
+ * AtomicState manages updating or waiting on a state enum between multiple threads.
+ */
+template <typename T>
+class AtomicState {
+ public:
+  explicit AtomicState(T state) : state_(state) {}
+  ~AtomicState() = default;
+
+  /*
+   * Set the state to `to`.  Wakes up any waiters that are waiting on the new state.
+   */
+  void set(T to) {
+    std::lock_guard<std::mutex> lock(m_);
+    state_ = to;
+    cv_.notify_all();
+  }
+
+  /*
+   * If the state is `from`, change it to `to` and return true.  Otherwise don't change
+   * it and return false.  If the state is changed, wakes up any waiters that are waiting
+   * on the new state.
+   */
+  bool transition(T from, T to) {
+    return transition_or(from, to, [&] { return state_; });
+  }
+
+  /*
+   * If the state is `from`, change it to `to` and return true.  Otherwise, call `or_func`,
+   * set the state to the value it returns and return false.  Wakes up any waiters that are
+   * waiting on the new state.
+   */
+  bool transition_or(T from, T to, const std::function<T()>& orFunc) {
+    std::lock_guard<std::mutex> lock(m_);
+
+    bool failed = false;
+    if (state_ == from) {
+      state_ = to;
+    } else {
+      failed = true;
+      state_ = orFunc();
+    }
+    cv_.notify_all();
+
+    return !failed;
+  }
+
+  /*
+   * Block until the state is either `state1` or `state2`, or the time limit is reached.
+   * Returns true if the time limit was not reached, false if it was reached.
+   */
+  bool wait_for_either_of(T state1, T state2, std::chrono::milliseconds ms) {
+    std::unique_lock<std::mutex> lock(m_);
+    bool success = cv_.wait_for(lock, ms, [&] { return state_ == state1 || state_ == state2; });
+    return success;
+  }
+
+ private:
+  T state_;
+  std::mutex m_;
+  std::condition_variable cv_;
+
+  FRIEND_TEST(AtomicStateTest, transition);
+  FRIEND_TEST(AtomicStateTest, wait);
+
+  DISALLOW_COPY_AND_ASSIGN(AtomicState);
+};
+
+}  // namespace android
diff --git a/MemUnreachable.cpp b/MemUnreachable.cpp
index b02a77e..d37819a 100644
--- a/MemUnreachable.cpp
+++ b/MemUnreachable.cpp
@@ -29,6 +29,7 @@
 #include <backtrace.h>
 
 #include "Allocator.h"
+#include "AtomicState.h"
 #include "Binder.h"
 #include "HeapWalker.h"
 #include "Leak.h"
@@ -37,7 +38,6 @@
 #include "ProcessMappings.h"
 #include "PtracerThread.h"
 #include "ScopedDisableMalloc.h"
-#include "Semaphore.h"
 #include "ThreadCapture.h"
 
 #include "bionic.h"
@@ -282,6 +282,13 @@
   return (val == 1) ? "" : "s";
 }
 
+enum State {
+  STARTING = 0,
+  PAUSING,
+  COLLECTING,
+  ABORT,
+};
+
 bool GetUnreachableMemory(UnreachableMemoryInfo& info, size_t limit) {
   if (info.version > 0) {
     MEM_ALOGE("unsupported UnreachableMemoryInfo.version %zu in GetUnreachableMemory",
@@ -294,7 +301,7 @@
 
   Heap heap;
 
-  Semaphore continue_parent_sem;
+  AtomicState<State> state(STARTING);
   LeakPipe pipe;
 
   PtracerThread thread{[&]() -> int {
@@ -303,6 +310,13 @@
     /////////////////////////////////////////////
     MEM_ALOGI("collecting thread info for process %d...", parent_pid);
 
+    if (!state.transition_or(STARTING, PAUSING, [&] {
+          MEM_ALOGI("collecting thread expected state STARTING, aborting");
+          return ABORT;
+        })) {
+      return 1;
+    }
+
     ThreadCapture thread_capture(parent_pid, heap);
     allocator::vector<ThreadInfo> thread_info(heap);
     allocator::vector<Mapping> mappings(heap);
@@ -310,24 +324,34 @@
 
     // ptrace all the threads
     if (!thread_capture.CaptureThreads()) {
-      continue_parent_sem.Post();
+      state.set(ABORT);
       return 1;
     }
 
     // collect register contents and stacks
     if (!thread_capture.CapturedThreadInfo(thread_info)) {
-      continue_parent_sem.Post();
+      state.set(ABORT);
       return 1;
     }
 
     // snapshot /proc/pid/maps
     if (!ProcessMappings(parent_pid, mappings)) {
-      continue_parent_sem.Post();
+      state.set(ABORT);
       return 1;
     }
 
     if (!BinderReferences(refs)) {
-      continue_parent_sem.Post();
+      state.set(ABORT);
+      return 1;
+    }
+
+    // Atomically update the state from PAUSING to COLLECTING.
+    // The main thread may have given up waiting for this thread to finish
+    // pausing, in which case it will have changed the state to ABORT.
+    if (!state.transition_or(PAUSING, COLLECTING, [&] {
+          MEM_ALOGI("collecting thread aborting");
+          return ABORT;
+        })) {
       return 1;
     }
 
@@ -337,7 +361,6 @@
     // can drop the malloc locks, it will block until the collection thread
     // exits.
     thread_capture.ReleaseThread(parent_tid);
-    continue_parent_sem.Post();
 
     // fork a process to do the heap walking
     int ret = fork();
@@ -400,7 +423,15 @@
 
     // Wait for the collection thread to signal that it is ready to fork the
     // heap walker process.
-    continue_parent_sem.Wait(30s);
+    if (!state.wait_for_either_of(COLLECTING, ABORT, 30s)) {
+      // The pausing didn't finish within 30 seconds, attempt to atomically
+      // update the state from PAUSING to ABORT.  The collecting thread
+      // may have raced with the timeout and already updated the state to
+      // COLLECTING, in which case aborting is not necessary.
+      if (state.transition(PAUSING, ABORT)) {
+        MEM_ALOGI("main thread timed out waiting for collecting thread");
+      }
+    }
 
     // Re-enable malloc so the collection thread can fork.
   }
diff --git a/Semaphore.h b/Semaphore.h
deleted file mode 100644
index cd73972..0000000
--- a/Semaphore.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (C) 2016 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBMEMUNREACHABLE_SEMAPHORE_H_
-#define LIBMEMUNREACHABLE_SEMAPHORE_H_
-
-#include <chrono>
-#include <mutex>
-
-#include "android-base/macros.h"
-
-namespace android {
-
-class Semaphore {
- public:
-  explicit Semaphore(int count = 0) : count_(count) {}
-  ~Semaphore() = default;
-
-  void Wait(std::chrono::milliseconds ms) {
-    std::unique_lock<std::mutex> lk(m_);
-    cv_.wait_for(lk, ms, [&] {
-      if (count_ > 0) {
-        count_--;
-        return true;
-      }
-      return false;
-    });
-  }
-  void Post() {
-    {
-      std::lock_guard<std::mutex> lk(m_);
-      count_++;
-    }
-    cv_.notify_one();
-  }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(Semaphore);
-
-  int count_;
-  std::mutex m_;
-  std::condition_variable cv_;
-};
-
-}  // namespace android
-
-#endif  // LIBMEMUNREACHABLE_SEMAPHORE_H_
diff --git a/tests/AtomicState_test.cpp b/tests/AtomicState_test.cpp
new file mode 100644
index 0000000..7fb2806
--- /dev/null
+++ b/tests/AtomicState_test.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <AtomicState.h>
+
+#include <chrono>
+#include <thread>
+
+#include <gtest/gtest.h>
+
+using namespace std::chrono_literals;
+
+namespace android {
+
+enum AtomicStateTestEnum {
+  A,
+  B,
+  C,
+  D,
+  E,
+};
+
+class AtomicStateTest : public testing::Test {
+ protected:
+  AtomicStateTest() : state_(A) {}
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  AtomicState<AtomicStateTestEnum> state_;
+};
+
+TEST_F(AtomicStateTest, transition) {
+  ASSERT_EQ(A, state_.state_);
+
+  // Starts as A, transition from B fails
+  ASSERT_FALSE(state_.transition(B, C));
+  ASSERT_EQ(A, state_.state_);
+
+  // transition from A to B
+  ASSERT_TRUE(state_.transition(A, B));
+  ASSERT_EQ(B, state_.state_);
+
+  // State is B, transition from A fails
+  ASSERT_FALSE(state_.transition(A, B));
+  ASSERT_EQ(B, state_.state_);
+
+  // State is B, transition_or from A calls the lambda
+  bool lambda = false;
+  bool already_locked = false;
+  state_.transition_or(A, B, [&] {
+    // The lock should be held in the lambda
+    if (state_.m_.try_lock()) {
+      state_.m_.unlock();
+    } else {
+      already_locked = true;
+    }
+    lambda = true;
+    return B;
+  });
+  ASSERT_TRUE(lambda);
+  ASSERT_TRUE(already_locked);
+  ASSERT_EQ(B, state_.state_);
+
+  // State is C, transition_or from B to C does not call the lambda
+  lambda = false;
+  state_.transition_or(B, C, [&] {
+    lambda = true;
+    return C;
+  });
+  ASSERT_FALSE(lambda);
+  ASSERT_EQ(C, state_.state_);
+}
+
+TEST_F(AtomicStateTest, wait) {
+  ASSERT_EQ(A, state_.state_);
+
+  // Starts as A, wait_for_either_of B, C returns false
+  ASSERT_FALSE(state_.wait_for_either_of(B, C, 10ms));
+
+  // Starts as A, wait_for_either_of A, B returns true
+  ASSERT_TRUE(state_.wait_for_either_of(A, B, 1s));
+
+  {
+    std::thread t([&] {
+      usleep(10000);
+      state_.set(B);
+    });
+
+    // Wait ing for B or C returns true after state is set to B
+    ASSERT_TRUE(state_.wait_for_either_of(B, C, 1s));
+
+    t.join();
+  }
+
+  ASSERT_EQ(B, state_.state_);
+  {
+    std::thread t([&] {
+      usleep(10000);
+      state_.transition(B, C);
+    });
+
+    // Waiting for A or C returns true after state is transitioned to C
+    ASSERT_TRUE(state_.wait_for_either_of(A, C, 1s));
+
+    t.join();
+  }
+
+  ASSERT_EQ(C, state_.state_);
+  {
+    std::thread t([&] {
+      usleep(10000);
+      state_.transition(C, D);
+    });
+
+    // Waiting for A or B returns false after state is transitioned to D
+    ASSERT_FALSE(state_.wait_for_either_of(A, B, 100ms));
+
+    t.join();
+  }
+}
+
+}  // namespace android