Improve scoped spinlock implementations

Both ScopedAllMutexesLock and ScopedExpectedMutexesOnWeakRefAccessLock
really implement simple spinlocks. But they did it in an unorthodox
way, with a CAS on unlock. I see no reason for that. Use the standard
(and faster) idiom instead.

The NanoSleep(100) waiting logic was probably suboptimal and definitely
misleading. I timed NanoSleep(100) on Linux4.4 host, and it takes about
60 usecs, i.e. 60,000 nsecs. By comparison a no-op sched_yield takes
about 1 usec. This replaces it with waiting logic that should be
generically usable. This is no doubt overkill, but the hope is that
we can eventually reuse this where it matters more.

Test: Built and booted AOSP.

Change-Id: I6e47508ecb8d5e5d0b4f08c8e8f073ad7b1d192e
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 3adebe7..7324dff 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -102,18 +102,40 @@
 }
 #endif
 
+// Wait for an amount of time that roughly increases in the argument i.
+// Spin for small arguments and yield/sleep for longer ones.
+static void BackOff(uint32_t i) {
+  static constexpr uint32_t kSpinMax = 10;
+  static constexpr uint32_t kYieldMax = 20;
+  if (i <= kSpinMax) {
+    // TODO: Esp. in very latency-sensitive cases, consider replacing this with an explicit
+    // test-and-test-and-set loop in the caller.  Possibly skip entirely on a uniprocessor.
+    volatile uint32_t x = 0;
+    const uint32_t spin_count = 10 * i;
+    for (uint32_t spin = 0; spin < spin_count; ++spin) {
+      ++x;  // Volatile; hence should not be optimized away.
+    }
+    // TODO: Consider adding x86 PAUSE and/or ARM YIELD here.
+  } else if (i <= kYieldMax) {
+    sched_yield();
+  } else {
+    NanoSleep(1000ull * (i - kYieldMax));
+  }
+}
+
 class ScopedAllMutexesLock FINAL {
  public:
   explicit ScopedAllMutexesLock(const BaseMutex* mutex) : mutex_(mutex) {
-    while (!gAllMutexData->all_mutexes_guard.CompareExchangeWeakAcquire(0, mutex)) {
-      NanoSleep(100);
+    for (uint32_t i = 0;
+         !gAllMutexData->all_mutexes_guard.CompareExchangeWeakAcquire(0, mutex);
+         ++i) {
+      BackOff(i);
     }
   }
 
   ~ScopedAllMutexesLock() {
-    while (!gAllMutexData->all_mutexes_guard.CompareExchangeWeakRelease(mutex_, 0)) {
-      NanoSleep(100);
-    }
+    DCHECK_EQ(gAllMutexData->all_mutexes_guard.LoadRelaxed(), mutex_);
+    gAllMutexData->all_mutexes_guard.StoreRelease(0);
   }
 
  private:
@@ -123,17 +145,16 @@
 class Locks::ScopedExpectedMutexesOnWeakRefAccessLock FINAL {
  public:
   explicit ScopedExpectedMutexesOnWeakRefAccessLock(const BaseMutex* mutex) : mutex_(mutex) {
-    while (!Locks::expected_mutexes_on_weak_ref_access_guard_.CompareExchangeWeakAcquire(0,
-                                                                                         mutex)) {
-      NanoSleep(100);
+    for (uint32_t i = 0;
+         !Locks::expected_mutexes_on_weak_ref_access_guard_.CompareExchangeWeakAcquire(0, mutex);
+         ++i) {
+      BackOff(i);
     }
   }
 
   ~ScopedExpectedMutexesOnWeakRefAccessLock() {
-    while (!Locks::expected_mutexes_on_weak_ref_access_guard_.CompareExchangeWeakRelease(mutex_,
-                                                                                         0)) {
-      NanoSleep(100);
-    }
+    DCHECK_EQ(Locks::expected_mutexes_on_weak_ref_access_guard_.LoadRelaxed(), mutex_);
+    Locks::expected_mutexes_on_weak_ref_access_guard_.StoreRelease(0);
   }
 
  private: