Implement Intel QuasiAtomics.

Don't use striped locks for 64bit atomics on x86.
Modify QuasiAtomic::Swap to be QuasiAtomic::Write that fits our current use of
Swap and is closer to Intel's implementation.
Return that MIPS doesn't support 64bit compare-and-exchanges in AtomicLong.
Set the SSE2 flag for host and target Intel ART builds as our codegen assumes
it.

Change-Id: Ic1cd5c3b06838e42c6f94e0dd91e77a2d0bb5868
diff --git a/src/atomic.cc b/src/atomic.cc
index 5bbbb4f..e7bab09 100644
--- a/src/atomic.cc
+++ b/src/atomic.cc
@@ -16,84 +16,94 @@
 
 #include "atomic.h"
 
-#include <pthread.h>
+#define NEED_SWAP_MUTEXES !defined(__arm__) && !defined(__i386__)
 
+#if NEED_SWAP_MUTEXES
 #include <vector>
-
 #include "base/mutex.h"
 #include "base/stl_util.h"
 #include "base/stringprintf.h"
 #include "thread.h"
-
-#if defined(__APPLE__)
-#include <libkern/OSAtomic.h>
-#endif
-#if defined(__arm__)
-#include <machine/cpu-features.h>
 #endif
 
 namespace art {
 
-#if defined(HAVE_MACOSX_IPC)
-#define NEED_MAC_QUASI_ATOMICS 1
+#if NEED_SWAP_MUTEXES
+// We stripe across a bunch of different mutexes to reduce contention.
+static const size_t kSwapMutexCount = 32;
+static std::vector<Mutex*>* gSwapMutexes;
 
-#elif defined(__i386__) || defined(__x86_64__)
-#define NEED_PTHREADS_QUASI_ATOMICS 1
-
-#elif defined(__mips__)
-#define NEED_PTHREADS_QUASI_ATOMICS 1
-
-#elif defined(__arm__)
-
-#if defined(__ARM_HAVE_LDREXD)
-#define NEED_ARM_LDREXD_QUASI_ATOMICS 1
-#else
-#define NEED_PTHREADS_QUASI_ATOMICS 1
+static Mutex& GetSwapMutex(const volatile int64_t* addr) {
+  return *(*gSwapMutexes)[((unsigned)(void*)(addr) >> 3U) % kSwapMutexCount];
+}
 #endif
 
-#else
-#error "QuasiAtomic unsupported on this platform"
+void QuasiAtomic::Startup() {
+#if NEED_SWAP_MUTEXES
+  gSwapMutexes = new std::vector<Mutex*>;
+  for (size_t i = 0; i < kSwapMutexCount; ++i) {
+    gSwapMutexes->push_back(new Mutex(StringPrintf("QuasiAtomic stripe %d", i).c_str()));
+  }
 #endif
-
-// *****************************************************************************
-
-#if NEED_ARM_LDREXD_QUASI_ATOMICS
-
-static inline int64_t QuasiAtomicSwap64Impl(int64_t new_value, volatile int64_t* addr) {
-  int64_t prev;
-  int status;
-  do {
-    __asm__ __volatile__("@ QuasiAtomic::Swap64\n"
-        "ldrexd     %0, %H0, [%3]\n"
-        "strexd     %1, %4, %H4, [%3]"
-        : "=&r" (prev), "=&r" (status), "+m"(*addr)
-        : "r" (addr), "r" (new_value)
-        : "cc");
-  } while (__builtin_expect(status != 0, 0));
-  return prev;
 }
 
-int64_t QuasiAtomic::Swap64(int64_t new_value, volatile int64_t* addr) {
-  return QuasiAtomicSwap64Impl(new_value, addr);
-}
-
-int64_t QuasiAtomic::Swap64Sync(int64_t new_value, volatile int64_t* addr) {
-  ANDROID_MEMBAR_STORE();
-  int64_t old_value = QuasiAtomicSwap64Impl(new_value, addr);
-  ANDROID_MEMBAR_FULL();
-  return old_value;
+void QuasiAtomic::Shutdown() {
+#if NEED_SWAP_MUTEXES
+  STLDeleteElements(gSwapMutexes);
+  delete gSwapMutexes;
+#endif
 }
 
 int64_t QuasiAtomic::Read64(volatile const int64_t* addr) {
   int64_t value;
+#if defined(__arm__)
+  // Exclusive loads are defined not to tear, clearing the exclusive state isn't necessary. If we
+  // have LPAE (such as Cortex-A15) then ldrd would suffice.
   __asm__ __volatile__("@ QuasiAtomic::Read64\n"
       "ldrexd     %0, %H0, [%1]"
       : "=&r" (value)
       : "r" (addr));
+#elif defined(__i386__)
+  __asm__ __volatile__(
+      "movq     %1, %0\n"
+      : "=x" (value)
+      : "m" (*addr));
+#else
+  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
+  return *addr;
+#endif
   return value;
 }
 
-int QuasiAtomic::Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
+void QuasiAtomic::Write64(volatile int64_t* addr, int64_t value) {
+#if defined(__arm__)
+  // The write is done as a swap so that the cache-line is in the exclusive state for the store. If
+  // we know that ARM architecture has LPAE (such as Cortex-A15) this isn't necessary and strd will
+  // suffice.
+  int64_t prev;
+  int status;
+  do {
+    __asm__ __volatile__("@ QuasiAtomic::Write64\n"
+        "ldrexd     %0, %H0, [%3]\n"
+        "strexd     %1, %4, %H4, [%3]"
+        : "=&r" (prev), "=&r" (status), "+m"(*addr)
+        : "r" (addr), "r" (value)
+        : "cc");
+  } while (__builtin_expect(status != 0, 0));
+#elif defined(__i386__)
+  __asm__ __volatile__(
+      "movq     %1, %0"
+      : "=m" (*addr)
+      : "x" (value));
+#else
+  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
+  *addr = value;
+#endif
+}
+
+
+bool QuasiAtomic::Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
+#if defined(__arm__)
   int64_t prev;
   int status;
   do {
@@ -108,103 +118,37 @@
         : "cc");
   } while (__builtin_expect(status != 0, 0));
   return prev != old_value;
-}
-
-#endif
-
-// *****************************************************************************
-
-#if NEED_MAC_QUASI_ATOMICS
-
-static inline int64_t QuasiAtomicSwap64Impl(int64_t value, volatile int64_t* addr) {
-  int64_t old_value;
-  do {
-    old_value = *addr;
-  } while (QuasiAtomic::Cas64(old_value, value, addr));
-  return old_value;
-}
-
-int64_t QuasiAtomic::Swap64(int64_t value, volatile int64_t* addr) {
-  return QuasiAtomicSwap64Impl(value, addr);
-}
-
-int64_t QuasiAtomic::Swap64Sync(int64_t value, volatile int64_t* addr) {
-  ANDROID_MEMBAR_STORE();
-  int64_t old_value = QuasiAtomicSwap64Impl(value, addr);
-  // TUNING: barriers can be avoided on some architectures.
-  ANDROID_MEMBAR_FULL();
-  return old_value;
-}
-
-int64_t QuasiAtomic::Read64(volatile const int64_t* addr) {
-  return OSAtomicAdd64Barrier(0, const_cast<volatile int64_t*>(addr));
-}
-
-int QuasiAtomic::Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
-  return OSAtomicCompareAndSwap64Barrier(old_value, new_value, const_cast<int64_t*>(addr)) == 0;
-}
-
-#endif
-
-// *****************************************************************************
-
-#if NEED_PTHREADS_QUASI_ATOMICS
-
-// In the absence of a better implementation, we implement the 64-bit atomic
-// operations through mutex locking.
-
-// We stripe across a bunch of different mutexes to reduce contention.
-static const size_t kSwapLockCount = 32;
-static std::vector<Mutex*>* gSwapLocks;
-
-void QuasiAtomic::Startup() {
-  gSwapLocks = new std::vector<Mutex*>;
-  for (size_t i = 0; i < kSwapLockCount; ++i) {
-    gSwapLocks->push_back(new Mutex(StringPrintf("QuasiAtomic stripe %d", i).c_str()));
-  }
-}
-
-void QuasiAtomic::Shutdown() {
-  STLDeleteElements(gSwapLocks);
-  delete gSwapLocks;
-}
-
-static inline Mutex& GetSwapLock(const volatile int64_t* addr) {
-  return *(*gSwapLocks)[((unsigned)(void*)(addr) >> 3U) % kSwapLockCount];
-}
-
-int64_t QuasiAtomic::Swap64(int64_t value, volatile int64_t* addr) {
-  MutexLock mu(Thread::Current(), GetSwapLock(addr));
-  int64_t old_value = *addr;
-  *addr = value;
-  return old_value;
-}
-
-int64_t QuasiAtomic::Swap64Sync(int64_t value, volatile int64_t* addr) {
-  // Same as QuasiAtomicSwap64 - mutex handles barrier.
-  return QuasiAtomic::Swap64(value, addr);
-}
-
-int QuasiAtomic::Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
-  MutexLock mu(Thread::Current(), GetSwapLock(addr));
-  if (*addr == old_value) {
-    *addr  = new_value;
-    return 0;
-  }
-  return 1;
-}
-
-int64_t QuasiAtomic::Read64(volatile const int64_t* addr) {
-  MutexLock mu(Thread::Current(), GetSwapLock(addr));
-  return *addr;
-}
-
+#elif defined(__i386__)
+  // cmpxchg8b implicitly uses %ebx which is also the PIC register.
+  int8_t status;
+  __asm__ __volatile__ (
+      "pushl          %%ebx\n"
+      "movl           (%3), %%ebx\n"
+      "movl           4(%3), %%ecx\n"
+      "lock cmpxchg8b %1\n"
+      "sete           %0\n"
+      "popl           %%ebx"
+      : "=R" (status), "+m" (*addr)
+      : "A"(old_value), "D" (&new_value)
+      : "%ecx"
+      );
+  return status != 0;
 #else
-
-// The other implementations don't need any special setup.
-void QuasiAtomic::Startup() {}
-void QuasiAtomic::Shutdown() {}
-
+  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
+  if (*addr == old_value) {
+    *addr = new_value;
+    return true;
+  }
+  return false;
 #endif
+}
+
+bool QuasiAtomic::LongAtomicsUseMutexes() {
+#if NEED_SWAP_MUTEXES
+  return true;
+#else
+  return false;
+#endif
+}
 
 }  // namespace art