Merge "Fix host-run-test-jar to be legal for /bin/sh"
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 35d04ae..5cc906f 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -203,10 +203,10 @@
 enum ConditionCode {
   kCondEq,  // equal
   kCondNe,  // not equal
-  kCondCs,  // carry set (unsigned less than)
-  kCondUlt = kCondCs,
-  kCondCc,  // carry clear (unsigned greater than or same)
-  kCondUge = kCondCc,
+  kCondCs,  // carry set
+  kCondCc,  // carry clear
+  kCondUlt,  // unsigned less than
+  kCondUge,  // unsigned greater than or same
   kCondMi,  // minus
   kCondPl,  // plus, positive or zero
   kCondVs,  // overflow
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index 8226b24..661050f 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -608,7 +608,7 @@
   }
   if (!skip_overflow_check) {
     OpRegRegImm(kOpSub, rARM_LR, rARM_SP, frame_size_ - (spill_count * 4));
-    GenRegRegCheck(kCondCc, rARM_LR, r12, kThrowStackOverflow);
+    GenRegRegCheck(kCondUlt, rARM_LR, r12, kThrowStackOverflow);
     OpRegCopy(rARM_SP, rARM_LR);     // Establish stack
   } else {
     OpRegImm(kOpSub, rARM_SP, frame_size_ - (spill_count * 4));
diff --git a/compiler/dex/quick/arm/fp_arm.cc b/compiler/dex/quick/arm/fp_arm.cc
index dc2e0d0..8af9cdd 100644
--- a/compiler/dex/quick/arm/fp_arm.cc
+++ b/compiler/dex/quick/arm/fp_arm.cc
@@ -216,7 +216,7 @@
       break;
     case kCondGe:
       if (gt_bias) {
-        ccode = kCondCs;
+        ccode = kCondUge;
       }
       break;
     default:
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index e839fe5..86ae75e 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -128,31 +128,23 @@
   int32_t low_reg = rl_src1.low_reg;
   int32_t high_reg = rl_src1.high_reg;
 
+  if (val == 0 && (ccode == kCondEq || ccode == kCondNe)) {
+    int t_reg = AllocTemp();
+    NewLIR4(kThumb2OrrRRRs, t_reg, low_reg, high_reg, 0);
+    FreeTemp(t_reg);
+    OpCondBranch(ccode, taken);
+    return;
+  }
+
   switch (ccode) {
     case kCondEq:
     case kCondNe:
-      LIR* target;
-      ConditionCode condition;
-      if (ccode == kCondEq) {
-        target = not_taken;
-        condition = kCondEq;
-      } else {
-        target = taken;
-        condition = kCondNe;
-      }
-      if (val == 0) {
-        int t_reg = AllocTemp();
-        NewLIR4(kThumb2OrrRRRs, t_reg, low_reg, high_reg, 0);
-        FreeTemp(t_reg);
-        OpCondBranch(condition, taken);
-        return;
-      }
-      OpCmpImmBranch(kCondNe, high_reg, val_hi, target);
+      OpCmpImmBranch(kCondNe, high_reg, val_hi, (ccode == kCondEq) ? not_taken : taken);
       break;
     case kCondLt:
       OpCmpImmBranch(kCondLt, high_reg, val_hi, taken);
       OpCmpImmBranch(kCondGt, high_reg, val_hi, not_taken);
-      ccode = kCondCc;
+      ccode = kCondUlt;
       break;
     case kCondLe:
       OpCmpImmBranch(kCondLt, high_reg, val_hi, taken);
@@ -167,7 +159,7 @@
     case kCondGe:
       OpCmpImmBranch(kCondGt, high_reg, val_hi, taken);
       OpCmpImmBranch(kCondLt, high_reg, val_hi, not_taken);
-      ccode = kCondCs;
+      ccode = kCondUge;
       break;
     default:
       LOG(FATAL) << "Unexpected ccode: " << ccode;
@@ -187,7 +179,7 @@
     rl_result = EvalLoc(rl_dest, kCoreReg, true);
     if ((true_val == 1) && (false_val == 0)) {
       OpRegRegImm(kOpRsub, rl_result.low_reg, rl_src.low_reg, 1);
-      OpIT(kCondCc, "");
+      OpIT(kCondUlt, "");
       LoadConstant(rl_result.low_reg, 0);
       GenBarrier();  // Add a scheduling barrier to keep the IT shadow intact
     } else if (InexpensiveConstantInt(true_val) && InexpensiveConstantInt(false_val)) {
@@ -238,9 +230,7 @@
   // Normalize such that if either operand is constant, src2 will be constant.
   ConditionCode ccode = static_cast<ConditionCode>(mir->dalvikInsn.arg[0]);
   if (rl_src1.is_const) {
-    RegLocation rl_temp = rl_src1;
-    rl_src1 = rl_src2;
-    rl_src2 = rl_temp;
+    std::swap(rl_src1, rl_src2);
     ccode = FlipComparisonOrder(ccode);
   }
   if (rl_src2.is_const) {
@@ -268,7 +258,7 @@
     case kCondLt:
       OpCondBranch(kCondLt, taken);
       OpCondBranch(kCondGt, not_taken);
-      ccode = kCondCc;
+      ccode = kCondUlt;
       break;
     case kCondLe:
       OpCondBranch(kCondLt, taken);
@@ -283,7 +273,7 @@
     case kCondGe:
       OpCondBranch(kCondGt, taken);
       OpCondBranch(kCondLt, not_taken);
-      ccode = kCondCs;
+      ccode = kCondUge;
       break;
     default:
       LOG(FATAL) << "Unexpected ccode: " << ccode;
@@ -701,7 +691,7 @@
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   OpRegRegImm(kOpRsub, rl_result.low_reg, r_tmp, 1);
   DCHECK(last_lir_insn_->u.m.def_mask & ENCODE_CCODE);
-  OpIT(kCondCc, "");
+  OpIT(kCondUlt, "");
   LoadConstant(rl_result.low_reg, 0); /* cc */
   FreeTemp(r_tmp);  // Now unneeded.
 
@@ -760,10 +750,10 @@
   int dmb_flavor;
   // TODO: revisit Arm barrier kinds
   switch (barrier_kind) {
-    case kLoadStore: dmb_flavor = kSY; break;
-    case kLoadLoad: dmb_flavor = kSY; break;
-    case kStoreStore: dmb_flavor = kST; break;
-    case kStoreLoad: dmb_flavor = kSY; break;
+    case kLoadStore: dmb_flavor = kISH; break;
+    case kLoadLoad: dmb_flavor = kISH; break;
+    case kStoreStore: dmb_flavor = kISHST; break;
+    case kStoreLoad: dmb_flavor = kISH; break;
     default:
       LOG(FATAL) << "Unexpected MemBarrierKind: " << barrier_kind;
       dmb_flavor = kSY;  // quiet gcc.
@@ -981,9 +971,7 @@
     rl_result = EvalLoc(rl_dest, reg_class, true);
 
     if (needs_range_check) {
-      // TODO: change kCondCS to a more meaningful name, is the sense of
-      // carry-set/clear flipped?
-      GenRegRegCheck(kCondCs, rl_index.low_reg, reg_len, kThrowArrayBounds);
+      GenRegRegCheck(kCondUge, rl_index.low_reg, reg_len, kThrowArrayBounds);
       FreeTemp(reg_len);
     }
     LoadBaseIndexed(reg_ptr, rl_index.low_reg, rl_result.low_reg, scale, size);
@@ -1072,7 +1060,7 @@
     OpRegRegImm(kOpAdd, reg_ptr, rl_array.low_reg, data_offset);
     rl_src = LoadValue(rl_src, reg_class);
     if (needs_range_check) {
-      GenRegRegCheck(kCondCs, rl_index.low_reg, reg_len, kThrowArrayBounds);
+      GenRegRegCheck(kCondUge, rl_index.low_reg, reg_len, kThrowArrayBounds);
       FreeTemp(reg_len);
     }
     StoreBaseIndexed(reg_ptr, rl_index.low_reg, rl_src.low_reg,
@@ -1172,9 +1160,7 @@
     // Normalize
     if (!rl_src2.is_const) {
       DCHECK(rl_src1.is_const);
-      RegLocation rl_temp = rl_src1;
-      rl_src1 = rl_src2;
-      rl_src2 = rl_temp;
+      std::swap(rl_src1, rl_src2);
     }
   }
   if (BadOverlap(rl_src1, rl_dest)) {
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index 48c9af5..d80ae3b 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -207,6 +207,8 @@
     case kCondNe: res = kArmCondNe; break;
     case kCondCs: res = kArmCondCs; break;
     case kCondCc: res = kArmCondCc; break;
+    case kCondUlt: res = kArmCondCc; break;
+    case kCondUge: res = kArmCondCs; break;
     case kCondMi: res = kArmCondMi; break;
     case kCondPl: res = kArmCondPl; break;
     case kCondVs: res = kArmCondVs; break;
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 8a8b168..fa05d6c 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -210,6 +210,9 @@
 }
 
 LIR* ArmMir2Lir::OpCondBranch(ConditionCode cc, LIR* target) {
+  // This is kThumb2BCond instead of kThumbBCond for performance reasons. The assembly
+  // time required for a new pass after kThumbBCond is fixed up to kThumb2BCond is
+  // substantial.
   LIR* branch = NewLIR2(kThumb2BCond, 0 /* offset to be patched */,
                         ArmConditionEncoding(cc));
   branch->target = target;
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 82a1932..d942a24 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -896,7 +896,7 @@
       intrinsic_launchpads_.Insert(launch_pad);
       OpRegReg(kOpCmp, rl_idx.low_reg, reg_max);
       FreeTemp(reg_max);
-      OpCondBranch(kCondCs, launch_pad);
+      OpCondBranch(kCondUge, launch_pad);
     }
   } else {
     if (range_check) {
@@ -907,7 +907,7 @@
       intrinsic_launchpads_.Insert(launch_pad);
       OpRegReg(kOpCmp, rl_idx.low_reg, reg_max);
       FreeTemp(reg_max);
-      OpCondBranch(kCondCc, launch_pad);
+      OpCondBranch(kCondUge, launch_pad);
     }
     reg_off = AllocTemp();
     reg_ptr = AllocTemp();
diff --git a/compiler/dex/quick/mips/call_mips.cc b/compiler/dex/quick/mips/call_mips.cc
index 14f49aa..2e385a3 100644
--- a/compiler/dex/quick/mips/call_mips.cc
+++ b/compiler/dex/quick/mips/call_mips.cc
@@ -318,7 +318,7 @@
   DCHECK_EQ(num_fp_spills_, 0);
   if (!skip_overflow_check) {
     OpRegRegImm(kOpSub, new_sp, rMIPS_SP, frame_size_ - (spill_count * 4));
-    GenRegRegCheck(kCondCc, new_sp, check_reg, kThrowStackOverflow);
+    GenRegRegCheck(kCondUlt, new_sp, check_reg, kThrowStackOverflow);
     OpRegCopy(rMIPS_SP, new_sp);     // Establish stack
   } else {
     OpRegImm(kOpSub, rMIPS_SP, frame_size_ - (spill_count * 4));
diff --git a/compiler/dex/quick/mips/int_mips.cc b/compiler/dex/quick/mips/int_mips.cc
index dfff260..180d56c 100644
--- a/compiler/dex/quick/mips/int_mips.cc
+++ b/compiler/dex/quick/mips/int_mips.cc
@@ -77,11 +77,11 @@
       br_op = kMipsBne;
       cmp_zero = true;
       break;
-    case kCondCc:
+    case kCondUlt:
       slt_op = kMipsSltu;
       br_op = kMipsBnez;
       break;
-    case kCondCs:
+    case kCondUge:
       slt_op = kMipsSltu;
       br_op = kMipsBeqz;
       break;
@@ -485,9 +485,7 @@
     rl_result = EvalLoc(rl_dest, reg_class, true);
 
     if (needs_range_check) {
-      // TODO: change kCondCS to a more meaningful name, is the sense of
-      // carry-set/clear flipped?
-      GenRegRegCheck(kCondCs, rl_index.low_reg, reg_len, kThrowArrayBounds);
+      GenRegRegCheck(kCondUge, rl_index.low_reg, reg_len, kThrowArrayBounds);
       FreeTemp(reg_len);
     }
     LoadBaseDispWide(reg_ptr, 0, rl_result.low_reg, rl_result.high_reg, INVALID_SREG);
@@ -498,9 +496,7 @@
     rl_result = EvalLoc(rl_dest, reg_class, true);
 
     if (needs_range_check) {
-      // TODO: change kCondCS to a more meaningful name, is the sense of
-      // carry-set/clear flipped?
-      GenRegRegCheck(kCondCs, rl_index.low_reg, reg_len, kThrowArrayBounds);
+      GenRegRegCheck(kCondUge, rl_index.low_reg, reg_len, kThrowArrayBounds);
       FreeTemp(reg_len);
     }
     LoadBaseIndexed(reg_ptr, rl_index.low_reg, rl_result.low_reg, scale, size);
@@ -566,7 +562,7 @@
     rl_src = LoadValueWide(rl_src, reg_class);
 
     if (needs_range_check) {
-      GenRegRegCheck(kCondCs, rl_index.low_reg, reg_len, kThrowArrayBounds);
+      GenRegRegCheck(kCondUge, rl_index.low_reg, reg_len, kThrowArrayBounds);
       FreeTemp(reg_len);
     }
 
@@ -574,7 +570,7 @@
   } else {
     rl_src = LoadValue(rl_src, reg_class);
     if (needs_range_check) {
-      GenRegRegCheck(kCondCs, rl_index.low_reg, reg_len, kThrowArrayBounds);
+      GenRegRegCheck(kCondUge, rl_index.low_reg, reg_len, kThrowArrayBounds);
       FreeTemp(reg_len);
     }
     StoreBaseIndexed(reg_ptr, rl_index.low_reg, rl_src.low_reg,
diff --git a/compiler/dex/quick/x86/fp_x86.cc b/compiler/dex/quick/x86/fp_x86.cc
index 1731703..6272498 100644
--- a/compiler/dex/quick/x86/fp_x86.cc
+++ b/compiler/dex/quick/x86/fp_x86.cc
@@ -322,7 +322,7 @@
         branch = NewLIR2(kX86Jcc8, 0, kX86CondPE);
         branch->target = not_taken;
       }
-      ccode = kCondCs;
+      ccode = kCondUlt;
       break;
     case kCondLe:
       if (gt_bias) {
@@ -343,7 +343,7 @@
         branch = NewLIR2(kX86Jcc8, 0, kX86CondPE);
         branch->target = taken;
       }
-      ccode = kCondCc;
+      ccode = kCondUge;
       break;
     default:
       LOG(FATAL) << "Unexpected ccode: " << ccode;
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 56cf7e9..2c646d4 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -83,6 +83,8 @@
     case kCondNe: return kX86CondNe;
     case kCondCs: return kX86CondC;
     case kCondCc: return kX86CondNc;
+    case kCondUlt: return kX86CondC;
+    case kCondUge: return kX86CondNc;
     case kCondMi: return kX86CondS;
     case kCondPl: return kX86CondNs;
     case kCondVs: return kX86CondO;
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 1124541..9cffb3c 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1404,7 +1404,7 @@
   }
 
   size_t NextIndex() {
-    return index_.fetch_add(1);
+    return index_.FetchAndAdd(1);
   }
 
  private:
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 90d84d5..71f70c4 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -81,6 +81,19 @@
   }
 }
 
+void DisassemblerArm::DumpMemoryDomain(std::ostream& os, uint32_t domain) {
+  switch (domain) {
+    case 0b1111: os << "sy"; break;
+    case 0b1110: os << "st"; break;
+    case 0b1011: os << "ish"; break;
+    case 0b1010: os << "ishst"; break;
+    case 0b0111: os << "nsh"; break;
+    case 0b0110: os << "nshst"; break;
+    case 0b0011: os << "osh"; break;
+    case 0b0010: os << "oshst"; break;
+  }
+}
+
 void DisassemblerArm::DumpBranchTarget(std::ostream& os, const uint8_t* instr_ptr, int32_t imm32) {
   os << StringPrintf("%+d (%p)", imm32, instr_ptr + imm32);
 }
@@ -996,9 +1009,9 @@
               // Miscellaneous control instructions
               uint32_t op5 = (instr >> 4) & 0xF;
               switch (op5) {
-                case 4: opcode << "dsb"; break;
-                case 5: opcode << "dmb"; break;
-                case 6: opcode << "isb"; break;
+                case 4: opcode << "dsb"; DumpMemoryDomain(args, instr & 0xF); break;
+                case 5: opcode << "dmb"; DumpMemoryDomain(args, instr & 0xF); break;
+                case 6: opcode << "isb"; DumpMemoryDomain(args, instr & 0xF); break;
               }
             }
             break;
diff --git a/disassembler/disassembler_arm.h b/disassembler/disassembler_arm.h
index 2e699ff..e34274e 100644
--- a/disassembler/disassembler_arm.h
+++ b/disassembler/disassembler_arm.h
@@ -30,6 +30,7 @@
 
   virtual size_t Dump(std::ostream& os, const uint8_t* begin);
   virtual void Dump(std::ostream& os, const uint8_t* begin, const uint8_t* end);
+
  private:
   void DumpArm(std::ostream& os, const uint8_t* instr);
 
@@ -39,6 +40,7 @@
 
   void DumpBranchTarget(std::ostream& os, const uint8_t* instr_ptr, int32_t imm32);
   void DumpCond(std::ostream& os, uint32_t cond);
+  void DumpMemoryDomain(std::ostream& os, uint32_t domain);
 
   std::vector<const char*> it_conditions_;
 
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 61be14b..34de93f 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -359,6 +359,7 @@
     @ unlocked case - r2 holds thread id with count of 0
     strex  r3, r2, [r0, #LOCK_WORD_OFFSET]
     cbnz   r3, strex_fail             @ store failed, retry
+    dmb    ish                        @ full (LoadLoad) memory barrier
     bx lr
 strex_fail:
     b retry_lock                      @ unlikely forward branch, need to reload and recheck r1/r2
@@ -402,6 +403,7 @@
     bpl    recursive_thin_unlock
     @ transition to unlocked, r3 holds 0
     str    r3, [r0, #LOCK_WORD_OFFSET]
+    dmb    ish                        @ full (StoreLoad) memory barrier
     bx     lr
 recursive_thin_unlock:
     sub    r1, r1, #65536
diff --git a/runtime/atomic.cc b/runtime/atomic.cc
index 47cee6a..bac0a99 100644
--- a/runtime/atomic.cc
+++ b/runtime/atomic.cc
@@ -15,135 +15,52 @@
  */
 
 #include "atomic.h"
-
-#define NEED_SWAP_MUTEXES !defined(__arm__) && !defined(__i386__)
-
-#if NEED_SWAP_MUTEXES
-#include <vector>
 #include "base/mutex.h"
 #include "base/stl_util.h"
-#include "base/stringprintf.h"
 #include "thread-inl.h"
-#endif
 
 namespace art {
 
-#if NEED_SWAP_MUTEXES
-// We stripe across a bunch of different mutexes to reduce contention.
-static const size_t kSwapMutexCount = 32;
-static std::vector<Mutex*>* gSwapMutexes;
+std::vector<Mutex*>* QuasiAtomic::gSwapMutexes = nullptr;
 
-static Mutex& GetSwapMutex(const volatile int64_t* addr) {
-  return *(*gSwapMutexes)[(reinterpret_cast<unsigned>(addr) >> 3U) % kSwapMutexCount];
+Mutex* QuasiAtomic::GetSwapMutex(const volatile int64_t* addr) {
+  return (*gSwapMutexes)[(reinterpret_cast<unsigned>(addr) >> 3U) % kSwapMutexCount];
 }
-#endif
 
 void QuasiAtomic::Startup() {
-#if NEED_SWAP_MUTEXES
-  gSwapMutexes = new std::vector<Mutex*>;
-  for (size_t i = 0; i < kSwapMutexCount; ++i) {
-    gSwapMutexes->push_back(new Mutex("QuasiAtomic stripe"));
+  if (kNeedSwapMutexes) {
+    gSwapMutexes = new std::vector<Mutex*>;
+    for (size_t i = 0; i < kSwapMutexCount; ++i) {
+      gSwapMutexes->push_back(new Mutex("QuasiAtomic stripe"));
+    }
   }
-#endif
 }
 
 void QuasiAtomic::Shutdown() {
-#if NEED_SWAP_MUTEXES
-  STLDeleteElements(gSwapMutexes);
-  delete gSwapMutexes;
-#endif
+  if (kNeedSwapMutexes) {
+    STLDeleteElements(gSwapMutexes);
+    delete gSwapMutexes;
+  }
 }
 
-int64_t QuasiAtomic::Read64(volatile const int64_t* addr) {
-  int64_t value;
-#if NEED_SWAP_MUTEXES
-  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
-  value = *addr;
-#elif defined(__arm__)
-  // Exclusive loads are defined not to tear, clearing the exclusive state isn't necessary. If we
-  // have LPAE (such as Cortex-A15) then ldrd would suffice.
-  __asm__ __volatile__("@ QuasiAtomic::Read64\n"
-      "ldrexd     %0, %H0, [%1]"
-      : "=&r" (value)
-      : "r" (addr));
-#elif defined(__i386__)
-  __asm__ __volatile__(
-      "movq     %1, %0\n"
-      : "=x" (value)
-      : "m" (*addr));
-#else
-#error Unexpected architecture
-#endif
-  return value;
+int64_t QuasiAtomic::SwapMutexRead64(volatile const int64_t* addr) {
+  MutexLock mu(Thread::Current(), *GetSwapMutex(addr));
+  return *addr;
 }
 
-void QuasiAtomic::Write64(volatile int64_t* addr, int64_t value) {
-#if NEED_SWAP_MUTEXES
-  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
+void QuasiAtomic::SwapMutexWrite64(volatile int64_t* addr, int64_t value) {
+  MutexLock mu(Thread::Current(), *GetSwapMutex(addr));
   *addr = value;
-#elif defined(__arm__)
-  // The write is done as a swap so that the cache-line is in the exclusive state for the store. If
-  // we know that ARM architecture has LPAE (such as Cortex-A15) this isn't necessary and strd will
-  // suffice.
-  int64_t prev;
-  int status;
-  do {
-    __asm__ __volatile__("@ QuasiAtomic::Write64\n"
-        "ldrexd     %0, %H0, [%3]\n"
-        "strexd     %1, %4, %H4, [%3]"
-        : "=&r" (prev), "=&r" (status), "+m"(*addr)
-        : "r" (addr), "r" (value)
-        : "cc");
-  } while (__builtin_expect(status != 0, 0));
-#elif defined(__i386__)
-  __asm__ __volatile__(
-      "movq     %1, %0"
-      : "=m" (*addr)
-      : "x" (value));
-#else
-#error Unexpected architecture
-#endif
 }
 
 
-bool QuasiAtomic::Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
-#if NEED_SWAP_MUTEXES
-  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
+bool QuasiAtomic::SwapMutexCas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
+  MutexLock mu(Thread::Current(), *GetSwapMutex(addr));
   if (*addr == old_value) {
     *addr = new_value;
     return true;
   }
   return false;
-#elif defined(__arm__)
-  int64_t prev;
-  int status;
-  do {
-    __asm__ __volatile__("@ QuasiAtomic::Cas64\n"
-        "ldrexd     %0, %H0, [%3]\n"
-        "mov        %1, #0\n"
-        "teq        %0, %4\n"
-        "teqeq      %H0, %H4\n"
-        "strexdeq   %1, %5, %H5, [%3]"
-        : "=&r" (prev), "=&r" (status), "+m"(*addr)
-        : "r" (addr), "Ir" (old_value), "r" (new_value)
-        : "cc");
-  } while (__builtin_expect(status != 0, 0));
-  return prev == old_value;
-#elif defined(__i386__)
-  // The compiler does the right job and works better than inline assembly, especially with -O0
-  // compilation.
-  return __sync_bool_compare_and_swap(addr, old_value, new_value);
-#else
-#error Unexpected architecture
-#endif
-}
-
-bool QuasiAtomic::LongAtomicsUseMutexes() {
-#if NEED_SWAP_MUTEXES
-  return true;
-#else
-  return false;
-#endif
 }
 
 }  // namespace art
diff --git a/runtime/atomic.h b/runtime/atomic.h
index cb6f86b..b1e9870 100644
--- a/runtime/atomic.h
+++ b/runtime/atomic.h
@@ -18,11 +18,14 @@
 #define ART_RUNTIME_ATOMIC_H_
 
 #include <stdint.h>
+#include <vector>
 
 #include "base/macros.h"
 
 namespace art {
 
+class Mutex;
+
 // NOTE: Two "quasiatomic" operations on the exact same memory address
 // are guaranteed to operate atomically with respect to each other,
 // but no guarantees are made about quasiatomic operations mixed with
@@ -30,25 +33,108 @@
 // quasiatomic operations that are performed on partially-overlapping
 // memory.
 class QuasiAtomic {
+#if !defined(__arm__) && !defined(__i386__)
+  static constexpr bool kNeedSwapMutexes = true;
+#else
+  static constexpr bool kNeedSwapMutexes = false;
+#endif
+
  public:
   static void Startup();
 
   static void Shutdown();
 
   // Reads the 64-bit value at "addr" without tearing.
-  static int64_t Read64(volatile const int64_t* addr);
+  static int64_t Read64(volatile const int64_t* addr) {
+    if (!kNeedSwapMutexes) {
+      return *addr;
+    } else {
+      return SwapMutexRead64(addr);
+    }
+  }
 
   // Writes to the 64-bit value at "addr" without tearing.
-  static void Write64(volatile int64_t* addr, int64_t val);
+  static void Write64(volatile int64_t* addr, int64_t val) {
+    if (!kNeedSwapMutexes) {
+      *addr = val;
+    } else {
+      SwapMutexWrite64(addr, val);
+    }
+  }
 
   // Atomically compare the value at "addr" to "old_value", if equal replace it with "new_value"
   // and return true. Otherwise, don't swap, and return false.
-  static bool Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr);
+  static bool Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
+    if (!kNeedSwapMutexes) {
+      return __sync_bool_compare_and_swap(addr, old_value, new_value);
+    } else {
+      return SwapMutexCas64(old_value, new_value, addr);
+    }
+  }
 
   // Does the architecture provide reasonable atomic long operations or do we fall back on mutexes?
-  static bool LongAtomicsUseMutexes();
+  static bool LongAtomicsUseMutexes() {
+    return !kNeedSwapMutexes;
+  }
+
+  static void MembarLoadStore() {
+  #if defined(__arm__)
+    __asm__ __volatile__("dmb ish" : : : "memory");
+  #elif defined(__i386__)
+    __asm__ __volatile__("" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
+
+  static void MembarLoadLoad() {
+  #if defined(__arm__)
+    __asm__ __volatile__("dmb ish" : : : "memory");
+  #elif defined(__i386__)
+    __asm__ __volatile__("" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
+
+  static void MembarStoreStore() {
+  #if defined(__arm__)
+    __asm__ __volatile__("dmb ishst" : : : "memory");
+  #elif defined(__i386__)
+    __asm__ __volatile__("" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
+
+  static void MembarStoreLoad() {
+  #if defined(__arm__)
+    __asm__ __volatile__("dmb ish" : : : "memory");
+  #elif defined(__i386__)
+    __asm__ __volatile__("mfence" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
 
  private:
+  static Mutex* GetSwapMutex(const volatile int64_t* addr);
+  static int64_t SwapMutexRead64(volatile const int64_t* addr);
+  static void SwapMutexWrite64(volatile int64_t* addr, int64_t val);
+  static bool SwapMutexCas64(int64_t old_value, int64_t new_value, volatile int64_t* addr);
+
+  // We stripe across a bunch of different mutexes to reduce contention.
+  static constexpr size_t kSwapMutexCount = 32;
+  static std::vector<Mutex*>* gSwapMutexes;
+
   DISALLOW_COPY_AND_ASSIGN(QuasiAtomic);
 };
 
diff --git a/runtime/atomic_integer.h b/runtime/atomic_integer.h
index 132f968..651ca4a 100644
--- a/runtime/atomic_integer.h
+++ b/runtime/atomic_integer.h
@@ -17,8 +17,7 @@
 #ifndef ART_RUNTIME_ATOMIC_INTEGER_H_
 #define ART_RUNTIME_ATOMIC_INTEGER_H_
 
-#include "cutils/atomic.h"
-#include "cutils/atomic-inline.h"
+#include <stdint.h>
 
 namespace art {
 
@@ -28,53 +27,57 @@
 
   explicit AtomicInteger(int32_t value) : value_(value) { }
 
-  // Unsafe = operator for non atomic operations on the integer.
-  void store(int32_t desired) {
-    value_ = desired;
-  }
-
   AtomicInteger& operator=(int32_t desired) {
-    store(desired);
+    Store(desired);
     return *this;
   }
 
-  int32_t load() const {
+  int32_t Load() const {
     return value_;
   }
 
   operator int32_t() const {
-    return load();
+    return Load();
   }
 
-  int32_t fetch_add(const int32_t value) {
-    return android_atomic_add(value, &value_);
+  int32_t FetchAndAdd(const int32_t value) {
+    return __sync_fetch_and_add(&value_, value);  // Return old_value.
   }
 
-  int32_t fetch_sub(const int32_t value) {
-    return android_atomic_add(-value, &value_);
+  int32_t FetchAndSub(const int32_t value) {
+    return __sync_fetch_and_sub(&value_, value);  // Return old value.
   }
 
-  int32_t operator++() {
-    return android_atomic_inc(&value_) + 1;
+  int32_t operator++() {  // Prefix operator.
+    return __sync_add_and_fetch(&value_, 1);  // Return new value.
   }
 
-  int32_t operator++(int32_t) {
-    return android_atomic_inc(&value_);
+  int32_t operator++(int32_t) {  // Postfix operator.
+    return __sync_fetch_and_add(&value_, 1);  // Return old value.
   }
 
-  int32_t operator--() {
-    return android_atomic_dec(&value_) - 1;
+  int32_t operator--() {  // Prefix operator.
+    return __sync_sub_and_fetch(&value_, 1);  // Return new value.
   }
 
-  int32_t operator--(int32_t) {
-    return android_atomic_dec(&value_);
+  int32_t operator--(int32_t) {  // Postfix operator.
+    return __sync_fetch_and_sub(&value_, 1);  // Return old value.
   }
 
-  bool compare_and_swap(int32_t expected_value, int32_t desired_value) {
-    return android_atomic_cas(expected_value, desired_value, &value_) == 0;
+  bool CompareAndSwap(int32_t expected_value, int32_t desired_value) {
+    return __sync_bool_compare_and_swap(&value_, expected_value, desired_value);
+  }
+
+  volatile int32_t* Address() {
+    return &value_;
   }
 
  private:
+  // Unsafe = operator for non atomic operations on the integer.
+  void Store(int32_t desired) {
+    value_ = desired;
+  }
+
   volatile int32_t value_;
 };
 
diff --git a/runtime/base/bounded_fifo.h b/runtime/base/bounded_fifo.h
index cb92d40..d04840a 100644
--- a/runtime/base/bounded_fifo.h
+++ b/runtime/base/bounded_fifo.h
@@ -17,9 +17,6 @@
 #ifndef ART_RUNTIME_BASE_BOUNDED_FIFO_H_
 #define ART_RUNTIME_BASE_BOUNDED_FIFO_H_
 
-#include "cutils/atomic.h"
-#include "cutils/atomic-inline.h"
-
 namespace art {
 
 // A bounded fifo is a fifo which has a bounded size. The power of two version uses a bit mask to
@@ -49,7 +46,7 @@
   void push_back(const T& value) {
     ++size_;
     DCHECK_LE(size_, MaxSize);
-    // Relies on integer overflow behaviour.
+    // Relies on integer overflow behavior.
     data_[back_index_++ & mask_] = value;
   }
 
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index ec79c55..05e3a83 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -21,8 +21,6 @@
 
 #include "atomic.h"
 #include "base/logging.h"
-#include "cutils/atomic.h"
-#include "cutils/atomic-inline.h"
 #include "mutex-inl.h"
 #include "runtime.h"
 #include "scoped_thread_state_change.h"
@@ -59,12 +57,12 @@
 class ScopedAllMutexesLock {
  public:
   explicit ScopedAllMutexesLock(const BaseMutex* mutex) : mutex_(mutex) {
-    while (!gAllMutexData->all_mutexes_guard.compare_and_swap(0, reinterpret_cast<int32_t>(mutex))) {
+    while (!gAllMutexData->all_mutexes_guard.CompareAndSwap(0, reinterpret_cast<int32_t>(mutex))) {
       NanoSleep(100);
     }
   }
   ~ScopedAllMutexesLock() {
-    while (!gAllMutexData->all_mutexes_guard.compare_and_swap(reinterpret_cast<int32_t>(mutex_), 0)) {
+    while (!gAllMutexData->all_mutexes_guard.CompareAndSwap(reinterpret_cast<int32_t>(mutex_), 0)) {
       NanoSleep(100);
     }
   }
@@ -176,7 +174,7 @@
       do {
         slot = data->cur_content_log_entry;
         new_slot = (slot + 1) % kContentionLogSize;
-      } while (!data->cur_content_log_entry.compare_and_swap(slot, new_slot));
+      } while (!data->cur_content_log_entry.CompareAndSwap(slot, new_slot));
       log[new_slot].blocked_tid = blocked_tid;
       log[new_slot].owner_tid = owner_tid;
       log[new_slot].count = 1;
@@ -300,11 +298,11 @@
       int32_t cur_state = state_;
       if (LIKELY(cur_state == 0)) {
         // Change state from 0 to 1.
-        done = android_atomic_acquire_cas(0, 1, &state_) == 0;
+        done = __sync_bool_compare_and_swap(&state_, 0 /* cur_state */, 1 /* new state */);
       } else {
         // Failed to acquire, hang up.
         ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
-        android_atomic_inc(&num_contenders_);
+        num_contenders_++;
         if (futex(&state_, FUTEX_WAIT, 1, NULL, NULL, 0) != 0) {
           // EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
           // We don't use TEMP_FAILURE_RETRY so we can intentionally retry to acquire the lock.
@@ -312,9 +310,10 @@
             PLOG(FATAL) << "futex wait failed for " << name_;
           }
         }
-        android_atomic_dec(&num_contenders_);
+        num_contenders_--;
       }
     } while (!done);
+    QuasiAtomic::MembarStoreLoad();
     DCHECK_EQ(state_, 1);
     exclusive_owner_ = SafeGetTid(self);
 #else
@@ -342,11 +341,12 @@
       int32_t cur_state = state_;
       if (cur_state == 0) {
         // Change state from 0 to 1.
-        done = android_atomic_acquire_cas(0, 1, &state_) == 0;
+        done = __sync_bool_compare_and_swap(&state_, 0 /* cur_state */, 1 /* new state */);
       } else {
         return false;
       }
     } while (!done);
+    QuasiAtomic::MembarStoreLoad();
     DCHECK_EQ(state_, 1);
     exclusive_owner_ = SafeGetTid(self);
 #else
@@ -385,10 +385,11 @@
   do {
     int32_t cur_state = state_;
     if (LIKELY(cur_state == 1)) {
+      QuasiAtomic::MembarStoreStore();
       // We're no longer the owner.
       exclusive_owner_ = 0;
       // Change state to 0.
-      done = android_atomic_release_cas(cur_state, 0, &state_) == 0;
+      done =  __sync_bool_compare_and_swap(&state_, cur_state, 0 /* new state */);
       if (LIKELY(done)) {  // Spurious fail?
         // Wake a contender
         if (UNLIKELY(num_contenders_ > 0)) {
@@ -407,6 +408,7 @@
       }
     }
   } while (!done);
+  QuasiAtomic::MembarStoreLoad();
 #else
     CHECK_MUTEX_CALL(pthread_mutex_unlock, (&mutex_));
 #endif
@@ -468,11 +470,11 @@
     int32_t cur_state = state_;
     if (LIKELY(cur_state == 0)) {
       // Change state from 0 to -1.
-      done = android_atomic_acquire_cas(0, -1, &state_) == 0;
+      done =  __sync_bool_compare_and_swap(&state_, 0 /* cur_state*/, -1 /* new state */);
     } else {
       // Failed to acquire, hang up.
       ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
-      android_atomic_inc(&num_pending_writers_);
+      num_pending_writers_++;
       if (futex(&state_, FUTEX_WAIT, cur_state, NULL, NULL, 0) != 0) {
         // EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
         // We don't use TEMP_FAILURE_RETRY so we can intentionally retry to acquire the lock.
@@ -480,7 +482,7 @@
           PLOG(FATAL) << "futex wait failed for " << name_;
         }
       }
-      android_atomic_dec(&num_pending_writers_);
+      num_pending_writers_--;
     }
   } while (!done);
   DCHECK_EQ(state_, -1);
@@ -504,7 +506,7 @@
       // We're no longer the owner.
       exclusive_owner_ = 0;
       // Change state from -1 to 0.
-      done = android_atomic_release_cas(-1, 0, &state_) == 0;
+      done =  __sync_bool_compare_and_swap(&state_, -1 /* cur_state*/, 0 /* new state */);
       if (LIKELY(done)) {  // cmpxchg may fail due to noise?
         // Wake any waiters.
         if (UNLIKELY(num_pending_readers_ > 0 || num_pending_writers_ > 0)) {
@@ -531,7 +533,7 @@
     int32_t cur_state = state_;
     if (cur_state == 0) {
       // Change state from 0 to -1.
-      done = android_atomic_acquire_cas(0, -1, &state_) == 0;
+      done =  __sync_bool_compare_and_swap(&state_, 0 /* cur_state */, -1 /* new state */);
     } else {
       // Failed to acquire, hang up.
       timespec now_abs_ts;
@@ -541,10 +543,10 @@
         return false;  // Timed out.
       }
       ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
-      android_atomic_inc(&num_pending_writers_);
+      num_pending_writers_++;
       if (futex(&state_, FUTEX_WAIT, cur_state, &rel_ts, NULL, 0) != 0) {
         if (errno == ETIMEDOUT) {
-          android_atomic_dec(&num_pending_writers_);
+          num_pending_writers_--;
           return false;  // Timed out.
         } else if ((errno != EAGAIN) && (errno != EINTR)) {
           // EAGAIN and EINTR both indicate a spurious failure,
@@ -553,7 +555,7 @@
           PLOG(FATAL) << "timed futex wait failed for " << name_;
         }
       }
-      android_atomic_dec(&num_pending_writers_);
+      num_pending_writers_--;
     }
   } while (!done);
   exclusive_owner_ = SafeGetTid(self);
@@ -583,7 +585,7 @@
     int32_t cur_state = state_;
     if (cur_state >= 0) {
       // Add as an extra reader.
-      done = android_atomic_acquire_cas(cur_state, cur_state + 1, &state_) == 0;
+      done =  __sync_bool_compare_and_swap(&state_, cur_state, cur_state + 1);
     } else {
       // Owner holds it exclusively.
       return false;
@@ -666,13 +668,13 @@
   DCHECK_EQ(guard_.GetExclusiveOwnerTid(), SafeGetTid(self));
 #if ART_USE_FUTEXES
   if (num_waiters_ > 0) {
-    android_atomic_inc(&sequence_);  // Indicate the broadcast occurred.
+    sequence_++;  // Indicate the broadcast occurred.
     bool done = false;
     do {
       int32_t cur_sequence = sequence_;
       // Requeue waiters onto mutex. The waiter holds the contender count on the mutex high ensuring
       // mutex unlocks will awaken the requeued waiter thread.
-      done = futex(&sequence_, FUTEX_CMP_REQUEUE, 0,
+      done = futex(sequence_.Address(), FUTEX_CMP_REQUEUE, 0,
                    reinterpret_cast<const timespec*>(std::numeric_limits<int32_t>::max()),
                    &guard_.state_, cur_sequence) != -1;
       if (!done) {
@@ -692,10 +694,10 @@
   guard_.AssertExclusiveHeld(self);
 #if ART_USE_FUTEXES
   if (num_waiters_ > 0) {
-    android_atomic_inc(&sequence_);  // Indicate a signal occurred.
+    sequence_++;  // Indicate a signal occurred.
     // Futex wake 1 waiter who will then come and in contend on mutex. It'd be nice to requeue them
     // to avoid this, however, requeueing can only move all waiters.
-    int num_woken = futex(&sequence_, FUTEX_WAKE, 1, NULL, NULL, 0);
+    int num_woken = futex(sequence_.Address(), FUTEX_WAKE, 1, NULL, NULL, 0);
     // Check something was woken or else we changed sequence_ before they had chance to wait.
     CHECK((num_woken == 0) || (num_woken == 1));
   }
@@ -716,11 +718,11 @@
 #if ART_USE_FUTEXES
   num_waiters_++;
   // Ensure the Mutex is contended so that requeued threads are awoken.
-  android_atomic_inc(&guard_.num_contenders_);
+  guard_.num_contenders_++;
   guard_.recursion_count_ = 1;
   int32_t cur_sequence = sequence_;
   guard_.ExclusiveUnlock(self);
-  if (futex(&sequence_, FUTEX_WAIT, cur_sequence, NULL, NULL, 0) != 0) {
+  if (futex(sequence_.Address(), FUTEX_WAIT, cur_sequence, NULL, NULL, 0) != 0) {
     // Futex failed, check it is an expected error.
     // EAGAIN == EWOULDBLK, so we let the caller try again.
     // EINTR implies a signal was sent to this thread.
@@ -733,7 +735,7 @@
   num_waiters_--;
   // We awoke and so no longer require awakes from the guard_'s unlock.
   CHECK_GE(guard_.num_contenders_, 0);
-  android_atomic_dec(&guard_.num_contenders_);
+  guard_.num_contenders_--;
 #else
   guard_.recursion_count_ = 0;
   CHECK_MUTEX_CALL(pthread_cond_wait, (&cond_, &guard_.mutex_));
@@ -751,11 +753,11 @@
   InitTimeSpec(false, CLOCK_REALTIME, ms, ns, &rel_ts);
   num_waiters_++;
   // Ensure the Mutex is contended so that requeued threads are awoken.
-  android_atomic_inc(&guard_.num_contenders_);
+  guard_.num_contenders_++;
   guard_.recursion_count_ = 1;
   int32_t cur_sequence = sequence_;
   guard_.ExclusiveUnlock(self);
-  if (futex(&sequence_, FUTEX_WAIT, cur_sequence, &rel_ts, NULL, 0) != 0) {
+  if (futex(sequence_.Address(), FUTEX_WAIT, cur_sequence, &rel_ts, NULL, 0) != 0) {
     if (errno == ETIMEDOUT) {
       // Timed out we're done.
     } else if ((errno == EAGAIN) || (errno == EINTR)) {
@@ -769,7 +771,7 @@
   num_waiters_--;
   // We awoke and so no longer require awakes from the guard_'s unlock.
   CHECK_GE(guard_.num_contenders_, 0);
-  android_atomic_dec(&guard_.num_contenders_);
+  guard_.num_contenders_--;
 #else
 #ifdef HAVE_TIMEDWAIT_MONOTONIC
 #define TIMEDWAIT pthread_cond_timedwait_monotonic
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index b894c0a..1c1dcaf 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -191,7 +191,7 @@
   // Exclusive owner.
   volatile uint64_t exclusive_owner_;
   // Number of waiting contenders.
-  volatile int32_t num_contenders_;
+  AtomicInteger num_contenders_;
 #else
   pthread_mutex_t mutex_;
 #endif
@@ -304,7 +304,7 @@
   // Pending readers.
   volatile int32_t num_pending_readers_;
   // Pending writers.
-  volatile int32_t num_pending_writers_;
+  AtomicInteger num_pending_writers_;
 #else
   pthread_rwlock_t rwlock_;
 #endif
@@ -339,7 +339,7 @@
   // their Mutex and another thread takes it and signals, the waiting thread observes that sequence_
   // changed and doesn't enter the wait. Modified while holding guard_, but is read by futex wait
   // without guard_ held.
-  volatile int32_t sequence_;
+  AtomicInteger sequence_;
   // Number of threads that have come into to wait, not the length of the waiters on the futex as
   // waiters may have been requeued onto guard_. Guarded by guard_.
   volatile int32_t num_waiters_;
diff --git a/runtime/check_jni.cc b/runtime/check_jni.cc
index a84e18a..09c48b1 100644
--- a/runtime/check_jni.cc
+++ b/runtime/check_jni.cc
@@ -1754,8 +1754,8 @@
     if (address == NULL) {
       JniAbortF(__FUNCTION__, "non-nullable address is NULL");
     }
-    if (capacity <= 0) {
-      JniAbortF(__FUNCTION__, "capacity must be greater than 0: %lld", capacity);
+    if (capacity < 0) {
+      JniAbortF(__FUNCTION__, "capacity must be non negative: %lld", capacity);
     }
     return CHECK_JNI_EXIT("L", baseEnv(env)->NewDirectByteBuffer(env, address, capacity));
   }
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index 8fa5b86..02e01b8 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -68,7 +68,7 @@
         // Stack overflow.
         return false;
       }
-    } while (!back_index_.compare_and_swap(index, index + 1));
+    } while (!back_index_.CompareAndSwap(index, index + 1));
     begin_[index] = value;
     return true;
   }
@@ -93,7 +93,7 @@
   // Take an item from the front of the stack.
   T PopFront() {
     int32_t index = front_index_;
-    DCHECK_LT(index, back_index_.load());
+    DCHECK_LT(index, back_index_.Load());
     front_index_ = front_index_ + 1;
     return begin_[index];
   }
@@ -101,7 +101,7 @@
   // Pop a number of elements.
   void PopBackCount(int32_t n) {
     DCHECK_GE(Size(), static_cast<size_t>(n));
-    back_index_.fetch_sub(n);
+    back_index_.FetchAndSub(n);
   }
 
   bool IsEmpty() const {
@@ -132,11 +132,11 @@
   }
 
   void Sort() {
-    int32_t start_back_index = back_index_.load();
-    int32_t start_front_index = front_index_.load();
+    int32_t start_back_index = back_index_.Load();
+    int32_t start_front_index = front_index_.Load();
     std::sort(Begin(), End());
-    CHECK_EQ(start_back_index, back_index_.load());
-    CHECK_EQ(start_front_index, front_index_.load());
+    CHECK_EQ(start_back_index, back_index_.Load());
+    CHECK_EQ(start_front_index, front_index_.Load());
     if (kIsDebugBuild) {
       debug_is_sorted_ = true;
     }
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 28cc510..cae2a54 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -1109,8 +1109,8 @@
   // AllocSpace::FreeList clears the value in ptrs, so perform after clearing the live bit
   size_t freed_bytes = space->FreeList(self, num_ptrs, ptrs);
   heap->RecordFree(freed_objects, freed_bytes);
-  mark_sweep->freed_objects_.fetch_add(freed_objects);
-  mark_sweep->freed_bytes_.fetch_add(freed_bytes);
+  mark_sweep->freed_objects_.FetchAndAdd(freed_objects);
+  mark_sweep->freed_bytes_.FetchAndAdd(freed_bytes);
 }
 
 void MarkSweep::ZygoteSweepCallback(size_t num_ptrs, Object** ptrs, void* arg) {
@@ -1192,10 +1192,10 @@
   VLOG(heap) << "Freed " << freed_objects << "/" << count
              << " objects with size " << PrettySize(freed_bytes);
   heap_->RecordFree(freed_objects + freed_large_objects, freed_bytes + freed_large_object_bytes);
-  freed_objects_.fetch_add(freed_objects);
-  freed_large_objects_.fetch_add(freed_large_objects);
-  freed_bytes_.fetch_add(freed_bytes);
-  freed_large_object_bytes_.fetch_add(freed_large_object_bytes);
+  freed_objects_.FetchAndAdd(freed_objects);
+  freed_large_objects_.FetchAndAdd(freed_large_objects);
+  freed_bytes_.FetchAndAdd(freed_bytes);
+  freed_large_object_bytes_.FetchAndAdd(freed_large_object_bytes);
   timings_.EndSplit();
 
   timings_.StartSplit("ResetStack");
@@ -1267,8 +1267,8 @@
       ++freed_objects;
     }
   }
-  freed_large_objects_.fetch_add(freed_objects);
-  freed_large_object_bytes_.fetch_add(freed_bytes);
+  freed_large_objects_.FetchAndAdd(freed_objects);
+  freed_large_object_bytes_.FetchAndAdd(freed_bytes);
   GetHeap()->RecordFree(freed_objects, freed_bytes);
 }
 
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index f29eadb..a4f7121 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -236,8 +236,8 @@
   int freed_bytes = from_bytes - to_bytes;
   int freed_objects = from_objects - to_objects;
   CHECK_GE(freed_bytes, 0);
-  freed_bytes_.fetch_add(freed_bytes);
-  freed_objects_.fetch_add(freed_objects);
+  freed_bytes_.FetchAndAdd(freed_bytes);
+  freed_objects_.FetchAndAdd(freed_objects);
   heap_->RecordFree(static_cast<size_t>(freed_objects), static_cast<size_t>(freed_bytes));
 
   timings_.StartSplit("PreSweepingGcVerification");
@@ -332,7 +332,7 @@
             // If out of space, fall back to the to-space.
             forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated);
           } else {
-            GetHeap()->num_bytes_allocated_.fetch_add(bytes_promoted);
+            GetHeap()->num_bytes_allocated_.FetchAndAdd(bytes_promoted);
             bytes_promoted_ += bytes_promoted;
             // Mark forward_address on the live bit map.
             accounting::SpaceBitmap* live_bitmap = non_moving_space->GetLiveBitmap();
@@ -446,8 +446,8 @@
   Locks::heap_bitmap_lock_->AssertExclusiveHeld(self);
   size_t freed_bytes = space->FreeList(self, num_ptrs, ptrs);
   heap->RecordFree(num_ptrs, freed_bytes);
-  gc->freed_objects_.fetch_add(num_ptrs);
-  gc->freed_bytes_.fetch_add(freed_bytes);
+  gc->freed_objects_.FetchAndAdd(num_ptrs);
+  gc->freed_bytes_.FetchAndAdd(freed_bytes);
 }
 
 void SemiSpace::ZygoteSweepCallback(size_t num_ptrs, Object** ptrs, void* arg) {
@@ -526,8 +526,8 @@
       ++freed_objects;
     }
   }
-  freed_large_objects_.fetch_add(freed_objects);
-  freed_large_object_bytes_.fetch_add(freed_bytes);
+  freed_large_objects_.FetchAndAdd(freed_objects);
+  freed_large_object_bytes_.FetchAndAdd(freed_bytes);
   GetHeap()->RecordFree(freed_objects, freed_bytes);
 }
 
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 9fb5760..af1b26b 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -61,7 +61,7 @@
   pre_fence_visitor(obj);
   DCHECK_GT(bytes_allocated, 0u);
   const size_t new_num_bytes_allocated =
-      static_cast<size_t>(num_bytes_allocated_.fetch_add(bytes_allocated)) + bytes_allocated;
+      static_cast<size_t>(num_bytes_allocated_.FetchAndAdd(bytes_allocated)) + bytes_allocated;
   // TODO: Deprecate.
   if (kInstrumented) {
     if (Runtime::Current()->HasStatsEnabled()) {
@@ -200,7 +200,7 @@
     // Only if the allocation succeeded, record the time.
     if (allocated_obj != nullptr) {
       uint64_t allocation_end_time = NanoTime() / kTimeAdjust;
-      heap_->total_allocation_time_.fetch_add(allocation_end_time - allocation_start_time_);
+      heap_->total_allocation_time_.FetchAndAdd(allocation_end_time - allocation_start_time_);
     }
   }
 };
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 61c66e7..e08106b 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -413,13 +413,13 @@
 
 void Heap::RegisterGCAllocation(size_t bytes) {
   if (this != nullptr) {
-    gc_memory_overhead_.fetch_add(bytes);
+    gc_memory_overhead_.FetchAndAdd(bytes);
   }
 }
 
 void Heap::RegisterGCDeAllocation(size_t bytes) {
   if (this != nullptr) {
-    gc_memory_overhead_.fetch_sub(bytes);
+    gc_memory_overhead_.FetchAndSub(bytes);
   }
 }
 
@@ -802,7 +802,7 @@
 void Heap::VerifyObjectBody(const mirror::Object* obj) {
   CHECK(IsAligned<kObjectAlignment>(obj)) << "Object isn't aligned: " << obj;
   // Ignore early dawn of the universe verifications.
-  if (UNLIKELY(static_cast<size_t>(num_bytes_allocated_.load()) < 10 * KB)) {
+  if (UNLIKELY(static_cast<size_t>(num_bytes_allocated_.Load()) < 10 * KB)) {
     return;
   }
   const byte* raw_addr = reinterpret_cast<const byte*>(obj) +
@@ -847,7 +847,8 @@
 
 void Heap::RecordFree(size_t freed_objects, size_t freed_bytes) {
   DCHECK_LE(freed_bytes, static_cast<size_t>(num_bytes_allocated_));
-  num_bytes_allocated_.fetch_sub(freed_bytes);
+  num_bytes_allocated_.FetchAndSub(freed_bytes);
+
   if (Runtime::Current()->HasStatsEnabled()) {
     RuntimeStats* thread_stats = Thread::Current()->GetStats();
     thread_stats->freed_objects += freed_objects;
@@ -2082,7 +2083,7 @@
     native_need_to_run_finalization_ = false;
   }
   // Total number of native bytes allocated.
-  native_bytes_allocated_.fetch_add(bytes);
+  native_bytes_allocated_.FetchAndAdd(bytes);
   if (static_cast<size_t>(native_bytes_allocated_) > native_footprint_gc_watermark_) {
     collector::GcType gc_type = have_zygote_space_ ? collector::kGcTypePartial :
         collector::kGcTypeFull;
@@ -2118,7 +2119,7 @@
 void Heap::RegisterNativeFree(JNIEnv* env, int bytes) {
   int expected_size, new_size;
   do {
-    expected_size = native_bytes_allocated_.load();
+    expected_size = native_bytes_allocated_.Load();
     new_size = expected_size - bytes;
     if (UNLIKELY(new_size < 0)) {
       ScopedObjectAccess soa(env);
@@ -2127,7 +2128,7 @@
                                  "registered as allocated", bytes, expected_size).c_str());
       break;
     }
-  } while (!native_bytes_allocated_.compare_and_swap(expected_size, new_size));
+  } while (!native_bytes_allocated_.CompareAndSwap(expected_size, new_size));
 }
 
 int64_t Heap::GetTotalMemory() const {
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 82e96a4..ac20972 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -44,8 +44,8 @@
 inline mirror::Object* BumpPointerSpace::AllocNonvirtual(size_t num_bytes) {
   mirror::Object* ret = AllocNonvirtualWithoutAccounting(num_bytes);
   if (ret != nullptr) {
-    objects_allocated_.fetch_add(1);
-    bytes_allocated_.fetch_add(num_bytes);
+    objects_allocated_.FetchAndAdd(1);
+    bytes_allocated_.FetchAndAdd(num_bytes);
   }
   return ret;
 }
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 7ea202c..d5bc667 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -172,7 +172,7 @@
 
 uint64_t BumpPointerSpace::GetBytesAllocated() {
   // Start out pre-determined amount (blocks which are not being allocated into).
-  uint64_t total = static_cast<uint64_t>(bytes_allocated_.load());
+  uint64_t total = static_cast<uint64_t>(bytes_allocated_.Load());
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::runtime_shutdown_lock_);
   MutexLock mu2(self, *Locks::thread_list_lock_);
@@ -190,7 +190,7 @@
 
 uint64_t BumpPointerSpace::GetObjectsAllocated() {
   // Start out pre-determined amount (blocks which are not being allocated into).
-  uint64_t total = static_cast<uint64_t>(objects_allocated_.load());
+  uint64_t total = static_cast<uint64_t>(objects_allocated_.Load());
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::runtime_shutdown_lock_);
   MutexLock mu2(self, *Locks::thread_list_lock_);
@@ -207,8 +207,8 @@
 }
 
 void BumpPointerSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
-  objects_allocated_.fetch_add(thread->thread_local_objects_);
-  bytes_allocated_.fetch_add(thread->thread_local_pos_ - thread->thread_local_start_);
+  objects_allocated_.FetchAndAdd(thread->thread_local_objects_);
+  bytes_allocated_.FetchAndAdd(thread->thread_local_pos_ - thread->thread_local_start_);
   thread->SetTLAB(nullptr, nullptr);
 }
 
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index c6177bd..4777cc6 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -227,7 +227,7 @@
     *error_msg = StringPrintf("Failed to map image bitmap: %s", error_msg->c_str());
     return nullptr;
   }
-  size_t bitmap_index = bitmap_index_.fetch_add(1);
+  size_t bitmap_index = bitmap_index_.FetchAndAdd(1);
   std::string bitmap_name(StringPrintf("imagespace %s live-bitmap %u", image_file_name,
                                        bitmap_index));
   UniquePtr<accounting::SpaceBitmap> bitmap(
diff --git a/runtime/indirect_reference_table.cc b/runtime/indirect_reference_table.cc
index 2bd8353..8194a0d 100644
--- a/runtime/indirect_reference_table.cc
+++ b/runtime/indirect_reference_table.cc
@@ -80,7 +80,7 @@
   prevState.all = cookie;
   size_t topIndex = segment_state_.parts.topIndex;
 
-  DCHECK(obj != NULL);
+  CHECK(obj != NULL);
   // TODO: stronger sanity check on the object (such as in heap)
   DCHECK_ALIGNED(reinterpret_cast<uintptr_t>(obj), 8);
   DCHECK(table_ != NULL);
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 4ad9c63..47c1899 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -407,9 +407,9 @@
 void Instrumentation::InstrumentQuickAllocEntryPoints() {
   // TODO: the read of quick_alloc_entry_points_instrumentation_counter_ is racey and this code
   //       should be guarded by a lock.
-  DCHECK_GE(quick_alloc_entry_points_instrumentation_counter_.load(), 0);
+  DCHECK_GE(quick_alloc_entry_points_instrumentation_counter_.Load(), 0);
   const bool enable_instrumentation =
-      quick_alloc_entry_points_instrumentation_counter_.fetch_add(1) == 0;
+      quick_alloc_entry_points_instrumentation_counter_.FetchAndAdd(1) == 0;
   if (enable_instrumentation) {
     // Instrumentation wasn't enabled so enable it.
     SetQuickAllocEntryPointsInstrumented(true);
@@ -420,9 +420,9 @@
 void Instrumentation::UninstrumentQuickAllocEntryPoints() {
   // TODO: the read of quick_alloc_entry_points_instrumentation_counter_ is racey and this code
   //       should be guarded by a lock.
-  DCHECK_GT(quick_alloc_entry_points_instrumentation_counter_.load(), 0);
+  DCHECK_GT(quick_alloc_entry_points_instrumentation_counter_.Load(), 0);
   const bool disable_instrumentation =
-      quick_alloc_entry_points_instrumentation_counter_.fetch_sub(1) == 1;
+      quick_alloc_entry_points_instrumentation_counter_.FetchAndSub(1) == 1;
   if (disable_instrumentation) {
     SetQuickAllocEntryPointsInstrumented(false);
     ResetQuickAllocEntryPoints();
diff --git a/runtime/interpreter/interpreter_goto_table_impl.cc b/runtime/interpreter/interpreter_goto_table_impl.cc
index 99c85bd..942c275 100644
--- a/runtime/interpreter/interpreter_goto_table_impl.cc
+++ b/runtime/interpreter/interpreter_goto_table_impl.cc
@@ -245,7 +245,7 @@
       // If access checks are required then the dex-to-dex compiler and analysis of
       // whether the class has final fields hasn't been performed. Conservatively
       // perform the memory barrier now.
-      ANDROID_MEMBAR_STORE();
+      QuasiAtomic::MembarStoreLoad();
     }
     if (UNLIKELY(self->TestAllFlags())) {
       CheckSuspend(self);
@@ -261,7 +261,7 @@
   HANDLE_INSTRUCTION_END();
 
   HANDLE_INSTRUCTION_START(RETURN_VOID_BARRIER) {
-    ANDROID_MEMBAR_STORE();
+    QuasiAtomic::MembarStoreLoad();
     JValue result;
     if (UNLIKELY(self->TestAllFlags())) {
       CheckSuspend(self);
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index 675095f..75041ea 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -169,7 +169,7 @@
           // If access checks are required then the dex-to-dex compiler and analysis of
           // whether the class has final fields hasn't been performed. Conservatively
           // perform the memory barrier now.
-          ANDROID_MEMBAR_STORE();
+          QuasiAtomic::MembarStoreLoad();
         }
         if (UNLIKELY(self->TestAllFlags())) {
           CheckSuspend(self);
@@ -183,7 +183,7 @@
       }
       case Instruction::RETURN_VOID_BARRIER: {
         PREAMBLE();
-        ANDROID_MEMBAR_STORE();
+        QuasiAtomic::MembarStoreLoad();
         JValue result;
         if (UNLIKELY(self->TestAllFlags())) {
           CheckSuspend(self);
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 6690519..bbe5fda 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -844,13 +844,14 @@
   }
 
   static jobject NewGlobalRef(JNIEnv* env, jobject obj) {
-    if (obj == NULL) {
-      return NULL;
-    }
     ScopedObjectAccess soa(env);
+    Object* decoded_obj = soa.Decode<Object*>(obj);
+    // Check for null after decoding the object to handle cleared weak globals.
+    if (decoded_obj == nullptr) {
+      return nullptr;
+    }
     JavaVMExt* vm = soa.Vm();
     IndirectReferenceTable& globals = vm->globals;
-    Object* decoded_obj = soa.Decode<Object*>(obj);
     WriterMutexLock mu(soa.Self(), vm->globals_lock);
     IndirectRef ref = globals.Add(IRT_FIRST_SEGMENT, decoded_obj);
     return reinterpret_cast<jobject>(ref);
@@ -884,11 +885,13 @@
   }
 
   static jobject NewLocalRef(JNIEnv* env, jobject obj) {
-    if (obj == NULL) {
-      return NULL;
-    }
     ScopedObjectAccess soa(env);
-    return soa.AddLocalReference<jobject>(soa.Decode<Object*>(obj));
+    mirror::Object* decoded_obj = soa.Decode<Object*>(obj);
+    // Check for null after decoding the object to handle cleared weak globals.
+    if (decoded_obj == nullptr) {
+      return nullptr;
+    }
+    return soa.AddLocalReference<jobject>(decoded_obj);
   }
 
   static void DeleteLocalRef(JNIEnv* env, jobject obj) {
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 7ac2c8c..9161bc5 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -253,11 +253,40 @@
   return result;
 }
 
+inline uint32_t Object::GetField32(MemberOffset field_offset, bool is_volatile) const {
+  VerifyObject(this);
+  const byte* raw_addr = reinterpret_cast<const byte*>(this) + field_offset.Int32Value();
+  const int32_t* word_addr = reinterpret_cast<const int32_t*>(raw_addr);
+  if (UNLIKELY(is_volatile)) {
+    int32_t result = *(reinterpret_cast<volatile int32_t*>(const_cast<int32_t*>(word_addr)));
+    QuasiAtomic::MembarLoadLoad();
+    return result;
+  } else {
+    return *word_addr;
+  }
+}
+
+inline void Object::SetField32(MemberOffset field_offset, uint32_t new_value, bool is_volatile,
+                               bool this_is_valid) {
+  if (this_is_valid) {
+    VerifyObject(this);
+  }
+  byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
+  uint32_t* word_addr = reinterpret_cast<uint32_t*>(raw_addr);
+  if (UNLIKELY(is_volatile)) {
+    QuasiAtomic::MembarStoreStore();  // Ensure this store occurs after others in the queue.
+    *word_addr = new_value;
+    QuasiAtomic::MembarStoreLoad();  // Ensure this store occurs before any loads.
+  } else {
+    *word_addr = new_value;
+  }
+}
+
 inline bool Object::CasField32(MemberOffset field_offset, uint32_t old_value, uint32_t new_value) {
   VerifyObject(this);
   byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
-  int32_t* addr = reinterpret_cast<int32_t*>(raw_addr);
-  return android_atomic_release_cas(old_value, new_value, addr) == 0;
+  volatile uint32_t* addr = reinterpret_cast<volatile uint32_t*>(raw_addr);
+  return __sync_bool_compare_and_swap(addr, old_value, new_value);
 }
 
 inline uint64_t Object::GetField64(MemberOffset field_offset, bool is_volatile) const {
@@ -266,7 +295,7 @@
   const int64_t* addr = reinterpret_cast<const int64_t*>(raw_addr);
   if (UNLIKELY(is_volatile)) {
     uint64_t result = QuasiAtomic::Read64(addr);
-    ANDROID_MEMBAR_FULL();
+    QuasiAtomic::MembarLoadLoad();
     return result;
   } else {
     return *addr;
@@ -278,9 +307,13 @@
   byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
   int64_t* addr = reinterpret_cast<int64_t*>(raw_addr);
   if (UNLIKELY(is_volatile)) {
-    ANDROID_MEMBAR_STORE();
+    QuasiAtomic::MembarStoreStore();  // Ensure this store occurs after others in the queue.
     QuasiAtomic::Write64(addr, new_value);
-    // Post-store barrier not required due to use of atomic op or mutex.
+    if (!QuasiAtomic::LongAtomicsUseMutexes()) {
+      QuasiAtomic::MembarStoreLoad();  // Ensure this store occurs before any loads.
+    } else {
+      // Fence from from mutex is enough.
+    }
   } else {
     *addr = new_value;
   }
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index 008a173..bdb3250 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -89,10 +89,10 @@
   static AtomicInteger seed(987654321 + std::time(nullptr));
   int32_t expected_value, new_value;
   do {
-    expected_value = static_cast<uint32_t>(seed.load());
+    expected_value = static_cast<uint32_t>(seed.Load());
     new_value = expected_value * 1103515245 + 12345;
   } while ((expected_value & LockWord::kHashMask) == 0 ||
-      !seed.compare_and_swap(expected_value, new_value));
+      !seed.CompareAndSwap(expected_value, new_value));
   return expected_value & LockWord::kHashMask;
 }
 
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index fe89b7e..058aee7 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -187,37 +187,10 @@
     return reinterpret_cast<Object**>(reinterpret_cast<byte*>(this) + field_offset.Int32Value());
   }
 
-  uint32_t GetField32(MemberOffset field_offset, bool is_volatile) const {
-    VerifyObject(this);
-    const byte* raw_addr = reinterpret_cast<const byte*>(this) + field_offset.Int32Value();
-    const int32_t* word_addr = reinterpret_cast<const int32_t*>(raw_addr);
-    if (UNLIKELY(is_volatile)) {
-      return android_atomic_acquire_load(word_addr);
-    } else {
-      return *word_addr;
-    }
-  }
+  uint32_t GetField32(MemberOffset field_offset, bool is_volatile) const;
 
   void SetField32(MemberOffset field_offset, uint32_t new_value, bool is_volatile,
-                  bool this_is_valid = true) {
-    if (this_is_valid) {
-      VerifyObject(this);
-    }
-    byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
-    uint32_t* word_addr = reinterpret_cast<uint32_t*>(raw_addr);
-    if (UNLIKELY(is_volatile)) {
-      /*
-       * TODO: add an android_atomic_synchronization_store() function and
-       * use it in the 32-bit volatile set handlers.  On some platforms we
-       * can use a fast atomic instruction and avoid the barriers.
-       */
-      ANDROID_MEMBAR_STORE();
-      *word_addr = new_value;
-      ANDROID_MEMBAR_FULL();
-    } else {
-      *word_addr = new_value;
-    }
-  }
+                  bool this_is_valid = true);
 
   bool CasField32(MemberOffset field_offset, uint32_t old_value, uint32_t new_value);
 
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index ef9a9ce..4186693 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -98,12 +98,12 @@
 
 int32_t Monitor::GetHashCode() {
   while (!HasHashCode()) {
-    if (hash_code_.compare_and_swap(0, mirror::Object::GenerateIdentityHashCode())) {
+    if (hash_code_.CompareAndSwap(0, mirror::Object::GenerateIdentityHashCode())) {
       break;
     }
   }
   DCHECK(HasHashCode());
-  return hash_code_.load();
+  return hash_code_.Load();
 }
 
 bool Monitor::Install(Thread* self) {
@@ -660,6 +660,7 @@
       case LockWord::kUnlocked: {
         LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0));
         if (sirt_obj->CasLockWord(lock_word, thin_locked)) {
+          QuasiAtomic::MembarLoadLoad();
           return;  // Success!
         }
         continue;  // Go again.
diff --git a/runtime/monitor.h b/runtime/monitor.h
index bfd8545..16e9410 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -105,7 +105,7 @@
   bool IsLocked() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   bool HasHashCode() const {
-    return hash_code_.load() != 0;
+    return hash_code_.Load() != 0;
   }
 
   static void InflateThinLocked(Thread* self, SirtRef<mirror::Object>& obj, LockWord lock_word,
diff --git a/runtime/native/java_lang_reflect_Field.cc b/runtime/native/java_lang_reflect_Field.cc
index 553aeb8..269a4a3 100644
--- a/runtime/native/java_lang_reflect_Field.cc
+++ b/runtime/native/java_lang_reflect_Field.cc
@@ -222,7 +222,7 @@
   // Special handling for final fields on SMP systems.
   // We need a store/store barrier here (JMM requirement).
   if (f->IsFinal()) {
-    ANDROID_MEMBAR_STORE();
+    QuasiAtomic::MembarStoreLoad();
   }
 }
 
diff --git a/runtime/native/sun_misc_Unsafe.cc b/runtime/native/sun_misc_Unsafe.cc
index 2c6d281..b5fc7e7 100644
--- a/runtime/native/sun_misc_Unsafe.cc
+++ b/runtime/native/sun_misc_Unsafe.cc
@@ -86,7 +86,7 @@
 static void Unsafe_putOrderedInt(JNIEnv* env, jobject, jobject javaObj, jlong offset, jint newValue) {
   ScopedFastNativeObjectAccess soa(env);
   mirror::Object* obj = soa.Decode<mirror::Object*>(javaObj);
-  ANDROID_MEMBAR_STORE();
+  QuasiAtomic::MembarStoreStore();
   obj->SetField32(MemberOffset(offset), newValue, false);
 }
 
@@ -117,7 +117,7 @@
 static void Unsafe_putOrderedLong(JNIEnv* env, jobject, jobject javaObj, jlong offset, jlong newValue) {
   ScopedFastNativeObjectAccess soa(env);
   mirror::Object* obj = soa.Decode<mirror::Object*>(javaObj);
-  ANDROID_MEMBAR_STORE();
+  QuasiAtomic::MembarStoreStore();
   obj->SetField64(MemberOffset(offset), newValue, false);
 }
 
@@ -153,7 +153,7 @@
   ScopedFastNativeObjectAccess soa(env);
   mirror::Object* obj = soa.Decode<mirror::Object*>(javaObj);
   mirror::Object* newValue = soa.Decode<mirror::Object*>(javaNewValue);
-  ANDROID_MEMBAR_STORE();
+  QuasiAtomic::MembarStoreStore();
   obj->SetFieldObject(MemberOffset(offset), newValue, false);
 }
 
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 6f3c117..b87a8ec 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -87,18 +87,22 @@
   DCHECK_EQ(GetState(), kRunnable);
   union StateAndFlags old_state_and_flags;
   union StateAndFlags new_state_and_flags;
-  do {
+  while (true) {
     old_state_and_flags.as_int = state_and_flags_.as_int;
     if (UNLIKELY((old_state_and_flags.as_struct.flags & kCheckpointRequest) != 0)) {
       RunCheckpointFunction();
       continue;
     }
-    // Copy over flags and try to clear the checkpoint bit if it is set.
-    new_state_and_flags.as_struct.flags = old_state_and_flags.as_struct.flags & ~kCheckpointRequest;
+    // Change the state but keep the current flags (kCheckpointRequest is clear).
+    DCHECK_EQ((old_state_and_flags.as_struct.flags & kCheckpointRequest), 0);
+    new_state_and_flags.as_struct.flags = old_state_and_flags.as_struct.flags;
     new_state_and_flags.as_struct.state = new_state;
-    // CAS the value without a memory barrier, that will occur in the unlock below.
-  } while (UNLIKELY(android_atomic_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
-                                       &state_and_flags_.as_int) != 0));
+    int status = android_atomic_cas(old_state_and_flags.as_int, new_state_and_flags.as_int,
+                                       &state_and_flags_.as_int);
+    if (LIKELY(status == 0)) {
+      break;
+    }
+  }
   // Release share on mutator_lock_.
   Locks::mutator_lock_->SharedUnlock(this);
 }
diff --git a/runtime/thread.cc b/runtime/thread.cc
index bc252de..9faa60d 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1147,7 +1147,7 @@
     result = Runtime::Current()->GetJavaVM()->DecodeWeakGlobal(const_cast<Thread*>(this), ref);
     if (result == kClearedJniWeakGlobal) {
       // This is a special case where it's okay to return NULL.
-      return NULL;
+      return nullptr;
     }
   }
 
diff --git a/runtime/thread_pool_test.cc b/runtime/thread_pool_test.cc
index 1b22361..2029d4b 100644
--- a/runtime/thread_pool_test.cc
+++ b/runtime/thread_pool_test.cc
@@ -94,7 +94,7 @@
   EXPECT_EQ(0, bad_count);
   // Allow tasks to finish up and delete themselves.
   thread_pool.StartWorkers(self);
-  while (count.load() != num_tasks && bad_count.load() != 1) {
+  while (count.Load() != num_tasks && bad_count.Load() != 1) {
     usleep(200);
   }
   thread_pool.StopWorkers(self);
diff --git a/test/JniTest/JniTest.java b/test/JniTest/JniTest.java
index 9194da5..d53cf5e 100644
--- a/test/JniTest/JniTest.java
+++ b/test/JniTest/JniTest.java
@@ -23,6 +23,7 @@
         testFindFieldOnAttachedNativeThread();
         testCallStaticVoidMethodOnSubClass();
         testGetMirandaMethod();
+        testZeroLengthByteBuffers();
     }
 
     private static native void testFindClassOnAttachedNativeThread();
@@ -67,6 +68,8 @@
         }
     }
 
+    private static native void testZeroLengthByteBuffers();
+
     private static abstract class testGetMirandaMethod_MirandaAbstract implements testGetMirandaMethod_MirandaInterface {
         public boolean inAbstract() {
             return true;
diff --git a/test/JniTest/jni_test.cc b/test/JniTest/jni_test.cc
index d15e180..33af94b 100644
--- a/test/JniTest/jni_test.cc
+++ b/test/JniTest/jni_test.cc
@@ -17,6 +17,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include <pthread.h>
+#include <vector>
 
 #include "jni.h"
 
@@ -125,3 +126,14 @@
   assert(miranda_method != NULL);
   return env->ToReflectedMethod(abstract_class, miranda_method, JNI_FALSE);
 }
+
+// https://code.google.com/p/android/issues/detail?id=63055
+extern "C" void JNICALL Java_JniTest_testZeroLengthByteBuffers(JNIEnv* env, jclass) {
+  std::vector<uint8_t> buffer(1);
+  jobject byte_buffer = env->NewDirectByteBuffer(&buffer[0], 0);
+  assert(byte_buffer != NULL);
+  assert(!env->ExceptionCheck());
+
+  assert(env->GetDirectBufferAddress(byte_buffer) == &buffer[0]);
+  assert(env->GetDirectBufferCapacity(byte_buffer) == 0);
+}