Merge changes I295c7876,Ib4b84b7b

* changes:
  ART: Remove PACKED from ArtMethod's ptr_sized_fields_
  ART: Rename ArtMethod JNI field
diff --git a/build/Android.common_path.mk b/build/Android.common_path.mk
index b1644df..e213dc4 100644
--- a/build/Android.common_path.mk
+++ b/build/Android.common_path.mk
@@ -38,7 +38,7 @@
 ifneq ($(TMPDIR),)
 ART_HOST_TEST_DIR := $(TMPDIR)/test-art-$(shell echo $$PPID)
 else
-ART_HOST_TEST_DIR := /tmp/test-art-$(shell echo $$PPID)
+ART_HOST_TEST_DIR := /tmp/$(USER)/test-art-$(shell echo $$PPID)
 endif
 
 # core.oat location on the device.
diff --git a/cmdline/cmdline_types.h b/cmdline/cmdline_types.h
index 9b4042c..f05648c 100644
--- a/cmdline/cmdline_types.h
+++ b/cmdline/cmdline_types.h
@@ -462,7 +462,7 @@
 struct XGcOption {
   // These defaults are used when the command line arguments for -Xgc:
   // are either omitted completely or partially.
-  gc::CollectorType collector_type_ =  kUseReadBarrier ?
+  gc::CollectorType collector_type_ = kUseReadBarrier ?
                                            // If RB is enabled (currently a build-time decision),
                                            // use CC as the default GC.
                                            gc::kCollectorTypeCC :
@@ -473,6 +473,7 @@
   bool verify_pre_gc_rosalloc_ = kIsDebugBuild;
   bool verify_pre_sweeping_rosalloc_ = false;
   bool verify_post_gc_rosalloc_ = false;
+  bool measure_ = kIsDebugBuild;
   bool gcstress_ = false;
 };
 
@@ -515,6 +516,8 @@
         xgc.gcstress_ = true;
       } else if (gc_option == "nogcstress") {
         xgc.gcstress_ = false;
+      } else if (gc_option == "measure") {
+        xgc.measure_ = true;
       } else if ((gc_option == "precise") ||
                  (gc_option == "noprecise") ||
                  (gc_option == "verifycardtable") ||
diff --git a/compiler/Android.mk b/compiler/Android.mk
index f310565..e3f8a5c 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -67,6 +67,7 @@
 	optimizing/parallel_move_resolver.cc \
 	optimizing/prepare_for_register_allocation.cc \
 	optimizing/reference_type_propagation.cc \
+	optimizing/register_allocator.cc \
 	optimizing/register_allocation_resolver.cc \
 	optimizing/register_allocator_linear_scan.cc \
 	optimizing/select_generator.cc \
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 47e6625..5e6e175 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -80,7 +80,11 @@
 
   virtual void EmitNativeCode(CodeGenerator* codegen) = 0;
 
+  // Save live core and floating-point caller-save registers and
+  // update the stack mask in `locations` for registers holding object
+  // references.
   virtual void SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations);
+  // Restore live core and floating-point caller-save registers.
   virtual void RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations);
 
   bool IsCoreRegisterSaved(int reg) const {
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index ad0a4f47..236ed20 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -437,11 +437,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // R0 (if it is live), as it is clobbered by functions
-    // art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     DCHECK_NE(reg, SP);
@@ -469,8 +467,6 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ b(GetExitLabel());
   }
 
@@ -1937,7 +1933,7 @@
   __ LoadFromOffset(kLoadWord, temp, temp,
         mirror::Class::ImtPtrOffset(kArmPointerSize).Uint32Value());
   uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
-      invoke->GetImtIndex() % ImTable::kSize, kArmPointerSize));
+      invoke->GetImtIndex(), kArmPointerSize));
   // temp = temp->GetImtEntryAt(method_offset);
   __ LoadFromOffset(kLoadWord, temp, temp, method_offset);
   uint32_t entry_point =
@@ -4437,6 +4433,10 @@
   Location out_loc = locations->Out();
   uint32_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
   Primitive::Type type = instruction->GetType();
+  HInstruction* array_instr = instruction->GetArray();
+  bool has_intermediate_address = array_instr->IsIntermediateAddress();
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -4451,8 +4451,21 @@
         LoadOperandType load_type = GetLoadOperandType(type);
         __ LoadFromOffset(load_type, out_loc.AsRegister<Register>(), obj, full_offset);
       } else {
-        __ add(IP, obj, ShifterOperand(data_offset));
-        codegen_->LoadFromShiftedRegOffset(type, out_loc, IP, index.AsRegister<Register>());
+        Register temp = IP;
+
+        if (has_intermediate_address) {
+          // We do not need to compute the intermediate address from the array: the
+          // input instruction has done it already. See the comment in
+          // `TryExtractArrayAccessAddress()`.
+          if (kIsDebugBuild) {
+            HIntermediateAddress* tmp = array_instr->AsIntermediateAddress();
+            DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), data_offset);
+          }
+          temp = obj;
+        } else {
+          __ add(temp, obj, ShifterOperand(data_offset));
+        }
+        codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>());
       }
       break;
     }
@@ -4481,8 +4494,21 @@
           // reference, if heap poisoning is enabled).
           codegen_->MaybeGenerateReadBarrierSlow(instruction, out_loc, out_loc, obj_loc, offset);
         } else {
-          __ add(IP, obj, ShifterOperand(data_offset));
-          codegen_->LoadFromShiftedRegOffset(type, out_loc, IP, index.AsRegister<Register>());
+          Register temp = IP;
+
+          if (has_intermediate_address) {
+            // We do not need to compute the intermediate address from the array: the
+            // input instruction has done it already. See the comment in
+            // `TryExtractArrayAccessAddress()`.
+            if (kIsDebugBuild) {
+              HIntermediateAddress* tmp = array_instr->AsIntermediateAddress();
+              DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), data_offset);
+            }
+            temp = obj;
+          } else {
+            __ add(temp, obj, ShifterOperand(data_offset));
+          }
+          codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>());
 
           codegen_->MaybeRecordImplicitNullCheck(instruction);
           // If read barriers are enabled, emit read barriers other than
@@ -4585,6 +4611,10 @@
   uint32_t data_offset =
       mirror::Array::DataOffset(Primitive::ComponentSize(value_type)).Uint32Value();
   Location value_loc = locations->InAt(2);
+  HInstruction* array_instr = instruction->GetArray();
+  bool has_intermediate_address = array_instr->IsIntermediateAddress();
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
 
   switch (value_type) {
     case Primitive::kPrimBoolean:
@@ -4599,10 +4629,23 @@
         StoreOperandType store_type = GetStoreOperandType(value_type);
         __ StoreToOffset(store_type, value_loc.AsRegister<Register>(), array, full_offset);
       } else {
-        __ add(IP, array, ShifterOperand(data_offset));
+        Register temp = IP;
+
+        if (has_intermediate_address) {
+          // We do not need to compute the intermediate address from the array: the
+          // input instruction has done it already. See the comment in
+          // `TryExtractArrayAccessAddress()`.
+          if (kIsDebugBuild) {
+            HIntermediateAddress* tmp = array_instr->AsIntermediateAddress();
+            DCHECK(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64() == data_offset);
+          }
+          temp = array;
+        } else {
+          __ add(temp, array, ShifterOperand(data_offset));
+        }
         codegen_->StoreToShiftedRegOffset(value_type,
                                           value_loc,
-                                          IP,
+                                          temp,
                                           index.AsRegister<Register>());
       }
       break;
@@ -4610,6 +4653,9 @@
 
     case Primitive::kPrimNot: {
       Register value = value_loc.AsRegister<Register>();
+      // TryExtractArrayAccessAddress optimization is never applied for non-primitive ArraySet.
+      // See the comment in instruction_simplifier_shared.cc.
+      DCHECK(!has_intermediate_address);
 
       if (instruction->InputAt(2)->IsNullConstant()) {
         // Just setting null.
@@ -4832,6 +4878,37 @@
   codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
+void LocationsBuilderARM::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!kEmitCompilerReadBarrier);
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RegisterOrConstant(instruction->GetOffset()));
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void InstructionCodeGeneratorARM::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  Location out = locations->Out();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!kEmitCompilerReadBarrier);
+
+  if (second.IsRegister()) {
+    __ add(out.AsRegister<Register>(),
+           first.AsRegister<Register>(),
+           ShifterOperand(second.AsRegister<Register>()));
+  } else {
+    __ AddConstant(out.AsRegister<Register>(),
+                   first.AsRegister<Register>(),
+                   second.GetConstant()->AsIntConstant()->GetValue());
+  }
+}
+
 void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) {
   LocationSummary::CallKind call_kind = instruction->CanThrowIntoCatchBlock()
       ? LocationSummary::kCallOnSlowPath
@@ -7033,7 +7110,7 @@
                       method_offset);
   } else {
     uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
-        instruction->GetIndex() % ImTable::kSize, kArmPointerSize));
+        instruction->GetIndex(), kArmPointerSize));
     __ LoadFromOffset(kLoadWord,
                       locations->Out().AsRegister<Register>(),
                       locations->InAt(0).AsRegister<Register>(),
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index bf2c598..76b0797 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -603,11 +603,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // W0 (if it is live), as it is clobbered by functions
-    // art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
     DCHECK_NE(obj_.reg(), LR);
@@ -635,8 +633,6 @@
                                  instruction_,
                                  instruction_->GetDexPc(),
                                  this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ B(GetExitLabel());
   }
 
@@ -690,10 +686,9 @@
             instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
-    // The read barrier instrumentation does not support the
-    // HArm64IntermediateAddress instruction yet.
+    // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
     DCHECK(!(instruction_->IsArrayGet() &&
-             instruction_->AsArrayGet()->GetArray()->IsArm64IntermediateAddress()));
+             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
 
     __ Bind(GetEntryLabel());
 
@@ -1983,9 +1978,8 @@
   }
 }
 
-void LocationsBuilderARM64::VisitArm64IntermediateAddress(HArm64IntermediateAddress* instruction) {
-  // The read barrier instrumentation does not support the
-  // HArm64IntermediateAddress instruction yet.
+void LocationsBuilderARM64::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
   DCHECK(!kEmitCompilerReadBarrier);
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
@@ -1994,10 +1988,9 @@
   locations->SetOut(Location::RequiresRegister());
 }
 
-void InstructionCodeGeneratorARM64::VisitArm64IntermediateAddress(
-    HArm64IntermediateAddress* instruction) {
-  // The read barrier instrumentation does not support the
-  // HArm64IntermediateAddress instruction yet.
+void InstructionCodeGeneratorARM64::VisitIntermediateAddress(
+    HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
   DCHECK(!kEmitCompilerReadBarrier);
   __ Add(OutputRegister(instruction),
          InputRegisterAt(instruction, 0),
@@ -2097,9 +2090,8 @@
   if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     // Object ArrayGet with Baker's read barrier case.
     Register temp = temps.AcquireW();
-    // The read barrier instrumentation does not support the
-    // HArm64IntermediateAddress instruction yet.
-    DCHECK(!instruction->GetArray()->IsArm64IntermediateAddress());
+    // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+    DCHECK(!instruction->GetArray()->IsIntermediateAddress());
     // Note that a potential implicit null check is handled in the
     // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call.
     codegen_->GenerateArrayLoadWithBakerReadBarrier(
@@ -2112,15 +2104,15 @@
       source = HeapOperand(obj, offset);
     } else {
       Register temp = temps.AcquireSameSizeAs(obj);
-      if (instruction->GetArray()->IsArm64IntermediateAddress()) {
+      if (instruction->GetArray()->IsIntermediateAddress()) {
         // The read barrier instrumentation does not support the
-        // HArm64IntermediateAddress instruction yet.
+        // HIntermediateAddress instruction yet.
         DCHECK(!kEmitCompilerReadBarrier);
         // We do not need to compute the intermediate address from the array: the
         // input instruction has done it already. See the comment in
-        // `InstructionSimplifierArm64::TryExtractArrayAccessAddress()`.
+        // `TryExtractArrayAccessAddress()`.
         if (kIsDebugBuild) {
-          HArm64IntermediateAddress* tmp = instruction->GetArray()->AsArm64IntermediateAddress();
+          HIntermediateAddress* tmp = instruction->GetArray()->AsIntermediateAddress();
           DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), offset);
         }
         temp = obj;
@@ -2204,15 +2196,15 @@
     } else {
       UseScratchRegisterScope temps(masm);
       Register temp = temps.AcquireSameSizeAs(array);
-      if (instruction->GetArray()->IsArm64IntermediateAddress()) {
+      if (instruction->GetArray()->IsIntermediateAddress()) {
         // The read barrier instrumentation does not support the
-        // HArm64IntermediateAddress instruction yet.
+        // HIntermediateAddress instruction yet.
         DCHECK(!kEmitCompilerReadBarrier);
         // We do not need to compute the intermediate address from the array: the
         // input instruction has done it already. See the comment in
-        // `InstructionSimplifierArm64::TryExtractArrayAccessAddress()`.
+        // `TryExtractArrayAccessAddress()`.
         if (kIsDebugBuild) {
-          HArm64IntermediateAddress* tmp = instruction->GetArray()->AsArm64IntermediateAddress();
+          HIntermediateAddress* tmp = instruction->GetArray()->AsIntermediateAddress();
           DCHECK(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64() == offset);
         }
         temp = array;
@@ -2228,7 +2220,7 @@
     codegen_->MaybeRecordImplicitNullCheck(instruction);
   } else {
     DCHECK(needs_write_barrier);
-    DCHECK(!instruction->GetArray()->IsArm64IntermediateAddress());
+    DCHECK(!instruction->GetArray()->IsIntermediateAddress());
     vixl::aarch64::Label done;
     SlowPathCodeARM64* slow_path = nullptr;
     {
@@ -3561,7 +3553,7 @@
   __ Ldr(temp,
       MemOperand(temp, mirror::Class::ImtPtrOffset(kArm64PointerSize).Uint32Value()));
   uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
-      invoke->GetImtIndex() % ImTable::kSize, kArm64PointerSize));
+      invoke->GetImtIndex(), kArm64PointerSize));
   // temp = temp->GetImtEntryAt(method_offset);
   __ Ldr(temp, MemOperand(temp, method_offset));
   // lr = temp->GetEntryPoint();
@@ -5382,7 +5374,7 @@
            MemOperand(XRegisterFrom(locations->InAt(0)), method_offset));
   } else {
     uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
-        instruction->GetIndex() % ImTable::kSize, kArm64PointerSize));
+        instruction->GetIndex(), kArm64PointerSize));
     __ Ldr(XRegisterFrom(locations->Out()), MemOperand(XRegisterFrom(locations->InAt(0)),
         mirror::Class::ImtPtrOffset(kArm64PointerSize).Uint32Value()));
     __ Ldr(XRegisterFrom(locations->Out()),
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index cf8928f..39248aa 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -3791,7 +3791,7 @@
   __ LoadFromOffset(kLoadWord, temp, temp,
       mirror::Class::ImtPtrOffset(kMipsPointerSize).Uint32Value());
   uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
-      invoke->GetImtIndex() % ImTable::kSize, kMipsPointerSize));
+      invoke->GetImtIndex(), kMipsPointerSize));
   // temp = temp->GetImtEntryAt(method_offset);
   __ LoadFromOffset(kLoadWord, temp, temp, method_offset);
   // T9 = temp->GetEntryPoint();
@@ -5389,7 +5389,7 @@
                       method_offset);
   } else {
     uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
-        instruction->GetIndex() % ImTable::kSize, kMipsPointerSize));
+        instruction->GetIndex(), kMipsPointerSize));
     __ LoadFromOffset(kLoadWord,
                       locations->Out().AsRegister<Register>(),
                       locations->InAt(0).AsRegister<Register>(),
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index cf3c42e..29b8c20 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -2951,7 +2951,7 @@
   __ LoadFromOffset(kLoadDoubleword, temp, temp,
       mirror::Class::ImtPtrOffset(kMips64PointerSize).Uint32Value());
   uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
-      invoke->GetImtIndex() % ImTable::kSize, kMips64PointerSize));
+      invoke->GetImtIndex(), kMips64PointerSize));
   // temp = temp->GetImtEntryAt(method_offset);
   __ LoadFromOffset(kLoadDoubleword, temp, temp, method_offset);
   // T9 = temp->GetEntryPoint();
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 425f31c..82baaa0 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -472,11 +472,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // EAX (if it is live), as it is clobbered by functions
-    // art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
     DCHECK_NE(reg, ESP);
@@ -502,8 +500,6 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
 
@@ -2093,7 +2089,7 @@
       Address(temp, mirror::Class::ImtPtrOffset(kX86PointerSize).Uint32Value()));
   // temp = temp->GetImtEntryAt(method_offset);
   uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
-      invoke->GetImtIndex() % ImTable::kSize, kX86PointerSize));
+      invoke->GetImtIndex(), kX86PointerSize));
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
   __ call(Address(temp,
@@ -4115,7 +4111,7 @@
             Address(locations->InAt(0).AsRegister<Register>(), method_offset));
   } else {
     uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
-        instruction->GetIndex() % ImTable::kSize, kX86PointerSize));
+        instruction->GetIndex(), kX86PointerSize));
     __ movl(locations->Out().AsRegister<Register>(),
             Address(locations->InAt(0).AsRegister<Register>(),
                     mirror::Class::ImtPtrOffset(kX86PointerSize).Uint32Value()));
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index bd4ded1..b6ba30e 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -493,11 +493,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // RDI and/or RAX (if they are live), as they are clobbered by
-    // functions art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     DCHECK_NE(reg, RSP);
@@ -523,8 +521,6 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
 
@@ -2322,7 +2318,7 @@
       Address(temp, mirror::Class::ImtPtrOffset(kX86_64PointerSize).Uint32Value()));
   // temp = temp->GetImtEntryAt(method_offset);
   uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
-      invoke->GetImtIndex() % ImTable::kSize, kX86_64PointerSize));
+      invoke->GetImtIndex(), kX86_64PointerSize));
   // temp = temp->GetImtEntryAt(method_offset);
   __ movq(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
@@ -4048,7 +4044,7 @@
             Address(locations->InAt(0).AsRegister<CpuRegister>(), method_offset));
   } else {
     uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
-        instruction->GetIndex() % ImTable::kSize, kX86_64PointerSize));
+        instruction->GetIndex(), kX86_64PointerSize));
     __ movq(locations->Out().AsRegister<CpuRegister>(),
             Address(locations->InAt(0).AsRegister<CpuRegister>(),
             mirror::Class::ImtPtrOffset(kX86_64PointerSize).Uint32Value()));
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 6f487c5..fe9a7af 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -219,7 +219,7 @@
 
   PrepareForRegisterAllocation(graph).Run();
   liveness.Analyze();
-  RegisterAllocator(graph->GetArena(), codegen, liveness).AllocateRegisters();
+  RegisterAllocator::Create(graph->GetArena(), codegen, liveness)->AllocateRegisters();
   hook_before_codegen(graph);
 
   InternalCodeAllocator allocator;
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index d2afa5b..af0ee4e 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -227,7 +227,7 @@
     return vixl::aarch64::Assembler::IsImmMovn(value, vixl::aarch64::kXRegSize);
   } else {
     DCHECK(instr->IsAdd() ||
-           instr->IsArm64IntermediateAddress() ||
+           instr->IsIntermediateAddress() ||
            instr->IsBoundsCheck() ||
            instr->IsCompare() ||
            instr->IsCondition() ||
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 790751f..a592162 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -661,7 +661,7 @@
     ArtMethod* new_method = nullptr;
     if (invoke_instruction->IsInvokeInterface()) {
       new_method = ic.GetTypeAt(i)->GetImt(pointer_size)->Get(
-          method_index % ImTable::kSize, pointer_size);
+          method_index, pointer_size);
       if (new_method->IsRuntimeMethod()) {
         // Bail out as soon as we see a conflict trampoline in one of the target's
         // interface table.
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index b412529..afac5f9 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -16,6 +16,7 @@
 
 #include "instruction_builder.h"
 
+#include "art_method-inl.h"
 #include "bytecode_utils.h"
 #include "class_linker.h"
 #include "driver/compiler_options.h"
@@ -890,7 +891,7 @@
                                            return_type,
                                            dex_pc,
                                            method_idx,
-                                           resolved_method->GetDexMethodIndex());
+                                           resolved_method->GetImtIndex());
   }
 
   return HandleInvoke(invoke,
diff --git a/compiler/optimizing/instruction_simplifier_arm.cc b/compiler/optimizing/instruction_simplifier_arm.cc
index cd026b8..495f3fd 100644
--- a/compiler/optimizing/instruction_simplifier_arm.cc
+++ b/compiler/optimizing/instruction_simplifier_arm.cc
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
+#include "code_generator.h"
 #include "instruction_simplifier_arm.h"
 #include "instruction_simplifier_shared.h"
+#include "mirror/array-inl.h"
 
 namespace art {
 namespace arm {
@@ -38,6 +40,46 @@
   }
 }
 
+void InstructionSimplifierArmVisitor::VisitArrayGet(HArrayGet* instruction) {
+  size_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
+  Primitive::Type type = instruction->GetType();
+
+  if (type == Primitive::kPrimLong
+      || type == Primitive::kPrimFloat
+      || type == Primitive::kPrimDouble) {
+    // T32 doesn't support ShiftedRegOffset mem address mode for these types
+    // to enable optimization.
+    return;
+  }
+
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
+}
+
+void InstructionSimplifierArmVisitor::VisitArraySet(HArraySet* instruction) {
+  size_t access_size = Primitive::ComponentSize(instruction->GetComponentType());
+  size_t data_offset = mirror::Array::DataOffset(access_size).Uint32Value();
+  Primitive::Type type = instruction->GetComponentType();
+
+  if (type == Primitive::kPrimLong
+      || type == Primitive::kPrimFloat
+      || type == Primitive::kPrimDouble) {
+    // T32 doesn't support ShiftedRegOffset mem address mode for these types
+    // to enable optimization.
+    return;
+  }
+
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
+}
 
 }  // namespace arm
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm.h b/compiler/optimizing/instruction_simplifier_arm.h
index 14c940e..3d297da 100644
--- a/compiler/optimizing/instruction_simplifier_arm.h
+++ b/compiler/optimizing/instruction_simplifier_arm.h
@@ -38,6 +38,8 @@
   void VisitMul(HMul* instruction) OVERRIDE;
   void VisitOr(HOr* instruction) OVERRIDE;
   void VisitAnd(HAnd* instruction) OVERRIDE;
+  void VisitArrayGet(HArrayGet* instruction) OVERRIDE;
+  void VisitArraySet(HArraySet* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
 };
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index 983d31d..6d107d5 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -28,56 +28,6 @@
 using helpers::HasShifterOperand;
 using helpers::ShifterOperandSupportsExtension;
 
-void InstructionSimplifierArm64Visitor::TryExtractArrayAccessAddress(HInstruction* access,
-                                                                     HInstruction* array,
-                                                                     HInstruction* index,
-                                                                     size_t data_offset) {
-  if (kEmitCompilerReadBarrier) {
-    // The read barrier instrumentation does not support the
-    // HArm64IntermediateAddress instruction yet.
-    //
-    // TODO: Handle this case properly in the ARM64 code generator and
-    // re-enable this optimization; otherwise, remove this TODO.
-    // b/26601270
-    return;
-  }
-  if (index->IsConstant() ||
-      (index->IsBoundsCheck() && index->AsBoundsCheck()->GetIndex()->IsConstant())) {
-    // When the index is a constant all the addressing can be fitted in the
-    // memory access instruction, so do not split the access.
-    return;
-  }
-  if (access->IsArraySet() &&
-      access->AsArraySet()->GetValue()->GetType() == Primitive::kPrimNot) {
-    // The access may require a runtime call or the original array pointer.
-    return;
-  }
-
-  // Proceed to extract the base address computation.
-  ArenaAllocator* arena = GetGraph()->GetArena();
-
-  HIntConstant* offset = GetGraph()->GetIntConstant(data_offset);
-  HArm64IntermediateAddress* address =
-      new (arena) HArm64IntermediateAddress(array, offset, kNoDexPc);
-  address->SetReferenceTypeInfo(array->GetReferenceTypeInfo());
-  access->GetBlock()->InsertInstructionBefore(address, access);
-  access->ReplaceInput(address, 0);
-  // Both instructions must depend on GC to prevent any instruction that can
-  // trigger GC to be inserted between the two.
-  access->AddSideEffects(SideEffects::DependsOnGC());
-  DCHECK(address->GetSideEffects().Includes(SideEffects::DependsOnGC()));
-  DCHECK(access->GetSideEffects().Includes(SideEffects::DependsOnGC()));
-  // TODO: Code generation for HArrayGet and HArraySet will check whether the input address
-  // is an HArm64IntermediateAddress and generate appropriate code.
-  // We would like to replace the `HArrayGet` and `HArraySet` with custom instructions (maybe
-  // `HArm64Load` and `HArm64Store`). We defer these changes because these new instructions would
-  // not bring any advantages yet.
-  // Also see the comments in
-  // `InstructionCodeGeneratorARM64::VisitArrayGet()` and
-  // `InstructionCodeGeneratorARM64::VisitArraySet()`.
-  RecordSimplification();
-}
-
 bool InstructionSimplifierArm64Visitor::TryMergeIntoShifterOperand(HInstruction* use,
                                                                    HInstruction* bitfield_op,
                                                                    bool do_merge) {
@@ -190,19 +140,23 @@
 
 void InstructionSimplifierArm64Visitor::VisitArrayGet(HArrayGet* instruction) {
   size_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
-  TryExtractArrayAccessAddress(instruction,
-                               instruction->GetArray(),
-                               instruction->GetIndex(),
-                               data_offset);
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
 }
 
 void InstructionSimplifierArm64Visitor::VisitArraySet(HArraySet* instruction) {
   size_t access_size = Primitive::ComponentSize(instruction->GetComponentType());
   size_t data_offset = mirror::Array::DataOffset(access_size).Uint32Value();
-  TryExtractArrayAccessAddress(instruction,
-                               instruction->GetArray(),
-                               instruction->GetIndex(),
-                               data_offset);
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
 }
 
 void InstructionSimplifierArm64Visitor::VisitMul(HMul* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index 4735f85..28648b3 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -35,10 +35,6 @@
     }
   }
 
-  void TryExtractArrayAccessAddress(HInstruction* access,
-                                    HInstruction* array,
-                                    HInstruction* index,
-                                    size_t data_offset);
   bool TryMergeIntoUsersShifterOperand(HInstruction* instruction);
   bool TryMergeIntoShifterOperand(HInstruction* use,
                                   HInstruction* bitfield_op,
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
index dab1ebc..8f7778f 100644
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ b/compiler/optimizing/instruction_simplifier_shared.cc
@@ -226,4 +226,59 @@
   return false;
 }
 
+
+bool TryExtractArrayAccessAddress(HInstruction* access,
+                                  HInstruction* array,
+                                  HInstruction* index,
+                                  size_t data_offset) {
+  if (kEmitCompilerReadBarrier) {
+    // The read barrier instrumentation does not support the
+    // HIntermediateAddress instruction yet.
+    //
+    // TODO: Handle this case properly in the ARM64 and ARM code generator and
+    // re-enable this optimization; otherwise, remove this TODO.
+    // b/26601270
+    return false;
+  }
+  if (index->IsConstant() ||
+      (index->IsBoundsCheck() && index->AsBoundsCheck()->GetIndex()->IsConstant())) {
+    // When the index is a constant all the addressing can be fitted in the
+    // memory access instruction, so do not split the access.
+    return false;
+  }
+  if (access->IsArraySet() &&
+      access->AsArraySet()->GetValue()->GetType() == Primitive::kPrimNot) {
+    // The access may require a runtime call or the original array pointer.
+    return false;
+  }
+
+  // Proceed to extract the base address computation.
+  HGraph* graph = access->GetBlock()->GetGraph();
+  ArenaAllocator* arena = graph->GetArena();
+
+  HIntConstant* offset = graph->GetIntConstant(data_offset);
+  HIntermediateAddress* address =
+      new (arena) HIntermediateAddress(array, offset, kNoDexPc);
+  address->SetReferenceTypeInfo(array->GetReferenceTypeInfo());
+  access->GetBlock()->InsertInstructionBefore(address, access);
+  access->ReplaceInput(address, 0);
+  // Both instructions must depend on GC to prevent any instruction that can
+  // trigger GC to be inserted between the two.
+  access->AddSideEffects(SideEffects::DependsOnGC());
+  DCHECK(address->GetSideEffects().Includes(SideEffects::DependsOnGC()));
+  DCHECK(access->GetSideEffects().Includes(SideEffects::DependsOnGC()));
+  // TODO: Code generation for HArrayGet and HArraySet will check whether the input address
+  // is an HIntermediateAddress and generate appropriate code.
+  // We would like to replace the `HArrayGet` and `HArraySet` with custom instructions (maybe
+  // `HArm64Load` and `HArm64Store`,`HArmLoad` and `HArmStore`). We defer these changes
+  // because these new instructions would not bring any advantages yet.
+  // Also see the comments in
+  // `InstructionCodeGeneratorARM::VisitArrayGet()`
+  // `InstructionCodeGeneratorARM::VisitArraySet()`
+  // `InstructionCodeGeneratorARM64::VisitArrayGet()`
+  // `InstructionCodeGeneratorARM64::VisitArraySet()`.
+  return true;
+}
+
+
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h
index b1fe8f4..56804f5 100644
--- a/compiler/optimizing/instruction_simplifier_shared.h
+++ b/compiler/optimizing/instruction_simplifier_shared.h
@@ -26,6 +26,11 @@
 // a negated bitwise instruction.
 bool TryMergeNegatedInput(HBinaryOperation* op);
 
+bool TryExtractArrayAccessAddress(HInstruction* access,
+                                  HInstruction* array,
+                                  HInstruction* index,
+                                  size_t data_offset);
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_SHARED_H_
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 0f0ef26..23ac457 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1289,7 +1289,8 @@
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_SHARED(M)                         \
   M(BitwiseNegatedRight, Instruction)                                   \
-  M(MultiplyAccumulate, Instruction)
+  M(MultiplyAccumulate, Instruction)                                    \
+  M(IntermediateAddress, Instruction)
 #endif
 
 #ifndef ART_ENABLE_CODEGEN_arm
@@ -1303,8 +1304,7 @@
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)                          \
-  M(Arm64DataProcWithShifterOp, Instruction)                            \
-  M(Arm64IntermediateAddress, Instruction)
+  M(Arm64DataProcWithShifterOp, Instruction)
 #endif
 
 #ifndef ART_ENABLE_CODEGEN_mips
diff --git a/compiler/optimizing/nodes_arm64.h b/compiler/optimizing/nodes_arm64.h
index 06b073c..3f88717 100644
--- a/compiler/optimizing/nodes_arm64.h
+++ b/compiler/optimizing/nodes_arm64.h
@@ -94,32 +94,6 @@
 
 std::ostream& operator<<(std::ostream& os, const HArm64DataProcWithShifterOp::OpKind op);
 
-// This instruction computes an intermediate address pointing in the 'middle' of an object. The
-// result pointer cannot be handled by GC, so extra care is taken to make sure that this value is
-// never used across anything that can trigger GC.
-class HArm64IntermediateAddress FINAL : public HExpression<2> {
- public:
-  HArm64IntermediateAddress(HInstruction* base_address, HInstruction* offset, uint32_t dex_pc)
-      : HExpression(Primitive::kPrimNot, SideEffects::DependsOnGC(), dex_pc) {
-    SetRawInputAt(0, base_address);
-    SetRawInputAt(1, offset);
-  }
-
-  bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
-    return true;
-  }
-  bool IsActualObject() const OVERRIDE { return false; }
-
-  HInstruction* GetBaseAddress() const { return InputAt(0); }
-  HInstruction* GetOffset() const { return InputAt(1); }
-
-  DECLARE_INSTRUCTION(Arm64IntermediateAddress);
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(HArm64IntermediateAddress);
-};
-
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_NODES_ARM64_H_
diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h
index f2d5cf3..8bd8667 100644
--- a/compiler/optimizing/nodes_shared.h
+++ b/compiler/optimizing/nodes_shared.h
@@ -113,6 +113,34 @@
   DISALLOW_COPY_AND_ASSIGN(HBitwiseNegatedRight);
 };
 
+
+// This instruction computes an intermediate address pointing in the 'middle' of an object. The
+// result pointer cannot be handled by GC, so extra care is taken to make sure that this value is
+// never used across anything that can trigger GC.
+class HIntermediateAddress FINAL : public HExpression<2> {
+ public:
+  HIntermediateAddress(HInstruction* base_address, HInstruction* offset, uint32_t dex_pc)
+      : HExpression(Primitive::kPrimNot, SideEffects::DependsOnGC(), dex_pc) {
+    SetRawInputAt(0, base_address);
+    SetRawInputAt(1, offset);
+  }
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+    return true;
+  }
+  bool IsActualObject() const OVERRIDE { return false; }
+
+  HInstruction* GetBaseAddress() const { return InputAt(0); }
+  HInstruction* GetOffset() const { return InputAt(1); }
+
+  DECLARE_INSTRUCTION(IntermediateAddress);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HIntermediateAddress);
+};
+
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_NODES_SHARED_H_
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 80affc3..0bca186 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -448,8 +448,12 @@
       arm::DexCacheArrayFixups* fixups = new (arena) arm::DexCacheArrayFixups(graph, stats);
       arm::InstructionSimplifierArm* simplifier =
           new (arena) arm::InstructionSimplifierArm(graph, stats);
+      SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
+      GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN_after_arch");
       HOptimization* arm_optimizations[] = {
         simplifier,
+        side_effects,
+        gvn,
         fixups
       };
       RunOptimizations(arm_optimizations, arraysize(arm_optimizations), pass_observer);
@@ -531,7 +535,7 @@
   }
   {
     PassScope scope(RegisterAllocator::kRegisterAllocatorPassName, pass_observer);
-    RegisterAllocator(graph->GetArena(), codegen, liveness).AllocateRegisters();
+    RegisterAllocator::Create(graph->GetArena(), codegen, liveness)->AllocateRegisters();
   }
 }
 
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
new file mode 100644
index 0000000..2367ce1
--- /dev/null
+++ b/compiler/optimizing/register_allocator.cc
@@ -0,0 +1,266 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "register_allocator.h"
+
+#include <iostream>
+#include <sstream>
+
+#include "base/bit_vector-inl.h"
+#include "code_generator.h"
+#include "register_allocator_linear_scan.h"
+#include "ssa_liveness_analysis.h"
+
+
+namespace art {
+
+RegisterAllocator::RegisterAllocator(ArenaAllocator* allocator,
+                                     CodeGenerator* codegen,
+                                     const SsaLivenessAnalysis& liveness)
+    : allocator_(allocator),
+      codegen_(codegen),
+      liveness_(liveness) {}
+
+RegisterAllocator* RegisterAllocator::Create(ArenaAllocator* allocator,
+                                             CodeGenerator* codegen,
+                                             const SsaLivenessAnalysis& analysis,
+                                             Strategy strategy) {
+  switch (strategy) {
+    case kRegisterAllocatorLinearScan:
+      return new (allocator) RegisterAllocatorLinearScan(allocator, codegen, analysis);
+    default:
+      LOG(FATAL) << "Invalid register allocation strategy: " << strategy;
+      UNREACHABLE();
+  }
+}
+
+bool RegisterAllocator::CanAllocateRegistersFor(const HGraph& graph ATTRIBUTE_UNUSED,
+                                                InstructionSet instruction_set) {
+  return instruction_set == kArm
+      || instruction_set == kArm64
+      || instruction_set == kMips
+      || instruction_set == kMips64
+      || instruction_set == kThumb2
+      || instruction_set == kX86
+      || instruction_set == kX86_64;
+}
+
+class AllRangesIterator : public ValueObject {
+ public:
+  explicit AllRangesIterator(LiveInterval* interval)
+      : current_interval_(interval),
+        current_range_(interval->GetFirstRange()) {}
+
+  bool Done() const { return current_interval_ == nullptr; }
+  LiveRange* CurrentRange() const { return current_range_; }
+  LiveInterval* CurrentInterval() const { return current_interval_; }
+
+  void Advance() {
+    current_range_ = current_range_->GetNext();
+    if (current_range_ == nullptr) {
+      current_interval_ = current_interval_->GetNextSibling();
+      if (current_interval_ != nullptr) {
+        current_range_ = current_interval_->GetFirstRange();
+      }
+    }
+  }
+
+ private:
+  LiveInterval* current_interval_;
+  LiveRange* current_range_;
+
+  DISALLOW_COPY_AND_ASSIGN(AllRangesIterator);
+};
+
+bool RegisterAllocator::ValidateIntervals(const ArenaVector<LiveInterval*>& intervals,
+                                          size_t number_of_spill_slots,
+                                          size_t number_of_out_slots,
+                                          const CodeGenerator& codegen,
+                                          ArenaAllocator* allocator,
+                                          bool processing_core_registers,
+                                          bool log_fatal_on_failure) {
+  size_t number_of_registers = processing_core_registers
+      ? codegen.GetNumberOfCoreRegisters()
+      : codegen.GetNumberOfFloatingPointRegisters();
+  ArenaVector<ArenaBitVector*> liveness_of_values(
+      allocator->Adapter(kArenaAllocRegisterAllocatorValidate));
+  liveness_of_values.reserve(number_of_registers + number_of_spill_slots);
+
+  size_t max_end = 0u;
+  for (LiveInterval* start_interval : intervals) {
+    for (AllRangesIterator it(start_interval); !it.Done(); it.Advance()) {
+      max_end = std::max(max_end, it.CurrentRange()->GetEnd());
+    }
+  }
+
+  // Allocate a bit vector per register. A live interval that has a register
+  // allocated will populate the associated bit vector based on its live ranges.
+  for (size_t i = 0; i < number_of_registers + number_of_spill_slots; ++i) {
+    liveness_of_values.push_back(
+        ArenaBitVector::Create(allocator, max_end, false, kArenaAllocRegisterAllocatorValidate));
+  }
+
+  for (LiveInterval* start_interval : intervals) {
+    for (AllRangesIterator it(start_interval); !it.Done(); it.Advance()) {
+      LiveInterval* current = it.CurrentInterval();
+      HInstruction* defined_by = current->GetParent()->GetDefinedBy();
+      if (current->GetParent()->HasSpillSlot()
+           // Parameters and current method have their own stack slot.
+           && !(defined_by != nullptr && (defined_by->IsParameterValue()
+                                          || defined_by->IsCurrentMethod()))) {
+        BitVector* liveness_of_spill_slot = liveness_of_values[number_of_registers
+            + current->GetParent()->GetSpillSlot() / kVRegSize
+            - number_of_out_slots];
+        for (size_t j = it.CurrentRange()->GetStart(); j < it.CurrentRange()->GetEnd(); ++j) {
+          if (liveness_of_spill_slot->IsBitSet(j)) {
+            if (log_fatal_on_failure) {
+              std::ostringstream message;
+              message << "Spill slot conflict at " << j;
+              LOG(FATAL) << message.str();
+            } else {
+              return false;
+            }
+          } else {
+            liveness_of_spill_slot->SetBit(j);
+          }
+        }
+      }
+
+      if (current->HasRegister()) {
+        if (kIsDebugBuild && log_fatal_on_failure && !current->IsFixed()) {
+          // Only check when an error is fatal. Only tests code ask for non-fatal failures
+          // and test code may not properly fill the right information to the code generator.
+          CHECK(codegen.HasAllocatedRegister(processing_core_registers, current->GetRegister()));
+        }
+        BitVector* liveness_of_register = liveness_of_values[current->GetRegister()];
+        for (size_t j = it.CurrentRange()->GetStart(); j < it.CurrentRange()->GetEnd(); ++j) {
+          if (liveness_of_register->IsBitSet(j)) {
+            if (current->IsUsingInputRegister() && current->CanUseInputRegister()) {
+              continue;
+            }
+            if (log_fatal_on_failure) {
+              std::ostringstream message;
+              message << "Register conflict at " << j << " ";
+              if (defined_by != nullptr) {
+                message << "(" << defined_by->DebugName() << ")";
+              }
+              message << "for ";
+              if (processing_core_registers) {
+                codegen.DumpCoreRegister(message, current->GetRegister());
+              } else {
+                codegen.DumpFloatingPointRegister(message, current->GetRegister());
+              }
+              LOG(FATAL) << message.str();
+            } else {
+              return false;
+            }
+          } else {
+            liveness_of_register->SetBit(j);
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+LiveInterval* RegisterAllocator::Split(LiveInterval* interval, size_t position) {
+  DCHECK_GE(position, interval->GetStart());
+  DCHECK(!interval->IsDeadAt(position));
+  if (position == interval->GetStart()) {
+    // Spill slot will be allocated when handling `interval` again.
+    interval->ClearRegister();
+    if (interval->HasHighInterval()) {
+      interval->GetHighInterval()->ClearRegister();
+    } else if (interval->HasLowInterval()) {
+      interval->GetLowInterval()->ClearRegister();
+    }
+    return interval;
+  } else {
+    LiveInterval* new_interval = interval->SplitAt(position);
+    if (interval->HasHighInterval()) {
+      LiveInterval* high = interval->GetHighInterval()->SplitAt(position);
+      new_interval->SetHighInterval(high);
+      high->SetLowInterval(new_interval);
+    } else if (interval->HasLowInterval()) {
+      LiveInterval* low = interval->GetLowInterval()->SplitAt(position);
+      new_interval->SetLowInterval(low);
+      low->SetHighInterval(new_interval);
+    }
+    return new_interval;
+  }
+}
+
+LiveInterval* RegisterAllocator::SplitBetween(LiveInterval* interval, size_t from, size_t to) {
+  HBasicBlock* block_from = liveness_.GetBlockFromPosition(from / 2);
+  HBasicBlock* block_to = liveness_.GetBlockFromPosition(to / 2);
+  DCHECK(block_from != nullptr);
+  DCHECK(block_to != nullptr);
+
+  // Both locations are in the same block. We split at the given location.
+  if (block_from == block_to) {
+    return Split(interval, to);
+  }
+
+  /*
+   * Non-linear control flow will force moves at every branch instruction to the new location.
+   * To avoid having all branches doing the moves, we find the next non-linear position and
+   * split the interval at this position. Take the following example (block number is the linear
+   * order position):
+   *
+   *     B1
+   *    /  \
+   *   B2  B3
+   *    \  /
+   *     B4
+   *
+   * B2 needs to split an interval, whose next use is in B4. If we were to split at the
+   * beginning of B4, B3 would need to do a move between B3 and B4 to ensure the interval
+   * is now in the correct location. It makes performance worst if the interval is spilled
+   * and both B2 and B3 need to reload it before entering B4.
+   *
+   * By splitting at B3, we give a chance to the register allocator to allocate the
+   * interval to the same register as in B1, and therefore avoid doing any
+   * moves in B3.
+   */
+  if (block_from->GetDominator() != nullptr) {
+    for (HBasicBlock* dominated : block_from->GetDominator()->GetDominatedBlocks()) {
+      size_t position = dominated->GetLifetimeStart();
+      if ((position > from) && (block_to->GetLifetimeStart() > position)) {
+        // Even if we found a better block, we continue iterating in case
+        // a dominated block is closer.
+        // Note that dominated blocks are not sorted in liveness order.
+        block_to = dominated;
+        DCHECK_NE(block_to, block_from);
+      }
+    }
+  }
+
+  // If `to` is in a loop, find the outermost loop header which does not contain `from`.
+  for (HLoopInformationOutwardIterator it(*block_to); !it.Done(); it.Advance()) {
+    HBasicBlock* header = it.Current()->GetHeader();
+    if (block_from->GetLifetimeStart() >= header->GetLifetimeStart()) {
+      break;
+    }
+    block_to = header;
+  }
+
+  // Split at the start of the found block, to piggy back on existing moves
+  // due to resolution if non-linear control flow (see `ConnectSplitSiblings`).
+  return Split(interval, block_to->GetLifetimeStart());
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
new file mode 100644
index 0000000..729eede
--- /dev/null
+++ b/compiler/optimizing/register_allocator.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_H_
+#define ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_H_
+
+#include "arch/instruction_set.h"
+#include "base/arena_containers.h"
+#include "base/arena_object.h"
+#include "base/macros.h"
+#include "primitive.h"
+
+namespace art {
+
+class CodeGenerator;
+class HBasicBlock;
+class HGraph;
+class HInstruction;
+class HParallelMove;
+class LiveInterval;
+class Location;
+class SsaLivenessAnalysis;
+
+/**
+ * Base class for any register allocator.
+ */
+class RegisterAllocator : public ArenaObject<kArenaAllocRegisterAllocator> {
+ public:
+  enum Strategy {
+    kRegisterAllocatorLinearScan
+  };
+
+  static constexpr Strategy kRegisterAllocatorDefault = kRegisterAllocatorLinearScan;
+
+  static RegisterAllocator* Create(ArenaAllocator* allocator,
+                                   CodeGenerator* codegen,
+                                   const SsaLivenessAnalysis& analysis,
+                                   Strategy strategy = kRegisterAllocatorDefault);
+
+  virtual ~RegisterAllocator() = default;
+
+  // Main entry point for the register allocator. Given the liveness analysis,
+  // allocates registers to live intervals.
+  virtual void AllocateRegisters() = 0;
+
+  // Validate that the register allocator did not allocate the same register to
+  // intervals that intersect each other. Returns false if it failed.
+  virtual bool Validate(bool log_fatal_on_failure) = 0;
+
+  static bool CanAllocateRegistersFor(const HGraph& graph,
+                                      InstructionSet instruction_set);
+
+  // Verifies that live intervals do not conflict. Used by unit testing.
+  static bool ValidateIntervals(const ArenaVector<LiveInterval*>& intervals,
+                                size_t number_of_spill_slots,
+                                size_t number_of_out_slots,
+                                const CodeGenerator& codegen,
+                                ArenaAllocator* allocator,
+                                bool processing_core_registers,
+                                bool log_fatal_on_failure);
+
+  static constexpr const char* kRegisterAllocatorPassName = "register";
+
+ protected:
+  RegisterAllocator(ArenaAllocator* allocator,
+                    CodeGenerator* codegen,
+                    const SsaLivenessAnalysis& analysis);
+
+  // Split `interval` at the position `position`. The new interval starts at `position`.
+  // If `position` is at the start of `interval`, returns `interval` with its
+  // register location(s) cleared.
+  static LiveInterval* Split(LiveInterval* interval, size_t position);
+
+  // Split `interval` at a position between `from` and `to`. The method will try
+  // to find an optimal split position.
+  LiveInterval* SplitBetween(LiveInterval* interval, size_t from, size_t to);
+
+  ArenaAllocator* const allocator_;
+  CodeGenerator* const codegen_;
+  const SsaLivenessAnalysis& liveness_;
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_H_
diff --git a/compiler/optimizing/register_allocator_linear_scan.cc b/compiler/optimizing/register_allocator_linear_scan.cc
index c1797b0..a9151ba 100644
--- a/compiler/optimizing/register_allocator_linear_scan.cc
+++ b/compiler/optimizing/register_allocator_linear_scan.cc
@@ -38,12 +38,10 @@
   return GetHighForLowRegister(low->GetRegister()) != low->GetHighInterval()->GetRegister();
 }
 
-RegisterAllocator::RegisterAllocator(ArenaAllocator* allocator,
-                                     CodeGenerator* codegen,
-                                     const SsaLivenessAnalysis& liveness)
-      : allocator_(allocator),
-        codegen_(codegen),
-        liveness_(liveness),
+RegisterAllocatorLinearScan::RegisterAllocatorLinearScan(ArenaAllocator* allocator,
+                                                         CodeGenerator* codegen,
+                                                         const SsaLivenessAnalysis& liveness)
+      : RegisterAllocator(allocator, codegen, liveness),
         unhandled_core_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
         unhandled_fp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
         unhandled_(nullptr),
@@ -83,17 +81,6 @@
       codegen->GetGraph()->GetMaximumNumberOfOutVRegs();
 }
 
-bool RegisterAllocator::CanAllocateRegistersFor(const HGraph& graph ATTRIBUTE_UNUSED,
-                                                InstructionSet instruction_set) {
-  return instruction_set == kArm
-      || instruction_set == kArm64
-      || instruction_set == kMips
-      || instruction_set == kMips64
-      || instruction_set == kThumb2
-      || instruction_set == kX86
-      || instruction_set == kX86_64;
-}
-
 static bool ShouldProcess(bool processing_core_registers, LiveInterval* interval) {
   if (interval == nullptr) return false;
   bool is_core_register = (interval->GetType() != Primitive::kPrimDouble)
@@ -101,7 +88,7 @@
   return processing_core_registers == is_core_register;
 }
 
-void RegisterAllocator::AllocateRegisters() {
+void RegisterAllocatorLinearScan::AllocateRegisters() {
   AllocateRegistersInternal();
   RegisterAllocationResolver(allocator_, codegen_, liveness_)
       .Resolve(maximum_number_of_live_core_registers_,
@@ -141,7 +128,7 @@
   }
 }
 
-void RegisterAllocator::BlockRegister(Location location, size_t start, size_t end) {
+void RegisterAllocatorLinearScan::BlockRegister(Location location, size_t start, size_t end) {
   int reg = location.reg();
   DCHECK(location.IsRegister() || location.IsFpuRegister());
   LiveInterval* interval = location.IsRegister()
@@ -162,7 +149,7 @@
   interval->AddRange(start, end);
 }
 
-void RegisterAllocator::BlockRegisters(size_t start, size_t end, bool caller_save_only) {
+void RegisterAllocatorLinearScan::BlockRegisters(size_t start, size_t end, bool caller_save_only) {
   for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) {
     if (!caller_save_only || !codegen_->IsCoreCalleeSaveRegister(i)) {
       BlockRegister(Location::RegisterLocation(i), start, end);
@@ -175,7 +162,7 @@
   }
 }
 
-void RegisterAllocator::AllocateRegistersInternal() {
+void RegisterAllocatorLinearScan::AllocateRegistersInternal() {
   // Iterate post-order, to ensure the list is sorted, and the last added interval
   // is the one with the lowest start position.
   for (HLinearPostOrderIterator it(*codegen_->GetGraph()); !it.Done(); it.Advance()) {
@@ -235,7 +222,7 @@
   LinearScan();
 }
 
-void RegisterAllocator::ProcessInstruction(HInstruction* instruction) {
+void RegisterAllocatorLinearScan::ProcessInstruction(HInstruction* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   size_t position = instruction->GetLifetimePosition();
 
@@ -452,7 +439,7 @@
   DISALLOW_COPY_AND_ASSIGN(AllRangesIterator);
 };
 
-bool RegisterAllocator::ValidateInternal(bool log_fatal_on_failure) const {
+bool RegisterAllocatorLinearScan::ValidateInternal(bool log_fatal_on_failure) const {
   // To simplify unit testing, we eagerly create the array of intervals, and
   // call the helper method.
   ArenaVector<LiveInterval*> intervals(allocator_->Adapter(kArenaAllocRegisterAllocatorValidate));
@@ -482,99 +469,7 @@
                            allocator_, processing_core_registers_, log_fatal_on_failure);
 }
 
-bool RegisterAllocator::ValidateIntervals(const ArenaVector<LiveInterval*>& intervals,
-                                          size_t number_of_spill_slots,
-                                          size_t number_of_out_slots,
-                                          const CodeGenerator& codegen,
-                                          ArenaAllocator* allocator,
-                                          bool processing_core_registers,
-                                          bool log_fatal_on_failure) {
-  size_t number_of_registers = processing_core_registers
-      ? codegen.GetNumberOfCoreRegisters()
-      : codegen.GetNumberOfFloatingPointRegisters();
-  ArenaVector<ArenaBitVector*> liveness_of_values(
-      allocator->Adapter(kArenaAllocRegisterAllocatorValidate));
-  liveness_of_values.reserve(number_of_registers + number_of_spill_slots);
-
-  size_t max_end = 0u;
-  for (LiveInterval* start_interval : intervals) {
-    for (AllRangesIterator it(start_interval); !it.Done(); it.Advance()) {
-      max_end = std::max(max_end, it.CurrentRange()->GetEnd());
-    }
-  }
-
-  // Allocate a bit vector per register. A live interval that has a register
-  // allocated will populate the associated bit vector based on its live ranges.
-  for (size_t i = 0; i < number_of_registers + number_of_spill_slots; ++i) {
-    liveness_of_values.push_back(
-        ArenaBitVector::Create(allocator, max_end, false, kArenaAllocRegisterAllocatorValidate));
-  }
-
-  for (LiveInterval* start_interval : intervals) {
-    for (AllRangesIterator it(start_interval); !it.Done(); it.Advance()) {
-      LiveInterval* current = it.CurrentInterval();
-      HInstruction* defined_by = current->GetParent()->GetDefinedBy();
-      if (current->GetParent()->HasSpillSlot()
-           // Parameters and current method have their own stack slot.
-           && !(defined_by != nullptr && (defined_by->IsParameterValue()
-                                          || defined_by->IsCurrentMethod()))) {
-        BitVector* liveness_of_spill_slot = liveness_of_values[number_of_registers
-            + current->GetParent()->GetSpillSlot() / kVRegSize
-            - number_of_out_slots];
-        for (size_t j = it.CurrentRange()->GetStart(); j < it.CurrentRange()->GetEnd(); ++j) {
-          if (liveness_of_spill_slot->IsBitSet(j)) {
-            if (log_fatal_on_failure) {
-              std::ostringstream message;
-              message << "Spill slot conflict at " << j;
-              LOG(FATAL) << message.str();
-            } else {
-              return false;
-            }
-          } else {
-            liveness_of_spill_slot->SetBit(j);
-          }
-        }
-      }
-
-      if (current->HasRegister()) {
-        if (kIsDebugBuild && log_fatal_on_failure && !current->IsFixed()) {
-          // Only check when an error is fatal. Only tests code ask for non-fatal failures
-          // and test code may not properly fill the right information to the code generator.
-          CHECK(codegen.HasAllocatedRegister(processing_core_registers, current->GetRegister()));
-        }
-        BitVector* liveness_of_register = liveness_of_values[current->GetRegister()];
-        for (size_t j = it.CurrentRange()->GetStart(); j < it.CurrentRange()->GetEnd(); ++j) {
-          if (liveness_of_register->IsBitSet(j)) {
-            if (current->IsUsingInputRegister() && current->CanUseInputRegister()) {
-              continue;
-            }
-            if (log_fatal_on_failure) {
-              std::ostringstream message;
-              message << "Register conflict at " << j << " ";
-              if (defined_by != nullptr) {
-                message << "(" << defined_by->DebugName() << ")";
-              }
-              message << "for ";
-              if (processing_core_registers) {
-                codegen.DumpCoreRegister(message, current->GetRegister());
-              } else {
-                codegen.DumpFloatingPointRegister(message, current->GetRegister());
-              }
-              LOG(FATAL) << message.str();
-            } else {
-              return false;
-            }
-          } else {
-            liveness_of_register->SetBit(j);
-          }
-        }
-      }
-    }
-  }
-  return true;
-}
-
-void RegisterAllocator::DumpInterval(std::ostream& stream, LiveInterval* interval) const {
+void RegisterAllocatorLinearScan::DumpInterval(std::ostream& stream, LiveInterval* interval) const {
   interval->Dump(stream);
   stream << ": ";
   if (interval->HasRegister()) {
@@ -589,7 +484,7 @@
   stream << std::endl;
 }
 
-void RegisterAllocator::DumpAllIntervals(std::ostream& stream) const {
+void RegisterAllocatorLinearScan::DumpAllIntervals(std::ostream& stream) const {
   stream << "inactive: " << std::endl;
   for (LiveInterval* inactive_interval : inactive_) {
     DumpInterval(stream, inactive_interval);
@@ -611,7 +506,7 @@
 }
 
 // By the book implementation of a linear scan register allocator.
-void RegisterAllocator::LinearScan() {
+void RegisterAllocatorLinearScan::LinearScan() {
   while (!unhandled_->empty()) {
     // (1) Remove interval with the lowest start position from unhandled.
     LiveInterval* current = unhandled_->back();
@@ -742,7 +637,7 @@
 
 // Find a free register. If multiple are found, pick the register that
 // is free the longest.
-bool RegisterAllocator::TryAllocateFreeReg(LiveInterval* current) {
+bool RegisterAllocatorLinearScan::TryAllocateFreeReg(LiveInterval* current) {
   size_t* free_until = registers_array_;
 
   // First set all registers to be free.
@@ -865,13 +760,13 @@
   return true;
 }
 
-bool RegisterAllocator::IsBlocked(int reg) const {
+bool RegisterAllocatorLinearScan::IsBlocked(int reg) const {
   return processing_core_registers_
       ? blocked_core_registers_[reg]
       : blocked_fp_registers_[reg];
 }
 
-int RegisterAllocator::FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const {
+int RegisterAllocatorLinearScan::FindAvailableRegisterPair(size_t* next_use, size_t starting_at) const {
   int reg = kNoRegister;
   // Pick the register pair that is used the last.
   for (size_t i = 0; i < number_of_registers_; ++i) {
@@ -896,13 +791,13 @@
   return reg;
 }
 
-bool RegisterAllocator::IsCallerSaveRegister(int reg) const {
+bool RegisterAllocatorLinearScan::IsCallerSaveRegister(int reg) const {
   return processing_core_registers_
       ? !codegen_->IsCoreCalleeSaveRegister(reg)
       : !codegen_->IsFloatingPointCalleeSaveRegister(reg);
 }
 
-int RegisterAllocator::FindAvailableRegister(size_t* next_use, LiveInterval* current) const {
+int RegisterAllocatorLinearScan::FindAvailableRegister(size_t* next_use, LiveInterval* current) const {
   // We special case intervals that do not span a safepoint to try to find a caller-save
   // register if one is available. We iterate from 0 to the number of registers,
   // so if there are caller-save registers available at the end, we continue the iteration.
@@ -965,9 +860,9 @@
   }
 }
 
-bool RegisterAllocator::TrySplitNonPairOrUnalignedPairIntervalAt(size_t position,
-                                                                 size_t first_register_use,
-                                                                 size_t* next_use) {
+bool RegisterAllocatorLinearScan::TrySplitNonPairOrUnalignedPairIntervalAt(size_t position,
+                                                                           size_t first_register_use,
+                                                                           size_t* next_use) {
   for (auto it = active_.begin(), end = active_.end(); it != end; ++it) {
     LiveInterval* active = *it;
     DCHECK(active->HasRegister());
@@ -997,7 +892,7 @@
 // Find the register that is used the last, and spill the interval
 // that holds it. If the first use of `current` is after that register
 // we spill `current` instead.
-bool RegisterAllocator::AllocateBlockedReg(LiveInterval* current) {
+bool RegisterAllocatorLinearScan::AllocateBlockedReg(LiveInterval* current) {
   size_t first_register_use = current->FirstRegisterUse();
   if (current->HasRegister()) {
     DCHECK(current->IsHighInterval());
@@ -1180,7 +1075,7 @@
   }
 }
 
-void RegisterAllocator::AddSorted(ArenaVector<LiveInterval*>* array, LiveInterval* interval) {
+void RegisterAllocatorLinearScan::AddSorted(ArenaVector<LiveInterval*>* array, LiveInterval* interval) {
   DCHECK(!interval->IsFixed() && !interval->HasSpillSlot());
   size_t insert_at = 0;
   for (size_t i = array->size(); i > 0; --i) {
@@ -1209,93 +1104,7 @@
   }
 }
 
-LiveInterval* RegisterAllocator::SplitBetween(LiveInterval* interval, size_t from, size_t to) {
-  HBasicBlock* block_from = liveness_.GetBlockFromPosition(from / 2);
-  HBasicBlock* block_to = liveness_.GetBlockFromPosition(to / 2);
-  DCHECK(block_from != nullptr);
-  DCHECK(block_to != nullptr);
-
-  // Both locations are in the same block. We split at the given location.
-  if (block_from == block_to) {
-    return Split(interval, to);
-  }
-
-  /*
-   * Non-linear control flow will force moves at every branch instruction to the new location.
-   * To avoid having all branches doing the moves, we find the next non-linear position and
-   * split the interval at this position. Take the following example (block number is the linear
-   * order position):
-   *
-   *     B1
-   *    /  \
-   *   B2  B3
-   *    \  /
-   *     B4
-   *
-   * B2 needs to split an interval, whose next use is in B4. If we were to split at the
-   * beginning of B4, B3 would need to do a move between B3 and B4 to ensure the interval
-   * is now in the correct location. It makes performance worst if the interval is spilled
-   * and both B2 and B3 need to reload it before entering B4.
-   *
-   * By splitting at B3, we give a chance to the register allocator to allocate the
-   * interval to the same register as in B1, and therefore avoid doing any
-   * moves in B3.
-   */
-  if (block_from->GetDominator() != nullptr) {
-    for (HBasicBlock* dominated : block_from->GetDominator()->GetDominatedBlocks()) {
-      size_t position = dominated->GetLifetimeStart();
-      if ((position > from) && (block_to->GetLifetimeStart() > position)) {
-        // Even if we found a better block, we continue iterating in case
-        // a dominated block is closer.
-        // Note that dominated blocks are not sorted in liveness order.
-        block_to = dominated;
-        DCHECK_NE(block_to, block_from);
-      }
-    }
-  }
-
-  // If `to` is in a loop, find the outermost loop header which does not contain `from`.
-  for (HLoopInformationOutwardIterator it(*block_to); !it.Done(); it.Advance()) {
-    HBasicBlock* header = it.Current()->GetHeader();
-    if (block_from->GetLifetimeStart() >= header->GetLifetimeStart()) {
-      break;
-    }
-    block_to = header;
-  }
-
-  // Split at the start of the found block, to piggy back on existing moves
-  // due to resolution if non-linear control flow (see `ConnectSplitSiblings`).
-  return Split(interval, block_to->GetLifetimeStart());
-}
-
-LiveInterval* RegisterAllocator::Split(LiveInterval* interval, size_t position) {
-  DCHECK_GE(position, interval->GetStart());
-  DCHECK(!interval->IsDeadAt(position));
-  if (position == interval->GetStart()) {
-    // Spill slot will be allocated when handling `interval` again.
-    interval->ClearRegister();
-    if (interval->HasHighInterval()) {
-      interval->GetHighInterval()->ClearRegister();
-    } else if (interval->HasLowInterval()) {
-      interval->GetLowInterval()->ClearRegister();
-    }
-    return interval;
-  } else {
-    LiveInterval* new_interval = interval->SplitAt(position);
-    if (interval->HasHighInterval()) {
-      LiveInterval* high = interval->GetHighInterval()->SplitAt(position);
-      new_interval->SetHighInterval(high);
-      high->SetLowInterval(new_interval);
-    } else if (interval->HasLowInterval()) {
-      LiveInterval* low = interval->GetLowInterval()->SplitAt(position);
-      new_interval->SetLowInterval(low);
-      low->SetHighInterval(new_interval);
-    }
-    return new_interval;
-  }
-}
-
-void RegisterAllocator::AllocateSpillSlotFor(LiveInterval* interval) {
+void RegisterAllocatorLinearScan::AllocateSpillSlotFor(LiveInterval* interval) {
   if (interval->IsHighInterval()) {
     // The low interval already took care of allocating the spill slot.
     DCHECK(!interval->GetLowInterval()->HasRegister());
@@ -1390,7 +1199,7 @@
   parent->SetSpillSlot(slot);
 }
 
-void RegisterAllocator::AllocateSpillSlotForCatchPhi(HPhi* phi) {
+void RegisterAllocatorLinearScan::AllocateSpillSlotForCatchPhi(HPhi* phi) {
   LiveInterval* interval = phi->GetLiveInterval();
 
   HInstruction* previous_phi = phi->GetPrevious();
diff --git a/compiler/optimizing/register_allocator_linear_scan.h b/compiler/optimizing/register_allocator_linear_scan.h
index f32a4db..b6e4f92 100644
--- a/compiler/optimizing/register_allocator_linear_scan.h
+++ b/compiler/optimizing/register_allocator_linear_scan.h
@@ -21,6 +21,7 @@
 #include "base/arena_containers.h"
 #include "base/macros.h"
 #include "primitive.h"
+#include "register_allocator.h"
 
 namespace art {
 
@@ -37,19 +38,15 @@
 /**
  * An implementation of a linear scan register allocator on an `HGraph` with SSA form.
  */
-class RegisterAllocator {
+class RegisterAllocatorLinearScan : public RegisterAllocator {
  public:
-  RegisterAllocator(ArenaAllocator* allocator,
-                    CodeGenerator* codegen,
-                    const SsaLivenessAnalysis& analysis);
+  RegisterAllocatorLinearScan(ArenaAllocator* allocator,
+                              CodeGenerator* codegen,
+                              const SsaLivenessAnalysis& analysis);
 
-  // Main entry point for the register allocator. Given the liveness analysis,
-  // allocates registers to live intervals.
-  void AllocateRegisters();
+  void AllocateRegisters() OVERRIDE;
 
-  // Validate that the register allocator did not allocate the same register to
-  // intervals that intersect each other. Returns false if it did not.
-  bool Validate(bool log_fatal_on_failure) {
+  bool Validate(bool log_fatal_on_failure) OVERRIDE {
     processing_core_registers_ = true;
     if (!ValidateInternal(log_fatal_on_failure)) {
       return false;
@@ -58,17 +55,6 @@
     return ValidateInternal(log_fatal_on_failure);
   }
 
-  // Helper method for validation. Used by unit testing.
-  static bool ValidateIntervals(const ArenaVector<LiveInterval*>& intervals,
-                                size_t number_of_spill_slots,
-                                size_t number_of_out_slots,
-                                const CodeGenerator& codegen,
-                                ArenaAllocator* allocator,
-                                bool processing_core_registers,
-                                bool log_fatal_on_failure);
-
-  static bool CanAllocateRegistersFor(const HGraph& graph, InstructionSet instruction_set);
-
   size_t GetNumberOfSpillSlots() const {
     return int_spill_slots_.size()
         + long_spill_slots_.size()
@@ -77,8 +63,6 @@
         + catch_phi_spill_slots_;
   }
 
-  static constexpr const char* kRegisterAllocatorPassName = "register";
-
  private:
   // Main methods of the allocator.
   void LinearScan();
@@ -88,13 +72,6 @@
   // Add `interval` in the given sorted list.
   static void AddSorted(ArenaVector<LiveInterval*>* array, LiveInterval* interval);
 
-  // Split `interval` at the position `position`. The new interval starts at `position`.
-  LiveInterval* Split(LiveInterval* interval, size_t position);
-
-  // Split `interval` at a position between `from` and `to`. The method will try
-  // to find an optimal split position.
-  LiveInterval* SplitBetween(LiveInterval* interval, size_t from, size_t to);
-
   // Returns whether `reg` is blocked by the code generator.
   bool IsBlocked(int reg) const;
 
@@ -127,10 +104,6 @@
                                                 size_t first_register_use,
                                                 size_t* next_use);
 
-  ArenaAllocator* const allocator_;
-  CodeGenerator* const codegen_;
-  const SsaLivenessAnalysis& liveness_;
-
   // List of intervals for core registers that must be processed, ordered by start
   // position. Last entry is the interval that has the lowest start position.
   // This list is initially populated before doing the linear scan.
@@ -206,7 +179,7 @@
   ART_FRIEND_TEST(RegisterAllocatorTest, FreeUntil);
   ART_FRIEND_TEST(RegisterAllocatorTest, SpillInactive);
 
-  DISALLOW_COPY_AND_ASSIGN(RegisterAllocator);
+  DISALLOW_COPY_AND_ASSIGN(RegisterAllocatorLinearScan);
 };
 
 }  // namespace art
diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc
index 602a14c..cbb7b2f 100644
--- a/compiler/optimizing/register_allocator_test.cc
+++ b/compiler/optimizing/register_allocator_test.cc
@@ -24,6 +24,7 @@
 #include "driver/compiler_options.h"
 #include "nodes.h"
 #include "optimizing_unit_test.h"
+#include "register_allocator.h"
 #include "register_allocator_linear_scan.h"
 #include "ssa_liveness_analysis.h"
 #include "ssa_phi_elimination.h"
@@ -44,9 +45,9 @@
   x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(graph, &codegen);
   liveness.Analyze();
-  RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-  register_allocator.AllocateRegisters();
-  return register_allocator.Validate(false);
+  RegisterAllocator* register_allocator = RegisterAllocator::Create(&allocator, &codegen, liveness);
+  register_allocator->AllocateRegisters();
+  return register_allocator->Validate(false);
 }
 
 /**
@@ -295,9 +296,9 @@
   x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(graph, &codegen);
   liveness.Analyze();
-  RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-  register_allocator.AllocateRegisters();
-  ASSERT_TRUE(register_allocator.Validate(false));
+  RegisterAllocator* register_allocator = RegisterAllocator::Create(&allocator, &codegen, liveness);
+  register_allocator->AllocateRegisters();
+  ASSERT_TRUE(register_allocator->Validate(false));
 
   HBasicBlock* loop_header = graph->GetBlocks()[2];
   HPhi* phi = loop_header->GetFirstPhi()->AsPhi();
@@ -384,9 +385,9 @@
   x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(graph, &codegen);
   liveness.Analyze();
-  RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-  register_allocator.AllocateRegisters();
-  ASSERT_TRUE(register_allocator.Validate(false));
+  RegisterAllocator* register_allocator = RegisterAllocator::Create(&allocator, &codegen, liveness);
+  register_allocator->AllocateRegisters();
+  ASSERT_TRUE(register_allocator->Validate(false));
 }
 
 /**
@@ -408,7 +409,7 @@
   x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(graph, &codegen);
   liveness.Analyze();
-  RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+  RegisterAllocatorLinearScan register_allocator(&allocator, &codegen, liveness);
 
   // Add an artifical range to cover the temps that will be put in the unhandled list.
   LiveInterval* unhandled = graph->GetEntryBlock()->GetFirstInstruction()->GetLiveInterval();
@@ -541,8 +542,9 @@
     liveness.Analyze();
 
     // Check that the register allocator is deterministic.
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 0);
     ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 0);
@@ -560,8 +562,9 @@
     // Set the phi to a specific register, and check that the inputs get allocated
     // the same register.
     phi->GetLocations()->UpdateOut(Location::RegisterLocation(2));
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2);
     ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2);
@@ -579,8 +582,9 @@
     // Set input1 to a specific register, and check that the phi and other input get allocated
     // the same register.
     input1->GetLocations()->UpdateOut(Location::RegisterLocation(2));
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2);
     ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2);
@@ -598,8 +602,9 @@
     // Set input2 to a specific register, and check that the phi and other input get allocated
     // the same register.
     input2->GetLocations()->UpdateOut(Location::RegisterLocation(2));
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(input1->GetLiveInterval()->GetRegister(), 2);
     ASSERT_EQ(input2->GetLiveInterval()->GetRegister(), 2);
@@ -658,8 +663,9 @@
     SsaLivenessAnalysis liveness(graph, &codegen);
     liveness.Analyze();
 
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness);
+    register_allocator->AllocateRegisters();
 
     // Sanity check that in normal conditions, the register should be hinted to 0 (EAX).
     ASSERT_EQ(field->GetLiveInterval()->GetRegister(), 0);
@@ -677,8 +683,9 @@
     // Don't use SetInAt because we are overriding an already allocated location.
     ret->GetLocations()->inputs_[0] = Location::RegisterLocation(2);
 
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(field->GetLiveInterval()->GetRegister(), 2);
   }
@@ -726,8 +733,9 @@
     SsaLivenessAnalysis liveness(graph, &codegen);
     liveness.Analyze();
 
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness);
+    register_allocator->AllocateRegisters();
 
     // Sanity check that in normal conditions, the registers are the same.
     ASSERT_EQ(first_sub->GetLiveInterval()->GetRegister(), 1);
@@ -748,8 +756,9 @@
     ASSERT_EQ(first_sub->GetLocations()->Out().GetPolicy(), Location::kSameAsFirstInput);
     ASSERT_EQ(second_sub->GetLocations()->Out().GetPolicy(), Location::kSameAsFirstInput);
 
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness);
+    register_allocator->AllocateRegisters();
 
     ASSERT_EQ(first_sub->GetLiveInterval()->GetRegister(), 2);
     ASSERT_EQ(second_sub->GetLiveInterval()->GetRegister(), 2);
@@ -795,8 +804,9 @@
     SsaLivenessAnalysis liveness(graph, &codegen);
     liveness.Analyze();
 
-    RegisterAllocator register_allocator(&allocator, &codegen, liveness);
-    register_allocator.AllocateRegisters();
+    RegisterAllocator* register_allocator =
+        RegisterAllocator::Create(&allocator, &codegen, liveness);
+    register_allocator->AllocateRegisters();
 
     // div on x86 requires its first input in eax and the output be the same as the first input.
     ASSERT_EQ(div->GetLiveInterval()->GetRegister(), 0);
@@ -892,7 +902,7 @@
     liveness.instructions_from_lifetime_position_.push_back(user);
   }
 
-  RegisterAllocator register_allocator(&allocator, &codegen, liveness);
+  RegisterAllocatorLinearScan register_allocator(&allocator, &codegen, liveness);
   register_allocator.unhandled_core_intervals_.push_back(fourth);
   register_allocator.unhandled_core_intervals_.push_back(third);
   register_allocator.unhandled_core_intervals_.push_back(second);
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index e48a164..966587d 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -33,7 +33,9 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
+extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg03(mirror::Object*);
@@ -119,7 +121,7 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  qpoints->pReadBarrierMarkReg00 = artReadBarrierMark;
+  qpoints->pReadBarrierMarkReg00 = art_quick_read_barrier_mark_reg00;
   qpoints->pReadBarrierMarkReg01 = art_quick_read_barrier_mark_reg01;
   qpoints->pReadBarrierMarkReg02 = art_quick_read_barrier_mark_reg02;
   qpoints->pReadBarrierMarkReg03 = art_quick_read_barrier_mark_reg03;
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index f9c34f5..34d3158 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -672,6 +672,12 @@
     .endif
 .endm
 
+// Save rReg's value to [sp, #offset].
+.macro PUSH_REG rReg, offset
+    str \rReg, [sp, #\offset]       @ save rReg
+    .cfi_rel_offset \rReg, \offset
+.endm
+
     /*
      * Macro to insert read barrier, only used in art_quick_aput_obj.
      * rObj and rDest are registers, offset is a defined literal such as MIRROR_OBJECT_CLASS_OFFSET.
@@ -1752,30 +1758,83 @@
     /*
      * Create a function `name` calling the ReadBarrier::Mark routine,
      * getting its argument and returning its result through register
-     * `reg`, thus following a non-standard runtime calling convention:
-     * - `reg` is used to pass the (sole) argument of this function
+     * `reg`, saving and restoring all caller-save registers.
+     *
+     * If `reg` is different from `r0`, the generated function follows a
+     * non-standard runtime calling convention:
+     * - register `reg` is used to pass the (sole) argument of this
+     *   function (instead of R0);
+     * - register `reg` is used to return the result of this function
      *   (instead of R0);
-     * - `reg` is used to return the result of this function (instead of R0);
      * - R0 is treated like a normal (non-argument) caller-save register;
      * - everything else is the same as in the standard runtime calling
-     *   convention (e.g. same callee-save registers).
+     *   convention (e.g. standard callee-save registers are preserved).
      */
 .macro READ_BARRIER_MARK_REG name, reg
 ENTRY \name
-    push  {lr}                          @ save return address
-    .cfi_adjust_cfa_offset 4
-    .cfi_rel_offset lr, 0
-    sub   sp, #4                        @ push padding (native calling convention 8-byte alignment)
-    .cfi_adjust_cfa_offset 4
-    mov   r0, \reg                      @ pass arg1 - obj from `reg`
-    bl    artReadBarrierMark            @ artReadBarrierMark(obj)
-    mov   \reg, r0                      @ return result into `reg`
-    add   sp, #4                        @ pop padding
-    .cfi_adjust_cfa_offset -4
-    pop   {pc}                          @ return
+    push  {r0-r4, r9, r12, lr}          @ save return address and core caller-save registers
+    .cfi_adjust_cfa_offset 32
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset r1, 4
+    .cfi_rel_offset r2, 8
+    .cfi_rel_offset r3, 12
+    .cfi_rel_offset r4, 16
+    .cfi_rel_offset r9, 20
+    .cfi_rel_offset r12, 24
+    .cfi_rel_offset lr, 28
+    vpush {s0-s15}                      @ save floating-point caller-save registers
+    .cfi_adjust_cfa_offset 64
+
+    .ifnc \reg, r0
+      mov   r0, \reg                    @ pass arg1 - obj from `reg`
+    .endif
+    bl    artReadBarrierMark            @ r0 <- artReadBarrierMark(obj)
+
+    vpop {s0-s15}                       @ restore floating-point registers
+    .cfi_adjust_cfa_offset -64
+    @ If `reg` is a caller-save register, save the result to its
+    @ corresponding stack slot; it will be restored by the "pop"
+    @ instruction below. Otherwise, move result into `reg`.
+    @
+    @ (Note that saving `reg` to its stack slot will overwrite the value
+    @ previously stored by the "push" instruction above. That is
+    @ alright, as in that case we know that `reg` is not a live
+    @ register, as it is used to pass the argument and return the result
+    @ of this function.)
+    .ifc \reg, r0
+      PUSH_REG r0, 0                    @ copy result to r0's stack location
+    .else
+      .ifc \reg, r1
+        PUSH_REG r0, 4                  @ copy result to r1's stack location
+      .else
+        .ifc \reg, r2
+          PUSH_REG r0, 8                @ copy result to r2's stack location
+        .else
+          .ifc \reg, r3
+            PUSH_REG r0, 12             @ copy result to r3's stack location
+          .else
+            .ifc \reg, r4
+              PUSH_REG r0, 16           @ copy result to r4's stack location
+            .else
+              .ifc \reg, r9
+                PUSH_REG r0, 20         @ copy result to r9's stack location
+              .else
+                .ifc \reg, r12
+                  PUSH_REG r0, 24       @ copy result to r12's stack location
+                .else
+                  mov   \reg, r0        @ return result into `reg`
+                .endif
+              .endif
+            .endif
+          .endif
+        .endif
+      .endif
+    .endif
+    pop   {r0-r4, r9, r12, pc}          @ restore caller-save registers and return
 END \name
 .endm
 
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, r0
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, r1
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, r2
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, r3
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index 5385a2f..2e5f5ad 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -33,7 +33,9 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
+extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg03(mirror::Object*);
@@ -122,7 +124,7 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  qpoints->pReadBarrierMarkReg00 = artReadBarrierMark;
+  qpoints->pReadBarrierMarkReg00 = art_quick_read_barrier_mark_reg00;
   qpoints->pReadBarrierMarkReg01 = art_quick_read_barrier_mark_reg01;
   qpoints->pReadBarrierMarkReg02 = art_quick_read_barrier_mark_reg02;
   qpoints->pReadBarrierMarkReg03 = art_quick_read_barrier_mark_reg03;
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index c893e77..6173ae7 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1253,6 +1253,22 @@
     .endif
 .endm
 
+// Restore xReg1's value from [sp, #offset] if xReg1 is not the same as xExclude.
+// Restore xReg2's value from [sp, #(offset + 8)] if xReg2 is not the same as xExclude.
+.macro POP_REGS_NE xReg1, xReg2, offset, xExclude
+    .ifc \xReg1, \xExclude
+        ldr \xReg2, [sp, #(\offset + 8)]        // restore xReg2
+    .else
+        .ifc \xReg2, \xExclude
+            ldr \xReg1, [sp, #\offset]          // restore xReg1
+        .else
+            ldp \xReg1, \xReg2, [sp, #\offset]  // restore xReg1 and xReg2
+        .endif
+    .endif
+    .cfi_restore \xReg1
+    .cfi_restore \xReg2
+.endm
+
     /*
      * Macro to insert read barrier, only used in art_quick_aput_obj.
      * xDest, wDest and xObj are registers, offset is a defined literal such as
@@ -2222,56 +2238,148 @@
 
     /*
      * Create a function `name` calling the ReadBarrier::Mark routine,
-     * getting its argument and returning its result through register
-     * `reg`, thus following a non-standard runtime calling convention:
-     * - `reg` is used to pass the (sole) argument of this function
+     * getting its argument and returning its result through W register
+     * `wreg` (corresponding to X register `xreg`), saving and restoring
+     * all caller-save registers.
+     *
+     * If `wreg` is different from `w0`, the generated function follows a
+     * non-standard runtime calling convention:
+     * - register `wreg` is used to pass the (sole) argument of this
+     *   function (instead of W0);
+     * - register `wreg` is used to return the result of this function
      *   (instead of W0);
-     * - `reg` is used to return the result of this function (instead of W0);
      * - W0 is treated like a normal (non-argument) caller-save register;
      * - everything else is the same as in the standard runtime calling
-     *   convention (e.g. same callee-save registers).
+     *   convention (e.g. standard callee-save registers are preserved).
      */
-.macro READ_BARRIER_MARK_REG name, reg
+.macro READ_BARRIER_MARK_REG name, wreg, xreg
 ENTRY \name
-    str   xLR, [sp, #-16]!              // Save return address and add padding (16B align stack).
-    .cfi_adjust_cfa_offset 16
-    .cfi_rel_offset x30, 0
-    mov   w0, \reg                      // Pass arg1 - obj from `reg`
+    /*
+     * Allocate 46 stack slots * 8 = 368 bytes:
+     * - 20 slots for core registers X0-X19
+     * - 24 slots for floating-point registers D0-D7 and D16-D31
+     * -  1 slot for return address register XLR
+     * -  1 padding slot for 16-byte stack alignment
+     */
+    // Save all potentially live caller-save core registers.
+    stp   x0, x1,   [sp, #-368]!
+    .cfi_adjust_cfa_offset 368
+    .cfi_rel_offset x0, 0
+    .cfi_rel_offset x1, 8
+    stp   x2, x3,   [sp, #16]
+    .cfi_rel_offset x2, 16
+    .cfi_rel_offset x3, 24
+    stp   x4, x5,   [sp, #32]
+    .cfi_rel_offset x4, 32
+    .cfi_rel_offset x5, 40
+    stp   x6, x7,   [sp, #48]
+    .cfi_rel_offset x6, 48
+    .cfi_rel_offset x7, 56
+    stp   x8, x9,   [sp, #64]
+    .cfi_rel_offset x8, 64
+    .cfi_rel_offset x9, 72
+    stp   x10, x11, [sp, #80]
+    .cfi_rel_offset x10, 80
+    .cfi_rel_offset x11, 88
+    stp   x12, x13, [sp, #96]
+    .cfi_rel_offset x12, 96
+    .cfi_rel_offset x13, 104
+    stp   x14, x15, [sp, #112]
+    .cfi_rel_offset x14, 112
+    .cfi_rel_offset x15, 120
+    stp   x16, x17, [sp, #128]
+    .cfi_rel_offset x16, 128
+    .cfi_rel_offset x17, 136
+    stp   x18, x19, [sp, #144]
+    .cfi_rel_offset x18, 144
+    .cfi_rel_offset x19, 152
+    // Save all potentially live caller-save floating-point registers.
+    stp   d0, d1,   [sp, #160]
+    stp   d2, d3,   [sp, #176]
+    stp   d4, d5,   [sp, #192]
+    stp   d6, d7,   [sp, #208]
+    stp   d16, d17, [sp, #224]
+    stp   d18, d19, [sp, #240]
+    stp   d20, d21, [sp, #256]
+    stp   d22, d23, [sp, #272]
+    stp   d24, d25, [sp, #288]
+    stp   d26, d27, [sp, #304]
+    stp   d28, d29, [sp, #320]
+    stp   d30, d31, [sp, #336]
+    // Save return address.
+    str   xLR,      [sp, #352]
+    .cfi_rel_offset x30, 352
+    // (sp + #360 is a padding slot)
+
+    .ifnc \wreg, w0
+      mov   w0, \wreg                   // Pass arg1 - obj from `wreg`
+    .endif
     bl    artReadBarrierMark            // artReadBarrierMark(obj)
-    mov   \reg, w0                      // Return result into `reg`
-    ldr   xLR, [sp], #16                // Restore return address and remove padding.
+    .ifnc \wreg, w0
+      mov   \wreg, w0                   // Return result into `wreg`
+    .endif
+
+    // Restore core regs, except `xreg`, as `wreg` is used to return the
+    // result of this function (simply remove it from the stack instead).
+    POP_REGS_NE x0, x1,   0,   \xreg
+    POP_REGS_NE x2, x3,   16,  \xreg
+    POP_REGS_NE x4, x5,   32,  \xreg
+    POP_REGS_NE x6, x7,   48,  \xreg
+    POP_REGS_NE x8, x9,   64,  \xreg
+    POP_REGS_NE x10, x11, 80,  \xreg
+    POP_REGS_NE x12, x13, 96,  \xreg
+    POP_REGS_NE x14, x15, 112, \xreg
+    POP_REGS_NE x16, x17, 128, \xreg
+    POP_REGS_NE x18, x19, 144, \xreg
+    // Restore floating-point registers.
+    ldp   d0, d1,   [sp, #160]
+    ldp   d2, d3,   [sp, #176]
+    ldp   d4, d5,   [sp, #192]
+    ldp   d6, d7,   [sp, #208]
+    ldp   d16, d17, [sp, #224]
+    ldp   d18, d19, [sp, #240]
+    ldp   d20, d21, [sp, #256]
+    ldp   d22, d23, [sp, #272]
+    ldp   d24, d25, [sp, #288]
+    ldp   d26, d27, [sp, #304]
+    ldp   d28, d29, [sp, #320]
+    ldp   d30, d31, [sp, #336]
+    // Restore return address and remove padding.
+    ldr   xLR,      [sp, #352]
     .cfi_restore x30
-    .cfi_adjust_cfa_offset -16
+    add sp, sp, #368
+    .cfi_adjust_cfa_offset -368
     ret
 END \name
 .endm
 
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, w1
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, w2
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, w3
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg04, w4
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg05, w5
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg06, w6
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg07, w7
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg08, w8
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg09, w9
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, w10
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, w11
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg12, w12
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg13, w13
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg14, w14
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg15, w15
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg16, w16
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg17, w17
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg18, w18
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg19, w19
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg20, w20
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg21, w21
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg22, w22
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg23, w23
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg24, w24
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg25, w25
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg26, w26
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg27, w27
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg28, w28
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg29, w29
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, w0,  x0
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, w1,  x1
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, w2,  x2
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, w3,  x3
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg04, w4,  x4
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg05, w5,  x5
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg06, w6,  x6
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg07, w7,  x7
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg08, w8,  x8
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg09, w9,  x9
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, w10, x10
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, w11, x11
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg12, w12, x12
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg13, w13, x13
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg14, w14, x14
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg15, w15, x15
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg16, w16, x16
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg17, w17, x17
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg18, w18, x18
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg19, w19, x19
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg20, w20, x20
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg21, w21, x21
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg22, w22, x22
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg23, w23, x23
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg24, w24, x24
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg25, w25, x25
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg26, w26, x26
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg27, w27, x27
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg28, w28, x28
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg29, w29, x29
diff --git a/runtime/arch/mips64/entrypoints_init_mips64.cc b/runtime/arch/mips64/entrypoints_init_mips64.cc
index b19aa01..b02edb6 100644
--- a/runtime/arch/mips64/entrypoints_init_mips64.cc
+++ b/runtime/arch/mips64/entrypoints_init_mips64.cc
@@ -28,8 +28,8 @@
 namespace art {
 
 // Cast entrypoints.
-extern "C" uint32_t artIsAssignableFromCode(const mirror::Class* klass,
-                                            const mirror::Class* ref_class);
+extern "C" size_t artIsAssignableFromCode(const mirror::Class* klass,
+                                          const mirror::Class* ref_class);
 // Math entrypoints.
 extern int32_t CmpgDouble(double a, double b);
 extern int32_t CmplDouble(double a, double b);
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index 8f13d58..4e9756c 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -31,7 +31,8 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index e75fecb..77e04e7 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1908,41 +1908,73 @@
     UNREACHABLE
 END_FUNCTION art_nested_signal_return
 
-// Call the ReadBarrierMark entry point, getting input and returning
-// result through EAX (register 0), following the standard runtime
-// calling convention.
-DEFINE_FUNCTION art_quick_read_barrier_mark_reg00
-    subl LITERAL(8), %esp            // alignment padding
-    CFI_ADJUST_CFA_OFFSET(8)
-    PUSH eax                         // pass arg1 - obj
-    call SYMBOL(artReadBarrierMark)  // artReadBarrierMark(obj)
-    addl LITERAL(12), %esp           // pop argument and remove padding
-    CFI_ADJUST_CFA_OFFSET(-12)
-    ret
-END_FUNCTION art_quick_read_barrier_mark_reg00
-
 // Create a function `name` calling the ReadBarrier::Mark routine,
 // getting its argument and returning its result through register
-// `reg`, thus following a non-standard runtime calling convention:
-// - `reg` is used to pass the (sole) argument of this function
+// `reg`, saving and restoring all caller-save registers.
+//
+// If `reg` is different from `eax`, the generated function follows a
+// non-standard runtime calling convention:
+// - register `reg` is used to pass the (sole) argument of this function
 //   (instead of EAX);
-// - `reg` is used to return the result of this function (instead of EAX);
+// - register `reg` is used to return the result of this function
+//   (instead of EAX);
 // - EAX is treated like a normal (non-argument) caller-save register;
 // - everything else is the same as in the standard runtime calling
-//   convention (e.g. same callee-save registers).
+//   convention (e.g. standard callee-save registers are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
-    subl LITERAL(8), %esp            // alignment padding
-    CFI_ADJUST_CFA_OFFSET(8)
+    // Save all potentially live caller-save core registers.
+    PUSH eax
+    PUSH ecx
+    PUSH edx
+    PUSH ebx
+    // 8-byte align the stack to improve (8-byte) XMM register saving and restoring.
+    // and create space for caller-save floating-point registers.
+    subl MACRO_LITERAL(4 + 8 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(4 + 8 * 8)
+    // Save all potentially live caller-save floating-point registers.
+    movsd %xmm0, 0(%esp)
+    movsd %xmm1, 8(%esp)
+    movsd %xmm2, 16(%esp)
+    movsd %xmm3, 24(%esp)
+    movsd %xmm4, 32(%esp)
+    movsd %xmm5, 40(%esp)
+    movsd %xmm6, 48(%esp)
+    movsd %xmm7, 56(%esp)
+
+    subl LITERAL(4), %esp            // alignment padding
+    CFI_ADJUST_CFA_OFFSET(4)
     PUSH RAW_VAR(reg)                // pass arg1 - obj from `reg`
     call SYMBOL(artReadBarrierMark)  // artReadBarrierMark(obj)
-    movl %eax, REG_VAR(reg)          // return result into `reg`
-    addl LITERAL(12), %esp           // pop argument and remove padding
-    CFI_ADJUST_CFA_OFFSET(-12)
+    .ifnc RAW_VAR(reg), eax
+      movl %eax, REG_VAR(reg)        // return result into `reg`
+    .endif
+    addl LITERAL(8), %esp            // pop argument and remove padding
+    CFI_ADJUST_CFA_OFFSET(-8)
+
+    // Restore floating-point registers.
+    movsd 0(%esp), %xmm0
+    movsd 8(%esp), %xmm1
+    movsd 16(%esp), %xmm2
+    movsd 24(%esp), %xmm3
+    movsd 32(%esp), %xmm4
+    movsd 40(%esp), %xmm5
+    movsd 48(%esp), %xmm6
+    movsd 56(%esp), %xmm7
+    // Remove floating-point registers and padding.
+    addl MACRO_LITERAL(8 * 8 + 4), %esp
+    CFI_ADJUST_CFA_OFFSET(-(8 * 8 + 4))
+    // Restore core regs, except `reg`, as it is used to return the
+    // result of this function (simply remove it from the stack instead).
+    POP_REG_NE ebx, RAW_VAR(reg)
+    POP_REG_NE edx, RAW_VAR(reg)
+    POP_REG_NE ecx, RAW_VAR(reg)
+    POP_REG_NE eax, RAW_VAR(reg)
     ret
     END_FUNCTION VAR(name)
 END_MACRO
 
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, eax
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, ecx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, edx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, ebx
diff --git a/runtime/arch/x86_64/asm_support_x86_64.S b/runtime/arch/x86_64/asm_support_x86_64.S
index cf0039c..c4e723c 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.S
+++ b/runtime/arch/x86_64/asm_support_x86_64.S
@@ -52,7 +52,7 @@
 
 #define LITERAL(value) $value
 #if defined(__APPLE__)
-    #define MACRO_LITERAL(value) $$(value)
+    #define MACRO_LITERAL(value) $(value)
 #else
     #define MACRO_LITERAL(value) $value
 #endif
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index b566fb1..c2e3023 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -34,7 +34,8 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 496e6a8..784ec39 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1815,73 +1815,93 @@
     UNREACHABLE
 END_FUNCTION art_nested_signal_return
 
-// Call the ReadBarrier::Mark routine, getting argument and returning
-// result through RAX (register 0), thus following a non-standard
-// runtime calling convention:
-// - RAX is used to pass the (sole) argument of this function (instead
-//   of RDI);
-// - RDI is treated like a normal (non-argument) caller-save register;
-// - everything else is the same as in the standard runtime calling
-//   convention; in particular, RAX is still used to return the result
-//   of this function.
-DEFINE_FUNCTION art_quick_read_barrier_mark_reg00
-    SETUP_FP_CALLEE_SAVE_FRAME
-    subq LITERAL(8), %rsp           // Alignment padding.
-    CFI_ADJUST_CFA_OFFSET(8)
-    movq %rax, %rdi                 // Pass arg1 - obj from RAX.
-    call SYMBOL(artReadBarrierMark) // artReadBarrierMark(obj)
-    addq LITERAL(8), %rsp           // Remove padding.
-    CFI_ADJUST_CFA_OFFSET(-8)
-    RESTORE_FP_CALLEE_SAVE_FRAME
-    ret
-END_FUNCTION art_quick_read_barrier_mark_reg00
-
-// Call the ReadBarrier::Mark routine, getting argument and returning
-// result through RDI (register 7), thus following a non-standard
-// runtime calling convention:
-// - RDI is used to return the result of this function (instead of RAX);
-// - RAX is treated like a normal (non-result) caller-save register;
-// - everything else is the same as in the standard runtime calling
-//   convention; in particular, RDI is still used to pass the (sole)
-//   argument of this function.
-DEFINE_FUNCTION art_quick_read_barrier_mark_reg07
-    SETUP_FP_CALLEE_SAVE_FRAME
-    subq LITERAL(8), %rsp           // Alignment padding.
-    CFI_ADJUST_CFA_OFFSET(8)
-    call SYMBOL(artReadBarrierMark) // artReadBarrierMark(obj)
-    movq %rax, %rdi                 // Return result into RDI.
-    addq LITERAL(8), %rsp           // Remove padding.
-    CFI_ADJUST_CFA_OFFSET(-8)
-    RESTORE_FP_CALLEE_SAVE_FRAME
-    ret
-END_FUNCTION art_quick_read_barrier_mark_reg07
-
 // Create a function `name` calling the ReadBarrier::Mark routine,
 // getting its argument and returning its result through register
-// `reg`, thus following a non-standard runtime calling convention:
-// - `reg` is used to pass the (sole) argument of this function (instead
-//   of RDI);
-// - `reg` is used to return the result of this function (instead of RAX);
-// - RDI is treated like a normal (non-argument) caller-save register;
-// - RAX is treated like a normal (non-result) caller-save register;
+// `reg`, saving and restoring all caller-save registers.
+//
+// The generated function follows a non-standard runtime calling
+// convention:
+// - register `reg` (which may be different from RDI) is used to pass
+//   the (sole) argument of this function;
+// - register `reg` (which may be different from RAX) is used to return
+//   the result of this function (instead of RAX);
+// - if `reg` is different from `rdi`, RDI is treated like a normal
+//   (non-argument) caller-save register;
+// - if `reg` is different from `rax`, RAX is treated like a normal
+//   (non-result) caller-save register;
 // - everything else is the same as in the standard runtime calling
-//   convention (e.g. same callee-save registers).
+//   convention (e.g. standard callee-save registers are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
+    // Save all potentially live caller-save core registers.
+    PUSH rax
+    PUSH rcx
+    PUSH rdx
+    PUSH rsi
+    PUSH rdi
+    PUSH r8
+    PUSH r9
+    PUSH r10
+    PUSH r11
+    // Create space for caller-save floating-point registers.
+    subq MACRO_LITERAL(12 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(12 * 8)
+    // Save all potentially live caller-save floating-point registers.
+    movq %xmm0, 0(%rsp)
+    movq %xmm1, 8(%rsp)
+    movq %xmm2, 16(%rsp)
+    movq %xmm3, 24(%rsp)
+    movq %xmm4, 32(%rsp)
+    movq %xmm5, 40(%rsp)
+    movq %xmm6, 48(%rsp)
+    movq %xmm7, 56(%rsp)
+    movq %xmm8, 64(%rsp)
+    movq %xmm9, 72(%rsp)
+    movq %xmm10, 80(%rsp)
+    movq %xmm11, 88(%rsp)
     SETUP_FP_CALLEE_SAVE_FRAME
-    subq LITERAL(8), %rsp           // Alignment padding.
-    CFI_ADJUST_CFA_OFFSET(8)
-    movq REG_VAR(reg), %rdi         // Pass arg1 - obj from `reg`.
+
+    .ifnc RAW_VAR(reg), rdi
+      movq REG_VAR(reg), %rdi       // Pass arg1 - obj from `reg`.
+    .endif
     call SYMBOL(artReadBarrierMark) // artReadBarrierMark(obj)
-    movq %rax, REG_VAR(reg)         // Return result into `reg`.
-    addq LITERAL(8), %rsp           // Remove padding.
-    CFI_ADJUST_CFA_OFFSET(-8)
+    .ifnc RAW_VAR(reg), rax
+      movq %rax, REG_VAR(reg)       // Return result into `reg`.
+    .endif
+
     RESTORE_FP_CALLEE_SAVE_FRAME
+    // Restore floating-point registers.
+    movq 0(%rsp), %xmm0
+    movq 8(%rsp), %xmm1
+    movq 16(%rsp), %xmm2
+    movq 24(%rsp), %xmm3
+    movq 32(%rsp), %xmm4
+    movq 40(%rsp), %xmm5
+    movq 48(%rsp), %xmm6
+    movq 56(%rsp), %xmm7
+    movq 64(%rsp), %xmm8
+    movq 72(%rsp), %xmm9
+    movq 80(%rsp), %xmm10
+    movq 88(%rsp), %xmm11
+    // Remove floating-point registers.
+    addq MACRO_LITERAL(12 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-(12 * 8))
+    // Restore core regs, except `reg`, as it is used to return the
+    // result of this function (simply remove it from the stack instead).
+    POP_REG_NE r11, RAW_VAR(reg)
+    POP_REG_NE r10, RAW_VAR(reg)
+    POP_REG_NE r9, RAW_VAR(reg)
+    POP_REG_NE r8, RAW_VAR(reg)
+    POP_REG_NE rdi, RAW_VAR(reg)
+    POP_REG_NE rsi, RAW_VAR(reg)
+    POP_REG_NE rdx, RAW_VAR(reg)
+    POP_REG_NE rcx, RAW_VAR(reg)
+    POP_REG_NE rax, RAW_VAR(reg)
     ret
     END_FUNCTION VAR(name)
 END_MACRO
 
-// Note: art_quick_read_barrier_mark_reg00 is implemented above.
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, rax
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, rcx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, rdx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, rbx
@@ -1889,7 +1909,7 @@
 // cannot be used to pass arguments.
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg05, rbp
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg06, rsi
-// Note: art_quick_read_barrier_mark_reg07 is implemented above.
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg07, rdi
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg08, r8
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg09, r9
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, r10
diff --git a/runtime/art_method-inl.h b/runtime/art_method-inl.h
index 91258c7..32425d8 100644
--- a/runtime/art_method-inl.h
+++ b/runtime/art_method-inl.h
@@ -120,6 +120,10 @@
   return dex_method_index_;
 }
 
+inline uint32_t ArtMethod::GetImtIndex() {
+  return GetDexMethodIndex() % ImTable::kSize;
+}
+
 inline ArtMethod** ArtMethod::GetDexCacheResolvedMethods(size_t pointer_size) {
   return GetNativePointer<ArtMethod**>(DexCacheResolvedMethodsOffset(pointer_size),
                                        pointer_size);
diff --git a/runtime/art_method.h b/runtime/art_method.h
index d75113e..1d14203 100644
--- a/runtime/art_method.h
+++ b/runtime/art_method.h
@@ -421,6 +421,8 @@
 
   ALWAYS_INLINE uint32_t GetDexMethodIndex() SHARED_REQUIRES(Locks::mutator_lock_);
 
+  ALWAYS_INLINE uint32_t GetImtIndex() SHARED_REQUIRES(Locks::mutator_lock_);
+
   void SetDexMethodIndex(uint32_t new_idx) {
     // Not called within a transaction.
     dex_method_index_ = new_idx;
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index cb97faa..d0dad64 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -6159,11 +6159,6 @@
   }
 }
 
-static inline uint32_t GetIMTIndex(ArtMethod* interface_method)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
-  return interface_method->GetDexMethodIndex() % ImTable::kSize;
-}
-
 ImtConflictTable* ClassLinker::CreateImtConflictTable(size_t count,
                                                       LinearAlloc* linear_alloc,
                                                       size_t image_pointer_size) {
@@ -6215,7 +6210,7 @@
       // or interface methods in the IMT here they will not create extra conflicts since we compare
       // names and signatures in SetIMTRef.
       ArtMethod* interface_method = interface->GetVirtualMethod(j, image_pointer_size_);
-      const uint32_t imt_index = GetIMTIndex(interface_method);
+      const uint32_t imt_index = interface_method->GetImtIndex();
 
       // There is only any conflicts if all of the interface methods for an IMT slot don't have
       // the same implementation method, keep track of this to avoid creating a conflict table in
@@ -6269,7 +6264,7 @@
         }
         DCHECK(implementation_method != nullptr);
         ArtMethod* interface_method = interface->GetVirtualMethod(j, image_pointer_size_);
-        const uint32_t imt_index = GetIMTIndex(interface_method);
+        const uint32_t imt_index = interface_method->GetImtIndex();
         if (!imt[imt_index]->IsRuntimeMethod() ||
             imt[imt_index] == unimplemented_method ||
             imt[imt_index] == imt_conflict_method) {
@@ -6675,7 +6670,7 @@
         auto* interface_method = iftable->GetInterface(i)->GetVirtualMethod(j, image_pointer_size_);
         MethodNameAndSignatureComparator interface_name_comparator(
             interface_method->GetInterfaceMethodIfProxy(image_pointer_size_));
-        uint32_t imt_index = GetIMTIndex(interface_method);
+        uint32_t imt_index = interface_method->GetImtIndex();
         ArtMethod** imt_ptr = &out_imt[imt_index];
         // For each method listed in the interface's method list, find the
         // matching method in our class's method list.  We want to favor the
@@ -7700,7 +7695,7 @@
   }
 
   if (is_static) {
-    resolved = mirror::Class::FindStaticField(self, klass, dex_cache.Get(), field_idx);
+    resolved = mirror::Class::FindStaticField(self, klass.Get(), dex_cache.Get(), field_idx);
   } else {
     resolved = klass->FindInstanceField(dex_cache.Get(), field_idx);
   }
diff --git a/runtime/entrypoints/entrypoint_utils-inl.h b/runtime/entrypoints/entrypoint_utils-inl.h
index ab14655..7ecd595 100644
--- a/runtime/entrypoints/entrypoint_utils-inl.h
+++ b/runtime/entrypoints/entrypoint_utils-inl.h
@@ -19,7 +19,7 @@
 
 #include "entrypoint_utils.h"
 
-#include "art_method.h"
+#include "art_method-inl.h"
 #include "class_linker-inl.h"
 #include "common_throws.h"
 #include "dex_file.h"
@@ -600,7 +600,7 @@
       }
     }
     case kInterface: {
-      uint32_t imt_index = resolved_method->GetDexMethodIndex() % ImTable::kSize;
+      uint32_t imt_index = resolved_method->GetImtIndex();
       size_t pointer_size = class_linker->GetImagePointerSize();
       ArtMethod* imt_method = (*this_object)->GetClass()->GetImt(pointer_size)->
           Get(imt_index, pointer_size);
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 1152b94..49043f6 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -2170,8 +2170,7 @@
   if (LIKELY(interface_method->GetDexMethodIndex() != DexFile::kDexNoIndex)) {
     // If the dex cache already resolved the interface method, look whether we have
     // a match in the ImtConflictTable.
-    uint32_t imt_index = interface_method->GetDexMethodIndex();
-    ArtMethod* conflict_method = imt->Get(imt_index % ImTable::kSize, sizeof(void*));
+    ArtMethod* conflict_method = imt->Get(interface_method->GetImtIndex(), sizeof(void*));
     if (LIKELY(conflict_method->IsRuntimeMethod())) {
       ImtConflictTable* current_table = conflict_method->GetImtConflictTable(sizeof(void*));
       DCHECK(current_table != nullptr);
@@ -2222,8 +2221,8 @@
 
   // We arrive here if we have found an implementation, and it is not in the ImtConflictTable.
   // We create a new table with the new pair { interface_method, method }.
-  uint32_t imt_index = interface_method->GetDexMethodIndex();
-  ArtMethod* conflict_method = imt->Get(imt_index % ImTable::kSize, sizeof(void*));
+  uint32_t imt_index = interface_method->GetImtIndex();
+  ArtMethod* conflict_method = imt->Get(imt_index, sizeof(void*));
   if (conflict_method->IsRuntimeMethod()) {
     ArtMethod* new_conflict_method = Runtime::Current()->GetClassLinker()->AddMethodToConflictTable(
         cls.Get(),
@@ -2234,7 +2233,7 @@
     if (new_conflict_method != conflict_method) {
       // Update the IMT if we create a new conflict method. No fence needed here, as the
       // data is consistent.
-      imt->Set(imt_index % ImTable::kSize,
+      imt->Set(imt_index,
                new_conflict_method,
                sizeof(void*));
     }
diff --git a/runtime/gc/collector/concurrent_copying-inl.h b/runtime/gc/collector/concurrent_copying-inl.h
index 3011112..4019a5b 100644
--- a/runtime/gc/collector/concurrent_copying-inl.h
+++ b/runtime/gc/collector/concurrent_copying-inl.h
@@ -153,6 +153,14 @@
   }
 }
 
+inline mirror::Object* ConcurrentCopying::MarkFromReadBarrier(mirror::Object* from_ref) {
+  // TODO: Consider removing this check when we are done investigating slow paths. b/30162165
+  if (UNLIKELY(mark_from_read_barrier_measurements_)) {
+    return MarkFromReadBarrierWithMeasurements(from_ref);
+  }
+  return Mark(from_ref);
+}
+
 inline mirror::Object* ConcurrentCopying::GetFwdPtr(mirror::Object* from_ref) {
   DCHECK(region_space_->IsInFromSpace(from_ref));
   LockWord lw = from_ref->GetLockWord(false);
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index b7b5aa0..d2d2f23 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -17,7 +17,9 @@
 #include "concurrent_copying.h"
 
 #include "art_field-inl.h"
+#include "base/histogram-inl.h"
 #include "base/stl_util.h"
+#include "base/systrace.h"
 #include "debugger.h"
 #include "gc/accounting/heap_bitmap-inl.h"
 #include "gc/accounting/space_bitmap-inl.h"
@@ -39,7 +41,9 @@
 
 static constexpr size_t kDefaultGcMarkStackSize = 2 * MB;
 
-ConcurrentCopying::ConcurrentCopying(Heap* heap, const std::string& name_prefix)
+ConcurrentCopying::ConcurrentCopying(Heap* heap,
+                                     const std::string& name_prefix,
+                                     bool measure_read_barrier_slow_path)
     : GarbageCollector(heap,
                        name_prefix + (name_prefix.empty() ? "" : " ") +
                        "concurrent copying + mark sweep"),
@@ -54,6 +58,14 @@
       heap_mark_bitmap_(nullptr), live_stack_freeze_size_(0), mark_stack_mode_(kMarkStackModeOff),
       weak_ref_access_enabled_(true),
       skipped_blocks_lock_("concurrent copying bytes blocks lock", kMarkSweepMarkStackLock),
+      measure_read_barrier_slow_path_(measure_read_barrier_slow_path),
+      rb_slow_path_ns_(0),
+      rb_slow_path_count_(0),
+      rb_slow_path_count_gc_(0),
+      rb_slow_path_histogram_lock_("Read barrier histogram lock"),
+      rb_slow_path_time_histogram_("Mutator time in read barrier slow path", 500, 32),
+      rb_slow_path_count_total_(0),
+      rb_slow_path_count_gc_total_(0),
       rb_table_(heap_->GetReadBarrierTable()),
       force_evacuate_all_(false),
       immune_gray_stack_lock_("concurrent copying immune gray stack lock",
@@ -162,6 +174,14 @@
     MutexLock mu(Thread::Current(), mark_stack_lock_);
     CHECK(false_gray_stack_.empty());
   }
+
+  mark_from_read_barrier_measurements_ = measure_read_barrier_slow_path_;
+  if (measure_read_barrier_slow_path_) {
+    rb_slow_path_ns_.StoreRelaxed(0);
+    rb_slow_path_count_.StoreRelaxed(0);
+    rb_slow_path_count_gc_.StoreRelaxed(0);
+  }
+
   immune_spaces_.Reset();
   bytes_moved_.StoreRelaxed(0);
   objects_moved_.StoreRelaxed(0);
@@ -194,7 +214,7 @@
 }
 
 // Used to switch the thread roots of a thread from from-space refs to to-space refs.
-class ConcurrentCopying::ThreadFlipVisitor : public Closure {
+class ConcurrentCopying::ThreadFlipVisitor : public Closure, public RootVisitor {
  public:
   ThreadFlipVisitor(ConcurrentCopying* concurrent_copying, bool use_tlab)
       : concurrent_copying_(concurrent_copying), use_tlab_(use_tlab) {
@@ -221,10 +241,44 @@
       thread->RevokeThreadLocalAllocationStack();
     }
     ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
-    thread->VisitRoots(concurrent_copying_);
+    // We can use the non-CAS VisitRoots functions below because we update thread-local GC roots
+    // only.
+    thread->VisitRoots(this);
     concurrent_copying_->GetBarrier().Pass(self);
   }
 
+  void VisitRoots(mirror::Object*** roots,
+                  size_t count,
+                  const RootInfo& info ATTRIBUTE_UNUSED)
+      SHARED_REQUIRES(Locks::mutator_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      mirror::Object** root = roots[i];
+      mirror::Object* ref = *root;
+      if (ref != nullptr) {
+        mirror::Object* to_ref = concurrent_copying_->Mark(ref);
+        if (to_ref != ref) {
+          *root = to_ref;
+        }
+      }
+    }
+  }
+
+  void VisitRoots(mirror::CompressedReference<mirror::Object>** roots,
+                  size_t count,
+                  const RootInfo& info ATTRIBUTE_UNUSED)
+      SHARED_REQUIRES(Locks::mutator_lock_) {
+    for (size_t i = 0; i < count; ++i) {
+      mirror::CompressedReference<mirror::Object>* const root = roots[i];
+      if (!root->IsNull()) {
+        mirror::Object* ref = root->AsMirrorPtr();
+        mirror::Object* to_ref = concurrent_copying_->Mark(ref);
+        if (to_ref != ref) {
+          root->Assign(to_ref);
+        }
+      }
+    }
+  }
+
  private:
   ConcurrentCopying* const concurrent_copying_;
   const bool use_tlab_;
@@ -1996,9 +2050,17 @@
     MutexLock mu(Thread::Current(), skipped_blocks_lock_);
     skipped_blocks_map_.clear();
   }
-  ReaderMutexLock mu(self, *Locks::mutator_lock_);
-  WriterMutexLock mu2(self, *Locks::heap_bitmap_lock_);
-  heap_->ClearMarkedObjects();
+  {
+    ReaderMutexLock mu(self, *Locks::mutator_lock_);
+    WriterMutexLock mu2(self, *Locks::heap_bitmap_lock_);
+    heap_->ClearMarkedObjects();
+  }
+  if (measure_read_barrier_slow_path_) {
+    MutexLock mu(self, rb_slow_path_histogram_lock_);
+    rb_slow_path_time_histogram_.AdjustAndAddValue(rb_slow_path_ns_.LoadRelaxed());
+    rb_slow_path_count_total_ += rb_slow_path_count_.LoadRelaxed();
+    rb_slow_path_count_gc_total_ += rb_slow_path_count_gc_.LoadRelaxed();
+  }
 }
 
 bool ConcurrentCopying::IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* field) {
@@ -2036,6 +2098,37 @@
   region_space_->RevokeAllThreadLocalBuffers();
 }
 
+mirror::Object* ConcurrentCopying::MarkFromReadBarrierWithMeasurements(mirror::Object* from_ref) {
+  if (Thread::Current() != thread_running_gc_) {
+    rb_slow_path_count_.FetchAndAddRelaxed(1u);
+  } else {
+    rb_slow_path_count_gc_.FetchAndAddRelaxed(1u);
+  }
+  ScopedTrace tr(__FUNCTION__);
+  const uint64_t start_time = measure_read_barrier_slow_path_ ? NanoTime() : 0u;
+  mirror::Object* ret = Mark(from_ref);
+  if (measure_read_barrier_slow_path_) {
+    rb_slow_path_ns_.FetchAndAddRelaxed(NanoTime() - start_time);
+  }
+  return ret;
+}
+
+void ConcurrentCopying::DumpPerformanceInfo(std::ostream& os) {
+  GarbageCollector::DumpPerformanceInfo(os);
+  MutexLock mu(Thread::Current(), rb_slow_path_histogram_lock_);
+  if (rb_slow_path_time_histogram_.SampleSize() > 0) {
+    Histogram<uint64_t>::CumulativeData cumulative_data;
+    rb_slow_path_time_histogram_.CreateHistogram(&cumulative_data);
+    rb_slow_path_time_histogram_.PrintConfidenceIntervals(os, 0.99, cumulative_data);
+  }
+  if (rb_slow_path_count_total_ > 0) {
+    os << "Slow path count " << rb_slow_path_count_total_ << "\n";
+  }
+  if (rb_slow_path_count_gc_total_ > 0) {
+    os << "GC slow path count " << rb_slow_path_count_gc_total_ << "\n";
+  }
+}
+
 }  // namespace collector
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 166a1f0..6a8d052 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -58,17 +58,24 @@
   // Enable verbose mode.
   static constexpr bool kVerboseMode = false;
 
-  ConcurrentCopying(Heap* heap, const std::string& name_prefix = "");
+  ConcurrentCopying(Heap* heap,
+                    const std::string& name_prefix = "",
+                    bool measure_read_barrier_slow_path = false);
   ~ConcurrentCopying();
 
   virtual void RunPhases() OVERRIDE
-      REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
+      REQUIRES(!immune_gray_stack_lock_,
+               !mark_stack_lock_,
+               !rb_slow_path_histogram_lock_,
+               !skipped_blocks_lock_);
   void InitializePhase() SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_, !immune_gray_stack_lock_);
   void MarkingPhase() SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
   void ReclaimPhase() SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!mark_stack_lock_);
-  void FinishPhase() REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_);
+  void FinishPhase() REQUIRES(!mark_stack_lock_,
+                              !rb_slow_path_histogram_lock_,
+                              !skipped_blocks_lock_);
 
   void BindBitmaps() SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!Locks::heap_bitmap_lock_);
@@ -95,7 +102,11 @@
     return IsMarked(ref) == ref;
   }
   template<bool kGrayImmuneObject = true>
-  ALWAYS_INLINE mirror::Object* Mark(mirror::Object* from_ref) SHARED_REQUIRES(Locks::mutator_lock_)
+  ALWAYS_INLINE mirror::Object* Mark(mirror::Object* from_ref)
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
+  ALWAYS_INLINE mirror::Object* MarkFromReadBarrier(mirror::Object* from_ref)
+      SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
   bool IsMarking() const {
     return is_marking_;
@@ -203,6 +214,10 @@
       REQUIRES(!mark_stack_lock_);
   void ScanImmuneObject(mirror::Object* obj)
       SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!mark_stack_lock_);
+  mirror::Object* MarkFromReadBarrierWithMeasurements(mirror::Object* from_ref)
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
+  void DumpPerformanceInfo(std::ostream& os) OVERRIDE REQUIRES(!rb_slow_path_histogram_lock_);
 
   space::RegionSpace* region_space_;      // The underlying region space.
   std::unique_ptr<Barrier> gc_barrier_;
@@ -251,6 +266,20 @@
   Atomic<size_t> to_space_bytes_skipped_;
   Atomic<size_t> to_space_objects_skipped_;
 
+  // If measure_read_barrier_slow_path_ is true, we count how long is spent in MarkFromReadBarrier
+  // and also log.
+  bool measure_read_barrier_slow_path_;
+  // mark_from_read_barrier_measurements_ is true if systrace is enabled or
+  // measure_read_barrier_time_ is true.
+  bool mark_from_read_barrier_measurements_;
+  Atomic<uint64_t> rb_slow_path_ns_;
+  Atomic<uint64_t> rb_slow_path_count_;
+  Atomic<uint64_t> rb_slow_path_count_gc_;
+  mutable Mutex rb_slow_path_histogram_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  Histogram<uint64_t> rb_slow_path_time_histogram_ GUARDED_BY(rb_slow_path_histogram_lock_);
+  uint64_t rb_slow_path_count_total_ GUARDED_BY(rb_slow_path_histogram_lock_);
+  uint64_t rb_slow_path_count_gc_total_ GUARDED_BY(rb_slow_path_histogram_lock_);
+
   accounting::ReadBarrierTable* rb_table_;
   bool force_evacuate_all_;  // True if all regions are evacuated.
   Atomic<bool> updated_all_immune_objects_;
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index 580486a..e0b71a7 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -181,7 +181,7 @@
   void RecordFree(const ObjectBytePair& freed);
   // Record a free of large objects.
   void RecordFreeLOS(const ObjectBytePair& freed);
-  void DumpPerformanceInfo(std::ostream& os) REQUIRES(!pause_histogram_lock_);
+  virtual void DumpPerformanceInfo(std::ostream& os) REQUIRES(!pause_histogram_lock_);
 
   // Helper functions for querying if objects are marked. These are used for processing references,
   // and will be used for reading system weaks while the GC is running.
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index a6d62a9..6f4767e 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -157,6 +157,7 @@
            bool verify_pre_sweeping_rosalloc,
            bool verify_post_gc_rosalloc,
            bool gc_stress_mode,
+           bool measure_gc_performance,
            bool use_homogeneous_space_compaction_for_oom,
            uint64_t min_interval_homogeneous_space_compaction_by_oom)
     : non_moving_space_(nullptr),
@@ -599,7 +600,9 @@
       garbage_collectors_.push_back(semi_space_collector_);
     }
     if (MayUseCollector(kCollectorTypeCC)) {
-      concurrent_copying_collector_ = new collector::ConcurrentCopying(this);
+      concurrent_copying_collector_ = new collector::ConcurrentCopying(this,
+                                                                       "",
+                                                                       measure_gc_performance);
       garbage_collectors_.push_back(concurrent_copying_collector_);
     }
     if (MayUseCollector(kCollectorTypeMC)) {
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 6fb048a..bb0d11a 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -182,6 +182,7 @@
        bool verify_pre_sweeping_rosalloc,
        bool verify_post_gc_rosalloc,
        bool gc_stress_mode,
+       bool measure_gc_performance,
        bool use_homogeneous_space_compaction,
        uint64_t min_interval_homogeneous_space_compaction_by_oom);
 
diff --git a/runtime/interpreter/mterp/arm64/fbinop2addr.S b/runtime/interpreter/mterp/arm64/fbinop2addr.S
index 0d57cbf..04236ad 100644
--- a/runtime/interpreter/mterp/arm64/fbinop2addr.S
+++ b/runtime/interpreter/mterp/arm64/fbinop2addr.S
@@ -7,8 +7,7 @@
      */
     /* binop/2addr vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     $instr                              // s2<- op
diff --git a/runtime/interpreter/mterp/arm64/footer.S b/runtime/interpreter/mterp/arm64/footer.S
index 2d3a11e..7628ed3 100644
--- a/runtime/interpreter/mterp/arm64/footer.S
+++ b/runtime/interpreter/mterp/arm64/footer.S
@@ -234,7 +234,7 @@
 #if MTERP_LOGGING
     mov  x0, xSELF
     add  x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm x2, xINST, 0, 31
+    sxtw x2, wINST
     bl MterpLogOSR
 #endif
     mov  x0, #1                         // Signal normal return
diff --git a/runtime/interpreter/mterp/arm64/funopNarrow.S b/runtime/interpreter/mterp/arm64/funopNarrow.S
index 9f5ad1e..aed830b 100644
--- a/runtime/interpreter/mterp/arm64/funopNarrow.S
+++ b/runtime/interpreter/mterp/arm64/funopNarrow.S
@@ -8,10 +8,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG $srcreg, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     $instr                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG $tgtreg, w4                // vA<- d0
diff --git a/runtime/interpreter/mterp/arm64/funopNarrower.S b/runtime/interpreter/mterp/arm64/funopNarrower.S
index 411396b..6fddfea 100644
--- a/runtime/interpreter/mterp/arm64/funopNarrower.S
+++ b/runtime/interpreter/mterp/arm64/funopNarrower.S
@@ -7,10 +7,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE $srcreg, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     $instr                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG $tgtreg, w4                // vA<- d0
diff --git a/runtime/interpreter/mterp/arm64/funopWide.S b/runtime/interpreter/mterp/arm64/funopWide.S
index d83b39c..409e26b 100644
--- a/runtime/interpreter/mterp/arm64/funopWide.S
+++ b/runtime/interpreter/mterp/arm64/funopWide.S
@@ -7,10 +7,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE $srcreg, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     $instr                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE $tgtreg, w4           // vA<- d0
diff --git a/runtime/interpreter/mterp/arm64/funopWider.S b/runtime/interpreter/mterp/arm64/funopWider.S
index 50a73f1..4c91ebc 100644
--- a/runtime/interpreter/mterp/arm64/funopWider.S
+++ b/runtime/interpreter/mterp/arm64/funopWider.S
@@ -7,10 +7,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG $srcreg, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     $instr                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE $tgtreg, w4           // vA<- d0
diff --git a/runtime/interpreter/mterp/arm64/op_const_wide_16.S b/runtime/interpreter/mterp/arm64/op_const_wide_16.S
index e43628b..553d481 100644
--- a/runtime/interpreter/mterp/arm64/op_const_wide_16.S
+++ b/runtime/interpreter/mterp/arm64/op_const_wide_16.S
@@ -1,8 +1,7 @@
     /* const-wide/16 vAA, #+BBBB */
-    FETCH_S w0, 1                       // w0<- ssssBBBB (sign-extended
+    FETCH_S x0, 1                       // x0<- ssssssssssssBBBB (sign-extended)
     lsr     w3, wINST, #8               // w3<- AA
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    sbfm    x0, x0, 0, 31
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/op_const_wide_32.S b/runtime/interpreter/mterp/arm64/op_const_wide_32.S
index 527f7d8..9dc4fc3 100644
--- a/runtime/interpreter/mterp/arm64/op_const_wide_32.S
+++ b/runtime/interpreter/mterp/arm64/op_const_wide_32.S
@@ -1,10 +1,9 @@
     /* const-wide/32 vAA, #+BBBBbbbb */
-    FETCH w0, 1                         // w0<- 0000bbbb (low)
+    FETCH   w0, 1                       // x0<- 000000000000bbbb (low)
     lsr     w3, wINST, #8               // w3<- AA
-    FETCH_S w2, 2                       // w2<- ssssBBBB (high)
+    FETCH_S x2, 2                       // x2<- ssssssssssssBBBB (high)
     FETCH_ADVANCE_INST 3                // advance rPC, load wINST
     GET_INST_OPCODE ip                  // extract opcode from wINST
-    orr     w0, w0, w2, lsl #16         // w0<- BBBBbbbb
-    sbfm    x0, x0, 0, 31
+    orr     x0, x0, x2, lsl #16         // x0<- ssssssssBBBBbbbb
     SET_VREG_WIDE x0, w3
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/op_iget_quick.S b/runtime/interpreter/mterp/arm64/op_iget_quick.S
index 45c68a3..699b2c4 100644
--- a/runtime/interpreter/mterp/arm64/op_iget_quick.S
+++ b/runtime/interpreter/mterp/arm64/op_iget_quick.S
@@ -5,8 +5,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     $load   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     $extend
diff --git a/runtime/interpreter/mterp/arm64/op_iget_wide_quick.S b/runtime/interpreter/mterp/arm64/op_iget_wide_quick.S
index 2480d2d..30b30c2 100644
--- a/runtime/interpreter/mterp/arm64/op_iget_wide_quick.S
+++ b/runtime/interpreter/mterp/arm64/op_iget_wide_quick.S
@@ -3,7 +3,7 @@
     FETCH w4, 1                         // w4<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cbz     w3, common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     add     x4, x3, x4                  // create direct pointer
     ldr     x0, [x4]
     FETCH_ADVANCE_INST 2                // advance rPC, load wINST
diff --git a/runtime/interpreter/mterp/arm64/op_instance_of.S b/runtime/interpreter/mterp/arm64/op_instance_of.S
index 647bc75..a56705a 100644
--- a/runtime/interpreter/mterp/arm64/op_instance_of.S
+++ b/runtime/interpreter/mterp/arm64/op_instance_of.S
@@ -13,8 +13,7 @@
     mov       x3, xSELF                 // w3<- self
     bl        MterpInstanceOf           // (index, &obj, method, self)
     ldr       x1, [xSELF, #THREAD_EXCEPTION_OFFSET]
-    lsr       w2, wINST, #8             // w2<- A+
-    and       w2, w2, #15               // w2<- A
+    ubfx      w2, wINST, #8, #4         // w2<- A
     PREFETCH_INST 2
     cbnz      x1, MterpException
     ADVANCE 2                           // advance rPC
diff --git a/runtime/interpreter/mterp/arm64/op_int_to_long.S b/runtime/interpreter/mterp/arm64/op_int_to_long.S
index 13d2120..35830f3 100644
--- a/runtime/interpreter/mterp/arm64/op_int_to_long.S
+++ b/runtime/interpreter/mterp/arm64/op_int_to_long.S
@@ -1 +1 @@
-%include "arm64/funopWider.S" {"instr":"sbfm x0, x0, 0, 31", "srcreg":"w0", "tgtreg":"x0"}
+%include "arm64/funopWider.S" {"instr":"sxtw x0, w0", "srcreg":"w0", "tgtreg":"x0"}
diff --git a/runtime/interpreter/mterp/arm64/op_iput_wide_quick.S b/runtime/interpreter/mterp/arm64/op_iput_wide_quick.S
index 27b5dc5..566e2bf 100644
--- a/runtime/interpreter/mterp/arm64/op_iput_wide_quick.S
+++ b/runtime/interpreter/mterp/arm64/op_iput_wide_quick.S
@@ -3,8 +3,7 @@
     FETCH w3, 1                         // w3<- field byte offset
     GET_VREG w2, w2                     // w2<- fp[B], the object pointer
     ubfx    w0, wINST, #8, #4           // w0<- A
-    cmp     w2, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w2, common_errNullObject    // object was null
     GET_VREG_WIDE x0, w0                // x0-< fp[A]
     FETCH_ADVANCE_INST 2                // advance rPC, load wINST
     add     x1, x2, x3                  // create a direct pointer
diff --git a/runtime/interpreter/mterp/arm64/op_packed_switch.S b/runtime/interpreter/mterp/arm64/op_packed_switch.S
index 1456f1a..4faa6d2 100644
--- a/runtime/interpreter/mterp/arm64/op_packed_switch.S
+++ b/runtime/interpreter/mterp/arm64/op_packed_switch.S
@@ -9,12 +9,12 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
-    FETCH w0, 1                         // w0<- bbbb (lo)
-    FETCH w1, 2                         // w1<- BBBB (hi)
+    FETCH   w0, 1                       // x0<- 000000000000bbbb (lo)
+    FETCH_S x1, 2                       // x1<- ssssssssssssBBBB (hi)
     lsr     w3, wINST, #8               // w3<- AA
-    orr     w0, w0, w1, lsl #16         // w0<- BBBBbbbb
+    orr     x0, x0, x1, lsl #16         // x0<- ssssssssBBBBbbbb
     GET_VREG w1, w3                     // w1<- vAA
-    add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
+    add     x0, xPC, x0, lsl #1         // x0<- PC + BBBBbbbb*2
     bl      $func                       // w0<- code-unit branch offset
-    sbfm    xINST, x0, 0, 31
+    sxtw    xINST, w0
     b       MterpCommonTakenBranchNoFlags
diff --git a/runtime/interpreter/mterp/arm64/op_rem_float_2addr.S b/runtime/interpreter/mterp/arm64/op_rem_float_2addr.S
index 0b91891..95f81c5 100644
--- a/runtime/interpreter/mterp/arm64/op_rem_float_2addr.S
+++ b/runtime/interpreter/mterp/arm64/op_rem_float_2addr.S
@@ -1,12 +1,10 @@
     /* rem vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     bl  fmodf
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG s0, w9
diff --git a/runtime/interpreter/mterp/arm64/op_shl_int.S b/runtime/interpreter/mterp/arm64/op_shl_int.S
index bd0f237..3062a3f 100644
--- a/runtime/interpreter/mterp/arm64/op_shl_int.S
+++ b/runtime/interpreter/mterp/arm64/op_shl_int.S
@@ -1 +1 @@
-%include "arm64/binop.S" {"preinstr":"and     w1, w1, #31", "instr":"lsl     w0, w0, w1"}
+%include "arm64/binop.S" {"instr":"lsl     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_shl_int_2addr.S b/runtime/interpreter/mterp/arm64/op_shl_int_2addr.S
index b4671d2..9a7e09f 100644
--- a/runtime/interpreter/mterp/arm64/op_shl_int_2addr.S
+++ b/runtime/interpreter/mterp/arm64/op_shl_int_2addr.S
@@ -1 +1 @@
-%include "arm64/binop2addr.S" {"preinstr":"and     w1, w1, #31", "instr":"lsl     w0, w0, w1"}
+%include "arm64/binop2addr.S" {"instr":"lsl     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_shl_int_lit8.S b/runtime/interpreter/mterp/arm64/op_shl_int_lit8.S
index 4dd32e0..17f57f9 100644
--- a/runtime/interpreter/mterp/arm64/op_shl_int_lit8.S
+++ b/runtime/interpreter/mterp/arm64/op_shl_int_lit8.S
@@ -1 +1 @@
-%include "arm64/binopLit8.S" {"preinstr":"and     w1, w1, #31", "instr":"lsl     w0, w0, w1"}
+%include "arm64/binopLit8.S" {"instr":"lsl     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_shr_int.S b/runtime/interpreter/mterp/arm64/op_shr_int.S
index c214a18..493b740 100644
--- a/runtime/interpreter/mterp/arm64/op_shr_int.S
+++ b/runtime/interpreter/mterp/arm64/op_shr_int.S
@@ -1 +1 @@
-%include "arm64/binop.S" {"preinstr":"and     w1, w1, #31", "instr":"asr     w0, w0, w1"}
+%include "arm64/binop.S" {"instr":"asr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_shr_int_2addr.S b/runtime/interpreter/mterp/arm64/op_shr_int_2addr.S
index 3c1484b..6efe8ee 100644
--- a/runtime/interpreter/mterp/arm64/op_shr_int_2addr.S
+++ b/runtime/interpreter/mterp/arm64/op_shr_int_2addr.S
@@ -1 +1 @@
-%include "arm64/binop2addr.S" {"preinstr":"and     w1, w1, #31", "instr":"asr     w0, w0, w1"}
+%include "arm64/binop2addr.S" {"instr":"asr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_shr_int_lit8.S b/runtime/interpreter/mterp/arm64/op_shr_int_lit8.S
index 26d5024..274080c 100644
--- a/runtime/interpreter/mterp/arm64/op_shr_int_lit8.S
+++ b/runtime/interpreter/mterp/arm64/op_shr_int_lit8.S
@@ -1 +1 @@
-%include "arm64/binopLit8.S" {"preinstr":"and     w1, w1, #31", "instr":"asr     w0, w0, w1"}
+%include "arm64/binopLit8.S" {"instr":"asr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_ushr_int.S b/runtime/interpreter/mterp/arm64/op_ushr_int.S
index bb8382b..005452b 100644
--- a/runtime/interpreter/mterp/arm64/op_ushr_int.S
+++ b/runtime/interpreter/mterp/arm64/op_ushr_int.S
@@ -1 +1 @@
-%include "arm64/binop.S" {"preinstr":"and     w1, w1, #31", "instr":"lsr     w0, w0, w1"}
+%include "arm64/binop.S" {"instr":"lsr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_ushr_int_2addr.S b/runtime/interpreter/mterp/arm64/op_ushr_int_2addr.S
index dbccb99..1cb8cb7 100644
--- a/runtime/interpreter/mterp/arm64/op_ushr_int_2addr.S
+++ b/runtime/interpreter/mterp/arm64/op_ushr_int_2addr.S
@@ -1 +1 @@
-%include "arm64/binop2addr.S" {"preinstr":"and     w1, w1, #31", "instr":"lsr     w0, w0, w1"}
+%include "arm64/binop2addr.S" {"instr":"lsr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_ushr_int_lit8.S b/runtime/interpreter/mterp/arm64/op_ushr_int_lit8.S
index 35090c4..ff30e1f 100644
--- a/runtime/interpreter/mterp/arm64/op_ushr_int_lit8.S
+++ b/runtime/interpreter/mterp/arm64/op_ushr_int_lit8.S
@@ -1 +1 @@
-%include "arm64/binopLit8.S" {"preinstr":"and     w1, w1, #31", "instr":"lsr     w0, w0, w1"}
+%include "arm64/binopLit8.S" {"instr":"lsr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/shiftWide.S b/runtime/interpreter/mterp/arm64/shiftWide.S
index 6306fca..dcb2fb7 100644
--- a/runtime/interpreter/mterp/arm64/shiftWide.S
+++ b/runtime/interpreter/mterp/arm64/shiftWide.S
@@ -12,8 +12,7 @@
     and      w1, w0, #255                // w1<- BB
     GET_VREG_WIDE x1, w1                // x1<- vBB
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and      x2, x2, #63                 // Mask low 6
-    $opcode  x0, x1, x2                 // Do the shift.
+    $opcode  x0, x1, x2                 // Do the shift. Only low 6 bits of x2 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3                // vAA<- x0
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/shiftWide2addr.S b/runtime/interpreter/mterp/arm64/shiftWide2addr.S
index 77d104a..b860dfd 100644
--- a/runtime/interpreter/mterp/arm64/shiftWide2addr.S
+++ b/runtime/interpreter/mterp/arm64/shiftWide2addr.S
@@ -8,8 +8,7 @@
     GET_VREG w1, w1                     // x1<- vB
     GET_VREG_WIDE x0, w2                // x0<- vA
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     x1, x1, #63                 // Mask low 6 bits.
-    $opcode x0, x0, x1
+    $opcode x0, x0, x1                  // Do the shift. Only low 6 bits of x1 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w2               // vAA<- result
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/out/mterp_arm64.S b/runtime/interpreter/mterp/out/mterp_arm64.S
index df0b686..d470551 100644
--- a/runtime/interpreter/mterp/out/mterp_arm64.S
+++ b/runtime/interpreter/mterp/out/mterp_arm64.S
@@ -747,10 +747,9 @@
 .L_op_const_wide_16: /* 0x16 */
 /* File: arm64/op_const_wide_16.S */
     /* const-wide/16 vAA, #+BBBB */
-    FETCH_S w0, 1                       // w0<- ssssBBBB (sign-extended
+    FETCH_S x0, 1                       // x0<- ssssssssssssBBBB (sign-extended)
     lsr     w3, wINST, #8               // w3<- AA
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    sbfm    x0, x0, 0, 31
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3
     GOTO_OPCODE ip                      // jump to next instruction
@@ -760,13 +759,12 @@
 .L_op_const_wide_32: /* 0x17 */
 /* File: arm64/op_const_wide_32.S */
     /* const-wide/32 vAA, #+BBBBbbbb */
-    FETCH w0, 1                         // w0<- 0000bbbb (low)
+    FETCH   w0, 1                       // x0<- 000000000000bbbb (low)
     lsr     w3, wINST, #8               // w3<- AA
-    FETCH_S w2, 2                       // w2<- ssssBBBB (high)
+    FETCH_S x2, 2                       // x2<- ssssssssssssBBBB (high)
     FETCH_ADVANCE_INST 3                // advance rPC, load wINST
     GET_INST_OPCODE ip                  // extract opcode from wINST
-    orr     w0, w0, w2, lsl #16         // w0<- BBBBbbbb
-    sbfm    x0, x0, 0, 31
+    orr     x0, x0, x2, lsl #16         // x0<- ssssssssBBBBbbbb
     SET_VREG_WIDE x0, w3
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -934,8 +932,7 @@
     mov       x3, xSELF                 // w3<- self
     bl        MterpInstanceOf           // (index, &obj, method, self)
     ldr       x1, [xSELF, #THREAD_EXCEPTION_OFFSET]
-    lsr       w2, wINST, #8             // w2<- A+
-    and       w2, w2, #15               // w2<- A
+    ubfx      w2, wINST, #8, #4         // w2<- A
     PREFETCH_INST 2
     cbnz      x1, MterpException
     ADVANCE 2                           // advance rPC
@@ -1143,14 +1140,14 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
-    FETCH w0, 1                         // w0<- bbbb (lo)
-    FETCH w1, 2                         // w1<- BBBB (hi)
+    FETCH   w0, 1                       // x0<- 000000000000bbbb (lo)
+    FETCH_S x1, 2                       // x1<- ssssssssssssBBBB (hi)
     lsr     w3, wINST, #8               // w3<- AA
-    orr     w0, w0, w1, lsl #16         // w0<- BBBBbbbb
+    orr     x0, x0, x1, lsl #16         // x0<- ssssssssBBBBbbbb
     GET_VREG w1, w3                     // w1<- vAA
-    add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
+    add     x0, xPC, x0, lsl #1         // x0<- PC + BBBBbbbb*2
     bl      MterpDoPackedSwitch                       // w0<- code-unit branch offset
-    sbfm    xINST, x0, 0, 31
+    sxtw    xINST, w0
     b       MterpCommonTakenBranchNoFlags
 
 /* ------------------------------ */
@@ -1168,14 +1165,14 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
-    FETCH w0, 1                         // w0<- bbbb (lo)
-    FETCH w1, 2                         // w1<- BBBB (hi)
+    FETCH   w0, 1                       // x0<- 000000000000bbbb (lo)
+    FETCH_S x1, 2                       // x1<- ssssssssssssBBBB (hi)
     lsr     w3, wINST, #8               // w3<- AA
-    orr     w0, w0, w1, lsl #16         // w0<- BBBBbbbb
+    orr     x0, x0, x1, lsl #16         // x0<- ssssssssBBBBbbbb
     GET_VREG w1, w3                     // w1<- vAA
-    add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
+    add     x0, xPC, x0, lsl #1         // x0<- PC + BBBBbbbb*2
     bl      MterpDoSparseSwitch                       // w0<- code-unit branch offset
-    sbfm    xINST, x0, 0, 31
+    sxtw    xINST, w0
     b       MterpCommonTakenBranchNoFlags
 
 
@@ -3345,11 +3342,10 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG w0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
-    sbfm x0, x0, 0, 31                              // d0<- op
+    sxtw x0, w0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE x0, w4           // vA<- d0
     GOTO_OPCODE ip                      // jump to next instruction
@@ -3369,10 +3365,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG w0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     scvtf s0, w0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG s0, w4                // vA<- d0
@@ -3392,10 +3387,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG w0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     scvtf d0, w0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE d0, w4           // vA<- d0
@@ -3415,10 +3409,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE x0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
                                   // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG w0, w4                // vA<- d0
@@ -3438,10 +3431,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE x0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     scvtf s0, x0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG s0, w4                // vA<- d0
@@ -3461,10 +3453,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE x0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     scvtf d0, x0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE d0, w4           // vA<- d0
@@ -3485,10 +3476,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG s0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvtzs w0, s0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG w0, w4                // vA<- d0
@@ -3508,10 +3498,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG s0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvtzs x0, s0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE x0, w4           // vA<- d0
@@ -3531,10 +3520,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG s0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvt  d0, s0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE d0, w4           // vA<- d0
@@ -3554,10 +3542,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE d0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvtzs w0, d0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG w0, w4                // vA<- d0
@@ -3577,10 +3564,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE d0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvtzs x0, d0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE x0, w4           // vA<- d0
@@ -3600,10 +3586,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE d0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvt s0, d0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG s0, w4                // vA<- d0
@@ -4032,7 +4017,7 @@
     cbz     w1, common_errDivideByZero  // is second operand zero?
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsl     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -4071,7 +4056,7 @@
     cbz     w1, common_errDivideByZero  // is second operand zero?
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     asr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -4110,7 +4095,7 @@
     cbz     w1, common_errDivideByZero  // is second operand zero?
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -4424,8 +4409,7 @@
     and      w1, w0, #255                // w1<- BB
     GET_VREG_WIDE x1, w1                // x1<- vBB
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and      x2, x2, #63                 // Mask low 6
-    lsl  x0, x1, x2                 // Do the shift.
+    lsl  x0, x1, x2                 // Do the shift. Only low 6 bits of x2 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3                // vAA<- x0
     GOTO_OPCODE ip                      // jump to next instruction
@@ -4450,8 +4434,7 @@
     and      w1, w0, #255                // w1<- BB
     GET_VREG_WIDE x1, w1                // x1<- vBB
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and      x2, x2, #63                 // Mask low 6
-    asr  x0, x1, x2                 // Do the shift.
+    asr  x0, x1, x2                 // Do the shift. Only low 6 bits of x2 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3                // vAA<- x0
     GOTO_OPCODE ip                      // jump to next instruction
@@ -4476,8 +4459,7 @@
     and      w1, w0, #255                // w1<- BB
     GET_VREG_WIDE x1, w1                // x1<- vBB
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and      x2, x2, #63                 // Mask low 6
-    lsr  x0, x1, x2                 // Do the shift.
+    lsr  x0, x1, x2                 // Do the shift. Only low 6 bits of x2 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3                // vAA<- x0
     GOTO_OPCODE ip                      // jump to next instruction
@@ -5089,7 +5071,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsl     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -5125,7 +5107,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     asr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -5161,7 +5143,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -5463,8 +5445,7 @@
     GET_VREG w1, w1                     // x1<- vB
     GET_VREG_WIDE x0, w2                // x0<- vA
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     x1, x1, #63                 // Mask low 6 bits.
-    lsl x0, x0, x1
+    lsl x0, x0, x1                  // Do the shift. Only low 6 bits of x1 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w2               // vAA<- result
     GOTO_OPCODE ip                      // jump to next instruction
@@ -5485,8 +5466,7 @@
     GET_VREG w1, w1                     // x1<- vB
     GET_VREG_WIDE x0, w2                // x0<- vA
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     x1, x1, #63                 // Mask low 6 bits.
-    asr x0, x0, x1
+    asr x0, x0, x1                  // Do the shift. Only low 6 bits of x1 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w2               // vAA<- result
     GOTO_OPCODE ip                      // jump to next instruction
@@ -5507,8 +5487,7 @@
     GET_VREG w1, w1                     // x1<- vB
     GET_VREG_WIDE x0, w2                // x0<- vA
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     x1, x1, #63                 // Mask low 6 bits.
-    lsr x0, x0, x1
+    lsr x0, x0, x1                  // Do the shift. Only low 6 bits of x1 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w2               // vAA<- result
     GOTO_OPCODE ip                      // jump to next instruction
@@ -5529,8 +5508,7 @@
      */
     /* binop/2addr vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     fadd   s2, s0, s1                              // s2<- op
@@ -5554,8 +5532,7 @@
      */
     /* binop/2addr vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     fsub   s2, s0, s1                              // s2<- op
@@ -5579,8 +5556,7 @@
      */
     /* binop/2addr vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     fmul   s2, s0, s1                              // s2<- op
@@ -5604,8 +5580,7 @@
      */
     /* binop/2addr vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     fdiv   s2, s0, s1                              // s2<- op
@@ -5621,13 +5596,11 @@
 /* File: arm64/op_rem_float_2addr.S */
     /* rem vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     bl  fmodf
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG s0, w9
@@ -6381,7 +6354,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsl     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -6417,7 +6390,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     asr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -6453,7 +6426,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -6471,8 +6444,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     ldr   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     
@@ -6489,7 +6461,7 @@
     FETCH w4, 1                         // w4<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cbz     w3, common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     add     x4, x3, x4                  // create direct pointer
     ldr     x0, [x4]
     FETCH_ADVANCE_INST 2                // advance rPC, load wINST
@@ -6544,8 +6516,7 @@
     FETCH w3, 1                         // w3<- field byte offset
     GET_VREG w2, w2                     // w2<- fp[B], the object pointer
     ubfx    w0, wINST, #8, #4           // w0<- A
-    cmp     w2, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w2, common_errNullObject    // object was null
     GET_VREG_WIDE x0, w0                // x0-< fp[A]
     FETCH_ADVANCE_INST 2                // advance rPC, load wINST
     add     x1, x2, x3                  // create a direct pointer
@@ -6710,8 +6681,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     ldrb   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     
@@ -6731,8 +6701,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     ldrsb   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     
@@ -6752,8 +6721,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     ldrh   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     
@@ -6773,8 +6741,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     ldrsh   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     
@@ -11521,7 +11488,7 @@
 #if MTERP_LOGGING
     mov  x0, xSELF
     add  x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm x2, xINST, 0, 31
+    sxtw x2, wINST
     bl MterpLogOSR
 #endif
     mov  x0, #1                         // Signal normal return
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index 9c77d38..1c31c57 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -748,21 +748,24 @@
   return nullptr;
 }
 
-ArtField* Class::FindStaticField(Thread* self, Handle<Class> klass, const DexCache* dex_cache,
+ArtField* Class::FindStaticField(Thread* self,
+                                 Class* klass,
+                                 const DexCache* dex_cache,
                                  uint32_t dex_field_idx) {
-  for (Class* k = klass.Get(); k != nullptr; k = k->GetSuperClass()) {
+  for (Class* k = klass; k != nullptr; k = k->GetSuperClass()) {
     // Is the field in this class?
     ArtField* f = k->FindDeclaredStaticField(dex_cache, dex_field_idx);
     if (f != nullptr) {
       return f;
     }
-    // Wrap k incase it moves during GetDirectInterface.
+    // Though GetDirectInterface() should not cause thread suspension when called
+    // from here, it takes a Handle as an argument, so we need to wrap `k`.
+    ScopedAssertNoThreadSuspension ants(self, __FUNCTION__);
     StackHandleScope<1> hs(self);
-    HandleWrapper<mirror::Class> h_k(hs.NewHandleWrapper(&k));
+    Handle<mirror::Class> h_k(hs.NewHandle(k));
     // Is this field in any of this class' interfaces?
     for (uint32_t i = 0; i < h_k->NumDirectInterfaces(); ++i) {
-      StackHandleScope<1> hs2(self);
-      Handle<mirror::Class> interface(hs2.NewHandle(GetDirectInterface(self, h_k, i)));
+      mirror::Class* interface = GetDirectInterface(self, h_k, i);
       f = FindStaticField(self, interface, dex_cache, dex_field_idx);
       if (f != nullptr) {
         return f;
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index f044b59..9be9f01 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -1091,7 +1091,9 @@
 
   // Finds the given static field in this class or superclass, only searches classes that
   // have the same dex cache.
-  static ArtField* FindStaticField(Thread* self, Handle<Class> klass, const DexCache* dex_cache,
+  static ArtField* FindStaticField(Thread* self,
+                                   Class* klass,
+                                   const DexCache* dex_cache,
                                    uint32_t dex_field_idx)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
diff --git a/runtime/native/dalvik_system_VMRuntime.cc b/runtime/native/dalvik_system_VMRuntime.cc
index 79b18aa..d987f65 100644
--- a/runtime/native/dalvik_system_VMRuntime.cc
+++ b/runtime/native/dalvik_system_VMRuntime.cc
@@ -342,7 +342,7 @@
     return;
   }
   if (is_static) {
-    field = mirror::Class::FindStaticField(self, klass, dex_cache.Get(), field_idx);
+    field = mirror::Class::FindStaticField(self, klass.Get(), dex_cache.Get(), field_idx);
   } else {
     field = klass->FindInstanceField(dex_cache.Get(), field_idx);
   }
diff --git a/runtime/oat.h b/runtime/oat.h
index e506e3c..9b8f545 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '8', '3', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '8', '4', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/read_barrier-inl.h b/runtime/read_barrier-inl.h
index 0c3eb3b..92efa21 100644
--- a/runtime/read_barrier-inl.h
+++ b/runtime/read_barrier-inl.h
@@ -220,7 +220,7 @@
 }
 
 inline mirror::Object* ReadBarrier::Mark(mirror::Object* obj) {
-  return Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->Mark(obj);
+  return Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->MarkFromReadBarrier(obj);
 }
 
 inline bool ReadBarrier::HasGrayReadBarrierPointer(mirror::Object* obj,
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 21cd2aa..079c079 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -989,6 +989,7 @@
                        xgc_option.verify_pre_sweeping_rosalloc_,
                        xgc_option.verify_post_gc_rosalloc_,
                        xgc_option.gcstress_,
+                       xgc_option.measure_,
                        runtime_options.GetOrDefault(Opt::EnableHSpaceCompactForOOM),
                        runtime_options.GetOrDefault(Opt::HSpaceCompactForOOMMinIntervalsMs));
 
diff --git a/runtime/trace.cc b/runtime/trace.cc
index 0acc54d..e77a11e 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -869,7 +869,7 @@
 bool Trace::RegisterThread(Thread* thread) {
   pid_t tid = thread->GetTid();
   CHECK_LT(0U, static_cast<uint32_t>(tid));
-  CHECK_LT(static_cast<uint32_t>(tid), 65536U);
+  CHECK_LT(static_cast<uint32_t>(tid), kMaxThreadIdNumber);
 
   if (!(*seen_threads_)[tid]) {
     seen_threads_->set(tid);
diff --git a/runtime/trace.h b/runtime/trace.h
index 80f1a4c..9b29fb9 100644
--- a/runtime/trace.h
+++ b/runtime/trace.h
@@ -41,7 +41,9 @@
 class Thread;
 
 using DexIndexBitSet = std::bitset<65536>;
-using ThreadIDBitSet = std::bitset<65536>;
+
+constexpr size_t kMaxThreadIdNumber = kIsTargetBuild ? 65536U : 1048576U;
+using ThreadIDBitSet = std::bitset<kMaxThreadIdNumber>;
 
 enum TracingMode {
   kTracingInactive,
diff --git a/test/501-regression-packed-switch/info.txt b/test/501-regression-packed-switch/info.txt
index fbd93fa..988b220 100644
--- a/test/501-regression-packed-switch/info.txt
+++ b/test/501-regression-packed-switch/info.txt
@@ -1,2 +1,4 @@
 Regression test for the interpreter and optimizing's builder which used
 to trip when compiled code contained a packed switch with no targets.
+Regression test for the arm64 mterp miscalculating the switch table
+address, zero-extending a register instead of sign-extending.
diff --git a/test/501-regression-packed-switch/smali/Test.smali b/test/501-regression-packed-switch/smali/Test.smali
index 8756ed5..5a760c7 100644
--- a/test/501-regression-packed-switch/smali/Test.smali
+++ b/test/501-regression-packed-switch/smali/Test.smali
@@ -27,3 +27,28 @@
   .packed-switch 0x0
   .end packed-switch
 .end method
+
+.method public static PackedSwitchAfterData(I)I
+  .registers 1
+  goto :pswitch_instr
+
+  :case0
+  const/4 v0, 0x1
+  return v0
+
+  :pswitch_data
+  .packed-switch 0x0
+    :case0
+    :case1
+  .end packed-switch
+
+  :pswitch_instr
+  packed-switch v0, :pswitch_data
+  const/4 v0, 0x7
+  return v0
+
+  :case1
+  const/4 v0, 0x4
+  return v0
+
+.end method
diff --git a/test/501-regression-packed-switch/src/Main.java b/test/501-regression-packed-switch/src/Main.java
index b80bc62..12bc1a8 100644
--- a/test/501-regression-packed-switch/src/Main.java
+++ b/test/501-regression-packed-switch/src/Main.java
@@ -29,5 +29,10 @@
     if (result != 5) {
       throw new Error("Expected 5, got " + result);
     }
+    m = c.getMethod("PackedSwitchAfterData", new Class[] { int.class });
+    result = (Integer) m.invoke(null, new Integer(0));
+    if (result != 1) {
+      throw new Error("Expected 1, got " + result);
+    }
   }
 }
diff --git a/test/527-checker-array-access-split/src/Main.java b/test/527-checker-array-access-split/src/Main.java
index ead9446..3366f20 100644
--- a/test/527-checker-array-access-split/src/Main.java
+++ b/test/527-checker-array-access-split/src/Main.java
@@ -34,9 +34,21 @@
   /// CHECK-START-ARM64: int Main.constantIndexGet(int[]) instruction_simplifier_arm64 (after)
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK-NOT:                                Arm64IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
   /// CHECK:                                    ArrayGet [<<Array>>,<<Index>>]
 
+
+  /// CHECK-START-ARM: int Main.constantIndexGet(int[]) instruction_simplifier_arm (before)
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:                                    ArrayGet [<<Array>>,<<Index>>]
+
+  /// CHECK-START-ARM: int Main.constantIndexGet(int[]) instruction_simplifier_arm (after)
+  /// CHECK:           <<Array:l\d+>>         NullCheck
+  /// CHECK:           <<Index:i\d+>>         BoundsCheck
+  /// CHECK-NOT:                              IntermediateAddress
+  /// CHECK:                                  ArrayGet [<<Array>>,<<Index>>]
+
   public static int constantIndexGet(int array[]) {
     return array[1];
   }
@@ -55,10 +67,23 @@
   /// CHECK:             <<Const2:i\d+>>        IntConstant 2
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK-NOT:                                Arm64IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
   /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Const2>>]
 
 
+  /// CHECK-START-ARM:   void Main.constantIndexSet(int[]) instruction_simplifier_arm (before)
+  /// CHECK:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Const2>>]
+
+  /// CHECK-START-ARM:   void Main.constantIndexSet(int[]) instruction_simplifier_arm (after)
+  /// CHECK:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK-NOT:                                IntermediateAddress
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Const2>>]
+
   public static void constantIndexSet(int array[]) {
     array[1] = 2;
   }
@@ -76,7 +101,20 @@
   /// CHECK:             <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address:l\d+>>       Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:                               ArrayGet [<<Address>>,<<Index>>]
+
+
+  /// CHECK-START-ARM:   int Main.get(int[], int) instruction_simplifier_arm (before)
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:                                    ArrayGet [<<Array>>,<<Index>>]
+
+  /// CHECK-START-ARM:   int Main.get(int[], int) instruction_simplifier_arm (after)
+  /// CHECK:             <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:                               ArrayGet [<<Address>>,<<Index>>]
 
   public static int get(int array[], int index) {
@@ -102,7 +140,26 @@
   /// CHECK:             <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address:l\d+>>       Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:                               ArraySet [<<Address>>,<<Index>>,<<Arg>>]
+
+
+  /// CHECK-START-ARM:   void Main.set(int[], int, int) instruction_simplifier_arm (before)
+  /// CHECK:                                    ParameterValue
+  /// CHECK:                                    ParameterValue
+  /// CHECK:             <<Arg:i\d+>>           ParameterValue
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Arg>>]
+
+  /// CHECK-START-ARM:   void Main.set(int[], int, int) instruction_simplifier_arm (after)
+  /// CHECK:                                    ParameterValue
+  /// CHECK:                                    ParameterValue
+  /// CHECK:             <<Arg:i\d+>>           ParameterValue
+  /// CHECK:             <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:                               ArraySet [<<Address>>,<<Index>>,<<Arg>>]
 
   public static void set(int array[], int index, int value) {
@@ -126,10 +183,10 @@
   /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address1:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
-  /// CHECK:             <<Address2:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
 
   /// CHECK-START-ARM64: void Main.getSet(int[], int) GVN_after_arch (after)
@@ -137,12 +194,42 @@
   /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address:l\d+>>       Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
-  /// CHECK-NOT:                                Arm64IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
   /// CHECK:                                    ArraySet [<<Address>>,<<Index>>,<<Add>>]
 
+
+  /// CHECK-START-ARM:   void Main.getSet(int[], int) instruction_simplifier_arm (before)
+  /// CHECK:             <<Const1:i\d+>>        IntConstant 1
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Array>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   void Main.getSet(int[], int) instruction_simplifier_arm (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   void Main.getSet(int[], int) GVN_after_arch (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK-NOT:                                IntermediateAddress
+  /// CHECK:                                    ArraySet [<<Address>>,<<Index>>,<<Add>>]
   public static void getSet(int array[], int index) {
     array[index] = array[index] + 1;
   }
@@ -166,11 +253,11 @@
   /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address1:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
   /// CHECK:                                    NewArray
-  /// CHECK:             <<Address2:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
 
   /// CHECK-START-ARM64: int[] Main.accrossGC(int[], int) GVN_after_arch (after)
@@ -178,11 +265,45 @@
   /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address1:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
   /// CHECK:                                    NewArray
-  /// CHECK:             <<Address2:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:                                    ArraySet [<<Address2>>,<<Index>>,<<Add>>]
+
+
+  /// CHECK-START-ARM:   int[] Main.accrossGC(int[], int) instruction_simplifier_arm (before)
+  /// CHECK:             <<Const1:i\d+>>        IntConstant 1
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Array>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:                                    NewArray
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int[] Main.accrossGC(int[], int) instruction_simplifier_arm (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:                                    NewArray
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int[] Main.accrossGC(int[], int) GVN_after_arch (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:                                    NewArray
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK:                                    ArraySet [<<Address2>>,<<Index>>,<<Add>>]
 
   public static int[] accrossGC(int array[], int index) {
@@ -196,6 +317,14 @@
    * Test that the intermediate address is shared between array accesses after
    * the bounds check have been removed by BCE.
    */
+  // For checker tests `instruction_simplifier_<arch> (after)` below, by the time we reach
+  // the architecture-specific instruction simplifier, BCE has removed the bounds checks in
+  // the loop.
+
+  // Note that we do not care that the `DataOffset` is `12`. But if we do not
+  // specify it and any other `IntConstant` appears before that instruction,
+  // checker will match the previous `IntConstant`, and we will thus fail the
+  // check.
 
   /// CHECK-START-ARM64: int Main.canMergeAfterBCE1() instruction_simplifier_arm64 (before)
   /// CHECK:             <<Const1:i\d+>>        IntConstant 1
@@ -207,14 +336,6 @@
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
   /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Add>>]
 
-  // By the time we reach the architecture-specific instruction simplifier, BCE
-  // has removed the bounds checks in the loop.
-
-  // Note that we do not care that the `DataOffset` is `12`. But if we do not
-  // specify it and any other `IntConstant` appears before that instruction,
-  // checker will match the previous `IntConstant`, and we will thus fail the
-  // check.
-
   /// CHECK-START-ARM64: int Main.canMergeAfterBCE1() instruction_simplifier_arm64 (after)
   /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
   /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant 12
@@ -222,10 +343,10 @@
   /// CHECK:             <<Index:i\d+>>         Phi
   /// CHECK:                                    If
   //  -------------- Loop
-  /// CHECK:             <<Address1:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
-  /// CHECK:             <<Address2:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
 
   /// CHECK-START-ARM64: int Main.canMergeAfterBCE1() GVN_after_arch (after)
@@ -235,10 +356,47 @@
   /// CHECK:             <<Index:i\d+>>         Phi
   /// CHECK:                                    If
   //  -------------- Loop
-  /// CHECK:             <<Address:l\d+>>       Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
-  /// CHECK-NOT:                                Arm64IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
+  /// CHECK:                                    ArraySet [<<Address>>,<<Index>>,<<Add>>]
+
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE1() instruction_simplifier_arm (before)
+  /// CHECK:             <<Const1:i\d+>>        IntConstant 1
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Array>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE1() instruction_simplifier_arm (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE1() GVN_after_arch (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK-NOT:                                IntermediateAddress
   /// CHECK:                                    ArraySet [<<Address>>,<<Index>>,<<Add>>]
 
   public static int canMergeAfterBCE1() {
@@ -279,12 +437,12 @@
   /// CHECK:                                    If
   //  -------------- Loop
   /// CHECK-DAG:         <<Index1:i\d+>>        Add [<<Index>>,<<Const1>>]
-  /// CHECK-DAG:         <<Address1:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-DAG:         <<ArrayGetI:i\d+>>     ArrayGet [<<Address1>>,<<Index>>]
-  /// CHECK-DAG:         <<Address2:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-DAG:         <<ArrayGetI1:i\d+>>    ArrayGet [<<Address2>>,<<Index1>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGetI>>,<<ArrayGetI1>>]
-  /// CHECK:             <<Address3:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address3:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK:                                    ArraySet [<<Address3>>,<<Index1>>,<<Add>>]
 
   /// CHECK-START-ARM64: int Main.canMergeAfterBCE2() GVN_after_arch (after)
@@ -295,7 +453,7 @@
   /// CHECK:                                    If
   //  -------------- Loop
   /// CHECK-DAG:         <<Index1:i\d+>>        Add [<<Index>>,<<Const1>>]
-  /// CHECK-DAG:         <<Address:l\d+>>       Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-DAG:         <<ArrayGetI:i\d+>>     ArrayGet [<<Address>>,<<Index>>]
   /// CHECK-DAG:         <<ArrayGetI1:i\d+>>    ArrayGet [<<Address>>,<<Index1>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGetI>>,<<ArrayGetI1>>]
@@ -304,8 +462,55 @@
   // There should be only one intermediate address computation in the loop.
 
   /// CHECK-START-ARM64: int Main.canMergeAfterBCE2() GVN_after_arch (after)
-  /// CHECK:                                    Arm64IntermediateAddress
-  /// CHECK-NOT:                                Arm64IntermediateAddress
+  /// CHECK:                                    IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
+
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE2() instruction_simplifier_arm (before)
+  /// CHECK:             <<Const1:i\d+>>        IntConstant 1
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK-DAG:         <<Index1:i\d+>>        Add [<<Index>>,<<Const1>>]
+  /// CHECK-DAG:         <<ArrayGetI:i\d+>>     ArrayGet [<<Array>>,<<Index>>]
+  /// CHECK-DAG:         <<ArrayGetI1:i\d+>>    ArrayGet [<<Array>>,<<Index1>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGetI>>,<<ArrayGetI1>>]
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index1>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE2() instruction_simplifier_arm (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK-DAG:         <<Index1:i\d+>>        Add [<<Index>>,<<Const1>>]
+  /// CHECK-DAG:         <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<ArrayGetI:i\d+>>     ArrayGet [<<Address1>>,<<Index>>]
+  /// CHECK-DAG:         <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<ArrayGetI1:i\d+>>    ArrayGet [<<Address2>>,<<Index1>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGetI>>,<<ArrayGetI1>>]
+  /// CHECK:             <<Address3:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:                                    ArraySet [<<Address3>>,<<Index1>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE2() GVN_after_arch (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK-DAG:         <<Index1:i\d+>>        Add [<<Index>>,<<Const1>>]
+  /// CHECK-DAG:         <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<ArrayGetI:i\d+>>     ArrayGet [<<Address>>,<<Index>>]
+  /// CHECK-DAG:         <<ArrayGetI1:i\d+>>    ArrayGet [<<Address>>,<<Index1>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGetI>>,<<ArrayGetI1>>]
+  /// CHECK:                                    ArraySet [<<Address>>,<<Index1>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE2() GVN_after_arch (after)
+  /// CHECK:                                    IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
 
   public static int canMergeAfterBCE2() {
     int[] array = {0, 1, 2, 3};
@@ -315,6 +520,37 @@
     return array[array.length - 1];
   }
 
+  /// CHECK-START-ARM: int Main.checkLongFloatDouble() instruction_simplifier_arm (before)
+  /// CHECK-DAG:         <<Array1:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Array2:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Array3:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                ArrayGet [<<Array1>>,<<Index>>]
+  /// CHECK-DAG:                                ArrayGet [<<Array2>>,<<Index>>]
+  /// CHECK-DAG:                                ArrayGet [<<Array3>>,<<Index>>]
+
+  /// CHECK-START-ARM: int Main.checkLongFloatDouble() instruction_simplifier_arm (after)
+  /// CHECK-DAG:         <<Array1:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Array2:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Array3:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                ArrayGet [<<Array1>>,<<Index>>]
+  /// CHECK-DAG:                                ArrayGet [<<Array2>>,<<Index>>]
+  /// CHECK-DAG:                                ArrayGet [<<Array3>>,<<Index>>]
+
+  /// CHECK-START-ARM: int Main.checkLongFloatDouble() instruction_simplifier_arm (after)
+  /// CHECK-NOT:                                IntermediateAddress
+  public static int checkLongFloatDouble() {
+    long[] array_long = {0, 1, 2, 3};
+    float[] array_float = {(float)0.0, (float)1.0, (float)2.0, (float)3.0};
+    double[] array_double = {0.0, 1.0, 2.0, 3.0};
+    double s = 0.0;
+
+    for (int i = 0; i < 4; i++) {
+      s += (double)array_long[i] + (double)array_float[i] + array_double[i];
+    }
+    return (int)s;
+  }
 
   public static void main(String[] args) {
     int[] array = {123, 456, 789};
@@ -337,5 +573,7 @@
 
     assertIntEquals(4, canMergeAfterBCE1());
     assertIntEquals(6, canMergeAfterBCE2());
+
+    assertIntEquals(18, checkLongFloatDouble());
   }
 }
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index dd6b6f3..8f8b667 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -527,7 +527,7 @@
 # Tests that should fail in the read barrier configuration with the Optimizing compiler (AOT).
 # 484: Baker's fast path based read barrier compiler instrumentation generates code containing
 #      more parallel moves on x86, thus some Checker assertions may fail.
-# 527: On ARM64, the read barrier instrumentation does not support the HArm64IntermediateAddress
+# 527: On ARM64 and ARM, the read barrier instrumentation does not support the HIntermediateAddress
 #      instruction yet (b/26601270).
 # 537: Expects an array copy to be intrinsified on x86-64, but calling-on-slowpath intrinsics are
 #      not yet handled in the read barrier configuration.
diff --git a/test/run-test b/test/run-test
index bbcd4b0..1ef5428 100755
--- a/test/run-test
+++ b/test/run-test
@@ -37,7 +37,7 @@
 if [ -z "$TMPDIR" ]; then
   tmp_dir="/tmp/$USER/${test_dir}"
 else
-  tmp_dir="${TMPDIR}/$USER/${test_dir}"
+  tmp_dir="${TMPDIR}/${test_dir}"
 fi
 checker="${progdir}/../tools/checker/checker.py"
 export JAVA="java"