Merge the 2019-08-01 SPL branch from AOSP-Partner

* security-aosp-nyc-mr2-release:
  Use conservative permissions when creating files in ART

Change-Id: Ifb9781b636a24bcfb2861051b7feef66f52ec404
diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index 2294ddb..c6eeaa9 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -30,9 +30,15 @@
 # Beware that tests may use the non-debug build for performance, notable 055-enum-performance
 #
 ART_BUILD_TARGET_NDEBUG ?= true
-ART_BUILD_TARGET_DEBUG ?= true
 ART_BUILD_HOST_NDEBUG ?= true
+
+ifneq ($(USE_DEX2OAT_DEBUG),false)
+ART_BUILD_TARGET_DEBUG ?= true
 ART_BUILD_HOST_DEBUG ?= true
+else
+ART_BUILD_TARGET_DEBUG ?= false
+ART_BUILD_HOST_DEBUG ?= false
+endif
 
 # Set this to change what opt level Art is built at.
 ART_DEBUG_OPT_FLAG ?= -O2
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 19af14d..6ac9ad3 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -298,6 +298,8 @@
   compiler/utils/intrusive_forward_list_test.cc \
   compiler/utils/swap_space_test.cc \
   compiler/utils/test_dex_file_builder_test.cc \
+  compiler/utils/transform_array_ref_test.cc \
+  compiler/utils/transform_iterator_test.cc \
 
 COMPILER_GTEST_COMMON_SRC_FILES_all := \
   compiler/jni/jni_cfi_test.cc \
diff --git a/cmdline/cmdline_types.h b/cmdline/cmdline_types.h
index 4797540..f06be8a 100644
--- a/cmdline/cmdline_types.h
+++ b/cmdline/cmdline_types.h
@@ -544,7 +544,7 @@
     : background_collector_type_(gc::kCollectorTypeNone) {
 
     if (kUseReadBarrier) {
-      background_collector_type_ = gc::kCollectorTypeCC;  // Disable background compaction for CC.
+      background_collector_type_ = gc::kCollectorTypeCCBackground;  // Background compaction for CC.
     }
   }
 
diff --git a/compiler/jni/jni_cfi_test.cc b/compiler/jni/jni_cfi_test.cc
index 05c85e0..5614451 100644
--- a/compiler/jni/jni_cfi_test.cc
+++ b/compiler/jni/jni_cfi_test.cc
@@ -52,7 +52,7 @@
     std::unique_ptr<ManagedRuntimeCallingConvention> mr_conv(
         ManagedRuntimeCallingConvention::Create(&arena, is_static, is_synchronized, shorty, isa));
     const int frame_size(jni_conv->FrameSize());
-    const std::vector<ManagedRegister>& callee_save_regs = jni_conv->CalleeSaveRegisters();
+    ArrayRef<const ManagedRegister> callee_save_regs = jni_conv->CalleeSaveRegisters();
 
     // Assemble the method.
     std::unique_ptr<Assembler> jni_asm(Assembler::Create(&arena, isa));
diff --git a/compiler/jni/jni_cfi_test_expected.inc b/compiler/jni/jni_cfi_test_expected.inc
index 16b4386..da72c75 100644
--- a/compiler/jni/jni_cfi_test_expected.inc
+++ b/compiler/jni/jni_cfi_test_expected.inc
@@ -1,8 +1,7 @@
 static constexpr uint8_t expected_asm_kThumb2[] = {
     0x2D, 0xE9, 0xE0, 0x4D, 0x2D, 0xED, 0x10, 0x8A, 0x89, 0xB0, 0x00, 0x90,
-    0xCD, 0xF8, 0x84, 0x10, 0x8D, 0xED, 0x22, 0x0A, 0xCD, 0xF8, 0x8C, 0x20,
-    0xCD, 0xF8, 0x90, 0x30, 0x88, 0xB0, 0x08, 0xB0, 0x09, 0xB0, 0xBD, 0xEC,
-    0x10, 0x8A, 0xBD, 0xE8, 0xE0, 0x8D,
+    0x21, 0x91, 0x8D, 0xED, 0x22, 0x0A, 0x23, 0x92, 0x24, 0x93, 0x88, 0xB0,
+    0x08, 0xB0, 0x09, 0xB0, 0xBD, 0xEC, 0x10, 0x8A, 0xBD, 0xE8, 0xE0, 0x8D,
 };
 static constexpr uint8_t expected_cfi_kThumb2[] = {
     0x44, 0x0E, 0x1C, 0x85, 0x07, 0x86, 0x06, 0x87, 0x05, 0x88, 0x04, 0x8A,
@@ -11,7 +10,7 @@
     0x55, 0x12, 0x05, 0x56, 0x11, 0x05, 0x57, 0x10, 0x05, 0x58, 0x0F, 0x05,
     0x59, 0x0E, 0x05, 0x5A, 0x0D, 0x05, 0x5B, 0x0C, 0x05, 0x5C, 0x0B, 0x05,
     0x5D, 0x0A, 0x05, 0x5E, 0x09, 0x05, 0x5F, 0x08, 0x42, 0x0E, 0x80, 0x01,
-    0x54, 0x0E, 0xA0, 0x01, 0x42, 0x0E, 0x80, 0x01, 0x0A, 0x42, 0x0E, 0x5C,
+    0x4E, 0x0E, 0xA0, 0x01, 0x42, 0x0E, 0x80, 0x01, 0x0A, 0x42, 0x0E, 0x5C,
     0x44, 0x0E, 0x1C, 0x06, 0x50, 0x06, 0x51, 0x06, 0x52, 0x06, 0x53, 0x06,
     0x54, 0x06, 0x55, 0x06, 0x56, 0x06, 0x57, 0x06, 0x58, 0x06, 0x59, 0x06,
     0x5A, 0x06, 0x5B, 0x06, 0x5C, 0x06, 0x5D, 0x06, 0x5E, 0x06, 0x5F, 0x44,
@@ -47,38 +46,38 @@
 // 0x00000008: sub sp, sp, #36
 // 0x0000000a: .cfi_def_cfa_offset: 128
 // 0x0000000a: str r0, [sp, #0]
-// 0x0000000c: str.w r1, [sp, #132]
-// 0x00000010: vstr.f32 s0, [sp, #136]
-// 0x00000014: str.w r2, [sp, #140]
-// 0x00000018: str.w r3, [sp, #144]
-// 0x0000001c: sub sp, sp, #32
-// 0x0000001e: .cfi_def_cfa_offset: 160
-// 0x0000001e: add sp, sp, #32
-// 0x00000020: .cfi_def_cfa_offset: 128
-// 0x00000020: .cfi_remember_state
-// 0x00000020: add sp, sp, #36
-// 0x00000022: .cfi_def_cfa_offset: 92
-// 0x00000022: vpop.f32 {s16-s31}
-// 0x00000026: .cfi_def_cfa_offset: 28
-// 0x00000026: .cfi_restore_extended: r80
-// 0x00000026: .cfi_restore_extended: r81
-// 0x00000026: .cfi_restore_extended: r82
-// 0x00000026: .cfi_restore_extended: r83
-// 0x00000026: .cfi_restore_extended: r84
-// 0x00000026: .cfi_restore_extended: r85
-// 0x00000026: .cfi_restore_extended: r86
-// 0x00000026: .cfi_restore_extended: r87
-// 0x00000026: .cfi_restore_extended: r88
-// 0x00000026: .cfi_restore_extended: r89
-// 0x00000026: .cfi_restore_extended: r90
-// 0x00000026: .cfi_restore_extended: r91
-// 0x00000026: .cfi_restore_extended: r92
-// 0x00000026: .cfi_restore_extended: r93
-// 0x00000026: .cfi_restore_extended: r94
-// 0x00000026: .cfi_restore_extended: r95
-// 0x00000026: pop {r5, r6, r7, r8, r10, r11, pc}
-// 0x0000002a: .cfi_restore_state
-// 0x0000002a: .cfi_def_cfa_offset: 128
+// 0x0000000c: str r1, [sp, #132]
+// 0x0000000e: vstr.f32 s0, [sp, #136]
+// 0x00000012: str r2, [sp, #140]
+// 0x00000014: str r3, [sp, #144]
+// 0x00000016: sub sp, sp, #32
+// 0x00000018: .cfi_def_cfa_offset: 160
+// 0x00000018: add sp, sp, #32
+// 0x0000001a: .cfi_def_cfa_offset: 128
+// 0x0000001a: .cfi_remember_state
+// 0x0000001a: add sp, sp, #36
+// 0x0000001c: .cfi_def_cfa_offset: 92
+// 0x0000001c: vpop.f32 {s16-s31}
+// 0x00000020: .cfi_def_cfa_offset: 28
+// 0x00000020: .cfi_restore_extended: r80
+// 0x00000020: .cfi_restore_extended: r81
+// 0x00000020: .cfi_restore_extended: r82
+// 0x00000020: .cfi_restore_extended: r83
+// 0x00000020: .cfi_restore_extended: r84
+// 0x00000020: .cfi_restore_extended: r85
+// 0x00000020: .cfi_restore_extended: r86
+// 0x00000020: .cfi_restore_extended: r87
+// 0x00000020: .cfi_restore_extended: r88
+// 0x00000020: .cfi_restore_extended: r89
+// 0x00000020: .cfi_restore_extended: r90
+// 0x00000020: .cfi_restore_extended: r91
+// 0x00000020: .cfi_restore_extended: r92
+// 0x00000020: .cfi_restore_extended: r93
+// 0x00000020: .cfi_restore_extended: r94
+// 0x00000020: .cfi_restore_extended: r95
+// 0x00000020: pop {r5, r6, r7, r8, r10, r11, pc}
+// 0x00000024: .cfi_restore_state
+// 0x00000024: .cfi_def_cfa_offset: 128
 
 static constexpr uint8_t expected_asm_kArm64[] = {
     0xFF, 0x03, 0x03, 0xD1, 0xF3, 0x53, 0x06, 0xA9, 0xF5, 0x5B, 0x07, 0xA9,
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index 9d2732a..29411f0 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -31,10 +31,6 @@
   S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15
 };
 
-static const SRegister kHFSCalleeSaveRegisters[] = {
-  S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31
-};
-
 static const DRegister kHFDArgumentRegisters[] = {
   D0, D1, D2, D3, D4, D5, D6, D7
 };
@@ -42,6 +38,57 @@
 static_assert(arraysize(kHFDArgumentRegisters) * 2 == arraysize(kHFSArgumentRegisters),
     "ks d argument registers mismatch");
 
+static constexpr ManagedRegister kCalleeSaveRegisters[] = {
+    // Core registers.
+    ArmManagedRegister::FromCoreRegister(R5),
+    ArmManagedRegister::FromCoreRegister(R6),
+    ArmManagedRegister::FromCoreRegister(R7),
+    ArmManagedRegister::FromCoreRegister(R8),
+    ArmManagedRegister::FromCoreRegister(R10),
+    ArmManagedRegister::FromCoreRegister(R11),
+    // Hard float registers.
+    ArmManagedRegister::FromSRegister(S16),
+    ArmManagedRegister::FromSRegister(S17),
+    ArmManagedRegister::FromSRegister(S18),
+    ArmManagedRegister::FromSRegister(S19),
+    ArmManagedRegister::FromSRegister(S20),
+    ArmManagedRegister::FromSRegister(S21),
+    ArmManagedRegister::FromSRegister(S22),
+    ArmManagedRegister::FromSRegister(S23),
+    ArmManagedRegister::FromSRegister(S24),
+    ArmManagedRegister::FromSRegister(S25),
+    ArmManagedRegister::FromSRegister(S26),
+    ArmManagedRegister::FromSRegister(S27),
+    ArmManagedRegister::FromSRegister(S28),
+    ArmManagedRegister::FromSRegister(S29),
+    ArmManagedRegister::FromSRegister(S30),
+    ArmManagedRegister::FromSRegister(S31)
+};
+
+static constexpr uint32_t CalculateCoreCalleeSpillMask() {
+  // LR is a special callee save which is not reported by CalleeSaveRegisters().
+  uint32_t result = 1 << LR;
+  for (auto&& r : kCalleeSaveRegisters) {
+    if (r.AsArm().IsCoreRegister()) {
+      result |= (1 << r.AsArm().AsCoreRegister());
+    }
+  }
+  return result;
+}
+
+static constexpr uint32_t CalculateFpCalleeSpillMask() {
+  uint32_t result = 0;
+  for (auto&& r : kCalleeSaveRegisters) {
+    if (r.AsArm().IsSRegister()) {
+      result |= (1 << r.AsArm().AsSRegister());
+    }
+  }
+  return result;
+}
+
+static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask();
+static constexpr uint32_t kFpCalleeSpillMask = CalculateFpCalleeSpillMask();
+
 // Calling convention
 
 ManagedRegister ArmManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
@@ -223,32 +270,15 @@
     cur_reg++;  // bump the iterator for every argument
   }
   padding_ = padding;
-
-  callee_save_regs_.push_back(ArmManagedRegister::FromCoreRegister(R5));
-  callee_save_regs_.push_back(ArmManagedRegister::FromCoreRegister(R6));
-  callee_save_regs_.push_back(ArmManagedRegister::FromCoreRegister(R7));
-  callee_save_regs_.push_back(ArmManagedRegister::FromCoreRegister(R8));
-  callee_save_regs_.push_back(ArmManagedRegister::FromCoreRegister(R10));
-  callee_save_regs_.push_back(ArmManagedRegister::FromCoreRegister(R11));
-
-  for (size_t i = 0; i < arraysize(kHFSCalleeSaveRegisters); ++i) {
-    callee_save_regs_.push_back(ArmManagedRegister::FromSRegister(kHFSCalleeSaveRegisters[i]));
-  }
 }
 
 uint32_t ArmJniCallingConvention::CoreSpillMask() const {
   // Compute spill mask to agree with callee saves initialized in the constructor
-  uint32_t result = 0;
-  result = 1 << R5 | 1 << R6 | 1 << R7 | 1 << R8 | 1 << R10 | 1 << R11 | 1 << LR;
-  return result;
+  return kCoreCalleeSpillMask;
 }
 
 uint32_t ArmJniCallingConvention::FpSpillMask() const {
-  uint32_t result = 0;
-  for (size_t i = 0; i < arraysize(kHFSCalleeSaveRegisters); ++i) {
-    result |= (1 << kHFSCalleeSaveRegisters[i]);
-  }
-  return result;
+  return kFpCalleeSpillMask;
 }
 
 ManagedRegister ArmJniCallingConvention::ReturnScratchRegister() const {
@@ -269,6 +299,10 @@
                  kStackAlignment);
 }
 
+ArrayRef<const ManagedRegister> ArmJniCallingConvention::CalleeSaveRegisters() const {
+  return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+}
+
 // JniCallingConvention ABI follows AAPCS where longs and doubles must occur
 // in even register numbers and stack slots
 void ArmJniCallingConvention::Next() {
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index 35b5093..157880b 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -58,9 +58,7 @@
   void Next() OVERRIDE;  // Override default behavior for AAPCS
   size_t FrameSize() OVERRIDE;
   size_t OutArgSize() OVERRIDE;
-  const std::vector<ManagedRegister>& CalleeSaveRegisters() const OVERRIDE {
-    return callee_save_regs_;
-  }
+  ArrayRef<const ManagedRegister> CalleeSaveRegisters() const OVERRIDE;
   ManagedRegister ReturnScratchRegister() const OVERRIDE;
   uint32_t CoreSpillMask() const OVERRIDE;
   uint32_t FpSpillMask() const OVERRIDE;
@@ -78,9 +76,6 @@
   size_t NumberOfOutgoingStackArgs() OVERRIDE;
 
  private:
-  // TODO: these values aren't unique and can be shared amongst instances
-  std::vector<ManagedRegister> callee_save_regs_;
-
   // Padding to ensure longs and doubles are not split in AAPCS
   size_t padding_;
 
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index 9aef10e..ab56c1c 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -38,10 +38,65 @@
   S0, S1, S2, S3, S4, S5, S6, S7
 };
 
-static const DRegister kDCalleeSaveRegisters[] = {
-  D8, D9, D10, D11, D12, D13, D14, D15
+static constexpr ManagedRegister kCalleeSaveRegisters[] = {
+    // Core registers.
+    // Note: The native jni function may call to some VM runtime functions which may suspend
+    // or trigger GC. And the jni method frame will become top quick frame in those cases.
+    // So we need to satisfy GC to save LR and callee-save registers which is similar to
+    // CalleeSaveMethod(RefOnly) frame.
+    // Jni function is the native function which the java code wants to call.
+    // Jni method is the method that is compiled by jni compiler.
+    // Call chain: managed code(java) --> jni method --> jni function.
+    // Thread register(X19) is saved on stack.
+    Arm64ManagedRegister::FromXRegister(X19),
+    Arm64ManagedRegister::FromXRegister(X20),
+    Arm64ManagedRegister::FromXRegister(X21),
+    Arm64ManagedRegister::FromXRegister(X22),
+    Arm64ManagedRegister::FromXRegister(X23),
+    Arm64ManagedRegister::FromXRegister(X24),
+    Arm64ManagedRegister::FromXRegister(X25),
+    Arm64ManagedRegister::FromXRegister(X26),
+    Arm64ManagedRegister::FromXRegister(X27),
+    Arm64ManagedRegister::FromXRegister(X28),
+    Arm64ManagedRegister::FromXRegister(X29),
+    Arm64ManagedRegister::FromXRegister(LR),
+    // Hard float registers.
+    // Considering the case, java_method_1 --> jni method --> jni function --> java_method_2,
+    // we may break on java_method_2 and we still need to find out the values of DEX registers
+    // in java_method_1. So all callee-saves(in managed code) need to be saved.
+    Arm64ManagedRegister::FromDRegister(D8),
+    Arm64ManagedRegister::FromDRegister(D9),
+    Arm64ManagedRegister::FromDRegister(D10),
+    Arm64ManagedRegister::FromDRegister(D11),
+    Arm64ManagedRegister::FromDRegister(D12),
+    Arm64ManagedRegister::FromDRegister(D13),
+    Arm64ManagedRegister::FromDRegister(D14),
+    Arm64ManagedRegister::FromDRegister(D15),
 };
 
+static constexpr uint32_t CalculateCoreCalleeSpillMask() {
+  uint32_t result = 0u;
+  for (auto&& r : kCalleeSaveRegisters) {
+    if (r.AsArm64().IsXRegister()) {
+      result |= (1 << r.AsArm64().AsXRegister());
+    }
+  }
+  return result;
+}
+
+static constexpr uint32_t CalculateFpCalleeSpillMask() {
+  uint32_t result = 0;
+  for (auto&& r : kCalleeSaveRegisters) {
+    if (r.AsArm64().IsDRegister()) {
+      result |= (1 << r.AsArm64().AsDRegister());
+    }
+  }
+  return result;
+}
+
+static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask();
+static constexpr uint32_t kFpCalleeSpillMask = CalculateFpCalleeSpillMask();
+
 // Calling convention
 ManagedRegister Arm64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
   return Arm64ManagedRegister::FromXRegister(X20);  // saved on entry restored on exit
@@ -157,47 +212,14 @@
 Arm64JniCallingConvention::Arm64JniCallingConvention(bool is_static, bool is_synchronized,
                                                      const char* shorty)
     : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
-  uint32_t core_spill_mask = CoreSpillMask();
-  DCHECK_EQ(XZR, kNumberOfXRegisters - 1);  // Exclude XZR from the loop (avoid 1 << 32).
-  for (int x_reg = 0; x_reg < kNumberOfXRegisters - 1; ++x_reg) {
-    if (((1 << x_reg) & core_spill_mask) != 0) {
-      callee_save_regs_.push_back(
-          Arm64ManagedRegister::FromXRegister(static_cast<XRegister>(x_reg)));
-    }
-  }
-
-  uint32_t fp_spill_mask = FpSpillMask();
-  for (int d_reg = 0; d_reg < kNumberOfDRegisters; ++d_reg) {
-    if (((1 << d_reg) & fp_spill_mask) != 0) {
-      callee_save_regs_.push_back(
-          Arm64ManagedRegister::FromDRegister(static_cast<DRegister>(d_reg)));
-    }
-  }
 }
 
 uint32_t Arm64JniCallingConvention::CoreSpillMask() const {
-  // Compute spill mask to agree with callee saves initialized in the constructor.
-  // Note: The native jni function may call to some VM runtime functions which may suspend
-  // or trigger GC. And the jni method frame will become top quick frame in those cases.
-  // So we need to satisfy GC to save LR and callee-save registers which is similar to
-  // CalleeSaveMethod(RefOnly) frame.
-  // Jni function is the native function which the java code wants to call.
-  // Jni method is the method that compiled by jni compiler.
-  // Call chain: managed code(java) --> jni method --> jni function.
-  // Thread register(X19) is saved on stack.
-  return 1 << X19 | 1 << X20 | 1 << X21 | 1 << X22 | 1 << X23 | 1 << X24 |
-         1 << X25 | 1 << X26 | 1 << X27 | 1 << X28 | 1 << X29 | 1 << LR;
+  return kCoreCalleeSpillMask;
 }
 
 uint32_t Arm64JniCallingConvention::FpSpillMask() const {
-  // Considering the case, java_method_1 --> jni method --> jni function --> java_method_2, we may
-  // break on java_method_2 and we still need to find out the values of DEX registers in
-  // java_method_1. So all callee-saves(in managed code) need to be saved.
-  uint32_t result = 0;
-  for (size_t i = 0; i < arraysize(kDCalleeSaveRegisters); ++i) {
-    result |= (1 << kDCalleeSaveRegisters[i]);
-  }
-  return result;
+  return kFpCalleeSpillMask;
 }
 
 ManagedRegister Arm64JniCallingConvention::ReturnScratchRegister() const {
@@ -218,6 +240,10 @@
   return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
 }
 
+ArrayRef<const ManagedRegister> Arm64JniCallingConvention::CalleeSaveRegisters() const {
+  return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+}
+
 bool Arm64JniCallingConvention::IsCurrentParamInRegister() {
   if (IsCurrentParamAFloatOrDouble()) {
     return (itr_float_and_doubles_ < 8);
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index 37c92b2..337e881 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -57,9 +57,7 @@
   // JNI calling convention
   size_t FrameSize() OVERRIDE;
   size_t OutArgSize() OVERRIDE;
-  const std::vector<ManagedRegister>& CalleeSaveRegisters() const OVERRIDE {
-    return callee_save_regs_;
-  }
+  ArrayRef<const ManagedRegister> CalleeSaveRegisters() const OVERRIDE;
   ManagedRegister ReturnScratchRegister() const OVERRIDE;
   uint32_t CoreSpillMask() const OVERRIDE;
   uint32_t FpSpillMask() const OVERRIDE;
@@ -77,9 +75,6 @@
   size_t NumberOfOutgoingStackArgs() OVERRIDE;
 
  private:
-  // TODO: these values aren't unique and can be shared amongst instances
-  std::vector<ManagedRegister> callee_save_regs_;
-
   DISALLOW_COPY_AND_ASSIGN(Arm64JniCallingConvention);
 };
 
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 2c4b15c..e8f738d 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -17,12 +17,11 @@
 #ifndef ART_COMPILER_JNI_QUICK_CALLING_CONVENTION_H_
 #define ART_COMPILER_JNI_QUICK_CALLING_CONVENTION_H_
 
-#include <vector>
-
 #include "base/arena_object.h"
 #include "handle_scope.h"
 #include "primitive.h"
 #include "thread.h"
+#include "utils/array_ref.h"
 #include "utils/managed_register.h"
 
 namespace art {
@@ -301,7 +300,7 @@
   virtual bool RequiresSmallResultTypeExtension() const = 0;
 
   // Callee save registers to spill prior to native code (which may clobber)
-  virtual const std::vector<ManagedRegister>& CalleeSaveRegisters() const = 0;
+  virtual ArrayRef<const ManagedRegister> CalleeSaveRegisters() const = 0;
 
   // Spill mask values
   virtual uint32_t CoreSpillMask() const = 0;
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 27714b8..4311a34 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -112,7 +112,7 @@
 
   // 1. Build the frame saving all callee saves
   const size_t frame_size(main_jni_conv->FrameSize());
-  const std::vector<ManagedRegister>& callee_save_regs = main_jni_conv->CalleeSaveRegisters();
+  ArrayRef<const ManagedRegister> callee_save_regs = main_jni_conv->CalleeSaveRegisters();
   __ BuildFrame(frame_size, mr_conv->MethodRegister(), callee_save_regs, mr_conv->EntrySpills());
   DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(frame_size));
 
diff --git a/compiler/jni/quick/mips/calling_convention_mips.cc b/compiler/jni/quick/mips/calling_convention_mips.cc
index 2d31a98..3d4d140 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.cc
+++ b/compiler/jni/quick/mips/calling_convention_mips.cc
@@ -27,6 +27,32 @@
 static const FRegister kFArgumentRegisters[] = { F12, F14 };
 static const DRegister kDArgumentRegisters[] = { D6, D7 };
 
+static constexpr ManagedRegister kCalleeSaveRegisters[] = {
+    // Core registers.
+    MipsManagedRegister::FromCoreRegister(S2),
+    MipsManagedRegister::FromCoreRegister(S3),
+    MipsManagedRegister::FromCoreRegister(S4),
+    MipsManagedRegister::FromCoreRegister(S5),
+    MipsManagedRegister::FromCoreRegister(S6),
+    MipsManagedRegister::FromCoreRegister(S7),
+    MipsManagedRegister::FromCoreRegister(FP),
+    // No hard float callee saves.
+};
+
+static constexpr uint32_t CalculateCoreCalleeSpillMask() {
+  // RA is a special callee save which is not reported by CalleeSaveRegisters().
+  uint32_t result = 1 << RA;
+  for (auto&& r : kCalleeSaveRegisters) {
+    if (r.AsMips().IsCoreRegister()) {
+      result |= (1 << r.AsMips().AsCoreRegister());
+    }
+  }
+  return result;
+}
+
+static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask();
+static constexpr uint32_t kFpCalleeSpillMask = 0u;
+
 // Calling convention
 ManagedRegister MipsManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
   return MipsManagedRegister::FromCoreRegister(T9);
@@ -161,21 +187,14 @@
     cur_reg++;  // bump the iterator for every argument
   }
   padding_ = padding;
-
-  callee_save_regs_.push_back(MipsManagedRegister::FromCoreRegister(S2));
-  callee_save_regs_.push_back(MipsManagedRegister::FromCoreRegister(S3));
-  callee_save_regs_.push_back(MipsManagedRegister::FromCoreRegister(S4));
-  callee_save_regs_.push_back(MipsManagedRegister::FromCoreRegister(S5));
-  callee_save_regs_.push_back(MipsManagedRegister::FromCoreRegister(S6));
-  callee_save_regs_.push_back(MipsManagedRegister::FromCoreRegister(S7));
-  callee_save_regs_.push_back(MipsManagedRegister::FromCoreRegister(FP));
 }
 
 uint32_t MipsJniCallingConvention::CoreSpillMask() const {
-  // Compute spill mask to agree with callee saves initialized in the constructor
-  uint32_t result = 0;
-  result = 1 << S2 | 1 << S3 | 1 << S4 | 1 << S5 | 1 << S6 | 1 << S7 | 1 << FP | 1 << RA;
-  return result;
+  return kCoreCalleeSpillMask;
+}
+
+uint32_t MipsJniCallingConvention::FpSpillMask() const {
+  return kFpCalleeSpillMask;
 }
 
 ManagedRegister MipsJniCallingConvention::ReturnScratchRegister() const {
@@ -196,6 +215,10 @@
   return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize + padding_, kStackAlignment);
 }
 
+ArrayRef<const ManagedRegister> MipsJniCallingConvention::CalleeSaveRegisters() const {
+  return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+}
+
 // JniCallingConvention ABI follows AAPCS where longs and doubles must occur
 // in even register numbers and stack slots
 void MipsJniCallingConvention::Next() {
diff --git a/compiler/jni/quick/mips/calling_convention_mips.h b/compiler/jni/quick/mips/calling_convention_mips.h
index dc45432..5c128b0 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.h
+++ b/compiler/jni/quick/mips/calling_convention_mips.h
@@ -58,14 +58,10 @@
   void Next() OVERRIDE;  // Override default behavior for AAPCS
   size_t FrameSize() OVERRIDE;
   size_t OutArgSize() OVERRIDE;
-  const std::vector<ManagedRegister>& CalleeSaveRegisters() const OVERRIDE {
-    return callee_save_regs_;
-  }
+  ArrayRef<const ManagedRegister> CalleeSaveRegisters() const OVERRIDE;
   ManagedRegister ReturnScratchRegister() const OVERRIDE;
   uint32_t CoreSpillMask() const OVERRIDE;
-  uint32_t FpSpillMask() const OVERRIDE {
-    return 0;  // Floats aren't spilled in JNI down call
-  }
+  uint32_t FpSpillMask() const OVERRIDE;
   bool IsCurrentParamInRegister() OVERRIDE;
   bool IsCurrentParamOnStack() OVERRIDE;
   ManagedRegister CurrentParamRegister() OVERRIDE;
@@ -80,9 +76,6 @@
   size_t NumberOfOutgoingStackArgs() OVERRIDE;
 
  private:
-  // TODO: these values aren't unique and can be shared amongst instances
-  std::vector<ManagedRegister> callee_save_regs_;
-
   // Padding to ensure longs and doubles are not split in AAPCS
   size_t padding_;
 
diff --git a/compiler/jni/quick/mips64/calling_convention_mips64.cc b/compiler/jni/quick/mips64/calling_convention_mips64.cc
index 807d740..f2e1da8 100644
--- a/compiler/jni/quick/mips64/calling_convention_mips64.cc
+++ b/compiler/jni/quick/mips64/calling_convention_mips64.cc
@@ -31,6 +31,33 @@
   F12, F13, F14, F15, F16, F17, F18, F19
 };
 
+static constexpr ManagedRegister kCalleeSaveRegisters[] = {
+    // Core registers.
+    Mips64ManagedRegister::FromGpuRegister(S2),
+    Mips64ManagedRegister::FromGpuRegister(S3),
+    Mips64ManagedRegister::FromGpuRegister(S4),
+    Mips64ManagedRegister::FromGpuRegister(S5),
+    Mips64ManagedRegister::FromGpuRegister(S6),
+    Mips64ManagedRegister::FromGpuRegister(S7),
+    Mips64ManagedRegister::FromGpuRegister(GP),
+    Mips64ManagedRegister::FromGpuRegister(S8),
+    // No hard float callee saves.
+};
+
+static constexpr uint32_t CalculateCoreCalleeSpillMask() {
+  // RA is a special callee save which is not reported by CalleeSaveRegisters().
+  uint32_t result = 1 << RA;
+  for (auto&& r : kCalleeSaveRegisters) {
+    if (r.AsMips64().IsGpuRegister()) {
+      result |= (1 << r.AsMips64().AsGpuRegister());
+    }
+  }
+  return result;
+}
+
+static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask();
+static constexpr uint32_t kFpCalleeSpillMask = 0u;
+
 // Calling convention
 ManagedRegister Mips64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
   return Mips64ManagedRegister::FromGpuRegister(T9);
@@ -126,22 +153,14 @@
 Mips64JniCallingConvention::Mips64JniCallingConvention(bool is_static, bool is_synchronized,
                                                        const char* shorty)
     : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
-  callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S2));
-  callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S3));
-  callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S4));
-  callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S5));
-  callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S6));
-  callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S7));
-  callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(GP));
-  callee_save_regs_.push_back(Mips64ManagedRegister::FromGpuRegister(S8));
 }
 
 uint32_t Mips64JniCallingConvention::CoreSpillMask() const {
-  // Compute spill mask to agree with callee saves initialized in the constructor
-  uint32_t result = 0;
-  result = 1 << S2 | 1 << S3 | 1 << S4 | 1 << S5 | 1 << S6 | 1 << S7 | 1 << GP | 1 << S8 | 1 << RA;
-  DCHECK_EQ(static_cast<size_t>(POPCOUNT(result)), callee_save_regs_.size() + 1);
-  return result;
+  return kCoreCalleeSpillMask;
+}
+
+uint32_t Mips64JniCallingConvention::FpSpillMask() const {
+  return kFpCalleeSpillMask;
 }
 
 ManagedRegister Mips64JniCallingConvention::ReturnScratchRegister() const {
@@ -162,6 +181,10 @@
   return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
 }
 
+ArrayRef<const ManagedRegister> Mips64JniCallingConvention::CalleeSaveRegisters() const {
+  return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+}
+
 bool Mips64JniCallingConvention::IsCurrentParamInRegister() {
   return itr_args_ < 8;
 }
diff --git a/compiler/jni/quick/mips64/calling_convention_mips64.h b/compiler/jni/quick/mips64/calling_convention_mips64.h
index 3d6aab7..99ea3cd 100644
--- a/compiler/jni/quick/mips64/calling_convention_mips64.h
+++ b/compiler/jni/quick/mips64/calling_convention_mips64.h
@@ -57,14 +57,10 @@
   // JNI calling convention
   size_t FrameSize() OVERRIDE;
   size_t OutArgSize() OVERRIDE;
-  const std::vector<ManagedRegister>& CalleeSaveRegisters() const OVERRIDE {
-    return callee_save_regs_;
-  }
+  ArrayRef<const ManagedRegister> CalleeSaveRegisters() const OVERRIDE;
   ManagedRegister ReturnScratchRegister() const OVERRIDE;
   uint32_t CoreSpillMask() const OVERRIDE;
-  uint32_t FpSpillMask() const OVERRIDE {
-    return 0;  // Floats aren't spilled in JNI down call
-  }
+  uint32_t FpSpillMask() const OVERRIDE;
   bool IsCurrentParamInRegister() OVERRIDE;
   bool IsCurrentParamOnStack() OVERRIDE;
   ManagedRegister CurrentParamRegister() OVERRIDE;
@@ -79,9 +75,6 @@
   size_t NumberOfOutgoingStackArgs() OVERRIDE;
 
  private:
-  // TODO: these values aren't unique and can be shared amongst instances
-  std::vector<ManagedRegister> callee_save_regs_;
-
   DISALLOW_COPY_AND_ASSIGN(Mips64JniCallingConvention);
 };
 
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 322caca..22c7cd0 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -23,6 +23,28 @@
 namespace art {
 namespace x86 {
 
+static constexpr ManagedRegister kCalleeSaveRegisters[] = {
+    // Core registers.
+    X86ManagedRegister::FromCpuRegister(EBP),
+    X86ManagedRegister::FromCpuRegister(ESI),
+    X86ManagedRegister::FromCpuRegister(EDI),
+    // No hard float callee saves.
+};
+
+static constexpr uint32_t CalculateCoreCalleeSpillMask() {
+  // The spilled PC gets a special marker.
+  uint32_t result = 1 << kNumberOfCpuRegisters;
+  for (auto&& r : kCalleeSaveRegisters) {
+    if (r.AsX86().IsCpuRegister()) {
+      result |= (1 << r.AsX86().AsCpuRegister());
+    }
+  }
+  return result;
+}
+
+static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask();
+static constexpr uint32_t kFpCalleeSpillMask = 0u;
+
 // Calling convention
 
 ManagedRegister X86ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
@@ -169,13 +191,14 @@
 X86JniCallingConvention::X86JniCallingConvention(bool is_static, bool is_synchronized,
                                                  const char* shorty)
     : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
-  callee_save_regs_.push_back(X86ManagedRegister::FromCpuRegister(EBP));
-  callee_save_regs_.push_back(X86ManagedRegister::FromCpuRegister(ESI));
-  callee_save_regs_.push_back(X86ManagedRegister::FromCpuRegister(EDI));
 }
 
 uint32_t X86JniCallingConvention::CoreSpillMask() const {
-  return 1 << EBP | 1 << ESI | 1 << EDI | 1 << kNumberOfCpuRegisters;
+  return kCoreCalleeSpillMask;
+}
+
+uint32_t X86JniCallingConvention::FpSpillMask() const {
+  return kFpCalleeSpillMask;
 }
 
 size_t X86JniCallingConvention::FrameSize() {
@@ -192,6 +215,10 @@
   return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
 }
 
+ArrayRef<const ManagedRegister> X86JniCallingConvention::CalleeSaveRegisters() const {
+  return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+}
+
 bool X86JniCallingConvention::IsCurrentParamInRegister() {
   return false;  // Everything is passed by stack.
 }
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index cdf0956..9d678b7 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -59,14 +59,10 @@
   // JNI calling convention
   size_t FrameSize() OVERRIDE;
   size_t OutArgSize() OVERRIDE;
-  const std::vector<ManagedRegister>& CalleeSaveRegisters() const OVERRIDE {
-    return callee_save_regs_;
-  }
+  ArrayRef<const ManagedRegister> CalleeSaveRegisters() const OVERRIDE;
   ManagedRegister ReturnScratchRegister() const OVERRIDE;
   uint32_t CoreSpillMask() const OVERRIDE;
-  uint32_t FpSpillMask() const OVERRIDE {
-    return 0;
-  }
+  uint32_t FpSpillMask() const OVERRIDE;
   bool IsCurrentParamInRegister() OVERRIDE;
   bool IsCurrentParamOnStack() OVERRIDE;
   ManagedRegister CurrentParamRegister() OVERRIDE;
@@ -81,9 +77,6 @@
   size_t NumberOfOutgoingStackArgs() OVERRIDE;
 
  private:
-  // TODO: these values aren't unique and can be shared amongst instances
-  std::vector<ManagedRegister> callee_save_regs_;
-
   DISALLOW_COPY_AND_ASSIGN(X86JniCallingConvention);
 };
 
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index b6b11ca..cc4d232 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -24,6 +24,45 @@
 namespace art {
 namespace x86_64 {
 
+static constexpr ManagedRegister kCalleeSaveRegisters[] = {
+    // Core registers.
+    X86_64ManagedRegister::FromCpuRegister(RBX),
+    X86_64ManagedRegister::FromCpuRegister(RBP),
+    X86_64ManagedRegister::FromCpuRegister(R12),
+    X86_64ManagedRegister::FromCpuRegister(R13),
+    X86_64ManagedRegister::FromCpuRegister(R14),
+    X86_64ManagedRegister::FromCpuRegister(R15),
+    // Hard float registers.
+    X86_64ManagedRegister::FromXmmRegister(XMM12),
+    X86_64ManagedRegister::FromXmmRegister(XMM13),
+    X86_64ManagedRegister::FromXmmRegister(XMM14),
+    X86_64ManagedRegister::FromXmmRegister(XMM15),
+};
+
+static constexpr uint32_t CalculateCoreCalleeSpillMask() {
+  // The spilled PC gets a special marker.
+  uint32_t result = 1 << kNumberOfCpuRegisters;
+  for (auto&& r : kCalleeSaveRegisters) {
+    if (r.AsX86_64().IsCpuRegister()) {
+      result |= (1 << r.AsX86_64().AsCpuRegister().AsRegister());
+    }
+  }
+  return result;
+}
+
+static constexpr uint32_t CalculateFpCalleeSpillMask() {
+  uint32_t result = 0;
+  for (auto&& r : kCalleeSaveRegisters) {
+    if (r.AsX86_64().IsXmmRegister()) {
+      result |= (1 << r.AsX86_64().AsXmmRegister().AsFloatRegister());
+    }
+  }
+  return result;
+}
+
+static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask();
+static constexpr uint32_t kFpCalleeSpillMask = CalculateFpCalleeSpillMask();
+
 // Calling convention
 
 ManagedRegister X86_64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
@@ -125,25 +164,14 @@
 X86_64JniCallingConvention::X86_64JniCallingConvention(bool is_static, bool is_synchronized,
                                                        const char* shorty)
     : JniCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {
-  callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(RBX));
-  callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(RBP));
-  callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(R12));
-  callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(R13));
-  callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(R14));
-  callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(R15));
-  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM12));
-  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM13));
-  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM14));
-  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM15));
 }
 
 uint32_t X86_64JniCallingConvention::CoreSpillMask() const {
-  return 1 << RBX | 1 << RBP | 1 << R12 | 1 << R13 | 1 << R14 | 1 << R15 |
-      1 << kNumberOfCpuRegisters;
+  return kCoreCalleeSpillMask;
 }
 
 uint32_t X86_64JniCallingConvention::FpSpillMask() const {
-  return 1 << XMM12 | 1 << XMM13 | 1 << XMM14 | 1 << XMM15;
+  return kFpCalleeSpillMask;
 }
 
 size_t X86_64JniCallingConvention::FrameSize() {
@@ -160,6 +188,10 @@
   return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
 }
 
+ArrayRef<const ManagedRegister> X86_64JniCallingConvention::CalleeSaveRegisters() const {
+  return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+}
+
 bool X86_64JniCallingConvention::IsCurrentParamInRegister() {
   return !IsCurrentParamOnStack();
 }
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index 6e47c9f..e2d3d48 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -55,9 +55,7 @@
   // JNI calling convention
   size_t FrameSize() OVERRIDE;
   size_t OutArgSize() OVERRIDE;
-  const std::vector<ManagedRegister>& CalleeSaveRegisters() const OVERRIDE {
-    return callee_save_regs_;
-  }
+  ArrayRef<const ManagedRegister> CalleeSaveRegisters() const OVERRIDE;
   ManagedRegister ReturnScratchRegister() const OVERRIDE;
   uint32_t CoreSpillMask() const OVERRIDE;
   uint32_t FpSpillMask() const OVERRIDE;
@@ -75,9 +73,6 @@
   size_t NumberOfOutgoingStackArgs() OVERRIDE;
 
  private:
-  // TODO: these values aren't unique and can be shared amongst instances
-  std::vector<ManagedRegister> callee_save_regs_;
-
   DISALLOW_COPY_AND_ASSIGN(X86_64JniCallingConvention);
 };
 
diff --git a/compiler/linker/arm/relative_patcher_arm_base.cc b/compiler/linker/arm/relative_patcher_arm_base.cc
index d4dd978..2471f79 100644
--- a/compiler/linker/arm/relative_patcher_arm_base.cc
+++ b/compiler/linker/arm/relative_patcher_arm_base.cc
@@ -31,10 +31,6 @@
 }
 
 uint32_t ArmBaseRelativePatcher::ReserveSpaceEnd(uint32_t offset) {
-  // NOTE: The final thunk can be reserved from InitCodeMethodVisitor::EndClass() while it
-  // may be written early by WriteCodeMethodVisitor::VisitMethod() for a deduplicated chunk
-  // of code. To avoid any alignment discrepancies for the final chunk, we always align the
-  // offset after reserving of writing any chunk.
   uint32_t aligned_offset = CompiledMethod::AlignCode(offset, instruction_set_);
   bool needs_thunk = ReserveSpaceProcessPatches(aligned_offset,
                                                 MethodReference(nullptr, 0u),
@@ -46,7 +42,7 @@
     unprocessed_patches_.clear();
 
     thunk_locations_.push_back(aligned_offset);
-    offset = CompiledMethod::AlignCode(aligned_offset + thunk_code_.size(), instruction_set_);
+    offset = aligned_offset + thunk_code_.size();
   }
   return offset;
 }
@@ -65,13 +61,7 @@
     if (UNLIKELY(!WriteRelCallThunk(out, ArrayRef<const uint8_t>(thunk_code_)))) {
       return 0u;
     }
-    uint32_t thunk_end_offset = aligned_offset + thunk_code_.size();
-    // Align after writing chunk, see the ReserveSpace() above.
-    offset = CompiledMethod::AlignCode(thunk_end_offset, instruction_set_);
-    aligned_code_delta = offset - thunk_end_offset;
-    if (aligned_code_delta != 0u && !WriteCodeAlignment(out, aligned_code_delta)) {
-      return 0u;
-    }
+    offset = aligned_offset + thunk_code_.size();
   }
   return offset;
 }
@@ -92,7 +82,7 @@
                                                       MethodReference method_ref,
                                                       uint32_t max_extra_space) {
   uint32_t quick_code_size = compiled_method->GetQuickCode().size();
-  uint32_t quick_code_offset = compiled_method->AlignCode(offset) + sizeof(OatQuickMethodHeader);
+  uint32_t quick_code_offset = compiled_method->AlignCode(offset + sizeof(OatQuickMethodHeader));
   uint32_t next_aligned_offset = compiled_method->AlignCode(quick_code_offset + quick_code_size);
   // Adjust for extra space required by the subclass.
   next_aligned_offset = compiled_method->AlignCode(next_aligned_offset + max_extra_space);
@@ -106,9 +96,9 @@
     if (needs_thunk) {
       // A single thunk will cover all pending patches.
       unprocessed_patches_.clear();
-      uint32_t thunk_location = compiled_method->AlignCode(offset);
+      uint32_t thunk_location = CompiledMethod::AlignCode(offset, instruction_set_);
       thunk_locations_.push_back(thunk_location);
-      offset = CompiledMethod::AlignCode(thunk_location + thunk_code_.size(), instruction_set_);
+      offset = thunk_location + thunk_code_.size();
     }
   }
   for (const LinkerPatch& patch : compiled_method->GetPatches()) {
diff --git a/compiler/linker/arm/relative_patcher_thumb2_test.cc b/compiler/linker/arm/relative_patcher_thumb2_test.cc
index a8078e3..eace3d4 100644
--- a/compiler/linker/arm/relative_patcher_thumb2_test.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2_test.cc
@@ -48,18 +48,18 @@
                              const ArrayRef<const LinkerPatch>& method3_patches,
                              uint32_t distance_without_thunks) {
     CHECK_EQ(distance_without_thunks % kArmAlignment, 0u);
-    const uint32_t method1_offset =
-        CompiledCode::AlignCode(kTrampolineSize, kThumb2) + sizeof(OatQuickMethodHeader);
+    uint32_t method1_offset =
+        kTrampolineSize + CodeAlignmentSize(kTrampolineSize) + sizeof(OatQuickMethodHeader);
     AddCompiledMethod(MethodRef(1u), method1_code, method1_patches);
 
     // We want to put the method3 at a very precise offset.
     const uint32_t method3_offset = method1_offset + distance_without_thunks;
-    CHECK_ALIGNED(method3_offset - sizeof(OatQuickMethodHeader), kArmAlignment);
+    CHECK_ALIGNED(method3_offset, kArmAlignment);
 
     // Calculate size of method2 so that we put method3 at the correct place.
+    const uint32_t method1_end = method1_offset + method1_code.size();
     const uint32_t method2_offset =
-        CompiledCode::AlignCode(method1_offset + method1_code.size(), kThumb2) +
-        sizeof(OatQuickMethodHeader);
+        method1_end + CodeAlignmentSize(method1_end) + sizeof(OatQuickMethodHeader);
     const uint32_t method2_size = (method3_offset - sizeof(OatQuickMethodHeader) - method2_offset);
     std::vector<uint8_t> method2_raw_code(method2_size);
     ArrayRef<const uint8_t> method2_code(method2_raw_code);
@@ -78,8 +78,11 @@
     if (result3.second == method3_offset + 1 /* thumb mode */) {
       return false;  // No thunk.
     } else {
-      uint32_t aligned_thunk_size = CompiledCode::AlignCode(ThunkSize(), kThumb2);
-      CHECK_EQ(result3.second, method3_offset + aligned_thunk_size + 1 /* thumb mode */);
+      uint32_t thunk_end =
+          CompiledCode::AlignCode(method3_offset - sizeof(OatQuickMethodHeader), kThumb2) +
+          ThunkSize();
+      uint32_t header_offset = thunk_end + CodeAlignmentSize(thunk_end);
+      CHECK_EQ(result3.second, header_offset + sizeof(OatQuickMethodHeader) + 1 /* thumb mode */);
       return true;   // Thunk present.
     }
   }
@@ -352,9 +355,12 @@
 
   uint32_t method1_offset = GetMethodOffset(1u);
   uint32_t method3_offset = GetMethodOffset(3u);
+  ASSERT_TRUE(IsAligned<kArmAlignment>(method3_offset));
   uint32_t method3_header_offset = method3_offset - sizeof(OatQuickMethodHeader);
-  ASSERT_TRUE(IsAligned<kArmAlignment>(method3_header_offset));
-  uint32_t thunk_offset = method3_header_offset - CompiledCode::AlignCode(ThunkSize(), kThumb2);
+  uint32_t thunk_offset =
+      RoundDown(method3_header_offset - ThunkSize(), GetInstructionSetAlignment(kThumb2));
+  DCHECK_EQ(thunk_offset + ThunkSize() + CodeAlignmentSize(thunk_offset + ThunkSize()),
+            method3_header_offset);
   ASSERT_TRUE(IsAligned<kArmAlignment>(thunk_offset));
   uint32_t diff = thunk_offset - (method1_offset + bl_offset_in_method1 + 4u /* PC adjustment */);
   ASSERT_EQ(diff & 1u, 0u);
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index b4ecbd8..84aab78 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -83,7 +83,7 @@
 
   // Now that we have the actual offset where the code will be placed, locate the ADRP insns
   // that actually require the thunk.
-  uint32_t quick_code_offset = compiled_method->AlignCode(offset) + sizeof(OatQuickMethodHeader);
+  uint32_t quick_code_offset = compiled_method->AlignCode(offset + sizeof(OatQuickMethodHeader));
   ArrayRef<const uint8_t> code = compiled_method->GetQuickCode();
   uint32_t thunk_offset = compiled_method->AlignCode(quick_code_offset + code.size());
   DCHECK(compiled_method != nullptr);
@@ -210,7 +210,14 @@
   } else {
     if ((insn & 0xfffffc00) == 0x91000000) {
       // ADD immediate, 64-bit with imm12 == 0 (unset).
-      DCHECK(patch.GetType() == LinkerPatch::Type::kStringRelative) << patch.GetType();
+      if (!kEmitCompilerReadBarrier) {
+        DCHECK(patch.GetType() == LinkerPatch::Type::kStringRelative) << patch.GetType();
+      } else {
+        // With the read barrier (non-baker) enabled, it could be kDexCacheArray in the
+        // HLoadString::LoadKind::kDexCachePcRelative case of VisitLoadString().
+        DCHECK(patch.GetType() == LinkerPatch::Type::kStringRelative ||
+               patch.GetType() == LinkerPatch::Type::kDexCacheArray) << patch.GetType();
+      }
       shift = 0u;  // No shift for ADD.
     } else {
       // LDR 32-bit or 64-bit with imm12 == 0 (unset).
diff --git a/compiler/linker/arm64/relative_patcher_arm64_test.cc b/compiler/linker/arm64/relative_patcher_arm64_test.cc
index 09729fd..573de73 100644
--- a/compiler/linker/arm64/relative_patcher_arm64_test.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64_test.cc
@@ -67,36 +67,39 @@
                                  const ArrayRef<const LinkerPatch>& last_method_patches,
                                  uint32_t distance_without_thunks) {
     CHECK_EQ(distance_without_thunks % kArm64Alignment, 0u);
-    const uint32_t method1_offset =
-        CompiledCode::AlignCode(kTrampolineSize, kArm64) + sizeof(OatQuickMethodHeader);
+    uint32_t method1_offset =
+        kTrampolineSize + CodeAlignmentSize(kTrampolineSize) + sizeof(OatQuickMethodHeader);
     AddCompiledMethod(MethodRef(1u), method1_code, method1_patches);
-    const uint32_t gap_start =
-        CompiledCode::AlignCode(method1_offset + method1_code.size(), kArm64);
+    const uint32_t gap_start = method1_offset + method1_code.size();
 
     // We want to put the method3 at a very precise offset.
     const uint32_t last_method_offset = method1_offset + distance_without_thunks;
+    CHECK_ALIGNED(last_method_offset, kArm64Alignment);
     const uint32_t gap_end = last_method_offset - sizeof(OatQuickMethodHeader);
-    CHECK_ALIGNED(gap_end, kArm64Alignment);
 
-    // Fill the gap with intermediate methods in chunks of 2MiB and the last in [2MiB, 4MiB).
+    // Fill the gap with intermediate methods in chunks of 2MiB and the first in [2MiB, 4MiB).
     // (This allows deduplicating the small chunks to avoid using 256MiB of memory for +-128MiB
-    // offsets by this test.)
+    // offsets by this test. Making the first chunk bigger makes it easy to give all intermediate
+    // methods the same alignment of the end, so the thunk insertion adds a predictable size as
+    // long as it's after the first chunk.)
     uint32_t method_idx = 2u;
     constexpr uint32_t kSmallChunkSize = 2 * MB;
     std::vector<uint8_t> gap_code;
-    size_t gap_size = gap_end - gap_start;
-    for (; gap_size >= 2u * kSmallChunkSize; gap_size -= kSmallChunkSize) {
-      uint32_t chunk_code_size = kSmallChunkSize - sizeof(OatQuickMethodHeader);
+    uint32_t gap_size = gap_end - gap_start;
+    uint32_t num_small_chunks = std::max(gap_size / kSmallChunkSize, 1u) - 1u;
+    uint32_t chunk_start = gap_start;
+    uint32_t chunk_size = gap_size - num_small_chunks * kSmallChunkSize;
+    for (uint32_t i = 0; i <= num_small_chunks; ++i) {  // num_small_chunks+1 iterations.
+      uint32_t chunk_code_size =
+          chunk_size - CodeAlignmentSize(chunk_start) - sizeof(OatQuickMethodHeader);
       gap_code.resize(chunk_code_size, 0u);
       AddCompiledMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(gap_code),
                         ArrayRef<const LinkerPatch>());
       method_idx += 1u;
+      chunk_start += chunk_size;
+      chunk_size = kSmallChunkSize;  // For all but the first chunk.
+      DCHECK_EQ(CodeAlignmentSize(gap_end), CodeAlignmentSize(chunk_start));
     }
-    uint32_t chunk_code_size = gap_size - sizeof(OatQuickMethodHeader);
-    gap_code.resize(chunk_code_size, 0u);
-    AddCompiledMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(gap_code),
-                      ArrayRef<const LinkerPatch>());
-    method_idx += 1u;
 
     // Add the last method and link
     AddCompiledMethod(MethodRef(method_idx), last_method_code, last_method_patches);
@@ -109,8 +112,9 @@
     // There may be a thunk before method2.
     if (last_result.second != last_method_offset) {
       // Thunk present. Check that there's only one.
-      uint32_t aligned_thunk_size = CompiledCode::AlignCode(ThunkSize(), kArm64);
-      CHECK_EQ(last_result.second, last_method_offset + aligned_thunk_size);
+      uint32_t thunk_end = CompiledCode::AlignCode(gap_end, kArm64) + ThunkSize();
+      uint32_t header_offset = thunk_end + CodeAlignmentSize(thunk_end);
+      CHECK_EQ(last_result.second, header_offset + sizeof(OatQuickMethodHeader));
     }
     return method_idx;
   }
@@ -341,7 +345,7 @@
                         uint32_t dex_cache_arrays_begin,
                         uint32_t element_offset) {
     uint32_t method1_offset =
-        CompiledCode::AlignCode(kTrampolineSize, kArm64) + sizeof(OatQuickMethodHeader);
+        kTrampolineSize + CodeAlignmentSize(kTrampolineSize) + sizeof(OatQuickMethodHeader);
     ASSERT_LT(method1_offset, adrp_offset);
     CHECK_ALIGNED(adrp_offset, 4u);
     uint32_t num_nops = (adrp_offset - method1_offset) / 4u;
@@ -391,7 +395,7 @@
                         bool has_thunk,
                         uint32_t string_offset) {
     uint32_t method1_offset =
-        CompiledCode::AlignCode(kTrampolineSize, kArm64) + sizeof(OatQuickMethodHeader);
+        kTrampolineSize + CodeAlignmentSize(kTrampolineSize) + sizeof(OatQuickMethodHeader);
     ASSERT_LT(method1_offset, adrp_offset);
     CHECK_ALIGNED(adrp_offset, 4u);
     uint32_t num_nops = (adrp_offset - method1_offset) / 4u;
@@ -614,10 +618,12 @@
 
   uint32_t method1_offset = GetMethodOffset(1u);
   uint32_t last_method_offset = GetMethodOffset(last_method_idx);
+  ASSERT_TRUE(IsAligned<kArm64Alignment>(last_method_offset));
   uint32_t last_method_header_offset = last_method_offset - sizeof(OatQuickMethodHeader);
-  ASSERT_TRUE(IsAligned<kArm64Alignment>(last_method_header_offset));
-  uint32_t thunk_offset = last_method_header_offset - CompiledCode::AlignCode(ThunkSize(), kArm64);
-  ASSERT_TRUE(IsAligned<kArm64Alignment>(thunk_offset));
+  uint32_t thunk_offset =
+      RoundDown(last_method_header_offset - ThunkSize(), GetInstructionSetAlignment(kArm64));
+  DCHECK_EQ(thunk_offset + ThunkSize() + CodeAlignmentSize(thunk_offset + ThunkSize()),
+            last_method_header_offset);
   uint32_t diff = thunk_offset - (method1_offset + bl_offset_in_method1);
   CHECK_ALIGNED(diff, 4u);
   ASSERT_LT(diff, 128 * MB);
diff --git a/compiler/linker/relative_patcher_test.h b/compiler/linker/relative_patcher_test.h
index ec69107..d21f33e 100644
--- a/compiler/linker/relative_patcher_test.h
+++ b/compiler/linker/relative_patcher_test.h
@@ -98,6 +98,14 @@
         patches));
   }
 
+  uint32_t CodeAlignmentSize(uint32_t header_offset_to_align) {
+    // We want to align the code rather than the preheader.
+    uint32_t unaligned_code_offset = header_offset_to_align + sizeof(OatQuickMethodHeader);
+    uint32_t aligned_code_offset =
+        CompiledMethod::AlignCode(unaligned_code_offset, instruction_set_);
+    return aligned_code_offset - unaligned_code_offset;
+  }
+
   void Link() {
     // Reserve space.
     static_assert(kTrampolineOffset == 0u, "Unexpected trampoline offset.");
@@ -106,9 +114,8 @@
     for (auto& compiled_method : compiled_methods_) {
       offset = patcher_->ReserveSpace(offset, compiled_method.get(), compiled_method_refs_[idx]);
 
-      uint32_t aligned_offset = compiled_method->AlignCode(offset);
-      uint32_t aligned_code_delta = aligned_offset - offset;
-      offset += aligned_code_delta;
+      uint32_t alignment_size = CodeAlignmentSize(offset);
+      offset += alignment_size;
 
       offset += sizeof(OatQuickMethodHeader);
       uint32_t quick_code_offset = offset + compiled_method->CodeDelta();
@@ -136,11 +143,10 @@
     for (auto& compiled_method : compiled_methods_) {
       offset = patcher_->WriteThunks(&out_, offset);
 
-      uint32_t aligned_offset = compiled_method->AlignCode(offset);
-      uint32_t aligned_code_delta = aligned_offset - offset;
-      CHECK_LE(aligned_code_delta, sizeof(kPadding));
-      out_.WriteFully(kPadding, aligned_code_delta);
-      offset += aligned_code_delta;
+      uint32_t alignment_size = CodeAlignmentSize(offset);
+      CHECK_LE(alignment_size, sizeof(kPadding));
+      out_.WriteFully(kPadding, alignment_size);
+      offset += alignment_size;
 
       out_.WriteFully(dummy_header, sizeof(OatQuickMethodHeader));
       offset += sizeof(OatQuickMethodHeader);
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index a02c024..d6f29f9 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -86,6 +86,13 @@
   OatHeader* const oat_header_;
 };
 
+inline uint32_t CodeAlignmentSize(uint32_t header_offset, const CompiledMethod& compiled_method) {
+  // We want to align the code rather than the preheader.
+  uint32_t unaligned_code_offset = header_offset + sizeof(OatQuickMethodHeader);
+  uint32_t aligned_code_offset =  compiled_method.AlignCode(unaligned_code_offset);
+  return aligned_code_offset - unaligned_code_offset;
+}
+
 }  // anonymous namespace
 
 // Defines the location of the raw dex file to write.
@@ -816,8 +823,8 @@
                               uint32_t thumb_offset) {
     offset_ = writer_->relative_patcher_->ReserveSpace(
         offset_, compiled_method, MethodReference(dex_file_, it.GetMemberIndex()));
-    offset_ = compiled_method->AlignCode(offset_);
-    DCHECK_ALIGNED_PARAM(offset_,
+    offset_ += CodeAlignmentSize(offset_, *compiled_method);
+    DCHECK_ALIGNED_PARAM(offset_ + sizeof(OatQuickMethodHeader),
                          GetInstructionSetAlignment(compiled_method->GetInstructionSet()));
     return offset_ + sizeof(OatQuickMethodHeader) + thumb_offset;
   }
@@ -1010,17 +1017,16 @@
           ReportWriteFailure("relative call thunk", it);
           return false;
         }
-        uint32_t aligned_offset = compiled_method->AlignCode(offset_);
-        uint32_t aligned_code_delta = aligned_offset - offset_;
-        if (aligned_code_delta != 0) {
-          if (!writer_->WriteCodeAlignment(out, aligned_code_delta)) {
+        uint32_t alignment_size = CodeAlignmentSize(offset_, *compiled_method);
+        if (alignment_size != 0) {
+          if (!writer_->WriteCodeAlignment(out, alignment_size)) {
             ReportWriteFailure("code alignment padding", it);
             return false;
           }
-          offset_ += aligned_code_delta;
+          offset_ += alignment_size;
           DCHECK_OFFSET_();
         }
-        DCHECK_ALIGNED_PARAM(offset_,
+        DCHECK_ALIGNED_PARAM(offset_ + sizeof(OatQuickMethodHeader),
                              GetInstructionSetAlignment(compiled_method->GetInstructionSet()));
         DCHECK_EQ(method_offsets.code_offset_,
                   offset_ + sizeof(OatQuickMethodHeader) + compiled_method->CodeDelta())
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index f0c4eaf..582c1e3 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -891,14 +891,15 @@
 
   static bool HasSameInputAtBackEdges(HPhi* phi) {
     DCHECK(phi->IsLoopHeaderPhi());
+    auto&& inputs = phi->GetInputs();
     // Start with input 1. Input 0 is from the incoming block.
-    HInstruction* input1 = phi->InputAt(1);
+    HInstruction* input1 = inputs[1];
     DCHECK(phi->GetBlock()->GetLoopInformation()->IsBackEdge(
         *phi->GetBlock()->GetPredecessors()[1]));
-    for (size_t i = 2, e = phi->InputCount(); i < e; ++i) {
+    for (size_t i = 2; i < inputs.size(); ++i) {
       DCHECK(phi->GetBlock()->GetLoopInformation()->IsBackEdge(
           *phi->GetBlock()->GetPredecessors()[i]));
-      if (input1 != phi->InputAt(i)) {
+      if (input1 != inputs[i]) {
         return false;
       }
     }
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index e7fa4e4..895cb47 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -50,6 +50,7 @@
 #include "mirror/array-inl.h"
 #include "mirror/object_array-inl.h"
 #include "mirror/object_reference.h"
+#include "mirror/string.h"
 #include "parallel_move_resolver.h"
 #include "ssa_liveness_analysis.h"
 #include "utils/assembler.h"
@@ -110,10 +111,10 @@
         << " " << locations->Out();
   }
 
-  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
-    DCHECK(CheckType(instruction->InputAt(i)->GetType(), locations->InAt(i)))
-      << instruction->InputAt(i)->GetType()
-      << " " << locations->InAt(i);
+  auto&& inputs = instruction->GetInputs();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    DCHECK(CheckType(inputs[i]->GetType(), locations->InAt(i)))
+      << inputs[i]->GetType() << " " << locations->InAt(i);
   }
 
   HEnvironment* environment = instruction->GetEnvironment();
@@ -139,6 +140,12 @@
   return pointer_size * index;
 }
 
+uint32_t CodeGenerator::GetArrayLengthOffset(HArrayLength* array_length) {
+  return array_length->IsStringLength()
+      ? mirror::String::CountOffset().Uint32Value()
+      : mirror::Array::LengthOffset().Uint32Value();
+}
+
 bool CodeGenerator::GoesToNextBlock(HBasicBlock* current, HBasicBlock* next) const {
   DCHECK_EQ((*block_order_)[current_block_index_], current);
   return GetNextBlockToEmit() == FirstNonEmptyBlock(next);
@@ -277,7 +284,8 @@
   DCHECK(!block_order.empty());
   DCHECK(block_order[0] == GetGraph()->GetEntryBlock());
   ComputeSpillMask();
-  first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize;
+  first_register_slot_in_slow_path_ = RoundUp(
+      (number_of_out_slots + number_of_spill_slots) * kVRegSize, GetPreferredSlotsAlignment());
 
   if (number_of_spill_slots == 0
       && !HasAllocatedCalleeSaveRegisters()
@@ -288,8 +296,7 @@
     SetFrameSize(CallPushesPC() ? GetWordSize() : 0);
   } else {
     SetFrameSize(RoundUp(
-        number_of_spill_slots * kVRegSize
-        + number_of_out_slots * kVRegSize
+        first_register_slot_in_slow_path_
         + maximum_number_of_live_core_registers * GetWordSize()
         + maximum_number_of_live_fpu_registers * GetFloatingPointSpillSlotSize()
         + FrameEntrySpillSize(),
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index d69c410..90c08d5 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -211,6 +211,8 @@
                                 size_t maximum_number_of_live_fpu_registers,
                                 size_t number_of_out_slots,
                                 const ArenaVector<HBasicBlock*>& block_order);
+  // Backends can override this as necessary. For most, no special alignment is required.
+  virtual uint32_t GetPreferredSlotsAlignment() const { return 1; }
 
   uint32_t GetFrameSize() const { return frame_size_; }
   void SetFrameSize(uint32_t size) { frame_size_ = size; }
@@ -340,6 +342,11 @@
   // Pointer variant for ArtMethod and ArtField arrays.
   size_t GetCachePointerOffset(uint32_t index);
 
+  // Helper that returns the offset of the array's length field.
+  // Note: Besides the normal arrays, we also use the HArrayLength for
+  // accessing the String's `count` field in String intrinsics.
+  static uint32_t GetArrayLengthOffset(HArrayLength* array_length);
+
   void EmitParallelMoves(Location from1,
                          Location to1,
                          Primitive::Type type1,
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 45e9b58..731ee39 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -426,7 +426,9 @@
            instruction_->IsLoadClass() ||
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
-           instruction_->IsCheckCast())
+           instruction_->IsCheckCast() ||
+           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
+            instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
@@ -489,8 +491,12 @@
     Register reg_out = out_.AsRegister<Register>();
     DCHECK(locations->CanCall());
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
-    DCHECK(!instruction_->IsInvoke() ||
-           (instruction_->IsInvokeStaticOrDirect() &&
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
             instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
@@ -503,7 +509,7 @@
     // introduce a copy of it, `index`.
     Location index = index_;
     if (index_.IsValid()) {
-      // Handle `index_` for HArrayGet and intrinsic UnsafeGetObject.
+      // Handle `index_` for HArrayGet and UnsafeGetObject/UnsafeGetObjectVolatile intrinsics.
       if (instruction_->IsArrayGet()) {
         // Compute the actual memory offset and store it in `index`.
         Register index_reg = index_.AsRegister<Register>();
@@ -551,7 +557,11 @@
             "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
         __ AddConstant(index_reg, index_reg, offset_);
       } else {
-        DCHECK(instruction_->IsInvoke());
+        // In the case of the UnsafeGetObject/UnsafeGetObjectVolatile
+        // intrinsics, `index_` is not shifted by a scale factor of 2
+        // (as in the case of ArrayGet), as it is actually an offset
+        // to an object field within an object.
+        DCHECK(instruction_->IsInvoke()) << instruction_->DebugName();
         DCHECK(instruction_->GetLocations()->Intrinsified());
         DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
                (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
@@ -1253,6 +1263,44 @@
 void InstructionCodeGeneratorARM::VisitExit(HExit* exit ATTRIBUTE_UNUSED) {
 }
 
+void InstructionCodeGeneratorARM::GenerateVcmp(HInstruction* instruction) {
+  Primitive::Type type = instruction->InputAt(0)->GetType();
+  Location lhs_loc = instruction->GetLocations()->InAt(0);
+  Location rhs_loc = instruction->GetLocations()->InAt(1);
+  if (rhs_loc.IsConstant()) {
+    // 0.0 is the only immediate that can be encoded directly in
+    // a VCMP instruction.
+    //
+    // Both the JLS (section 15.20.1) and the JVMS (section 6.5)
+    // specify that in a floating-point comparison, positive zero
+    // and negative zero are considered equal, so we can use the
+    // literal 0.0 for both cases here.
+    //
+    // Note however that some methods (Float.equal, Float.compare,
+    // Float.compareTo, Double.equal, Double.compare,
+    // Double.compareTo, Math.max, Math.min, StrictMath.max,
+    // StrictMath.min) consider 0.0 to be (strictly) greater than
+    // -0.0. So if we ever translate calls to these methods into a
+    // HCompare instruction, we must handle the -0.0 case with
+    // care here.
+    DCHECK(rhs_loc.GetConstant()->IsArithmeticZero());
+    if (type == Primitive::kPrimFloat) {
+      __ vcmpsz(lhs_loc.AsFpuRegister<SRegister>());
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimDouble);
+      __ vcmpdz(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>()));
+    }
+  } else {
+    if (type == Primitive::kPrimFloat) {
+      __ vcmps(lhs_loc.AsFpuRegister<SRegister>(), rhs_loc.AsFpuRegister<SRegister>());
+    } else {
+      DCHECK_EQ(type, Primitive::kPrimDouble);
+      __ vcmpd(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>()),
+               FromLowSToD(rhs_loc.AsFpuRegisterPairLow<SRegister>()));
+    }
+  }
+}
+
 void InstructionCodeGeneratorARM::GenerateFPJumps(HCondition* cond,
                                                   Label* true_label,
                                                   Label* false_label ATTRIBUTE_UNUSED) {
@@ -1353,22 +1401,14 @@
   Label* true_target = true_target_in == nullptr ? &fallthrough_target : true_target_in;
   Label* false_target = false_target_in == nullptr ? &fallthrough_target : false_target_in;
 
-  LocationSummary* locations = condition->GetLocations();
-  Location left = locations->InAt(0);
-  Location right = locations->InAt(1);
-
   Primitive::Type type = condition->InputAt(0)->GetType();
   switch (type) {
     case Primitive::kPrimLong:
       GenerateLongComparesAndJumps(condition, true_target, false_target);
       break;
     case Primitive::kPrimFloat:
-      __ vcmps(left.AsFpuRegister<SRegister>(), right.AsFpuRegister<SRegister>());
-      GenerateFPJumps(condition, true_target, false_target);
-      break;
     case Primitive::kPrimDouble:
-      __ vcmpd(FromLowSToD(left.AsFpuRegisterPairLow<SRegister>()),
-               FromLowSToD(right.AsFpuRegisterPairLow<SRegister>()));
+      GenerateVcmp(condition);
       GenerateFPJumps(condition, true_target, false_target);
       break;
     default:
@@ -1549,7 +1589,7 @@
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(1, ArithmeticZeroOrFpuRegister(cond->InputAt(1)));
       if (!cond->IsEmittedAtUseSite()) {
         locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       }
@@ -1596,12 +1636,8 @@
       GenerateLongComparesAndJumps(cond, &true_label, &false_label);
       break;
     case Primitive::kPrimFloat:
-      __ vcmps(left.AsFpuRegister<SRegister>(), right.AsFpuRegister<SRegister>());
-      GenerateFPJumps(cond, &true_label, &false_label);
-      break;
     case Primitive::kPrimDouble:
-      __ vcmpd(FromLowSToD(left.AsFpuRegisterPairLow<SRegister>()),
-               FromLowSToD(right.AsFpuRegisterPairLow<SRegister>()));
+      GenerateVcmp(cond);
       GenerateFPJumps(cond, &true_label, &false_label);
       break;
   }
@@ -2274,8 +2310,7 @@
         case Primitive::kPrimFloat: {
           // Processing a Dex `float-to-int' instruction.
           SRegister temp = locations->GetTemp(0).AsFpuRegisterPairLow<SRegister>();
-          __ vmovs(temp, in.AsFpuRegister<SRegister>());
-          __ vcvtis(temp, temp);
+          __ vcvtis(temp, in.AsFpuRegister<SRegister>());
           __ vmovrs(out.AsRegister<Register>(), temp);
           break;
         }
@@ -2283,9 +2318,7 @@
         case Primitive::kPrimDouble: {
           // Processing a Dex `double-to-int' instruction.
           SRegister temp_s = locations->GetTemp(0).AsFpuRegisterPairLow<SRegister>();
-          DRegister temp_d = FromLowSToD(temp_s);
-          __ vmovd(temp_d, FromLowSToD(in.AsFpuRegisterPairLow<SRegister>()));
-          __ vcvtid(temp_s, temp_d);
+          __ vcvtid(temp_s, FromLowSToD(in.AsFpuRegisterPairLow<SRegister>()));
           __ vmovrs(out.AsRegister<Register>(), temp_s);
           break;
         }
@@ -2464,7 +2497,7 @@
 
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, ArmEncodableConstantOrRegister(add->InputAt(1), ADD));
       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
     }
@@ -2501,13 +2534,18 @@
       break;
 
     case Primitive::kPrimLong: {
-      DCHECK(second.IsRegisterPair());
-      __ adds(out.AsRegisterPairLow<Register>(),
-              first.AsRegisterPairLow<Register>(),
-              ShifterOperand(second.AsRegisterPairLow<Register>()));
-      __ adc(out.AsRegisterPairHigh<Register>(),
-             first.AsRegisterPairHigh<Register>(),
-             ShifterOperand(second.AsRegisterPairHigh<Register>()));
+      if (second.IsConstant()) {
+        uint64_t value = static_cast<uint64_t>(Int64FromConstant(second.GetConstant()));
+        GenerateAddLongConst(out, first, value);
+      } else {
+        DCHECK(second.IsRegisterPair());
+        __ adds(out.AsRegisterPairLow<Register>(),
+                first.AsRegisterPairLow<Register>(),
+                ShifterOperand(second.AsRegisterPairLow<Register>()));
+        __ adc(out.AsRegisterPairHigh<Register>(),
+               first.AsRegisterPairHigh<Register>(),
+               ShifterOperand(second.AsRegisterPairHigh<Register>()));
+      }
       break;
     }
 
@@ -2541,7 +2579,7 @@
 
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, ArmEncodableConstantOrRegister(sub->InputAt(1), SUB));
       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
     }
@@ -2577,13 +2615,18 @@
     }
 
     case Primitive::kPrimLong: {
-      DCHECK(second.IsRegisterPair());
-      __ subs(out.AsRegisterPairLow<Register>(),
-              first.AsRegisterPairLow<Register>(),
-              ShifterOperand(second.AsRegisterPairLow<Register>()));
-      __ sbc(out.AsRegisterPairHigh<Register>(),
-             first.AsRegisterPairHigh<Register>(),
-             ShifterOperand(second.AsRegisterPairHigh<Register>()));
+      if (second.IsConstant()) {
+        uint64_t value = static_cast<uint64_t>(Int64FromConstant(second.GetConstant()));
+        GenerateAddLongConst(out, first, -value);
+      } else {
+        DCHECK(second.IsRegisterPair());
+        __ subs(out.AsRegisterPairLow<Register>(),
+                first.AsRegisterPairLow<Register>(),
+                ShifterOperand(second.AsRegisterPairLow<Register>()));
+        __ sbc(out.AsRegisterPairHigh<Register>(),
+               first.AsRegisterPairHigh<Register>(),
+               ShifterOperand(second.AsRegisterPairHigh<Register>()));
+      }
       break;
     }
 
@@ -3621,7 +3664,7 @@
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(1, ArithmeticZeroOrFpuRegister(compare->InputAt(1)));
       locations->SetOut(Location::RequiresRegister());
       break;
     }
@@ -3666,12 +3709,7 @@
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       __ LoadImmediate(out, 0);
-      if (type == Primitive::kPrimFloat) {
-        __ vcmps(left.AsFpuRegister<SRegister>(), right.AsFpuRegister<SRegister>());
-      } else {
-        __ vcmpd(FromLowSToD(left.AsFpuRegisterPairLow<SRegister>()),
-                 FromLowSToD(right.AsFpuRegisterPairLow<SRegister>()));
-      }
+      GenerateVcmp(compare);
       __ vmstat();  // transfer FP status register to ARM APSR.
       less_cond = ARMFPCondition(kCondLT, compare->IsGtBias());
       break;
@@ -3697,7 +3735,7 @@
 void LocationsBuilderARM::VisitPhi(HPhi* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
+  for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
     locations->SetInAt(i, Location::Any());
   }
   locations->SetOut(Location::Any());
@@ -3965,6 +4003,17 @@
   }
 }
 
+Location LocationsBuilderARM::ArithmeticZeroOrFpuRegister(HInstruction* input) {
+  DCHECK(input->GetType() == Primitive::kPrimDouble || input->GetType() == Primitive::kPrimFloat)
+      << input->GetType();
+  if ((input->IsFloatConstant() && (input->AsFloatConstant()->IsArithmeticZero())) ||
+      (input->IsDoubleConstant() && (input->AsDoubleConstant()->IsArithmeticZero()))) {
+    return Location::ConstantLocation(input->AsConstant());
+  } else {
+    return Location::RequiresFpuRegister();
+  }
+}
+
 Location LocationsBuilderARM::ArmEncodableConstantOrRegister(HInstruction* constant,
                                                              Opcode opcode) {
   DCHECK(!Primitive::IsFloatingPointType(constant->GetType()));
@@ -3979,31 +4028,51 @@
                                                        Opcode opcode) {
   uint64_t value = static_cast<uint64_t>(Int64FromConstant(input_cst));
   if (Primitive::Is64BitType(input_cst->GetType())) {
-    return CanEncodeConstantAsImmediate(Low32Bits(value), opcode) &&
-        CanEncodeConstantAsImmediate(High32Bits(value), opcode);
+    Opcode high_opcode = opcode;
+    SetCc low_set_cc = kCcDontCare;
+    switch (opcode) {
+      case SUB:
+        // Flip the operation to an ADD.
+        value = -value;
+        opcode = ADD;
+        FALLTHROUGH_INTENDED;
+      case ADD:
+        if (Low32Bits(value) == 0u) {
+          return CanEncodeConstantAsImmediate(High32Bits(value), opcode, kCcDontCare);
+        }
+        high_opcode = ADC;
+        low_set_cc = kCcSet;
+        break;
+      default:
+        break;
+    }
+    return CanEncodeConstantAsImmediate(Low32Bits(value), opcode, low_set_cc) &&
+        CanEncodeConstantAsImmediate(High32Bits(value), high_opcode, kCcDontCare);
   } else {
     return CanEncodeConstantAsImmediate(Low32Bits(value), opcode);
   }
 }
 
-bool LocationsBuilderARM::CanEncodeConstantAsImmediate(uint32_t value, Opcode opcode) {
+bool LocationsBuilderARM::CanEncodeConstantAsImmediate(uint32_t value,
+                                                       Opcode opcode,
+                                                       SetCc set_cc) {
   ShifterOperand so;
   ArmAssembler* assembler = codegen_->GetAssembler();
-  if (assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, opcode, value, &so)) {
+  if (assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, opcode, value, set_cc, &so)) {
     return true;
   }
   Opcode neg_opcode = kNoOperand;
   switch (opcode) {
-    case AND:
-      neg_opcode = BIC;
-      break;
-    case ORR:
-      neg_opcode = ORN;
-      break;
+    case AND: neg_opcode = BIC; value = ~value; break;
+    case ORR: neg_opcode = ORN; value = ~value; break;
+    case ADD: neg_opcode = SUB; value = -value; break;
+    case ADC: neg_opcode = SBC; value = ~value; break;
+    case SUB: neg_opcode = ADD; value = -value; break;
+    case SBC: neg_opcode = ADC; value = ~value; break;
     default:
       return false;
   }
-  return assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, neg_opcode, ~value, &so);
+  return assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, neg_opcode, value, set_cc, &so);
 }
 
 void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction,
@@ -4453,12 +4522,10 @@
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
   bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
-  bool object_array_set_with_read_barrier =
-      kEmitCompilerReadBarrier && (value_type == Primitive::kPrimNot);
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      (may_need_runtime_call_for_type_check || object_array_set_with_read_barrier) ?
+      may_need_runtime_call_for_type_check ?
           LocationSummary::kCallOnSlowPath :
           LocationSummary::kNoCall);
 
@@ -4744,7 +4811,7 @@
 
 void InstructionCodeGeneratorARM::VisitArrayLength(HArrayLength* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  uint32_t offset = mirror::Array::LengthOffset().Uint32Value();
+  uint32_t offset = CodeGenerator::GetArrayLengthOffset(instruction);
   Register obj = locations->InAt(0).AsRegister<Register>();
   Register out = locations->Out().AsRegister<Register>();
   __ LoadFromOffset(kLoadWord, out, obj, offset);
@@ -5886,6 +5953,34 @@
   __ eor(out, first, ShifterOperand(value));
 }
 
+void InstructionCodeGeneratorARM::GenerateAddLongConst(Location out,
+                                                       Location first,
+                                                       uint64_t value) {
+  Register out_low = out.AsRegisterPairLow<Register>();
+  Register out_high = out.AsRegisterPairHigh<Register>();
+  Register first_low = first.AsRegisterPairLow<Register>();
+  Register first_high = first.AsRegisterPairHigh<Register>();
+  uint32_t value_low = Low32Bits(value);
+  uint32_t value_high = High32Bits(value);
+  if (value_low == 0u) {
+    if (out_low != first_low) {
+      __ mov(out_low, ShifterOperand(first_low));
+    }
+    __ AddConstant(out_high, first_high, value_high);
+    return;
+  }
+  __ AddConstantSetFlags(out_low, first_low, value_low);
+  ShifterOperand so;
+  if (__ ShifterOperandCanHold(out_high, first_high, ADC, value_high, kCcDontCare, &so)) {
+    __ adc(out_high, first_high, so);
+  } else if (__ ShifterOperandCanHold(out_low, first_low, SBC, ~value_high, kCcDontCare, &so)) {
+    __ sbc(out_high, first_high, so);
+  } else {
+    LOG(FATAL) << "Unexpected constant " << value_high;
+    UNREACHABLE();
+  }
+}
+
 void InstructionCodeGeneratorARM::HandleBitwiseOperation(HBinaryOperation* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   Location first = locations->InAt(0);
@@ -6083,8 +6178,9 @@
 
   // /* HeapReference<Object> */ ref = *(obj + offset)
   Location no_index = Location::NoLocation();
+  ScaleFactor no_scale_factor = TIMES_1;
   GenerateReferenceLoadWithBakerReadBarrier(
-      instruction, ref, obj, offset, no_index, temp, needs_null_check);
+      instruction, ref, obj, offset, no_index, no_scale_factor, temp, needs_null_check);
 }
 
 void CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
@@ -6097,10 +6193,14 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
+  static_assert(
+      sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+      "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
   // /* HeapReference<Object> */ ref =
   //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
+  ScaleFactor scale_factor = TIMES_4;
   GenerateReferenceLoadWithBakerReadBarrier(
-      instruction, ref, obj, data_offset, index, temp, needs_null_check);
+      instruction, ref, obj, data_offset, index, scale_factor, temp, needs_null_check);
 }
 
 void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
@@ -6108,6 +6208,7 @@
                                                                  Register obj,
                                                                  uint32_t offset,
                                                                  Location index,
+                                                                 ScaleFactor scale_factor,
                                                                  Location temp,
                                                                  bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
@@ -6144,35 +6245,31 @@
   // /* LockWord */ lock_word = LockWord(monitor)
   static_assert(sizeof(LockWord) == sizeof(int32_t),
                 "art::LockWord and int32_t have different sizes.");
-  // /* uint32_t */ rb_state = lock_word.ReadBarrierState()
-  __ Lsr(temp_reg, temp_reg, LockWord::kReadBarrierStateShift);
-  __ and_(temp_reg, temp_reg, ShifterOperand(LockWord::kReadBarrierStateMask));
-  static_assert(
-      LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_,
-      "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_.");
 
-  // Introduce a dependency on the high bits of rb_state, which shall
-  // be all zeroes, to prevent load-load reordering, and without using
+  // Introduce a dependency on the lock_word including the rb_state,
+  // which shall prevent load-load reordering without using
   // a memory barrier (which would be more expensive).
-  // IP = rb_state & ~LockWord::kReadBarrierStateMask = 0
-  __ bic(IP, temp_reg, ShifterOperand(LockWord::kReadBarrierStateMask));
-  // obj is unchanged by this operation, but its value now depends on
-  // IP, which depends on temp_reg.
-  __ add(obj, obj, ShifterOperand(IP));
+  // obj is unchanged by this operation, but its value now depends on temp_reg.
+  __ add(obj, obj, ShifterOperand(temp_reg, LSR, 32));
 
   // The actual reference load.
   if (index.IsValid()) {
-    static_assert(
-        sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
-        "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
-    // /* HeapReference<Object> */ ref =
-    //     *(obj + offset + index * sizeof(HeapReference<Object>))
+    // Load types involving an "index": ArrayGet and
+    // UnsafeGetObject/UnsafeGetObjectVolatile intrinsics.
+    // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor))
     if (index.IsConstant()) {
       size_t computed_offset =
-          (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + offset;
+          (index.GetConstant()->AsIntConstant()->GetValue() << scale_factor) + offset;
       __ LoadFromOffset(kLoadWord, ref_reg, obj, computed_offset);
     } else {
-      __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
+      // Handle the special case of the
+      // UnsafeGetObject/UnsafeGetObjectVolatile intrinsics, which use
+      // a register pair as index ("long offset"), of which only the low
+      // part contains data.
+      Register index_reg = index.IsRegisterPair()
+          ? index.AsRegisterPairLow<Register>()
+          : index.AsRegister<Register>();
+      __ add(IP, obj, ShifterOperand(index_reg, LSL, scale_factor));
       __ LoadFromOffset(kLoadWord, ref_reg, IP, offset);
     }
   } else {
@@ -6190,8 +6287,14 @@
 
   // if (rb_state == ReadBarrier::gray_ptr_)
   //   ref = ReadBarrier::Mark(ref);
-  __ cmp(temp_reg, ShifterOperand(ReadBarrier::gray_ptr_));
-  __ b(slow_path->GetEntryLabel(), EQ);
+  // Given the numeric representation, it's enough to check the low bit of the
+  // rb_state. We do that by shifting the bit out of the lock word with LSRS
+  // which can be a 16-bit instruction unlike the TST immediate.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1);
+  __ b(slow_path->GetEntryLabel(), CS);  // Carry flag is the last bit shifted out by LSRS.
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 0020f7b..ede7b61 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -179,9 +179,10 @@
   void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
 
+  Location ArithmeticZeroOrFpuRegister(HInstruction* input);
   Location ArmEncodableConstantOrRegister(HInstruction* constant, Opcode opcode);
   bool CanEncodeConstantAsImmediate(HConstant* input_cst, Opcode opcode);
-  bool CanEncodeConstantAsImmediate(uint32_t value, Opcode opcode);
+  bool CanEncodeConstantAsImmediate(uint32_t value, Opcode opcode, SetCc set_cc = kCcDontCare);
 
   CodeGeneratorARM* const codegen_;
   InvokeDexCallingConventionVisitorARM parameter_visitor_;
@@ -218,6 +219,7 @@
   void GenerateAndConst(Register out, Register first, uint32_t value);
   void GenerateOrrConst(Register out, Register first, uint32_t value);
   void GenerateEorConst(Register out, Register first, uint32_t value);
+  void GenerateAddLongConst(Location out, Location first, uint64_t value);
   void HandleBitwiseOperation(HBinaryOperation* operation);
   void HandleCondition(HCondition* condition);
   void HandleIntegerRotate(LocationSummary* locations);
@@ -280,6 +282,7 @@
   void GenerateCompareTestAndBranch(HCondition* condition,
                                     Label* true_target,
                                     Label* false_target);
+  void GenerateVcmp(HInstruction* instruction);
   void GenerateFPJumps(HCondition* cond, Label* true_label, Label* false_label);
   void GenerateLongComparesAndJumps(HCondition* cond, Label* true_label, Label* false_label);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
@@ -464,6 +467,16 @@
                                              Location index,
                                              Location temp,
                                              bool needs_null_check);
+  // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier
+  // and GenerateArrayLoadWithBakerReadBarrier.
+  void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                 Location ref,
+                                                 Register obj,
+                                                 uint32_t offset,
+                                                 Location index,
+                                                 ScaleFactor scale_factor,
+                                                 Location temp,
+                                                 bool needs_null_check);
 
   // Generate a read barrier for a heap reference within `instruction`
   // using a slow path.
@@ -519,16 +532,6 @@
   void GenerateExplicitNullCheck(HNullCheck* instruction);
 
  private:
-  // Factored implementation of GenerateFieldLoadWithBakerReadBarrier
-  // and GenerateArrayLoadWithBakerReadBarrier.
-  void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
-                                                 Location ref,
-                                                 Register obj,
-                                                 uint32_t offset,
-                                                 Location index,
-                                                 Location temp,
-                                                 bool needs_null_check);
-
   Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp);
 
   using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, Literal*>;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 8e58b15..e247451 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -594,7 +594,9 @@
            instruction_->IsLoadClass() ||
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
-           instruction_->IsCheckCast())
+           instruction_->IsCheckCast() ||
+           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
+            instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
@@ -657,8 +659,12 @@
     Primitive::Type type = Primitive::kPrimNot;
     DCHECK(locations->CanCall());
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(out_.reg()));
-    DCHECK(!instruction_->IsInvoke() ||
-           (instruction_->IsInvokeStaticOrDirect() &&
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
             instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
@@ -676,7 +682,7 @@
     // introduce a copy of it, `index`.
     Location index = index_;
     if (index_.IsValid()) {
-      // Handle `index_` for HArrayGet and intrinsic UnsafeGetObject.
+      // Handle `index_` for HArrayGet and UnsafeGetObject/UnsafeGetObjectVolatile intrinsics.
       if (instruction_->IsArrayGet()) {
         // Compute the actual memory offset and store it in `index`.
         Register index_reg = RegisterFrom(index_, Primitive::kPrimInt);
@@ -724,7 +730,11 @@
             "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
         __ Add(index_reg, index_reg, Operand(offset_));
       } else {
-        DCHECK(instruction_->IsInvoke());
+        // In the case of the UnsafeGetObject/UnsafeGetObjectVolatile
+        // intrinsics, `index_` is not shifted by a scale factor of 2
+        // (as in the case of ArrayGet), as it is actually an offset
+        // to an object field within an object.
+        DCHECK(instruction_->IsInvoke()) << instruction_->DebugName();
         DCHECK(instruction_->GetLocations()->Intrinsified());
         DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
                (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
@@ -1260,17 +1270,21 @@
       UseScratchRegisterScope temps(GetVIXLAssembler());
       HConstant* src_cst = source.GetConstant();
       CPURegister temp;
-      if (src_cst->IsIntConstant() || src_cst->IsNullConstant()) {
-        temp = temps.AcquireW();
-      } else if (src_cst->IsLongConstant()) {
-        temp = temps.AcquireX();
-      } else if (src_cst->IsFloatConstant()) {
-        temp = temps.AcquireS();
+      if (src_cst->IsZeroBitPattern()) {
+        temp = (src_cst->IsLongConstant() || src_cst->IsDoubleConstant()) ? xzr : wzr;
       } else {
-        DCHECK(src_cst->IsDoubleConstant());
-        temp = temps.AcquireD();
+        if (src_cst->IsIntConstant()) {
+          temp = temps.AcquireW();
+        } else if (src_cst->IsLongConstant()) {
+          temp = temps.AcquireX();
+        } else if (src_cst->IsFloatConstant()) {
+          temp = temps.AcquireS();
+        } else {
+          DCHECK(src_cst->IsDoubleConstant());
+          temp = temps.AcquireD();
+        }
+        MoveConstant(temp, src_cst);
       }
-      MoveConstant(temp, src_cst);
       __ Str(temp, StackOperandFrom(destination));
     } else {
       DCHECK(source.IsStackSlot() || source.IsDoubleStackSlot());
@@ -2118,9 +2132,9 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitArrayLength(HArrayLength* instruction) {
+  uint32_t offset = CodeGenerator::GetArrayLengthOffset(instruction);
   BlockPoolsScope block_pools(GetVIXLAssembler());
-  __ Ldr(OutputRegister(instruction),
-         HeapOperand(InputRegisterAt(instruction, 0), mirror::Array::LengthOffset()));
+  __ Ldr(OutputRegister(instruction), HeapOperand(InputRegisterAt(instruction, 0), offset));
   codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
@@ -2128,11 +2142,9 @@
   Primitive::Type value_type = instruction->GetComponentType();
 
   bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
-  bool object_array_set_with_read_barrier =
-      kEmitCompilerReadBarrier && (value_type == Primitive::kPrimNot);
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      (may_need_runtime_call_for_type_check  || object_array_set_with_read_barrier) ?
+      may_need_runtime_call_for_type_check ?
           LocationSummary::kCallOnSlowPath :
           LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
@@ -4401,7 +4413,7 @@
 
 void LocationsBuilderARM64::VisitPhi(HPhi* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
-  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
+  for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
     locations->SetInAt(i, Location::Any());
   }
   locations->SetOut(Location::Any());
@@ -4977,8 +4989,16 @@
 
   // /* HeapReference<Object> */ ref = *(obj + offset)
   Location no_index = Location::NoLocation();
-  GenerateReferenceLoadWithBakerReadBarrier(
-      instruction, ref, obj, offset, no_index, temp, needs_null_check, use_load_acquire);
+  size_t no_scale_factor = 0U;
+  GenerateReferenceLoadWithBakerReadBarrier(instruction,
+                                            ref,
+                                            obj,
+                                            offset,
+                                            no_index,
+                                            no_scale_factor,
+                                            temp,
+                                            needs_null_check,
+                                            use_load_acquire);
 }
 
 void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
@@ -4995,10 +5015,21 @@
   // never use Load-Acquire instructions on ARM64.
   const bool use_load_acquire = false;
 
+  static_assert(
+      sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+      "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
   // /* HeapReference<Object> */ ref =
   //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
-  GenerateReferenceLoadWithBakerReadBarrier(
-      instruction, ref, obj, data_offset, index, temp, needs_null_check, use_load_acquire);
+  size_t scale_factor = Primitive::ComponentSizeShift(Primitive::kPrimNot);
+  GenerateReferenceLoadWithBakerReadBarrier(instruction,
+                                            ref,
+                                            obj,
+                                            data_offset,
+                                            index,
+                                            scale_factor,
+                                            temp,
+                                            needs_null_check,
+                                            use_load_acquire);
 }
 
 void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
@@ -5006,15 +5037,16 @@
                                                                    vixl::Register obj,
                                                                    uint32_t offset,
                                                                    Location index,
+                                                                   size_t scale_factor,
                                                                    Register temp,
                                                                    bool needs_null_check,
                                                                    bool use_load_acquire) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
-  // If `index` is a valid location, then we are emitting an array
-  // load, so we shouldn't be using a Load Acquire instruction.
-  // In other words: `index.IsValid()` => `!use_load_acquire`.
-  DCHECK(!index.IsValid() || !use_load_acquire);
+  // If we are emitting an array load, we should not be using a
+  // Load Acquire instruction.  In other words:
+  // `instruction->IsArrayGet()` => `!use_load_acquire`.
+  DCHECK(!instruction->IsArrayGet() || !use_load_acquire);
 
   MacroAssembler* masm = GetVIXLAssembler();
   UseScratchRegisterScope temps(masm);
@@ -5051,40 +5083,42 @@
   // /* LockWord */ lock_word = LockWord(monitor)
   static_assert(sizeof(LockWord) == sizeof(int32_t),
                 "art::LockWord and int32_t have different sizes.");
-  // /* uint32_t */ rb_state = lock_word.ReadBarrierState()
-  __ Lsr(temp, temp, LockWord::kReadBarrierStateShift);
-  __ And(temp, temp, Operand(LockWord::kReadBarrierStateMask));
-  static_assert(
-      LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_,
-      "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_.");
 
-  // Introduce a dependency on the high bits of rb_state, which shall
-  // be all zeroes, to prevent load-load reordering, and without using
+  // Introduce a dependency on the lock_word including rb_state,
+  // to prevent load-load reordering, and without using
   // a memory barrier (which would be more expensive).
-  // temp2 = rb_state & ~LockWord::kReadBarrierStateMask = 0
-  Register temp2 = temps.AcquireW();
-  __ Bic(temp2, temp, Operand(LockWord::kReadBarrierStateMask));
-  // obj is unchanged by this operation, but its value now depends on
-  // temp2, which depends on temp.
-  __ Add(obj, obj, Operand(temp2));
-  temps.Release(temp2);
+  // obj is unchanged by this operation, but its value now depends on temp.
+  __ Add(obj.X(), obj.X(), Operand(temp.X(), LSR, 32));
 
   // The actual reference load.
   if (index.IsValid()) {
-    static_assert(
-        sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
-        "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
-    // /* HeapReference<Object> */ ref =
-    //     *(obj + offset + index * sizeof(HeapReference<Object>))
-    const size_t shift_amount = Primitive::ComponentSizeShift(type);
-    if (index.IsConstant()) {
-      uint32_t computed_offset = offset + (Int64ConstantFrom(index) << shift_amount);
-      Load(type, ref_reg, HeapOperand(obj, computed_offset));
+    // Load types involving an "index".
+    if (use_load_acquire) {
+      // UnsafeGetObjectVolatile intrinsic case.
+      // Register `index` is not an index in an object array, but an
+      // offset to an object reference field within object `obj`.
+      DCHECK(instruction->IsInvoke()) << instruction->DebugName();
+      DCHECK(instruction->GetLocations()->Intrinsified());
+      DCHECK(instruction->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile)
+          << instruction->AsInvoke()->GetIntrinsic();
+      DCHECK_EQ(offset, 0U);
+      DCHECK_EQ(scale_factor, 0U);
+      DCHECK_EQ(needs_null_check, 0U);
+      // /* HeapReference<Object> */ ref = *(obj + index)
+      MemOperand field = HeapOperand(obj, XRegisterFrom(index));
+      LoadAcquire(instruction, ref_reg, field, /* needs_null_check */ false);
     } else {
-      temp2 = temps.AcquireW();
-      __ Add(temp2, obj, offset);
-      Load(type, ref_reg, HeapOperand(temp2, XRegisterFrom(index), LSL, shift_amount));
-      temps.Release(temp2);
+      // ArrayGet and UnsafeGetObject intrinsics cases.
+      // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor))
+      if (index.IsConstant()) {
+        uint32_t computed_offset = offset + (Int64ConstantFrom(index) << scale_factor);
+        Load(type, ref_reg, HeapOperand(obj, computed_offset));
+      } else {
+        Register temp2 = temps.AcquireW();
+        __ Add(temp2, obj, offset);
+        Load(type, ref_reg, HeapOperand(temp2, XRegisterFrom(index), LSL, scale_factor));
+        temps.Release(temp2);
+      }
     }
   } else {
     // /* HeapReference<Object> */ ref = *(obj + offset)
@@ -5106,8 +5140,11 @@
 
   // if (rb_state == ReadBarrier::gray_ptr_)
   //   ref = ReadBarrier::Mark(ref);
-  __ Cmp(temp, ReadBarrier::gray_ptr_);
-  __ B(eq, slow_path->GetEntryLabel());
+  // Given the numeric representation, it's enough to check the low bit of the rb_state.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  __ Tbnz(temp, LockWord::kReadBarrierStateShift, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 422963e..dea597c 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -417,6 +417,10 @@
     block_labels_.resize(GetGraph()->GetBlocks().size());
   }
 
+  // We want to use the STP and LDP instructions to spill and restore registers for slow paths.
+  // These instructions can only encode offsets that are multiples of the register size accessed.
+  uint32_t GetPreferredSlotsAlignment() const OVERRIDE { return vixl::kXRegSizeInBytes; }
+
   JumpTableARM64* CreateJumpTable(HPackedSwitch* switch_instr) {
     jump_tables_.emplace_back(new (GetGraph()->GetArena()) JumpTableARM64(switch_instr));
     return jump_tables_.back().get();
@@ -515,6 +519,17 @@
                                              Location index,
                                              vixl::Register temp,
                                              bool needs_null_check);
+  // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier
+  // and GenerateArrayLoadWithBakerReadBarrier.
+  void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                 Location ref,
+                                                 vixl::Register obj,
+                                                 uint32_t offset,
+                                                 Location index,
+                                                 size_t scale_factor,
+                                                 vixl::Register temp,
+                                                 bool needs_null_check,
+                                                 bool use_load_acquire);
 
   // Generate a read barrier for a heap reference within `instruction`
   // using a slow path.
@@ -570,17 +585,6 @@
   void GenerateExplicitNullCheck(HNullCheck* instruction);
 
  private:
-  // Factored implementation of GenerateFieldLoadWithBakerReadBarrier
-  // and GenerateArrayLoadWithBakerReadBarrier.
-  void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
-                                                 Location ref,
-                                                 vixl::Register obj,
-                                                 uint32_t offset,
-                                                 Location index,
-                                                 vixl::Register temp,
-                                                 bool needs_null_check,
-                                                 bool use_load_acquire);
-
   using Uint64ToLiteralMap = ArenaSafeMap<uint64_t, vixl::Literal<uint64_t>*>;
   using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, vixl::Literal<uint32_t>*>;
   using MethodToLiteralMap = ArenaSafeMap<MethodReference,
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 06248a3..be6c68b 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -1803,7 +1803,7 @@
 
 void InstructionCodeGeneratorMIPS::VisitArrayLength(HArrayLength* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  uint32_t offset = mirror::Array::LengthOffset().Uint32Value();
+  uint32_t offset = CodeGenerator::GetArrayLengthOffset(instruction);
   Register obj = locations->InAt(0).AsRegister<Register>();
   Register out = locations->Out().AsRegister<Register>();
   __ LoadFromOffset(kLoadWord, out, obj, offset);
@@ -4439,7 +4439,7 @@
 
 void LocationsBuilderMIPS::VisitPhi(HPhi* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
-  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
+  for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
     locations->SetInAt(i, Location::Any());
   }
   locations->SetOut(Location::Any());
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 9b405bb..d83ad09 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -1426,7 +1426,7 @@
 
 void InstructionCodeGeneratorMIPS64::VisitArrayLength(HArrayLength* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  uint32_t offset = mirror::Array::LengthOffset().Uint32Value();
+  uint32_t offset = CodeGenerator::GetArrayLengthOffset(instruction);
   GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>();
   GpuRegister out = locations->Out().AsRegister<GpuRegister>();
   __ LoadFromOffset(kLoadWord, out, obj, offset);
@@ -3594,7 +3594,7 @@
 
 void LocationsBuilderMIPS64::VisitPhi(HPhi* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
-  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
+  for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
     locations->SetInAt(i, Location::Any());
   }
   locations->SetOut(Location::Any());
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 51d9b7c..b2d9614 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -443,7 +443,9 @@
            instruction_->IsLoadClass() ||
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
-           instruction_->IsCheckCast())
+           instruction_->IsCheckCast() ||
+           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
+            instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
@@ -506,8 +508,12 @@
     Register reg_out = out_.AsRegister<Register>();
     DCHECK(locations->CanCall());
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out));
-    DCHECK(!instruction_->IsInvoke() ||
-           (instruction_->IsInvokeStaticOrDirect() &&
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
             instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
@@ -520,7 +526,7 @@
     // introduce a copy of it, `index`.
     Location index = index_;
     if (index_.IsValid()) {
-      // Handle `index_` for HArrayGet and intrinsic UnsafeGetObject.
+      // Handle `index_` for HArrayGet and UnsafeGetObject/UnsafeGetObjectVolatile intrinsics.
       if (instruction_->IsArrayGet()) {
         // Compute the actual memory offset and store it in `index`.
         Register index_reg = index_.AsRegister<Register>();
@@ -568,7 +574,11 @@
             "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
         __ AddImmediate(index_reg, Immediate(offset_));
       } else {
-        DCHECK(instruction_->IsInvoke());
+        // In the case of the UnsafeGetObject/UnsafeGetObjectVolatile
+        // intrinsics, `index_` is not shifted by a scale factor of 2
+        // (as in the case of ArrayGet), as it is actually an offset
+        // to an object field within an object.
+        DCHECK(instruction_->IsInvoke()) << instruction_->DebugName();
         DCHECK(instruction_->GetLocations()->Intrinsified());
         DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
                (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
@@ -4247,7 +4257,7 @@
 void LocationsBuilderX86::VisitPhi(HPhi* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
+  for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
     locations->SetInAt(i, Location::Any());
   }
   locations->SetOut(Location::Any());
@@ -5203,12 +5213,10 @@
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
   bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
-  bool object_array_set_with_read_barrier =
-      kEmitCompilerReadBarrier && (value_type == Primitive::kPrimNot);
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      (may_need_runtime_call_for_type_check || object_array_set_with_read_barrier) ?
+      may_need_runtime_call_for_type_check ?
           LocationSummary::kCallOnSlowPath :
           LocationSummary::kNoCall);
 
@@ -5498,7 +5506,7 @@
 
 void InstructionCodeGeneratorX86::VisitArrayLength(HArrayLength* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  uint32_t offset = mirror::Array::LengthOffset().Uint32Value();
+  uint32_t offset = CodeGenerator::GetArrayLengthOffset(instruction);
   Register obj = locations->InAt(0).AsRegister<Register>();
   Register out = locations->Out().AsRegister<Register>();
   __ movl(out, Address(obj, offset));
@@ -6855,6 +6863,9 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
+  static_assert(
+      sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+      "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
   // /* HeapReference<Object> */ ref =
   //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
   Address src = index.IsConstant() ?
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 28b52a1..4e7cee8 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -464,7 +464,9 @@
            instruction_->IsLoadClass() ||
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
-           instruction_->IsCheckCast())
+           instruction_->IsCheckCast() ||
+           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
+            instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
@@ -527,8 +529,12 @@
     CpuRegister reg_out = out_.AsRegister<CpuRegister>();
     DCHECK(locations->CanCall());
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out.AsRegister())) << out_;
-    DCHECK(!instruction_->IsInvoke() ||
-           (instruction_->IsInvokeStaticOrDirect() &&
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           ((instruction_->IsInvokeStaticOrDirect() || instruction_->IsInvokeVirtual()) &&
             instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
@@ -541,7 +547,7 @@
     // introduce a copy of it, `index`.
     Location index = index_;
     if (index_.IsValid()) {
-      // Handle `index_` for HArrayGet and intrinsic UnsafeGetObject.
+      // Handle `index_` for HArrayGet and UnsafeGetObject/UnsafeGetObjectVolatile intrinsics.
       if (instruction_->IsArrayGet()) {
         // Compute real offset and store it in index_.
         Register index_reg = index_.AsRegister<CpuRegister>().AsRegister();
@@ -589,7 +595,11 @@
             "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
         __ AddImmediate(CpuRegister(index_reg), Immediate(offset_));
       } else {
-        DCHECK(instruction_->IsInvoke());
+        // In the case of the UnsafeGetObject/UnsafeGetObjectVolatile
+        // intrinsics, `index_` is not shifted by a scale factor of 2
+        // (as in the case of ArrayGet), as it is actually an offset
+        // to an object field within an object.
+        DCHECK(instruction_->IsInvoke()) << instruction_->DebugName();
         DCHECK(instruction_->GetLocations()->Intrinsified());
         DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
                (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
@@ -4040,7 +4050,7 @@
 void LocationsBuilderX86_64::VisitPhi(HPhi* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
+  for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
     locations->SetInAt(i, Location::Any());
   }
   locations->SetOut(Location::Any());
@@ -4697,12 +4707,10 @@
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
   bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck();
-  bool object_array_set_with_read_barrier =
-      kEmitCompilerReadBarrier && (value_type == Primitive::kPrimNot);
 
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
       instruction,
-      (may_need_runtime_call_for_type_check || object_array_set_with_read_barrier) ?
+      may_need_runtime_call_for_type_check ?
           LocationSummary::kCallOnSlowPath :
           LocationSummary::kNoCall);
 
@@ -4973,7 +4981,7 @@
 
 void InstructionCodeGeneratorX86_64::VisitArrayLength(HArrayLength* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  uint32_t offset = mirror::Array::LengthOffset().Uint32Value();
+  uint32_t offset = CodeGenerator::GetArrayLengthOffset(instruction);
   CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   __ movl(out, Address(obj, offset));
@@ -6319,6 +6327,9 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
+  static_assert(
+      sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+      "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
   // /* HeapReference<Object> */ ref =
   //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
   Address src = index.IsConstant() ?
diff --git a/compiler/optimizing/graph_checker.cc b/compiler/optimizing/graph_checker.cc
index 968e267..2bd2403 100644
--- a/compiler/optimizing/graph_checker.cc
+++ b/compiler/optimizing/graph_checker.cc
@@ -335,9 +335,7 @@
   }
 
   // Ensure the inputs of `instruction` are defined in a block of the graph.
-  for (HInputIterator input_it(instruction); !input_it.Done();
-       input_it.Advance()) {
-    HInstruction* input = input_it.Current();
+  for (HInstruction* input : instruction->GetInputs()) {
     const HInstructionList& list = input->IsPhi()
         ? input->GetBlock()->GetPhis()
         : input->GetBlock()->GetInstructions();
@@ -364,7 +362,8 @@
                             instruction->GetId()));
     }
     size_t use_index = use.GetIndex();
-    if ((use_index >= user->InputCount()) || (user->InputAt(use_index) != instruction)) {
+    auto&& user_inputs = user->GetInputs();
+    if ((use_index >= user_inputs.size()) || (user_inputs[use_index] != instruction)) {
       AddError(StringPrintf("User %s:%d of instruction %s:%d has a wrong "
                             "UseListNode index.",
                             user->DebugName(),
@@ -387,8 +386,9 @@
   }
 
   // Ensure 'instruction' has pointers to its inputs' use entries.
-  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
-    HUserRecord<HInstruction*> input_record = instruction->InputRecordAt(i);
+  auto&& input_records = instruction->GetInputRecords();
+  for (size_t i = 0; i < input_records.size(); ++i) {
+    const HUserRecord<HInstruction*>& input_record = input_records[i];
     HInstruction* input = input_record.GetInstruction();
     if ((input_record.GetBeforeUseNode() == input->GetUses().end()) ||
         (input_record.GetUseNode() == input->GetUses().end()) ||
@@ -490,8 +490,7 @@
   VisitInstruction(invoke);
 
   if (invoke->IsStaticWithExplicitClinitCheck()) {
-    size_t last_input_index = invoke->InputCount() - 1;
-    HInstruction* last_input = invoke->InputAt(last_input_index);
+    HInstruction* last_input = invoke->GetInputs().back();
     if (last_input == nullptr) {
       AddError(StringPrintf("Static invoke %s:%d marked as having an explicit clinit check "
                             "has a null pointer as last input.",
@@ -673,16 +672,21 @@
 
 static bool IsConstantEquivalent(HInstruction* insn1, HInstruction* insn2, BitVector* visited) {
   if (insn1->IsPhi() &&
-      insn1->AsPhi()->IsVRegEquivalentOf(insn2) &&
-      insn1->InputCount() == insn2->InputCount()) {
+      insn1->AsPhi()->IsVRegEquivalentOf(insn2)) {
+    auto&& insn1_inputs = insn1->GetInputs();
+    auto&& insn2_inputs = insn2->GetInputs();
+    if (insn1_inputs.size() != insn2_inputs.size()) {
+      return false;
+    }
+
     // Testing only one of the two inputs for recursion is sufficient.
     if (visited->IsBitSet(insn1->GetId())) {
       return true;
     }
     visited->SetBit(insn1->GetId());
 
-    for (size_t i = 0, e = insn1->InputCount(); i < e; ++i) {
-      if (!IsConstantEquivalent(insn1->InputAt(i), insn2->InputAt(i), visited)) {
+    for (size_t i = 0; i < insn1_inputs.size(); ++i) {
+      if (!IsConstantEquivalent(insn1_inputs[i], insn2_inputs[i], visited)) {
         return false;
       }
     }
@@ -698,15 +702,16 @@
   VisitInstruction(phi);
 
   // Ensure the first input of a phi is not itself.
-  if (phi->InputAt(0) == phi) {
+  ArrayRef<HUserRecord<HInstruction*>> input_records = phi->GetInputRecords();
+  if (input_records[0].GetInstruction() == phi) {
     AddError(StringPrintf("Loop phi %d in block %d is its own first input.",
                           phi->GetId(),
                           phi->GetBlock()->GetBlockId()));
   }
 
   // Ensure that the inputs have the same primitive kind as the phi.
-  for (size_t i = 0, e = phi->InputCount(); i < e; ++i) {
-    HInstruction* input = phi->InputAt(i);
+  for (size_t i = 0; i < input_records.size(); ++i) {
+    HInstruction* input = input_records[i].GetInstruction();
     if (Primitive::PrimitiveKind(input->GetType()) != Primitive::PrimitiveKind(phi->GetType())) {
         AddError(StringPrintf(
             "Input %d at index %zu of phi %d from block %d does not have the "
@@ -729,8 +734,7 @@
     // because we do not remove the corresponding inputs when we prove that an
     // instruction cannot throw. Instead, we at least test that all phis have the
     // same, non-zero number of inputs (b/24054676).
-    size_t input_count_this = phi->InputCount();
-    if (input_count_this == 0u) {
+    if (input_records.empty()) {
       AddError(StringPrintf("Phi %d in catch block %d has zero inputs.",
                             phi->GetId(),
                             phi->GetBlock()->GetBlockId()));
@@ -738,12 +742,12 @@
       HInstruction* next_phi = phi->GetNext();
       if (next_phi != nullptr) {
         size_t input_count_next = next_phi->InputCount();
-        if (input_count_this != input_count_next) {
+        if (input_records.size() != input_count_next) {
           AddError(StringPrintf("Phi %d in catch block %d has %zu inputs, "
                                 "but phi %d has %zu inputs.",
                                 phi->GetId(),
                                 phi->GetBlock()->GetBlockId(),
-                                input_count_this,
+                                input_records.size(),
                                 next_phi->GetId(),
                                 input_count_next));
         }
@@ -753,17 +757,17 @@
     // Ensure the number of inputs of a non-catch phi is the same as the number
     // of its predecessors.
     const ArenaVector<HBasicBlock*>& predecessors = phi->GetBlock()->GetPredecessors();
-    if (phi->InputCount() != predecessors.size()) {
+    if (input_records.size() != predecessors.size()) {
       AddError(StringPrintf(
           "Phi %d in block %d has %zu inputs, "
           "but block %d has %zu predecessors.",
-          phi->GetId(), phi->GetBlock()->GetBlockId(), phi->InputCount(),
+          phi->GetId(), phi->GetBlock()->GetBlockId(), input_records.size(),
           phi->GetBlock()->GetBlockId(), predecessors.size()));
     } else {
       // Ensure phi input at index I either comes from the Ith
       // predecessor or from a block that dominates this predecessor.
-      for (size_t i = 0, e = phi->InputCount(); i < e; ++i) {
-        HInstruction* input = phi->InputAt(i);
+      for (size_t i = 0; i < input_records.size(); ++i) {
+        HInstruction* input = input_records[i].GetInstruction();
         HBasicBlock* predecessor = predecessors[i];
         if (!(input->GetBlock() == predecessor
               || input->GetBlock()->Dominates(predecessor))) {
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 46db6e3..3084a4f 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -394,6 +394,11 @@
         << instance_of->MustDoNullCheck() << std::noboolalpha;
   }
 
+  void VisitArrayLength(HArrayLength* array_length) OVERRIDE {
+    StartAttributeStream("is_string_length") << std::boolalpha
+        << array_length->IsStringLength() << std::noboolalpha;
+  }
+
   void VisitArraySet(HArraySet* array_set) OVERRIDE {
     StartAttributeStream("value_can_be_null") << std::boolalpha
         << array_set->GetValueCanBeNull() << std::noboolalpha;
@@ -492,12 +497,13 @@
 
   void PrintInstruction(HInstruction* instruction) {
     output_ << instruction->DebugName();
-    if (instruction->InputCount() > 0) {
-      StringList inputs;
-      for (HInputIterator it(instruction); !it.Done(); it.Advance()) {
-        inputs.NewEntryStream() << GetTypeId(it.Current()->GetType()) << it.Current()->GetId();
+    auto&& inputs = instruction->GetInputs();
+    if (!inputs.empty()) {
+      StringList input_list;
+      for (const HInstruction* input : inputs) {
+        input_list.NewEntryStream() << GetTypeId(input->GetType()) << input->GetId();
       }
-      StartAttributeStream() << inputs;
+      StartAttributeStream() << input_list;
     }
     instruction->Accept(this);
     if (instruction->HasEnvironment()) {
@@ -539,12 +545,12 @@
       StartAttributeStream("liveness") << instruction->GetLifetimePosition();
       LocationSummary* locations = instruction->GetLocations();
       if (locations != nullptr) {
-        StringList inputs;
-        for (size_t i = 0; i < instruction->InputCount(); ++i) {
-          DumpLocation(inputs.NewEntryStream(), locations->InAt(i));
+        StringList input_list;
+        for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
+          DumpLocation(input_list.NewEntryStream(), locations->InAt(i));
         }
         std::ostream& attr = StartAttributeStream("locations");
-        attr << inputs << "->";
+        attr << input_list << "->";
         DumpLocation(attr, locations->Out());
       }
     }
@@ -734,8 +740,8 @@
       HInstruction* instruction = it.Current();
       output_ << instruction->GetId() << " " << GetTypeId(instruction->GetType())
               << instruction->GetId() << "[ ";
-      for (HInputIterator inputs(instruction); !inputs.Done(); inputs.Advance()) {
-        output_ << inputs.Current()->GetId() << " ";
+      for (const HInstruction* input : instruction->GetInputs()) {
+        output_ << input->GetId() << " ";
       }
       output_ << "]\n";
     }
diff --git a/compiler/optimizing/induction_var_analysis.cc b/compiler/optimizing/induction_var_analysis.cc
index c06d19d..0a5cf80 100644
--- a/compiler/optimizing/induction_var_analysis.cc
+++ b/compiler/optimizing/induction_var_analysis.cc
@@ -152,8 +152,8 @@
 
   // Visit all descendants.
   uint32_t low = d1;
-  for (size_t i = 0, count = instruction->InputCount(); i < count; ++i) {
-    low = std::min(low, VisitDescendant(loop, instruction->InputAt(i)));
+  for (HInstruction* input : instruction->GetInputs()) {
+    low = std::min(low, VisitDescendant(loop, input));
   }
 
   // Lower or found SCC?
@@ -341,11 +341,11 @@
                                                                          HInstruction* phi,
                                                                          size_t input_index) {
   // Match all phi inputs from input_index onwards exactly.
-  const size_t count = phi->InputCount();
-  DCHECK_LT(input_index, count);
-  InductionInfo* a = LookupInfo(loop, phi->InputAt(input_index));
-  for (size_t i = input_index + 1; i < count; i++) {
-    InductionInfo* b = LookupInfo(loop, phi->InputAt(i));
+  auto&& inputs = phi->GetInputs();
+  DCHECK_LT(input_index, inputs.size());
+  InductionInfo* a = LookupInfo(loop, inputs[input_index]);
+  for (size_t i = input_index + 1; i < inputs.size(); i++) {
+    InductionInfo* b = LookupInfo(loop, inputs[i]);
     if (!InductionEqual(a, b)) {
       return nullptr;
     }
@@ -464,12 +464,12 @@
 HInductionVarAnalysis::InductionInfo* HInductionVarAnalysis::SolvePhi(HInstruction* phi,
                                                                       size_t input_index) {
   // Match all phi inputs from input_index onwards exactly.
-  const size_t count = phi->InputCount();
-  DCHECK_LT(input_index, count);
-  auto ita = cycle_.find(phi->InputAt(input_index));
+  auto&& inputs = phi->GetInputs();
+  DCHECK_LT(input_index, inputs.size());
+  auto ita = cycle_.find(inputs[input_index]);
   if (ita != cycle_.end()) {
-    for (size_t i = input_index + 1; i < count; i++) {
-      auto itb = cycle_.find(phi->InputAt(i));
+    for (size_t i = input_index + 1; i < inputs.size(); i++) {
+      auto itb = cycle_.find(inputs[i]);
       if (itb == cycle_.end() ||
           !HInductionVarAnalysis::InductionEqual(ita->second, itb->second)) {
         return nullptr;
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index d7b3856..304afc3 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -101,6 +101,7 @@
   void SimplifyCompare(HInvoke* invoke, bool is_signum, Primitive::Type type);
   void SimplifyIsNaN(HInvoke* invoke);
   void SimplifyFP2Int(HInvoke* invoke);
+  void SimplifyStringIsEmptyOrLength(HInvoke* invoke);
   void SimplifyMemBarrier(HInvoke* invoke, MemBarrierKind barrier_kind);
 
   OptimizingCompilerStats* stats_;
@@ -234,21 +235,40 @@
 
 void InstructionSimplifierVisitor::VisitShift(HBinaryOperation* instruction) {
   DCHECK(instruction->IsShl() || instruction->IsShr() || instruction->IsUShr());
-  HConstant* input_cst = instruction->GetConstantRight();
-  HInstruction* input_other = instruction->GetLeastConstantLeft();
+  HInstruction* shift_amount = instruction->GetRight();
+  HInstruction* value = instruction->GetLeft();
 
-  if (input_cst != nullptr) {
-    int64_t cst = Int64FromConstant(input_cst);
-    int64_t mask = (input_other->GetType() == Primitive::kPrimLong)
-        ? kMaxLongShiftDistance
-        : kMaxIntShiftDistance;
-    if ((cst & mask) == 0) {
+  int64_t implicit_mask = (value->GetType() == Primitive::kPrimLong)
+      ? kMaxLongShiftDistance
+      : kMaxIntShiftDistance;
+
+  if (shift_amount->IsConstant()) {
+    int64_t cst = Int64FromConstant(shift_amount->AsConstant());
+    if ((cst & implicit_mask) == 0) {
       // Replace code looking like
-      //    SHL dst, src, 0
+      //    SHL dst, value, 0
       // with
-      //    src
-      instruction->ReplaceWith(input_other);
+      //    value
+      instruction->ReplaceWith(value);
       instruction->GetBlock()->RemoveInstruction(instruction);
+      RecordSimplification();
+      return;
+    }
+  }
+
+  // Shift operations implicitly mask the shift amount according to the type width. Get rid of
+  // unnecessary explicit masking operations on the shift amount.
+  // Replace code looking like
+  //    AND masked_shift, shift, <superset of implicit mask>
+  //    SHL dst, value, masked_shift
+  // with
+  //    SHL dst, value, shift
+  if (shift_amount->IsAnd()) {
+    HAnd* and_insn = shift_amount->AsAnd();
+    HConstant* mask = and_insn->GetConstantRight();
+    if ((mask != nullptr) && ((Int64FromConstant(mask) & implicit_mask) == implicit_mask)) {
+      instruction->ReplaceInput(and_insn->GetLeastConstantLeft(), 1);
+      RecordSimplification();
     }
   }
 }
@@ -277,6 +297,7 @@
   if (!shl->GetRight()->HasUses()) {
     shl->GetRight()->GetBlock()->RemoveInstruction(shl->GetRight());
   }
+  RecordSimplification();
   return true;
 }
 
@@ -906,6 +927,7 @@
     if (Primitive::IsIntegralType(instruction->GetType())) {
       instruction->ReplaceWith(input_other);
       instruction->GetBlock()->RemoveInstruction(instruction);
+      RecordSimplification();
       return;
     }
   }
@@ -998,6 +1020,7 @@
     //    src
     instruction->ReplaceWith(instruction->GetLeft());
     instruction->GetBlock()->RemoveInstruction(instruction);
+    RecordSimplification();
     return;
   }
 
@@ -1115,6 +1138,7 @@
     //    src
     instruction->ReplaceWith(input_other);
     instruction->GetBlock()->RemoveInstruction(instruction);
+    RecordSimplification();
     return;
   }
 
@@ -1175,6 +1199,7 @@
     //    src
     instruction->ReplaceWith(input_other);
     instruction->GetBlock()->RemoveInstruction(instruction);
+    RecordSimplification();
     return;
   }
 
@@ -1215,6 +1240,7 @@
       //    0
       instruction->ReplaceWith(input_cst);
       instruction->GetBlock()->RemoveInstruction(instruction);
+      RecordSimplification();
     } else if (IsPowerOfTwo(factor)) {
       // Replace code looking like
       //    MUL dst, src, pow_of_2
@@ -1333,6 +1359,7 @@
     //    src
     instruction->ReplaceWith(input_other);
     instruction->GetBlock()->RemoveInstruction(instruction);
+    RecordSimplification();
     return;
   }
 
@@ -1346,6 +1373,7 @@
     //    src
     instruction->ReplaceWith(instruction->GetLeft());
     instruction->GetBlock()->RemoveInstruction(instruction);
+    RecordSimplification();
     return;
   }
 
@@ -1381,6 +1409,7 @@
     // yields `-0.0`.
     instruction->ReplaceWith(input_other);
     instruction->GetBlock()->RemoveInstruction(instruction);
+    RecordSimplification();
     return;
   }
 
@@ -1459,6 +1488,7 @@
     //    src
     instruction->ReplaceWith(input_other);
     instruction->GetBlock()->RemoveInstruction(instruction);
+    RecordSimplification();
     return;
   }
 
@@ -1538,7 +1568,7 @@
   HRor* ror = new (GetGraph()->GetArena()) HRor(type, value, distance);
   invoke->GetBlock()->ReplaceAndRemoveInstructionWith(invoke, ror);
   // Remove ClinitCheck and LoadClass, if possible.
-  HInstruction* clinit = invoke->InputAt(invoke->InputCount() - 1);
+  HInstruction* clinit = invoke->GetInputs().back();
   if (clinit->IsClinitCheck() && !clinit->HasUses()) {
     clinit->GetBlock()->RemoveInstruction(clinit);
     HInstruction* ldclass = clinit->InputAt(0);
@@ -1673,6 +1703,27 @@
   invoke->ReplaceWithExceptInReplacementAtIndex(select, 0);  // false at index 0
 }
 
+void InstructionSimplifierVisitor::SimplifyStringIsEmptyOrLength(HInvoke* invoke) {
+  HInstruction* str = invoke->InputAt(0);
+  uint32_t dex_pc = invoke->GetDexPc();
+  // We treat String as an array to allow DCE and BCE to seamlessly work on strings,
+  // so create the HArrayLength.
+  HArrayLength* length = new (GetGraph()->GetArena()) HArrayLength(str, dex_pc);
+  length->MarkAsStringLength();
+  HInstruction* replacement;
+  if (invoke->GetIntrinsic() == Intrinsics::kStringIsEmpty) {
+    // For String.isEmpty(), create the `HEqual` representing the `length == 0`.
+    invoke->GetBlock()->InsertInstructionBefore(length, invoke);
+    HIntConstant* zero = GetGraph()->GetIntConstant(0);
+    HEqual* equal = new (GetGraph()->GetArena()) HEqual(length, zero, dex_pc);
+    replacement = equal;
+  } else {
+    DCHECK_EQ(invoke->GetIntrinsic(), Intrinsics::kStringLength);
+    replacement = length;
+  }
+  invoke->GetBlock()->ReplaceAndRemoveInstructionWith(invoke, replacement);
+}
+
 void InstructionSimplifierVisitor::SimplifyMemBarrier(HInvoke* invoke, MemBarrierKind barrier_kind) {
   uint32_t dex_pc = invoke->GetDexPc();
   HMemoryBarrier* mem_barrier = new (GetGraph()->GetArena()) HMemoryBarrier(barrier_kind, dex_pc);
@@ -1719,6 +1770,10 @@
     case Intrinsics::kDoubleDoubleToLongBits:
       SimplifyFP2Int(instruction);
       break;
+    case Intrinsics::kStringIsEmpty:
+    case Intrinsics::kStringLength:
+      SimplifyStringIsEmptyOrLength(instruction);
+      break;
     case Intrinsics::kUnsafeLoadFence:
       SimplifyMemBarrier(instruction, MemBarrierKind::kLoadAny);
       break;
diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc
index 5d4c4e2..418d59c 100644
--- a/compiler/optimizing/intrinsics.cc
+++ b/compiler/optimizing/intrinsics.cc
@@ -388,10 +388,8 @@
     case kIntrinsicGetCharsNoCheck:
       return Intrinsics::kStringGetCharsNoCheck;
     case kIntrinsicIsEmptyOrLength:
-      // The inliner can handle these two cases - and this is the preferred approach
-      // since after inlining the call is no longer visible (as opposed to waiting
-      // until codegen to handle intrinsic).
-      return Intrinsics::kNone;
+      return ((method.d.data & kIntrinsicFlagIsEmpty) == 0) ?
+          Intrinsics::kStringLength : Intrinsics::kStringIsEmpty;
     case kIntrinsicIndexOf:
       return ((method.d.data & kIntrinsicFlagBase0) == 0) ?
           Intrinsics::kStringIndexOfAfter : Intrinsics::kStringIndexOf;
diff --git a/compiler/optimizing/intrinsics.h b/compiler/optimizing/intrinsics.h
index 863dd1c..214250f 100644
--- a/compiler/optimizing/intrinsics.h
+++ b/compiler/optimizing/intrinsics.h
@@ -30,6 +30,10 @@
 // Temporary measure until we have caught up with the Java 7 definition of Math.round. b/26327751
 static constexpr bool kRoundIsPlusPointFive = false;
 
+// Positive floating-point infinities.
+static constexpr uint32_t kPositiveInfinityFloat = 0x7f800000U;
+static constexpr uint64_t kPositiveInfinityDouble = UINT64_C(0x7ff0000000000000);
+
 // Recognize intrinsics from HInvoke nodes.
 class IntrinsicsRecognizer : public HOptimization {
  public:
@@ -235,6 +239,8 @@
 UNREACHABLE_INTRINSIC(Arch, LongCompare)            \
 UNREACHABLE_INTRINSIC(Arch, IntegerSignum)          \
 UNREACHABLE_INTRINSIC(Arch, LongSignum)             \
+UNREACHABLE_INTRINSIC(Arch, StringIsEmpty)          \
+UNREACHABLE_INTRINSIC(Arch, StringLength)           \
 UNREACHABLE_INTRINSIC(Arch, UnsafeLoadFence)        \
 UNREACHABLE_INTRINSIC(Arch, UnsafeStoreFence)       \
 UNREACHABLE_INTRINSIC(Arch, UnsafeFullFence)
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 6c253ad..0ec0366 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -47,19 +47,6 @@
   if (res == nullptr) {
     return false;
   }
-  if (kEmitCompilerReadBarrier && res->CanCall()) {
-    // Generating an intrinsic for this HInvoke may produce an
-    // IntrinsicSlowPathARM slow path.  Currently this approach
-    // does not work when using read barriers, as the emitted
-    // calling sequence will make use of another slow path
-    // (ReadBarrierForRootSlowPathARM for HInvokeStaticOrDirect,
-    // ReadBarrierSlowPathARM for HInvokeVirtual).  So we bail
-    // out in this case.
-    //
-    // TODO: Find a way to have intrinsics work with read barriers.
-    invoke->SetLocations(nullptr);
-    return false;
-  }
   return res->Intrinsified();
 }
 
@@ -524,8 +511,8 @@
       if (kEmitCompilerReadBarrier) {
         if (kUseBakerReadBarrier) {
           Location temp = locations->GetTemp(0);
-          codegen->GenerateArrayLoadWithBakerReadBarrier(
-              invoke, trg_loc, base, 0U, offset_loc, temp, /* needs_null_check */ false);
+          codegen->GenerateReferenceLoadWithBakerReadBarrier(
+              invoke, trg_loc, base, 0U, offset_loc, TIMES_1, temp, /* needs_null_check */ false);
           if (is_volatile) {
             __ dmb(ISH);
           }
@@ -581,10 +568,11 @@
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  locations->SetOut(Location::RequiresRegister(),
+                    can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     // We need a temporary register for the read barrier marking slow
-    // path in InstructionCodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier.
+    // path in InstructionCodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier.
     locations->AddTemp(Location::RequiresRegister());
   }
 }
@@ -919,9 +907,10 @@
   // The UnsafeCASObject intrinsic is missing a read barrier, and
   // therefore sometimes does not work as expected (b/25883050).
   // Turn it off temporarily as a quick fix, until the read barrier is
-  // implemented (see TODO in GenCAS below).
+  // implemented (see TODO in GenCAS).
   //
-  // TODO(rpl): Fix this issue and re-enable this intrinsic with read barriers.
+  // TODO(rpl): Implement read barrier support in GenCAS and re-enable
+  // this intrinsic.
   if (kEmitCompilerReadBarrier) {
     return;
   }
@@ -932,6 +921,15 @@
   GenCas(invoke->GetLocations(), Primitive::kPrimInt, codegen_);
 }
 void IntrinsicCodeGeneratorARM::VisitUnsafeCASObject(HInvoke* invoke) {
+  // The UnsafeCASObject intrinsic is missing a read barrier, and
+  // therefore sometimes does not work as expected (b/25883050).
+  // Turn it off temporarily as a quick fix, until the read barrier is
+  // implemented (see TODO in GenCAS).
+  //
+  // TODO(rpl): Implement read barrier support in GenCAS and re-enable
+  // this intrinsic.
+  DCHECK(!kEmitCompilerReadBarrier);
+
   GenCas(invoke->GetLocations(), Primitive::kPrimNot, codegen_);
 }
 
@@ -987,31 +985,126 @@
 void IntrinsicLocationsBuilderARM::VisitStringCompareTo(HInvoke* invoke) {
   // The inputs plus one temp.
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            invoke->InputAt(1)->CanBeNull()
+                                                                ? LocationSummary::kCallOnSlowPath
+                                                                : LocationSummary::kNoCall,
                                                             kIntrinsified);
-  InvokeRuntimeCallingConvention calling_convention;
-  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
-  locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
-  locations->SetOut(Location::RegisterLocation(R0));
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
 
 void IntrinsicCodeGeneratorARM::VisitStringCompareTo(HInvoke* invoke) {
   ArmAssembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
+  Register str = locations->InAt(0).AsRegister<Register>();
+  Register arg = locations->InAt(1).AsRegister<Register>();
+  Register out = locations->Out().AsRegister<Register>();
+
+  Register temp0 = locations->GetTemp(0).AsRegister<Register>();
+  Register temp1 = locations->GetTemp(1).AsRegister<Register>();
+  Register temp2 = locations->GetTemp(2).AsRegister<Register>();
+
+  Label loop;
+  Label find_char_diff;
+  Label end;
+
+  // Get offsets of count and value fields within a string object.
+  const int32_t count_offset = mirror::String::CountOffset().Int32Value();
+  const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
+
   // Note that the null check must have been done earlier.
   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
 
-  Register argument = locations->InAt(1).AsRegister<Register>();
-  __ cmp(argument, ShifterOperand(0));
-  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke);
-  codegen_->AddSlowPath(slow_path);
-  __ b(slow_path->GetEntryLabel(), EQ);
+  // Take slow path and throw if input can be and is null.
+  SlowPathCode* slow_path = nullptr;
+  const bool can_slow_path = invoke->InputAt(1)->CanBeNull();
+  if (can_slow_path) {
+    slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke);
+    codegen_->AddSlowPath(slow_path);
+    __ CompareAndBranchIfZero(arg, slow_path->GetEntryLabel());
+  }
 
-  __ LoadFromOffset(
-      kLoadWord, LR, TR, QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pStringCompareTo).Int32Value());
-  __ blx(LR);
-  __ Bind(slow_path->GetExitLabel());
+  // Reference equality check, return 0 if same reference.
+  __ subs(out, str, ShifterOperand(arg));
+  __ b(&end, EQ);
+  // Load lengths of this and argument strings.
+  __ ldr(temp2, Address(str, count_offset));
+  __ ldr(temp1, Address(arg, count_offset));
+  // out = length diff.
+  __ subs(out, temp2, ShifterOperand(temp1));
+  // temp0 = min(len(str), len(arg)).
+  __ it(Condition::LT, kItElse);
+  __ mov(temp0, ShifterOperand(temp2), Condition::LT);
+  __ mov(temp0, ShifterOperand(temp1), Condition::GE);
+  // Shorter string is empty?
+  __ CompareAndBranchIfZero(temp0, &end);
+
+  // Store offset of string value in preparation for comparison loop.
+  __ mov(temp1, ShifterOperand(value_offset));
+
+  // Assertions that must hold in order to compare multiple characters at a time.
+  CHECK_ALIGNED(value_offset, 8);
+  static_assert(IsAligned<8>(kObjectAlignment),
+                "String data must be 8-byte aligned for unrolled CompareTo loop.");
+
+  const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
+  DCHECK_EQ(char_size, 2u);
+
+  // Unrolled loop comparing 4x16-bit chars per iteration (ok because of string data alignment).
+  __ Bind(&loop);
+  __ ldr(IP, Address(str, temp1));
+  __ ldr(temp2, Address(arg, temp1));
+  __ cmp(IP, ShifterOperand(temp2));
+  __ b(&find_char_diff, NE);
+  __ add(temp1, temp1, ShifterOperand(char_size * 2));
+  __ sub(temp0, temp0, ShifterOperand(2));
+
+  __ ldr(IP, Address(str, temp1));
+  __ ldr(temp2, Address(arg, temp1));
+  __ cmp(IP, ShifterOperand(temp2));
+  __ b(&find_char_diff, NE);
+  __ add(temp1, temp1, ShifterOperand(char_size * 2));
+  __ subs(temp0, temp0, ShifterOperand(2));
+
+  __ b(&loop, GT);
+  __ b(&end);
+
+  // Find the single 16-bit character difference.
+  __ Bind(&find_char_diff);
+  // Get the bit position of the first character that differs.
+  __ eor(temp1, temp2, ShifterOperand(IP));
+  __ rbit(temp1, temp1);
+  __ clz(temp1, temp1);
+
+  // temp0 = number of 16-bit characters remaining to compare.
+  // (it could be < 1 if a difference is found after the first SUB in the comparison loop, and
+  // after the end of the shorter string data).
+
+  // (temp1 >> 4) = character where difference occurs between the last two words compared, on the
+  // interval [0,1] (0 for low half-word different, 1 for high half-word different).
+
+  // If temp0 <= (temp1 >> 4), the difference occurs outside the remaining string data, so just
+  // return length diff (out).
+  __ cmp(temp0, ShifterOperand(temp1, LSR, 4));
+  __ b(&end, LE);
+  // Extract the characters and calculate the difference.
+  __ bic(temp1, temp1, ShifterOperand(0xf));
+  __ Lsr(temp2, temp2, temp1);
+  __ Lsr(IP, IP, temp1);
+  __ movt(temp2, 0);
+  __ movt(IP, 0);
+  __ sub(out, IP, ShifterOperand(temp2));
+
+  __ Bind(&end);
+
+  if (can_slow_path) {
+    __ Bind(slow_path->GetExitLabel());
+  }
 }
 
 void IntrinsicLocationsBuilderARM::VisitStringEquals(HInvoke* invoke) {
@@ -1055,17 +1148,22 @@
   // Note that the null check must have been done earlier.
   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
 
-  // Check if input is null, return false if it is.
-  __ CompareAndBranchIfZero(arg, &return_false);
+  StringEqualsOptimizations optimizations(invoke);
+  if (!optimizations.GetArgumentNotNull()) {
+    // Check if input is null, return false if it is.
+    __ CompareAndBranchIfZero(arg, &return_false);
+  }
 
-  // Instanceof check for the argument by comparing class fields.
-  // All string objects must have the same type since String cannot be subclassed.
-  // Receiver must be a string object, so its class field is equal to all strings' class fields.
-  // If the argument is a string object, its class field must be equal to receiver's class field.
-  __ ldr(temp, Address(str, class_offset));
-  __ ldr(temp1, Address(arg, class_offset));
-  __ cmp(temp, ShifterOperand(temp1));
-  __ b(&return_false, NE);
+  if (!optimizations.GetArgumentIsString()) {
+    // Instanceof check for the argument by comparing class fields.
+    // All string objects must have the same type since String cannot be subclassed.
+    // Receiver must be a string object, so its class field is equal to all strings' class fields.
+    // If the argument is a string object, its class field must be equal to receiver's class field.
+    __ ldr(temp, Address(str, class_offset));
+    __ ldr(temp1, Address(arg, class_offset));
+    __ cmp(temp, ShifterOperand(temp1));
+    __ b(&return_false, NE);
+  }
 
   // Load lengths of this and argument strings.
   __ ldr(temp, Address(str, count_offset));
@@ -1082,7 +1180,7 @@
 
   // Assertions that must hold in order to compare strings 2 characters at a time.
   DCHECK_ALIGNED(value_offset, 4);
-  static_assert(IsAligned<4>(kObjectAlignment), "String of odd length is not zero padded");
+  static_assert(IsAligned<4>(kObjectAlignment), "String data must be aligned for fast compare.");
 
   __ LoadImmediate(temp1, value_offset);
 
@@ -1115,16 +1213,16 @@
                                        ArenaAllocator* allocator,
                                        bool start_at_zero) {
   LocationSummary* locations = invoke->GetLocations();
-  Register tmp_reg = locations->GetTemp(0).AsRegister<Register>();
 
   // Note that the null check must have been done earlier.
   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
 
   // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
-  // or directly dispatch if we have a constant.
+  // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
   SlowPathCode* slow_path = nullptr;
-  if (invoke->InputAt(1)->IsIntConstant()) {
-    if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) >
+  HInstruction* code_point = invoke->InputAt(1);
+  if (code_point->IsIntConstant()) {
+    if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) >
         std::numeric_limits<uint16_t>::max()) {
       // Always needs the slow-path. We could directly dispatch to it, but this case should be
       // rare, so for simplicity just put the full slow-path down and branch unconditionally.
@@ -1134,16 +1232,18 @@
       __ Bind(slow_path->GetExitLabel());
       return;
     }
-  } else {
+  } else if (code_point->GetType() != Primitive::kPrimChar) {
     Register char_reg = locations->InAt(1).AsRegister<Register>();
-    __ LoadImmediate(tmp_reg, std::numeric_limits<uint16_t>::max());
-    __ cmp(char_reg, ShifterOperand(tmp_reg));
+    // 0xffff is not modified immediate but 0x10000 is, so use `>= 0x10000` instead of `> 0xffff`.
+    __ cmp(char_reg,
+           ShifterOperand(static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()) + 1));
     slow_path = new (allocator) IntrinsicSlowPathARM(invoke);
     codegen->AddSlowPath(slow_path);
-    __ b(slow_path->GetEntryLabel(), HI);
+    __ b(slow_path->GetEntryLabel(), HS);
   }
 
   if (start_at_zero) {
+    Register tmp_reg = locations->GetTemp(0).AsRegister<Register>();
     DCHECK_EQ(tmp_reg, R2);
     // Start-index = 0.
     __ LoadImmediate(tmp_reg, 0);
@@ -1170,7 +1270,7 @@
   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   locations->SetOut(Location::RegisterLocation(R0));
 
-  // Need a temp for slow-path codepoint compare, and need to send start-index=0.
+  // Need to send start-index=0.
   locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
 }
 
@@ -1190,9 +1290,6 @@
   locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
   locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
   locations->SetOut(Location::RegisterLocation(R0));
-
-  // Need a temp for slow-path codepoint compare.
-  locations->AddTemp(Location::RequiresRegister());
 }
 
 void IntrinsicCodeGeneratorARM::VisitStringIndexOfAfter(HInvoke* invoke) {
@@ -1285,6 +1382,12 @@
 }
 
 void IntrinsicLocationsBuilderARM::VisitSystemArrayCopy(HInvoke* invoke) {
+  // TODO(rpl): Implement read barriers in the SystemArrayCopy
+  // intrinsic and re-enable it (b/29516905).
+  if (kEmitCompilerReadBarrier) {
+    return;
+  }
+
   CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke);
   LocationSummary* locations = invoke->GetLocations();
   if (locations == nullptr) {
@@ -1369,11 +1472,11 @@
   }
 }
 
-// TODO: Implement read barriers in the SystemArrayCopy intrinsic.
-// Note that this code path is not used (yet) because we do not
-// intrinsify methods that can go into the IntrinsicSlowPathARM
-// slow path.
 void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
+  // TODO(rpl): Implement read barriers in the SystemArrayCopy
+  // intrinsic and re-enable it (b/29516905).
+  DCHECK(!kEmitCompilerReadBarrier);
+
   ArmAssembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
@@ -1929,6 +2032,50 @@
   __ revsh(out, in);
 }
 
+static void GenBitCount(HInvoke* instr, bool is64bit, ArmAssembler* assembler) {
+  DCHECK(instr->GetType() == Primitive::kPrimInt);
+  DCHECK((is64bit && instr->InputAt(0)->GetType() == Primitive::kPrimLong) ||
+         (!is64bit && instr->InputAt(0)->GetType() == Primitive::kPrimInt));
+
+  LocationSummary* locations = instr->GetLocations();
+  Location     in = locations->InAt(0);
+  Register  src_0 = is64bit ? in.AsRegisterPairLow<Register>() : in.AsRegister<Register>();
+  Register  src_1 = is64bit ? in.AsRegisterPairHigh<Register>() : src_0;
+  SRegister tmp_s = locations->GetTemp(0).AsFpuRegisterPairLow<SRegister>();
+  DRegister tmp_d = FromLowSToD(tmp_s);
+  Register  out_r = locations->Out().AsRegister<Register>();
+
+  // Move data from core register(s) to temp D-reg for bit count calculation, then move back.
+  // According to Cortex A57 and A72 optimization guides, compared to transferring to full D-reg,
+  // transferring data from core reg to upper or lower half of vfp D-reg requires extra latency,
+  // That's why for integer bit count, we use 'vmov d0, r0, r0' instead of 'vmov d0[0], r0'.
+  __ vmovdrr(tmp_d, src_1, src_0);                         // Temp DReg |--src_1|--src_0|
+  __ vcntd(tmp_d, tmp_d);                                  // Temp DReg |c|c|c|c|c|c|c|c|
+  __ vpaddld(tmp_d, tmp_d, 8, /* is_unsigned */ true);     // Temp DReg |--c|--c|--c|--c|
+  __ vpaddld(tmp_d, tmp_d, 16, /* is_unsigned */ true);    // Temp DReg |------c|------c|
+  if (is64bit) {
+    __ vpaddld(tmp_d, tmp_d, 32, /* is_unsigned */ true);  // Temp DReg |--------------c|
+  }
+  __ vmovrs(out_r, tmp_s);
+}
+
+void IntrinsicLocationsBuilderARM::VisitIntegerBitCount(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM::VisitIntegerBitCount(HInvoke* invoke) {
+  GenBitCount(invoke, /* is64bit */ false, GetAssembler());
+}
+
+void IntrinsicLocationsBuilderARM::VisitLongBitCount(HInvoke* invoke) {
+  VisitIntegerBitCount(invoke);
+}
+
+void IntrinsicCodeGeneratorARM::VisitLongBitCount(HInvoke* invoke) {
+  GenBitCount(invoke, /* is64bit */ true, GetAssembler());
+}
+
 void IntrinsicLocationsBuilderARM::VisitStringGetCharsNoCheck(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
                                                             LocationSummary::kNoCall,
@@ -1939,7 +2086,7 @@
   locations->SetInAt(3, Location::RequiresRegister());
   locations->SetInAt(4, Location::RequiresRegister());
 
-  locations->AddTemp(Location::RequiresRegister());
+  // Temporary registers to store lengths of strings and for calculations.
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
@@ -1967,33 +2114,108 @@
   Register dstObj = locations->InAt(3).AsRegister<Register>();
   Register dstBegin = locations->InAt(4).AsRegister<Register>();
 
-  Register src_ptr = locations->GetTemp(0).AsRegister<Register>();
-  Register src_ptr_end = locations->GetTemp(1).AsRegister<Register>();
+  Register num_chr = locations->GetTemp(0).AsRegister<Register>();
+  Register src_ptr = locations->GetTemp(1).AsRegister<Register>();
   Register dst_ptr = locations->GetTemp(2).AsRegister<Register>();
-  Register tmp = locations->GetTemp(3).AsRegister<Register>();
 
   // src range to copy.
   __ add(src_ptr, srcObj, ShifterOperand(value_offset));
-  __ add(src_ptr_end, src_ptr, ShifterOperand(srcEnd, LSL, 1));
   __ add(src_ptr, src_ptr, ShifterOperand(srcBegin, LSL, 1));
 
   // dst to be copied.
   __ add(dst_ptr, dstObj, ShifterOperand(data_offset));
   __ add(dst_ptr, dst_ptr, ShifterOperand(dstBegin, LSL, 1));
 
+  __ subs(num_chr, srcEnd, ShifterOperand(srcBegin));
+
   // Do the copy.
-  Label loop, done;
-  __ Bind(&loop);
-  __ cmp(src_ptr, ShifterOperand(src_ptr_end));
+  Label loop, remainder, done;
+
+  // Early out for valid zero-length retrievals.
   __ b(&done, EQ);
-  __ ldrh(tmp, Address(src_ptr, char_size, Address::PostIndex));
-  __ strh(tmp, Address(dst_ptr, char_size, Address::PostIndex));
-  __ b(&loop);
+
+  // Save repairing the value of num_chr on the < 4 character path.
+  __ subs(IP, num_chr, ShifterOperand(4));
+  __ b(&remainder, LT);
+
+  // Keep the result of the earlier subs, we are going to fetch at least 4 characters.
+  __ mov(num_chr, ShifterOperand(IP));
+
+  // Main loop used for longer fetches loads and stores 4x16-bit characters at a time.
+  // (LDRD/STRD fault on unaligned addresses and it's not worth inlining extra code
+  // to rectify these everywhere this intrinsic applies.)
+  __ Bind(&loop);
+  __ ldr(IP, Address(src_ptr, char_size * 2));
+  __ subs(num_chr, num_chr, ShifterOperand(4));
+  __ str(IP, Address(dst_ptr, char_size * 2));
+  __ ldr(IP, Address(src_ptr, char_size * 4, Address::PostIndex));
+  __ str(IP, Address(dst_ptr, char_size * 4, Address::PostIndex));
+  __ b(&loop, GE);
+
+  __ adds(num_chr, num_chr, ShifterOperand(4));
+  __ b(&done, EQ);
+
+  // Main loop for < 4 character case and remainder handling. Loads and stores one
+  // 16-bit Java character at a time.
+  __ Bind(&remainder);
+  __ ldrh(IP, Address(src_ptr, char_size, Address::PostIndex));
+  __ subs(num_chr, num_chr, ShifterOperand(1));
+  __ strh(IP, Address(dst_ptr, char_size, Address::PostIndex));
+  __ b(&remainder, GT);
+
   __ Bind(&done);
 }
 
-UNIMPLEMENTED_INTRINSIC(ARM, IntegerBitCount)
-UNIMPLEMENTED_INTRINSIC(ARM, LongBitCount)
+void IntrinsicLocationsBuilderARM::VisitFloatIsInfinite(HInvoke* invoke) {
+  CreateFPToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM::VisitFloatIsInfinite(HInvoke* invoke) {
+  ArmAssembler* const assembler = GetAssembler();
+  LocationSummary* const locations = invoke->GetLocations();
+  const Register out = locations->Out().AsRegister<Register>();
+  // Shifting left by 1 bit makes the value encodable as an immediate operand;
+  // we don't care about the sign bit anyway.
+  constexpr uint32_t infinity = kPositiveInfinityFloat << 1U;
+
+  __ vmovrs(out, locations->InAt(0).AsFpuRegister<SRegister>());
+  // We don't care about the sign bit, so shift left.
+  __ Lsl(out, out, 1);
+  __ eor(out, out, ShifterOperand(infinity));
+  // If the result is 0, then it has 32 leading zeros, and less than that otherwise.
+  __ clz(out, out);
+  // Any number less than 32 logically shifted right by 5 bits results in 0;
+  // the same operation on 32 yields 1.
+  __ Lsr(out, out, 5);
+}
+
+void IntrinsicLocationsBuilderARM::VisitDoubleIsInfinite(HInvoke* invoke) {
+  CreateFPToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM::VisitDoubleIsInfinite(HInvoke* invoke) {
+  ArmAssembler* const assembler = GetAssembler();
+  LocationSummary* const locations = invoke->GetLocations();
+  const Register out = locations->Out().AsRegister<Register>();
+  // The highest 32 bits of double precision positive infinity separated into
+  // two constants encodable as immediate operands.
+  constexpr uint32_t infinity_high  = 0x7f000000U;
+  constexpr uint32_t infinity_high2 = 0x00f00000U;
+
+  static_assert((infinity_high | infinity_high2) == static_cast<uint32_t>(kPositiveInfinityDouble >> 32U),
+                "The constants do not add up to the high 32 bits of double precision positive infinity.");
+  __ vmovrrd(IP, out, FromLowSToD(locations->InAt(0).AsFpuRegisterPairLow<SRegister>()));
+  __ eor(out, out, ShifterOperand(infinity_high));
+  __ eor(out, out, ShifterOperand(infinity_high2));
+  // We don't care about the sign bit, so shift left.
+  __ orr(out, IP, ShifterOperand(out, LSL, 1));
+  // If the result is 0, then it has 32 leading zeros, and less than that otherwise.
+  __ clz(out, out);
+  // Any number less than 32 logically shifted right by 5 bits results in 0;
+  // the same operation on 32 yields 1.
+  __ Lsr(out, out, 5);
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM, MathMinDoubleDouble)
 UNIMPLEMENTED_INTRINSIC(ARM, MathMinFloatFloat)
 UNIMPLEMENTED_INTRINSIC(ARM, MathMaxDoubleDouble)
@@ -2008,8 +2230,6 @@
 UNIMPLEMENTED_INTRINSIC(ARM, UnsafeCASLong)     // High register pressure.
 UNIMPLEMENTED_INTRINSIC(ARM, SystemArrayCopyChar)
 UNIMPLEMENTED_INTRINSIC(ARM, ReferenceGetReferent)
-UNIMPLEMENTED_INTRINSIC(ARM, FloatIsInfinite)
-UNIMPLEMENTED_INTRINSIC(ARM, DoubleIsInfinite)
 UNIMPLEMENTED_INTRINSIC(ARM, IntegerHighestOneBit)
 UNIMPLEMENTED_INTRINSIC(ARM, LongHighestOneBit)
 UNIMPLEMENTED_INTRINSIC(ARM, IntegerLowestOneBit)
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 696fa52..f2d5e08 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -47,6 +47,7 @@
 using helpers::WRegisterFrom;
 using helpers::XRegisterFrom;
 using helpers::InputRegisterAt;
+using helpers::OutputRegister;
 
 namespace {
 
@@ -148,19 +149,6 @@
   if (res == nullptr) {
     return false;
   }
-  if (kEmitCompilerReadBarrier && res->CanCall()) {
-    // Generating an intrinsic for this HInvoke may produce an
-    // IntrinsicSlowPathARM64 slow path.  Currently this approach
-    // does not work when using read barriers, as the emitted
-    // calling sequence will make use of another slow path
-    // (ReadBarrierForRootSlowPathARM64 for HInvokeStaticOrDirect,
-    // ReadBarrierSlowPathARM64 for HInvokeVirtual).  So we bail
-    // out in this case.
-    //
-    // TODO: Find a way to have intrinsics work with read barriers.
-    invoke->SetLocations(nullptr);
-    return false;
-  }
   return res->Intrinsified();
 }
 
@@ -620,54 +608,66 @@
   __ Frintn(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
 }
 
-static void CreateFPToIntPlusTempLocations(ArenaAllocator* arena, HInvoke* invoke) {
+static void CreateFPToIntPlusFPTempLocations(ArenaAllocator* arena, HInvoke* invoke) {
   LocationSummary* locations = new (arena) LocationSummary(invoke,
                                                            LocationSummary::kNoCall,
                                                            kIntrinsified);
   locations->SetInAt(0, Location::RequiresFpuRegister());
   locations->SetOut(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresFpuRegister());
 }
 
-static void GenMathRound(LocationSummary* locations,
-                         bool is_double,
-                         vixl::MacroAssembler* masm) {
-  FPRegister in_reg = is_double ?
-      DRegisterFrom(locations->InAt(0)) : SRegisterFrom(locations->InAt(0));
-  Register out_reg = is_double ?
-      XRegisterFrom(locations->Out()) : WRegisterFrom(locations->Out());
-  UseScratchRegisterScope temps(masm);
-  FPRegister temp1_reg = temps.AcquireSameSizeAs(in_reg);
+static void GenMathRound(HInvoke* invoke, bool is_double, vixl::MacroAssembler* masm) {
+  // Java 8 API definition for Math.round():
+  // Return the closest long or int to the argument, with ties rounding to positive infinity.
+  //
+  // There is no single instruction in ARMv8 that can support the above definition.
+  // We choose to use FCVTAS here, because it has closest semantic.
+  // FCVTAS performs rounding to nearest integer, ties away from zero.
+  // For most inputs (positive values, zero or NaN), this instruction is enough.
+  // We only need a few handling code after FCVTAS if the input is negative half value.
+  //
+  // The reason why we didn't choose FCVTPS instruction here is that
+  // although it performs rounding toward positive infinity, it doesn't perform rounding to nearest.
+  // For example, FCVTPS(-1.9) = -1 and FCVTPS(1.1) = 2.
+  // If we were using this instruction, for most inputs, more handling code would be needed.
+  LocationSummary* l = invoke->GetLocations();
+  FPRegister in_reg = is_double ? DRegisterFrom(l->InAt(0)) : SRegisterFrom(l->InAt(0));
+  FPRegister tmp_fp = is_double ? DRegisterFrom(l->GetTemp(0)) : SRegisterFrom(l->GetTemp(0));
+  Register out_reg = is_double ? XRegisterFrom(l->Out()) : WRegisterFrom(l->Out());
+  vixl::Label done;
 
-  // 0.5 can be encoded as an immediate, so use fmov.
-  if (is_double) {
-    __ Fmov(temp1_reg, static_cast<double>(0.5));
-  } else {
-    __ Fmov(temp1_reg, static_cast<float>(0.5));
-  }
-  __ Fadd(temp1_reg, in_reg, temp1_reg);
-  __ Fcvtms(out_reg, temp1_reg);
+  // Round to nearest integer, ties away from zero.
+  __ Fcvtas(out_reg, in_reg);
+
+  // For positive values, zero or NaN inputs, rounding is done.
+  __ Tbz(out_reg, out_reg.size() - 1, &done);
+
+  // Handle input < 0 cases.
+  // If input is negative but not a tie, previous result (round to nearest) is valid.
+  // If input is a negative tie, out_reg += 1.
+  __ Frinta(tmp_fp, in_reg);
+  __ Fsub(tmp_fp, in_reg, tmp_fp);
+  __ Fcmp(tmp_fp, 0.5);
+  __ Cinc(out_reg, out_reg, eq);
+
+  __ Bind(&done);
 }
 
 void IntrinsicLocationsBuilderARM64::VisitMathRoundDouble(HInvoke* invoke) {
-  // See intrinsics.h.
-  if (kRoundIsPlusPointFive) {
-    CreateFPToIntPlusTempLocations(arena_, invoke);
-  }
+  CreateFPToIntPlusFPTempLocations(arena_, invoke);
 }
 
 void IntrinsicCodeGeneratorARM64::VisitMathRoundDouble(HInvoke* invoke) {
-  GenMathRound(invoke->GetLocations(), /* is_double */ true, GetVIXLAssembler());
+  GenMathRound(invoke, /* is_double */ true, GetVIXLAssembler());
 }
 
 void IntrinsicLocationsBuilderARM64::VisitMathRoundFloat(HInvoke* invoke) {
-  // See intrinsics.h.
-  if (kRoundIsPlusPointFive) {
-    CreateFPToIntPlusTempLocations(arena_, invoke);
-  }
+  CreateFPToIntPlusFPTempLocations(arena_, invoke);
 }
 
 void IntrinsicCodeGeneratorARM64::VisitMathRoundFloat(HInvoke* invoke) {
-  GenMathRound(invoke->GetLocations(), /* is_double */ false, GetVIXLAssembler());
+  GenMathRound(invoke, /* is_double */ false, GetVIXLAssembler());
 }
 
 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekByte(HInvoke* invoke) {
@@ -790,8 +790,15 @@
     // UnsafeGetObject/UnsafeGetObjectVolatile with Baker's read barrier case.
     UseScratchRegisterScope temps(masm);
     Register temp = temps.AcquireW();
-    codegen->GenerateArrayLoadWithBakerReadBarrier(
-        invoke, trg_loc, base, 0U, offset_loc, temp, /* needs_null_check */ false);
+    codegen->GenerateReferenceLoadWithBakerReadBarrier(invoke,
+                                                       trg_loc,
+                                                       base,
+                                                       /* offset */ 0U,
+                                                       /* index */ offset_loc,
+                                                       /* scale_factor */ 0U,
+                                                       temp,
+                                                       /* needs_null_check */ false,
+                                                       is_volatile);
   } else {
     // Other cases.
     MemOperand mem_op(base.X(), offset);
@@ -820,7 +827,8 @@
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  locations->SetOut(Location::RequiresRegister(),
+                    can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap);
 }
 
 void IntrinsicLocationsBuilderARM64::VisitUnsafeGet(HInvoke* invoke) {
@@ -1101,9 +1109,10 @@
   // The UnsafeCASObject intrinsic is missing a read barrier, and
   // therefore sometimes does not work as expected (b/25883050).
   // Turn it off temporarily as a quick fix, until the read barrier is
-  // implemented (see TODO in GenCAS below).
+  // implemented (see TODO in GenCAS).
   //
-  // TODO(rpl): Fix this issue and re-enable this intrinsic with read barriers.
+  // TODO(rpl): Implement read barrier support in GenCAS and re-enable
+  // this intrinsic.
   if (kEmitCompilerReadBarrier) {
     return;
   }
@@ -1118,6 +1127,15 @@
   GenCas(invoke->GetLocations(), Primitive::kPrimLong, codegen_);
 }
 void IntrinsicCodeGeneratorARM64::VisitUnsafeCASObject(HInvoke* invoke) {
+  // The UnsafeCASObject intrinsic is missing a read barrier, and
+  // therefore sometimes does not work as expected (b/25883050).
+  // Turn it off temporarily as a quick fix, until the read barrier is
+  // implemented (see TODO in GenCAS).
+  //
+  // TODO(rpl): Implement read barrier support in GenCAS and re-enable
+  // this intrinsic.
+  DCHECK(!kEmitCompilerReadBarrier);
+
   GenCas(invoke->GetLocations(), Primitive::kPrimNot, codegen_);
 }
 
@@ -1173,31 +1191,118 @@
 
 void IntrinsicLocationsBuilderARM64::VisitStringCompareTo(HInvoke* invoke) {
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
-                                                            LocationSummary::kCall,
+                                                            invoke->InputAt(1)->CanBeNull()
+                                                                ? LocationSummary::kCallOnSlowPath
+                                                                : LocationSummary::kNoCall,
                                                             kIntrinsified);
-  InvokeRuntimeCallingConvention calling_convention;
-  locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
-  locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
-  locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimInt));
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
   vixl::MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
+  Register str = XRegisterFrom(locations->InAt(0));
+  Register arg = XRegisterFrom(locations->InAt(1));
+  Register out = OutputRegister(invoke);
+
+  Register temp0 = WRegisterFrom(locations->GetTemp(0));
+  Register temp1 = WRegisterFrom(locations->GetTemp(1));
+  Register temp2 = WRegisterFrom(locations->GetTemp(2));
+
+  vixl::Label loop;
+  vixl::Label find_char_diff;
+  vixl::Label end;
+
+  // Get offsets of count and value fields within a string object.
+  const int32_t count_offset = mirror::String::CountOffset().Int32Value();
+  const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
+
   // Note that the null check must have been done earlier.
   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
 
-  Register argument = WRegisterFrom(locations->InAt(1));
-  __ Cmp(argument, 0);
-  SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
-  codegen_->AddSlowPath(slow_path);
-  __ B(eq, slow_path->GetEntryLabel());
+  // Take slow path and throw if input can be and is null.
+  SlowPathCodeARM64* slow_path = nullptr;
+  const bool can_slow_path = invoke->InputAt(1)->CanBeNull();
+  if (can_slow_path) {
+    slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
+    codegen_->AddSlowPath(slow_path);
+    __ Cbz(arg, slow_path->GetEntryLabel());
+  }
 
-  __ Ldr(
-      lr, MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pStringCompareTo).Int32Value()));
-  __ Blr(lr);
-  __ Bind(slow_path->GetExitLabel());
+  // Reference equality check, return 0 if same reference.
+  __ Subs(out, str, arg);
+  __ B(&end, eq);
+  // Load lengths of this and argument strings.
+  __ Ldr(temp0, MemOperand(str.X(), count_offset));
+  __ Ldr(temp1, MemOperand(arg.X(), count_offset));
+  // Return zero if both strings are empty.
+  __ Orr(out, temp0, temp1);
+  __ Cbz(out, &end);
+  // out = length diff.
+  __ Subs(out, temp0, temp1);
+  // temp2 = min(len(str), len(arg)).
+  __ Csel(temp2, temp1, temp0, ge);
+  // Shorter string is empty?
+  __ Cbz(temp2, &end);
+
+  // Store offset of string value in preparation for comparison loop.
+  __ Mov(temp1, value_offset);
+
+  UseScratchRegisterScope scratch_scope(masm);
+  Register temp4 = scratch_scope.AcquireX();
+
+  // Assertions that must hold in order to compare strings 4 characters at a time.
+  DCHECK_ALIGNED(value_offset, 8);
+  static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
+
+  const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
+  DCHECK_EQ(char_size, 2u);
+
+  // Promote temp0 to an X reg, ready for LDR.
+  temp0 = temp0.X();
+
+  // Loop to compare 4x16-bit characters at a time (ok because of string data alignment).
+  __ Bind(&loop);
+  __ Ldr(temp4, MemOperand(str.X(), temp1));
+  __ Ldr(temp0, MemOperand(arg.X(), temp1));
+  __ Cmp(temp4, temp0);
+  __ B(ne, &find_char_diff);
+  __ Add(temp1, temp1, char_size * 4);
+  __ Subs(temp2, temp2, 4);
+  __ B(gt, &loop);
+  __ B(&end);
+
+  // Promote temp1 to an X reg, ready for EOR.
+  temp1 = temp1.X();
+
+  // Find the single 16-bit character difference.
+  __ Bind(&find_char_diff);
+  // Get the bit position of the first character that differs.
+  __ Eor(temp1, temp0, temp4);
+  __ Rbit(temp1, temp1);
+  __ Clz(temp1, temp1);
+  // If the number of 16-bit chars remaining <= the index where the difference occurs (0-3), then
+  // the difference occurs outside the remaining string data, so just return length diff (out).
+  __ Cmp(temp2, Operand(temp1, LSR, 4));
+  __ B(le, &end);
+  // Extract the characters and calculate the difference.
+  __ Bic(temp1, temp1, 0xf);
+  __ Lsr(temp0, temp0, temp1);
+  __ Lsr(temp4, temp4, temp1);
+  __ And(temp4, temp4, 0xffff);
+  __ Sub(out, temp4, Operand(temp0, UXTH));
+
+  __ Bind(&end);
+
+  if (can_slow_path) {
+    __ Bind(slow_path->GetExitLabel());
+  }
 }
 
 void IntrinsicLocationsBuilderARM64::VisitStringEquals(HInvoke* invoke) {
@@ -1239,21 +1344,26 @@
   // Note that the null check must have been done earlier.
   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
 
-  // Check if input is null, return false if it is.
-  __ Cbz(arg, &return_false);
+  StringEqualsOptimizations optimizations(invoke);
+  if (!optimizations.GetArgumentNotNull()) {
+    // Check if input is null, return false if it is.
+    __ Cbz(arg, &return_false);
+  }
 
   // Reference equality check, return true if same reference.
   __ Cmp(str, arg);
   __ B(&return_true, eq);
 
-  // Instanceof check for the argument by comparing class fields.
-  // All string objects must have the same type since String cannot be subclassed.
-  // Receiver must be a string object, so its class field is equal to all strings' class fields.
-  // If the argument is a string object, its class field must be equal to receiver's class field.
-  __ Ldr(temp, MemOperand(str.X(), class_offset));
-  __ Ldr(temp1, MemOperand(arg.X(), class_offset));
-  __ Cmp(temp, temp1);
-  __ B(&return_false, ne);
+  if (!optimizations.GetArgumentIsString()) {
+    // Instanceof check for the argument by comparing class fields.
+    // All string objects must have the same type since String cannot be subclassed.
+    // Receiver must be a string object, so its class field is equal to all strings' class fields.
+    // If the argument is a string object, its class field must be equal to receiver's class field.
+    __ Ldr(temp, MemOperand(str.X(), class_offset));
+    __ Ldr(temp1, MemOperand(arg.X(), class_offset));
+    __ Cmp(temp, temp1);
+    __ B(&return_false, ne);
+  }
 
   // Load lengths of this and argument strings.
   __ Ldr(temp, MemOperand(str.X(), count_offset));
@@ -1302,16 +1412,16 @@
                                        ArenaAllocator* allocator,
                                        bool start_at_zero) {
   LocationSummary* locations = invoke->GetLocations();
-  Register tmp_reg = WRegisterFrom(locations->GetTemp(0));
 
   // Note that the null check must have been done earlier.
   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
 
   // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
-  // or directly dispatch if we have a constant.
+  // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
   SlowPathCodeARM64* slow_path = nullptr;
-  if (invoke->InputAt(1)->IsIntConstant()) {
-    if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) > 0xFFFFU) {
+  HInstruction* code_point = invoke->InputAt(1);
+  if (code_point->IsIntConstant()) {
+    if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) > 0xFFFFU) {
       // Always needs the slow-path. We could directly dispatch to it, but this case should be
       // rare, so for simplicity just put the full slow-path down and branch unconditionally.
       slow_path = new (allocator) IntrinsicSlowPathARM64(invoke);
@@ -1320,17 +1430,17 @@
       __ Bind(slow_path->GetExitLabel());
       return;
     }
-  } else {
+  } else if (code_point->GetType() != Primitive::kPrimChar) {
     Register char_reg = WRegisterFrom(locations->InAt(1));
-    __ Mov(tmp_reg, 0xFFFF);
-    __ Cmp(char_reg, Operand(tmp_reg));
+    __ Tst(char_reg, 0xFFFF0000);
     slow_path = new (allocator) IntrinsicSlowPathARM64(invoke);
     codegen->AddSlowPath(slow_path);
-    __ B(hi, slow_path->GetEntryLabel());
+    __ B(ne, slow_path->GetEntryLabel());
   }
 
   if (start_at_zero) {
     // Start-index = 0.
+    Register tmp_reg = WRegisterFrom(locations->GetTemp(0));
     __ Mov(tmp_reg, 0);
   }
 
@@ -1354,7 +1464,7 @@
   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
   locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimInt));
 
-  // Need a temp for slow-path codepoint compare, and need to send start_index=0.
+  // Need to send start_index=0.
   locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(2)));
 }
 
@@ -1374,9 +1484,6 @@
   locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
   locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
   locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimInt));
-
-  // Need a temp for slow-path codepoint compare.
-  locations->AddTemp(Location::RequiresRegister());
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringIndexOfAfter(HInvoke* invoke) {
@@ -1655,6 +1762,7 @@
 
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
@@ -1680,29 +1788,57 @@
   Register dstBegin = XRegisterFrom(locations->InAt(4));
 
   Register src_ptr = XRegisterFrom(locations->GetTemp(0));
-  Register src_ptr_end = XRegisterFrom(locations->GetTemp(1));
+  Register num_chr = XRegisterFrom(locations->GetTemp(1));
+  Register tmp1 = XRegisterFrom(locations->GetTemp(2));
 
   UseScratchRegisterScope temps(masm);
   Register dst_ptr = temps.AcquireX();
-  Register tmp = temps.AcquireW();
+  Register tmp2 = temps.AcquireX();
 
-  // src range to copy.
+  // src address to copy from.
   __ Add(src_ptr, srcObj, Operand(value_offset));
-  __ Add(src_ptr_end, src_ptr, Operand(srcEnd, LSL, 1));
   __ Add(src_ptr, src_ptr, Operand(srcBegin, LSL, 1));
 
-  // dst to be copied.
+  // dst address start to copy to.
   __ Add(dst_ptr, dstObj, Operand(data_offset));
   __ Add(dst_ptr, dst_ptr, Operand(dstBegin, LSL, 1));
 
+  __ Sub(num_chr, srcEnd, srcBegin);
+
   // Do the copy.
-  vixl::Label loop, done;
+  vixl::Label loop;
+  vixl::Label done;
+  vixl::Label remainder;
+
+  // Early out for valid zero-length retrievals.
+  __ Cbz(num_chr, &done);
+
+  // Save repairing the value of num_chr on the < 8 character path.
+  __ Subs(tmp1, num_chr, 8);
+  __ B(lt, &remainder);
+
+  // Keep the result of the earlier subs, we are going to fetch at least 8 characters.
+  __ Mov(num_chr, tmp1);
+
+  // Main loop used for longer fetches loads and stores 8x16-bit characters at a time.
+  // (Unaligned addresses are acceptable here and not worth inlining extra code to rectify.)
   __ Bind(&loop);
-  __ Cmp(src_ptr, src_ptr_end);
-  __ B(&done, eq);
-  __ Ldrh(tmp, MemOperand(src_ptr, char_size, vixl::PostIndex));
-  __ Strh(tmp, MemOperand(dst_ptr, char_size, vixl::PostIndex));
-  __ B(&loop);
+  __ Ldp(tmp1, tmp2, MemOperand(src_ptr, char_size * 8, vixl::PostIndex));
+  __ Subs(num_chr, num_chr, 8);
+  __ Stp(tmp1, tmp2, MemOperand(dst_ptr, char_size * 8, vixl::PostIndex));
+  __ B(ge, &loop);
+
+  __ Adds(num_chr, num_chr, 8);
+  __ B(eq, &done);
+
+  // Main loop for < 8 character case and remainder handling. Loads and stores one
+  // 16-bit Java character at a time.
+  __ Bind(&remainder);
+  __ Ldrh(tmp1, MemOperand(src_ptr, char_size, vixl::PostIndex));
+  __ Subs(num_chr, num_chr, 1);
+  __ Strh(tmp1, MemOperand(dst_ptr, char_size, vixl::PostIndex));
+  __ B(gt, &remainder);
+
   __ Bind(&done);
 }
 
@@ -1943,6 +2079,12 @@
 // We want to use two temporary registers in order to reduce the register pressure in arm64.
 // So we don't use the CodeGenerator::CreateSystemArrayCopyLocationSummary.
 void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) {
+  // TODO(rpl): Implement read barriers in the SystemArrayCopy
+  // intrinsic and re-enable it (b/29516905).
+  if (kEmitCompilerReadBarrier) {
+    return;
+  }
+
   // Check to see if we have known failures that will cause us to have to bail out
   // to the runtime, and just generate the runtime call directly.
   HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
@@ -1995,6 +2137,10 @@
 }
 
 void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
+  // TODO(rpl): Implement read barriers in the SystemArrayCopy
+  // intrinsic and re-enable it (b/29516905).
+  DCHECK(!kEmitCompilerReadBarrier);
+
   vixl::MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
@@ -2207,9 +2353,46 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+static void GenIsInfinite(LocationSummary* locations,
+                          bool is64bit,
+                          vixl::MacroAssembler* masm) {
+  Operand infinity;
+  Register out;
+
+  if (is64bit) {
+    infinity = kPositiveInfinityDouble;
+    out = XRegisterFrom(locations->Out());
+  } else {
+    infinity = kPositiveInfinityFloat;
+    out = WRegisterFrom(locations->Out());
+  }
+
+  const Register zero = vixl::Assembler::AppropriateZeroRegFor(out);
+
+  MoveFPToInt(locations, is64bit, masm);
+  __ Eor(out, out, infinity);
+  // We don't care about the sign bit, so shift left.
+  __ Cmp(zero, Operand(out, LSL, 1));
+  __ Cset(out, eq);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitFloatIsInfinite(HInvoke* invoke) {
+  CreateFPToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFloatIsInfinite(HInvoke* invoke) {
+  GenIsInfinite(invoke->GetLocations(), /* is64bit */ false, GetVIXLAssembler());
+}
+
+void IntrinsicLocationsBuilderARM64::VisitDoubleIsInfinite(HInvoke* invoke) {
+  CreateFPToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitDoubleIsInfinite(HInvoke* invoke) {
+  GenIsInfinite(invoke->GetLocations(), /* is64bit */ true, GetVIXLAssembler());
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
-UNIMPLEMENTED_INTRINSIC(ARM64, FloatIsInfinite)
-UNIMPLEMENTED_INTRINSIC(ARM64, DoubleIsInfinite)
 UNIMPLEMENTED_INTRINSIC(ARM64, IntegerHighestOneBit)
 UNIMPLEMENTED_INTRINSIC(ARM64, LongHighestOneBit)
 UNIMPLEMENTED_INTRINSIC(ARM64, IntegerLowestOneBit)
diff --git a/compiler/optimizing/intrinsics_list.h b/compiler/optimizing/intrinsics_list.h
index dd9294d..db60238 100644
--- a/compiler/optimizing/intrinsics_list.h
+++ b/compiler/optimizing/intrinsics_list.h
@@ -107,6 +107,8 @@
   V(StringGetCharsNoCheck, kDirect, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow) \
   V(StringIndexOf, kDirect, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow) \
   V(StringIndexOfAfter, kDirect, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow) \
+  V(StringIsEmpty, kDirect, kNeedsEnvironmentOrCache, kReadSideEffects, kNoThrow) \
+  V(StringLength, kDirect, kNeedsEnvironmentOrCache, kReadSideEffects, kNoThrow) \
   V(StringNewStringFromBytes, kStatic, kNeedsEnvironmentOrCache, kAllSideEffects, kCanThrow) \
   V(StringNewStringFromChars, kStatic, kNeedsEnvironmentOrCache, kAllSideEffects, kCanThrow) \
   V(StringNewStringFromString, kStatic, kNeedsEnvironmentOrCache, kAllSideEffects, kCanThrow) \
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 19c6a22..fa250a3 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2067,11 +2067,12 @@
   // Note that the null check must have been done earlier.
   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
 
-  // Check for code points > 0xFFFF. Either a slow-path check when we
-  // don't know statically, or directly dispatch if we have a constant.
+  // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
+  // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
   SlowPathCodeMIPS* slow_path = nullptr;
-  if (invoke->InputAt(1)->IsIntConstant()) {
-    if (!IsUint<16>(invoke->InputAt(1)->AsIntConstant()->GetValue())) {
+  HInstruction* code_point = invoke->InputAt(1);
+  if (code_point->IsIntConstant()) {
+    if (!IsUint<16>(code_point->AsIntConstant()->GetValue())) {
       // Always needs the slow-path. We could directly dispatch to it,
       // but this case should be rare, so for simplicity just put the
       // full slow-path down and branch unconditionally.
@@ -2081,7 +2082,7 @@
       __ Bind(slow_path->GetExitLabel());
       return;
     }
-  } else {
+  } else if (code_point->GetType() != Primitive::kPrimChar) {
     Register char_reg = locations->InAt(1).AsRegister<Register>();
     // The "bltu" conditional branch tests to see if the character value
     // fits in a valid 16-bit (MIPS halfword) value. If it doesn't then
@@ -2283,10 +2284,10 @@
     // If one, or more, of the exponent bits is zero, then the number can't be infinite.
     if (type == Primitive::kPrimDouble) {
       __ MoveFromFpuHigh(TMP, in);
-      __ LoadConst32(AT, 0x7FF00000);
+      __ LoadConst32(AT, High32Bits(kPositiveInfinityDouble));
     } else {
       __ Mfc1(TMP, in);
-      __ LoadConst32(AT, 0x7F800000);
+      __ LoadConst32(AT, kPositiveInfinityFloat);
     }
     __ Xor(TMP, TMP, AT);
 
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index cf973aa..aa978e5 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -1477,11 +1477,12 @@
   // Note that the null check must have been done earlier.
   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
 
-  // Check for code points > 0xFFFF. Either a slow-path check when we
-  // don't know statically, or directly dispatch if we have a constant.
+  // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
+  // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
   SlowPathCodeMIPS64* slow_path = nullptr;
-  if (invoke->InputAt(1)->IsIntConstant()) {
-    if (!IsUint<16>(invoke->InputAt(1)->AsIntConstant()->GetValue())) {
+  HInstruction* code_point = invoke->InputAt(1);
+  if (code_point->IsIntConstant()) {
+    if (!IsUint<16>(code_point->AsIntConstant()->GetValue())) {
       // Always needs the slow-path. We could directly dispatch to it,
       // but this case should be rare, so for simplicity just put the
       // full slow-path down and branch unconditionally.
@@ -1491,7 +1492,7 @@
       __ Bind(slow_path->GetExitLabel());
       return;
     }
-  } else {
+  } else if (code_point->GetType() != Primitive::kPrimChar) {
     GpuRegister char_reg = locations->InAt(1).AsRegister<GpuRegister>();
     __ LoadConst32(tmp_reg, std::numeric_limits<uint16_t>::max());
     slow_path = new (allocator) IntrinsicSlowPathMIPS64(invoke);
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 4aab3e2..580e744 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -60,19 +60,6 @@
   if (res == nullptr) {
     return false;
   }
-  if (kEmitCompilerReadBarrier && res->CanCall()) {
-    // Generating an intrinsic for this HInvoke may produce an
-    // IntrinsicSlowPathX86 slow path.  Currently this approach
-    // does not work when using read barriers, as the emitted
-    // calling sequence will make use of another slow path
-    // (ReadBarrierForRootSlowPathX86 for HInvokeStaticOrDirect,
-    // ReadBarrierSlowPathX86 for HInvokeVirtual).  So we bail
-    // out in this case.
-    //
-    // TODO: Find a way to have intrinsics work with read barriers.
-    invoke->SetLocations(nullptr);
-    return false;
-  }
   return res->Intrinsified();
 }
 
@@ -1319,11 +1306,11 @@
     __ j(kEqual, &return_false);
   }
 
-  // Instanceof check for the argument by comparing class fields.
-  // All string objects must have the same type since String cannot be subclassed.
-  // Receiver must be a string object, so its class field is equal to all strings' class fields.
-  // If the argument is a string object, its class field must be equal to receiver's class field.
   if (!optimizations.GetArgumentIsString()) {
+    // Instanceof check for the argument by comparing class fields.
+    // All string objects must have the same type since String cannot be subclassed.
+    // Receiver must be a string object, so its class field is equal to all strings' class fields.
+    // If the argument is a string object, its class field must be equal to receiver's class field.
     __ movl(ecx, Address(str, class_offset));
     __ cmpl(ecx, Address(arg, class_offset));
     __ j(kNotEqual, &return_false);
@@ -1418,10 +1405,11 @@
   DCHECK_EQ(out, EDI);
 
   // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
-  // or directly dispatch if we have a constant.
+  // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
   SlowPathCode* slow_path = nullptr;
-  if (invoke->InputAt(1)->IsIntConstant()) {
-    if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) >
+  HInstruction* code_point = invoke->InputAt(1);
+  if (code_point->IsIntConstant()) {
+    if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) >
     std::numeric_limits<uint16_t>::max()) {
       // Always needs the slow-path. We could directly dispatch to it, but this case should be
       // rare, so for simplicity just put the full slow-path down and branch unconditionally.
@@ -1431,7 +1419,7 @@
       __ Bind(slow_path->GetExitLabel());
       return;
     }
-  } else {
+  } else if (code_point->GetType() != Primitive::kPrimChar) {
     __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
     slow_path = new (allocator) IntrinsicSlowPathX86(invoke);
     codegen->AddSlowPath(slow_path);
@@ -1919,12 +1907,13 @@
     if (is_volatile) {
       // Need to use XMM to read volatile.
       locations->AddTemp(Location::RequiresFpuRegister());
-      locations->SetOut(Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
     } else {
       locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
     }
   } else {
-    locations->SetOut(Location::RequiresRegister());
+    locations->SetOut(Location::RequiresRegister(),
+                      can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
   if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     // We need a temporary register for the read barrier marking slow
@@ -2150,9 +2139,9 @@
   // The UnsafeCASObject intrinsic is missing a read barrier, and
   // therefore sometimes does not work as expected (b/25883050).
   // Turn it off temporarily as a quick fix, until the read barrier is
-  // implemented.
+  // implemented (see TODO in GenCAS).
   //
-  // TODO(rpl): Implement a read barrier in GenCAS below and re-enable
+  // TODO(rpl): Implement read barrier support in GenCAS and re-enable
   // this intrinsic.
   if (kEmitCompilerReadBarrier) {
     return;
@@ -2277,6 +2266,15 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitUnsafeCASObject(HInvoke* invoke) {
+  // The UnsafeCASObject intrinsic is missing a read barrier, and
+  // therefore sometimes does not work as expected (b/25883050).
+  // Turn it off temporarily as a quick fix, until the read barrier is
+  // implemented (see TODO in GenCAS).
+  //
+  // TODO(rpl): Implement read barrier support in GenCAS and re-enable
+  // this intrinsic.
+  DCHECK(!kEmitCompilerReadBarrier);
+
   GenCAS(Primitive::kPrimNot, invoke, codegen_);
 }
 
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index f726a25..91eea6b 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -54,19 +54,6 @@
   if (res == nullptr) {
     return false;
   }
-  if (kEmitCompilerReadBarrier && res->CanCall()) {
-    // Generating an intrinsic for this HInvoke may produce an
-    // IntrinsicSlowPathX86_64 slow path.  Currently this approach
-    // does not work when using read barriers, as the emitted
-    // calling sequence will make use of another slow path
-    // (ReadBarrierForRootSlowPathX86_64 for HInvokeStaticOrDirect,
-    // ReadBarrierSlowPathX86_64 for HInvokeVirtual).  So we bail
-    // out in this case.
-    //
-    // TODO: Find a way to have intrinsics work with read barriers.
-    invoke->SetLocations(nullptr);
-    return false;
-  }
   return res->Intrinsified();
 }
 
@@ -1122,14 +1109,20 @@
 
 
 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
+  // TODO(rpl): Implement read barriers in the SystemArrayCopy
+  // intrinsic and re-enable it (b/29516905).
+  if (kEmitCompilerReadBarrier) {
+    return;
+  }
+
   CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke);
 }
 
-// TODO: Implement read barriers in the SystemArrayCopy intrinsic.
-// Note that this code path is not used (yet) because we do not
-// intrinsify methods that can go into the IntrinsicSlowPathX86_64
-// slow path.
 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
+  // TODO(rpl): Implement read barriers in the SystemArrayCopy
+  // intrinsic and re-enable it (b/29516905).
+  DCHECK(!kEmitCompilerReadBarrier);
+
   X86_64Assembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
@@ -1419,17 +1412,22 @@
   // Note that the null check must have been done earlier.
   DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
 
-  // Check if input is null, return false if it is.
-  __ testl(arg, arg);
-  __ j(kEqual, &return_false);
+  StringEqualsOptimizations optimizations(invoke);
+  if (!optimizations.GetArgumentNotNull()) {
+    // Check if input is null, return false if it is.
+    __ testl(arg, arg);
+    __ j(kEqual, &return_false);
+  }
 
-  // Instanceof check for the argument by comparing class fields.
-  // All string objects must have the same type since String cannot be subclassed.
-  // Receiver must be a string object, so its class field is equal to all strings' class fields.
-  // If the argument is a string object, its class field must be equal to receiver's class field.
-  __ movl(rcx, Address(str, class_offset));
-  __ cmpl(rcx, Address(arg, class_offset));
-  __ j(kNotEqual, &return_false);
+  if (!optimizations.GetArgumentIsString()) {
+    // Instanceof check for the argument by comparing class fields.
+    // All string objects must have the same type since String cannot be subclassed.
+    // Receiver must be a string object, so its class field is equal to all strings' class fields.
+    // If the argument is a string object, its class field must be equal to receiver's class field.
+    __ movl(rcx, Address(str, class_offset));
+    __ cmpl(rcx, Address(arg, class_offset));
+    __ j(kNotEqual, &return_false);
+  }
 
   // Reference equality check, return true if same reference.
   __ cmpl(str, arg);
@@ -1520,10 +1518,11 @@
   DCHECK_EQ(out.AsRegister(), RDI);
 
   // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
-  // or directly dispatch if we have a constant.
+  // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
   SlowPathCode* slow_path = nullptr;
-  if (invoke->InputAt(1)->IsIntConstant()) {
-    if (static_cast<uint32_t>(invoke->InputAt(1)->AsIntConstant()->GetValue()) >
+  HInstruction* code_point = invoke->InputAt(1);
+  if (code_point->IsIntConstant()) {
+    if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) >
     std::numeric_limits<uint16_t>::max()) {
       // Always needs the slow-path. We could directly dispatch to it, but this case should be
       // rare, so for simplicity just put the full slow-path down and branch unconditionally.
@@ -1533,7 +1532,7 @@
       __ Bind(slow_path->GetExitLabel());
       return;
     }
-  } else {
+  } else if (code_point->GetType() != Primitive::kPrimChar) {
     __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
     slow_path = new (allocator) IntrinsicSlowPathX86_64(invoke);
     codegen->AddSlowPath(slow_path);
@@ -1988,7 +1987,8 @@
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(),
+                    can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     // We need a temporary register for the read barrier marking slow
     // path in InstructionCodeGeneratorX86_64::GenerateArrayLoadWithBakerReadBarrier.
@@ -2175,9 +2175,9 @@
   // The UnsafeCASObject intrinsic is missing a read barrier, and
   // therefore sometimes does not work as expected (b/25883050).
   // Turn it off temporarily as a quick fix, until the read barrier is
-  // implemented.
+  // implemented (see TODO in GenCAS).
   //
-  // TODO(rpl): Implement a read barrier in GenCAS below and re-enable
+  // TODO(rpl): Implement read barrier support in GenCAS and re-enable
   // this intrinsic.
   if (kEmitCompilerReadBarrier) {
     return;
@@ -2293,6 +2293,15 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
+  // The UnsafeCASObject intrinsic is missing a read barrier, and
+  // therefore sometimes does not work as expected (b/25883050).
+  // Turn it off temporarily as a quick fix, until the read barrier is
+  // implemented (see TODO in GenCAS).
+  //
+  // TODO(rpl): Implement read barrier support in GenCAS and re-enable
+  // this intrinsic.
+  DCHECK(!kEmitCompilerReadBarrier);
+
   GenCAS(Primitive::kPrimNot, invoke, codegen_);
 }
 
diff --git a/compiler/optimizing/licm.cc b/compiler/optimizing/licm.cc
index 7543cd6..a0ded74 100644
--- a/compiler/optimizing/licm.cc
+++ b/compiler/optimizing/licm.cc
@@ -30,8 +30,8 @@
 static bool InputsAreDefinedBeforeLoop(HInstruction* instruction) {
   DCHECK(instruction->IsInLoop());
   HLoopInformation* info = instruction->GetBlock()->GetLoopInformation();
-  for (HInputIterator it(instruction); !it.Done(); it.Advance()) {
-    HLoopInformation* input_loop = it.Current()->GetBlock()->GetLoopInformation();
+  for (const HInstruction* input : instruction->GetInputs()) {
+    HLoopInformation* input_loop = input->GetBlock()->GetLoopInformation();
     // We only need to check whether the input is defined in the loop. If it is not
     // it is defined before the loop.
     if (input_loop != nullptr && input_loop->IsIn(*info)) {
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 63bbc2c..3f27c91 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -38,7 +38,13 @@
 class Location : public ValueObject {
  public:
   enum OutputOverlap {
+    // The liveness of the output overlaps the liveness of one or
+    // several input(s); the register allocator cannot reuse an
+    // input's location for the output's location.
     kOutputOverlap,
+    // The liveness of the output does not overlap the liveness of any
+    // input; the register allocator is allowed to reuse an input's
+    // location for the output's location.
     kNoOutputOverlap
   };
 
@@ -494,6 +500,10 @@
     return inputs_.size();
   }
 
+  // Set the output location.  Argument `overlaps` tells whether the
+  // output overlaps any of the inputs (if so, it cannot share the
+  // same register as one of the inputs); it is set to
+  // `Location::kOutputOverlap` by default for safety.
   void SetOut(Location location, Location::OutputOverlap overlaps = Location::kOutputOverlap) {
     DCHECK(output_.IsInvalid());
     output_overlaps_ = overlaps;
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index ca76bc0..8bb0d66 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -101,10 +101,7 @@
 }
 
 static void RemoveAsUser(HInstruction* instruction) {
-  for (size_t i = 0; i < instruction->InputCount(); i++) {
-    instruction->RemoveAsUserOfInput(i);
-  }
-
+  instruction->RemoveAsUserOfAllInputs();
   RemoveEnvironmentUses(instruction);
 }
 
@@ -757,8 +754,9 @@
 }
 
 static void UpdateInputsUsers(HInstruction* instruction) {
-  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
-    instruction->InputAt(i)->AddUseAt(instruction, i);
+  auto&& inputs = instruction->GetInputs();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs[i]->AddUseAt(instruction, i);
   }
   // Environment should be created later.
   DCHECK(!instruction->HasEnvironment());
@@ -1126,9 +1124,10 @@
 void HPhi::RemoveInputAt(size_t index) {
   RemoveAsUserOfInput(index);
   inputs_.erase(inputs_.begin() + index);
-  for (size_t i = index, e = InputCount(); i < e; ++i) {
-    DCHECK_EQ(InputRecordAt(i).GetUseNode()->GetIndex(), i + 1u);
-    InputRecordAt(i).GetUseNode()->SetIndex(i);
+  // Update indexes in use nodes of inputs that have been pulled forward by the erase().
+  for (size_t i = index, e = inputs_.size(); i < e; ++i) {
+    DCHECK_EQ(inputs_[i].GetUseNode()->GetIndex(), i + 1u);
+    inputs_[i].GetUseNode()->SetIndex(i);
   }
 }
 
@@ -1324,16 +1323,18 @@
   return this == instruction->GetPreviousDisregardingMoves();
 }
 
-bool HInstruction::Equals(HInstruction* other) const {
+bool HInstruction::Equals(const HInstruction* other) const {
   if (!InstructionTypeEquals(other)) return false;
   DCHECK_EQ(GetKind(), other->GetKind());
   if (!InstructionDataEquals(other)) return false;
   if (GetType() != other->GetType()) return false;
-  if (InputCount() != other->InputCount()) return false;
-
-  for (size_t i = 0, e = InputCount(); i < e; ++i) {
-    if (InputAt(i) != other->InputAt(i)) return false;
+  auto&& inputs = GetInputs();
+  auto&& other_inputs = other->GetInputs();
+  if (inputs.size() != other_inputs.size()) return false;
+  for (size_t i = 0; i != inputs.size(); ++i) {
+    if (inputs[i] != other_inputs[i]) return false;
   }
+
   DCHECK_EQ(ComputeHashCode(), other->ComputeHashCode());
   return true;
 }
@@ -2390,9 +2391,9 @@
   inputs_.insert(inputs_.begin() + index, HUserRecord<HInstruction*>(input));
   input->AddUseAt(this, index);
   // Update indexes in use nodes of inputs that have been pushed further back by the insert().
-  for (size_t i = index + 1u, size = inputs_.size(); i != size; ++i) {
-    DCHECK_EQ(InputRecordAt(i).GetUseNode()->GetIndex(), i - 1u);
-    InputRecordAt(i).GetUseNode()->SetIndex(i);
+  for (size_t i = index + 1u, e = inputs_.size(); i < e; ++i) {
+    DCHECK_EQ(inputs_[i].GetUseNode()->GetIndex(), i - 1u);
+    inputs_[i].GetUseNode()->SetIndex(i);
   }
 }
 
@@ -2400,9 +2401,9 @@
   RemoveAsUserOfInput(index);
   inputs_.erase(inputs_.begin() + index);
   // Update indexes in use nodes of inputs that have been pulled forward by the erase().
-  for (size_t i = index, e = InputCount(); i < e; ++i) {
-    DCHECK_EQ(InputRecordAt(i).GetUseNode()->GetIndex(), i + 1u);
-    InputRecordAt(i).GetUseNode()->SetIndex(i);
+  for (size_t i = index, e = inputs_.size(); i < e; ++i) {
+    DCHECK_EQ(inputs_[i].GetUseNode()->GetIndex(), i + 1u);
+    inputs_[i].GetUseNode()->SetIndex(i);
   }
 }
 
@@ -2440,8 +2441,8 @@
   }
 }
 
-bool HLoadString::InstructionDataEquals(HInstruction* other) const {
-  HLoadString* other_load_string = other->AsLoadString();
+bool HLoadString::InstructionDataEquals(const HInstruction* other) const {
+  const HLoadString* other_load_string = other->AsLoadString();
   if (string_index_ != other_load_string->string_index_ ||
       GetPackedFields() != other_load_string->GetPackedFields()) {
     return false;
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index f3915a2..f1d164e 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -37,6 +37,7 @@
 #include "primitive.h"
 #include "utils/array_ref.h"
 #include "utils/intrusive_forward_list.h"
+#include "utils/transform_array_ref.h"
 
 namespace art {
 
@@ -1333,12 +1334,12 @@
 FOR_EACH_INSTRUCTION(FORWARD_DECLARATION)
 #undef FORWARD_DECLARATION
 
-#define DECLARE_INSTRUCTION(type)                                       \
-  InstructionKind GetKindInternal() const OVERRIDE { return k##type; }  \
-  const char* DebugName() const OVERRIDE { return #type; }              \
-  bool InstructionTypeEquals(HInstruction* other) const OVERRIDE {      \
-    return other->Is##type();                                           \
-  }                                                                     \
+#define DECLARE_INSTRUCTION(type)                                         \
+  InstructionKind GetKindInternal() const OVERRIDE { return k##type; }    \
+  const char* DebugName() const OVERRIDE { return #type; }                \
+  bool InstructionTypeEquals(const HInstruction* other) const OVERRIDE {  \
+    return other->Is##type();                                             \
+  }                                                                       \
   void Accept(HGraphVisitor* visitor) OVERRIDE
 
 #define DECLARE_ABSTRACT_INSTRUCTION(type)                              \
@@ -1798,16 +1799,41 @@
     return IsLoopHeaderPhi() && GetBlock()->GetLoopInformation()->IsIrreducible();
   }
 
-  virtual size_t InputCount() const = 0;
+  virtual ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() = 0;
+
+  ArrayRef<const HUserRecord<HInstruction*>> GetInputRecords() const {
+    // One virtual method is enough, just const_cast<> and then re-add the const.
+    return ArrayRef<const HUserRecord<HInstruction*>>(
+        const_cast<HInstruction*>(this)->GetInputRecords());
+  }
+
+  auto GetInputs() {
+    return MakeTransformArrayRef(
+        GetInputRecords(),
+        [](HUserRecord<HInstruction*>& record) -> HInstruction* {
+            return record.GetInstruction();
+        });
+  }
+
+  auto GetInputs() const {
+    return MakeTransformArrayRef(
+        GetInputRecords(),
+        [](const HUserRecord<HInstruction*>& record) -> const HInstruction* {
+            return record.GetInstruction();
+        });
+  }
+
+  size_t InputCount() const { return GetInputRecords().size(); }
   HInstruction* InputAt(size_t i) const { return InputRecordAt(i).GetInstruction(); }
 
+  void SetRawInputAt(size_t index, HInstruction* input) {
+    SetRawInputRecordAt(index, HUserRecord<HInstruction*>(input));
+  }
+
   virtual void Accept(HGraphVisitor* visitor) = 0;
   virtual const char* DebugName() const = 0;
 
   virtual Primitive::Type GetType() const { return Primitive::kPrimVoid; }
-  void SetRawInputAt(size_t index, HInstruction* input) {
-    SetRawInputRecordAt(index, HUserRecord<HInstruction*>(input));
-  }
 
   virtual bool NeedsEnvironment() const { return false; }
 
@@ -1872,6 +1898,14 @@
     input_use.GetInstruction()->FixUpUserRecordsAfterUseRemoval(before_use_node);
   }
 
+  void RemoveAsUserOfAllInputs() {
+    for (const HUserRecord<HInstruction*>& input_use : GetInputRecords()) {
+      HUseList<HInstruction*>::iterator before_use_node = input_use.GetBeforeUseNode();
+      input_use.GetInstruction()->uses_.erase_after(before_use_node);
+      input_use.GetInstruction()->FixUpUserRecordsAfterUseRemoval(before_use_node);
+    }
+  }
+
   const HUseList<HInstruction*>& GetUses() const { return uses_; }
   const HUseList<HEnvironment*>& GetEnvUses() const { return env_uses_; }
 
@@ -1976,21 +2010,21 @@
   virtual bool CanBeMoved() const { return false; }
 
   // Returns whether the two instructions are of the same kind.
-  virtual bool InstructionTypeEquals(HInstruction* other ATTRIBUTE_UNUSED) const {
+  virtual bool InstructionTypeEquals(const HInstruction* other ATTRIBUTE_UNUSED) const {
     return false;
   }
 
   // Returns whether any data encoded in the two instructions is equal.
   // This method does not look at the inputs. Both instructions must be
   // of the same type, otherwise the method has undefined behavior.
-  virtual bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const {
+  virtual bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const {
     return false;
   }
 
   // Returns whether two instructions are equal, that is:
   // 1) They have the same type and contain the same data (InstructionDataEquals).
   // 2) Their inputs are identical.
-  bool Equals(HInstruction* other) const;
+  bool Equals(const HInstruction* other) const;
 
   // TODO: Remove this indirection when the [[pure]] attribute proposal (n3744)
   // is adopted and implemented by our C++ compiler(s). Fow now, we need to hide
@@ -2001,8 +2035,8 @@
 
   virtual size_t ComputeHashCode() const {
     size_t result = GetKind();
-    for (size_t i = 0, e = InputCount(); i < e; ++i) {
-      result = (result * 31) + InputAt(i)->GetId();
+    for (const HInstruction* input : GetInputs()) {
+      result = (result * 31) + input->GetId();
     }
     return result;
   }
@@ -2052,8 +2086,14 @@
   static constexpr size_t kNumberOfGenericPackedBits = kFlagReferenceTypeIsExact + 1;
   static constexpr size_t kMaxNumberOfPackedBits = sizeof(uint32_t) * kBitsPerByte;
 
-  virtual const HUserRecord<HInstruction*> InputRecordAt(size_t i) const = 0;
-  virtual void SetRawInputRecordAt(size_t index, const HUserRecord<HInstruction*>& input) = 0;
+  const HUserRecord<HInstruction*> InputRecordAt(size_t i) const {
+    return GetInputRecords()[i];
+  }
+
+  void SetRawInputRecordAt(size_t index, const HUserRecord<HInstruction*>& input) {
+    ArrayRef<HUserRecord<HInstruction*>> input_records = GetInputRecords();
+    input_records[index] = input;
+  }
 
   uint32_t GetPackedFields() const {
     return packed_fields_;
@@ -2174,21 +2214,6 @@
 };
 std::ostream& operator<<(std::ostream& os, const HInstruction::InstructionKind& rhs);
 
-class HInputIterator : public ValueObject {
- public:
-  explicit HInputIterator(HInstruction* instruction) : instruction_(instruction), index_(0) {}
-
-  bool Done() const { return index_ == instruction_->InputCount(); }
-  HInstruction* Current() const { return instruction_->InputAt(index_); }
-  void Advance() { index_++; }
-
- private:
-  HInstruction* instruction_;
-  size_t index_;
-
-  DISALLOW_COPY_AND_ASSIGN(HInputIterator);
-};
-
 class HInstructionIterator : public ValueObject {
  public:
   explicit HInstructionIterator(const HInstructionList& instructions)
@@ -2238,17 +2263,9 @@
       : HInstruction(side_effects, dex_pc), inputs_() {}
   virtual ~HTemplateInstruction() {}
 
-  size_t InputCount() const OVERRIDE { return N; }
-
- protected:
-  const HUserRecord<HInstruction*> InputRecordAt(size_t i) const OVERRIDE {
-    DCHECK_LT(i, N);
-    return inputs_[i];
-  }
-
-  void SetRawInputRecordAt(size_t i, const HUserRecord<HInstruction*>& input) OVERRIDE {
-    DCHECK_LT(i, N);
-    inputs_[i] = input;
+  using HInstruction::GetInputRecords;  // Keep the const version visible.
+  ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() OVERRIDE FINAL {
+    return ArrayRef<HUserRecord<HInstruction*>>(inputs_);
   }
 
  private:
@@ -2266,18 +2283,9 @@
 
   virtual ~HTemplateInstruction() {}
 
-  size_t InputCount() const OVERRIDE { return 0; }
-
- protected:
-  const HUserRecord<HInstruction*> InputRecordAt(size_t i ATTRIBUTE_UNUSED) const OVERRIDE {
-    LOG(FATAL) << "Unreachable";
-    UNREACHABLE();
-  }
-
-  void SetRawInputRecordAt(size_t i ATTRIBUTE_UNUSED,
-                           const HUserRecord<HInstruction*>& input ATTRIBUTE_UNUSED) OVERRIDE {
-    LOG(FATAL) << "Unreachable";
-    UNREACHABLE();
+  using HInstruction::GetInputRecords;  // Keep the const version visible.
+  ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() OVERRIDE FINAL {
+    return ArrayRef<HUserRecord<HInstruction*>>();
   }
 
  private:
@@ -2309,7 +2317,7 @@
 
 // Represents dex's RETURN_VOID opcode. A HReturnVoid is a control flow
 // instruction that branches to the exit block.
-class HReturnVoid : public HTemplateInstruction<0> {
+class HReturnVoid FINAL : public HTemplateInstruction<0> {
  public:
   explicit HReturnVoid(uint32_t dex_pc = kNoDexPc)
       : HTemplateInstruction(SideEffects::None(), dex_pc) {}
@@ -2324,7 +2332,7 @@
 
 // Represents dex's RETURN opcodes. A HReturn is a control flow
 // instruction that branches to the exit block.
-class HReturn : public HTemplateInstruction<1> {
+class HReturn FINAL : public HTemplateInstruction<1> {
  public:
   explicit HReturn(HInstruction* value, uint32_t dex_pc = kNoDexPc)
       : HTemplateInstruction(SideEffects::None(), dex_pc) {
@@ -2339,7 +2347,7 @@
   DISALLOW_COPY_AND_ASSIGN(HReturn);
 };
 
-class HPhi : public HInstruction {
+class HPhi FINAL : public HInstruction {
  public:
   HPhi(ArenaAllocator* arena,
        uint32_t reg_number,
@@ -2365,7 +2373,10 @@
 
   bool IsCatchPhi() const { return GetBlock()->IsCatchBlock(); }
 
-  size_t InputCount() const OVERRIDE { return inputs_.size(); }
+  using HInstruction::GetInputRecords;  // Keep the const version visible.
+  ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() OVERRIDE FINAL {
+    return ArrayRef<HUserRecord<HInstruction*>>(inputs_);
+  }
 
   void AddInput(HInstruction* input);
   void RemoveInputAt(size_t index);
@@ -2415,15 +2426,6 @@
 
   DECLARE_INSTRUCTION(Phi);
 
- protected:
-  const HUserRecord<HInstruction*> InputRecordAt(size_t index) const OVERRIDE {
-    return inputs_[index];
-  }
-
-  void SetRawInputRecordAt(size_t index, const HUserRecord<HInstruction*>& input) OVERRIDE {
-    inputs_[index] = input;
-  }
-
  private:
   static constexpr size_t kFieldType = HInstruction::kNumberOfGenericPackedBits;
   static constexpr size_t kFieldTypeSize =
@@ -2434,7 +2436,7 @@
   static_assert(kNumberOfPhiPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
   using TypeField = BitField<Primitive::Type, kFieldType, kFieldTypeSize>;
 
-  ArenaVector<HUserRecord<HInstruction*> > inputs_;
+  ArenaVector<HUserRecord<HInstruction*>> inputs_;
   const uint32_t reg_number_;
 
   DISALLOW_COPY_AND_ASSIGN(HPhi);
@@ -2443,7 +2445,7 @@
 // The exit instruction is the only instruction of the exit block.
 // Instructions aborting the method (HThrow and HReturn) must branch to the
 // exit block.
-class HExit : public HTemplateInstruction<0> {
+class HExit FINAL : public HTemplateInstruction<0> {
  public:
   explicit HExit(uint32_t dex_pc = kNoDexPc) : HTemplateInstruction(SideEffects::None(), dex_pc) {}
 
@@ -2456,7 +2458,7 @@
 };
 
 // Jumps from one block to another.
-class HGoto : public HTemplateInstruction<0> {
+class HGoto FINAL : public HTemplateInstruction<0> {
  public:
   explicit HGoto(uint32_t dex_pc = kNoDexPc) : HTemplateInstruction(SideEffects::None(), dex_pc) {}
 
@@ -2496,9 +2498,9 @@
   DISALLOW_COPY_AND_ASSIGN(HConstant);
 };
 
-class HNullConstant : public HConstant {
+class HNullConstant FINAL : public HConstant {
  public:
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
 
@@ -2520,7 +2522,7 @@
 
 // Constants of the type int. Those can be from Dex instructions, or
 // synthesized (for example with the if-eqz instruction).
-class HIntConstant : public HConstant {
+class HIntConstant FINAL : public HConstant {
  public:
   int32_t GetValue() const { return value_; }
 
@@ -2528,7 +2530,7 @@
     return static_cast<uint64_t>(static_cast<uint32_t>(value_));
   }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
     DCHECK(other->IsIntConstant()) << other->DebugName();
     return other->AsIntConstant()->value_ == value_;
   }
@@ -2561,13 +2563,13 @@
   DISALLOW_COPY_AND_ASSIGN(HIntConstant);
 };
 
-class HLongConstant : public HConstant {
+class HLongConstant FINAL : public HConstant {
  public:
   int64_t GetValue() const { return value_; }
 
   uint64_t GetValueAsUint64() const OVERRIDE { return value_; }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
     DCHECK(other->IsLongConstant()) << other->DebugName();
     return other->AsLongConstant()->value_ == value_;
   }
@@ -2591,7 +2593,7 @@
   DISALLOW_COPY_AND_ASSIGN(HLongConstant);
 };
 
-class HFloatConstant : public HConstant {
+class HFloatConstant FINAL : public HConstant {
  public:
   float GetValue() const { return value_; }
 
@@ -2599,7 +2601,7 @@
     return static_cast<uint64_t>(bit_cast<uint32_t, float>(value_));
   }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
     DCHECK(other->IsFloatConstant()) << other->DebugName();
     return other->AsFloatConstant()->GetValueAsUint64() == GetValueAsUint64();
   }
@@ -2644,13 +2646,13 @@
   DISALLOW_COPY_AND_ASSIGN(HFloatConstant);
 };
 
-class HDoubleConstant : public HConstant {
+class HDoubleConstant FINAL : public HConstant {
  public:
   double GetValue() const { return value_; }
 
   uint64_t GetValueAsUint64() const OVERRIDE { return bit_cast<uint64_t, double>(value_); }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
     DCHECK(other->IsDoubleConstant()) << other->DebugName();
     return other->AsDoubleConstant()->GetValueAsUint64() == GetValueAsUint64();
   }
@@ -2697,7 +2699,7 @@
 
 // Conditional branch. A block ending with an HIf instruction must have
 // two successors.
-class HIf : public HTemplateInstruction<1> {
+class HIf FINAL : public HTemplateInstruction<1> {
  public:
   explicit HIf(HInstruction* input, uint32_t dex_pc = kNoDexPc)
       : HTemplateInstruction(SideEffects::None(), dex_pc) {
@@ -2726,7 +2728,7 @@
 // non-exceptional control flow.
 // Normal-flow successor is stored at index zero, exception handlers under
 // higher indices in no particular order.
-class HTryBoundary : public HTemplateInstruction<0> {
+class HTryBoundary FINAL : public HTemplateInstruction<0> {
  public:
   enum class BoundaryKind {
     kEntry,
@@ -2784,7 +2786,7 @@
 };
 
 // Deoptimize to interpreter, upon checking a condition.
-class HDeoptimize : public HTemplateInstruction<1> {
+class HDeoptimize FINAL : public HTemplateInstruction<1> {
  public:
   // We set CanTriggerGC to prevent any intermediate address to be live
   // at the point of the `HDeoptimize`.
@@ -2794,7 +2796,7 @@
   }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
   bool NeedsEnvironment() const OVERRIDE { return true; }
@@ -2809,7 +2811,7 @@
 // Represents the ArtMethod that was passed as a first argument to
 // the method. It is used by instructions that depend on it, like
 // instructions that work with the dex cache.
-class HCurrentMethod : public HExpression<0> {
+class HCurrentMethod FINAL : public HExpression<0> {
  public:
   explicit HCurrentMethod(Primitive::Type type, uint32_t dex_pc = kNoDexPc)
       : HExpression(type, SideEffects::None(), dex_pc) {}
@@ -2822,7 +2824,7 @@
 
 // Fetches an ArtMethod from the virtual table or the interface method table
 // of a class.
-class HClassTableGet : public HExpression<1> {
+class HClassTableGet FINAL : public HExpression<1> {
  public:
   enum class TableKind {
     kVTable,
@@ -2841,7 +2843,7 @@
   }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
     return other->AsClassTableGet()->GetIndex() == index_ &&
         other->AsClassTableGet()->GetPackedFields() == GetPackedFields();
   }
@@ -2869,7 +2871,7 @@
 // PackedSwitch (jump table). A block ending with a PackedSwitch instruction will
 // have one successor for each entry in the switch table, and the final successor
 // will be the block containing the next Dex opcode.
-class HPackedSwitch : public HTemplateInstruction<1> {
+class HPackedSwitch FINAL : public HTemplateInstruction<1> {
  public:
   HPackedSwitch(int32_t start_value,
                 uint32_t num_entries,
@@ -2911,7 +2913,7 @@
   Primitive::Type GetResultType() const { return GetType(); }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
 
@@ -2983,7 +2985,7 @@
   }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
 
@@ -3056,7 +3058,7 @@
   ComparisonBias GetBias() const { return GetPackedField<ComparisonBiasField>(); }
   void SetBias(ComparisonBias bias) { SetPackedField<ComparisonBiasField>(bias); }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
     return GetPackedFields() == other->AsCondition()->GetPackedFields();
   }
 
@@ -3114,7 +3116,7 @@
 };
 
 // Instruction to check if two inputs are equal to each other.
-class HEqual : public HCondition {
+class HEqual FINAL : public HCondition {
  public:
   HEqual(HInstruction* first, HInstruction* second, uint32_t dex_pc = kNoDexPc)
       : HCondition(first, second, dex_pc) {}
@@ -3158,7 +3160,7 @@
   DISALLOW_COPY_AND_ASSIGN(HEqual);
 };
 
-class HNotEqual : public HCondition {
+class HNotEqual FINAL : public HCondition {
  public:
   HNotEqual(HInstruction* first, HInstruction* second, uint32_t dex_pc = kNoDexPc)
       : HCondition(first, second, dex_pc) {}
@@ -3201,7 +3203,7 @@
   DISALLOW_COPY_AND_ASSIGN(HNotEqual);
 };
 
-class HLessThan : public HCondition {
+class HLessThan FINAL : public HCondition {
  public:
   HLessThan(HInstruction* first, HInstruction* second, uint32_t dex_pc = kNoDexPc)
       : HCondition(first, second, dex_pc) {}
@@ -3238,7 +3240,7 @@
   DISALLOW_COPY_AND_ASSIGN(HLessThan);
 };
 
-class HLessThanOrEqual : public HCondition {
+class HLessThanOrEqual FINAL : public HCondition {
  public:
   HLessThanOrEqual(HInstruction* first, HInstruction* second, uint32_t dex_pc = kNoDexPc)
       : HCondition(first, second, dex_pc) {}
@@ -3275,7 +3277,7 @@
   DISALLOW_COPY_AND_ASSIGN(HLessThanOrEqual);
 };
 
-class HGreaterThan : public HCondition {
+class HGreaterThan FINAL : public HCondition {
  public:
   HGreaterThan(HInstruction* first, HInstruction* second, uint32_t dex_pc = kNoDexPc)
       : HCondition(first, second, dex_pc) {}
@@ -3312,7 +3314,7 @@
   DISALLOW_COPY_AND_ASSIGN(HGreaterThan);
 };
 
-class HGreaterThanOrEqual : public HCondition {
+class HGreaterThanOrEqual FINAL : public HCondition {
  public:
   HGreaterThanOrEqual(HInstruction* first, HInstruction* second, uint32_t dex_pc = kNoDexPc)
       : HCondition(first, second, dex_pc) {}
@@ -3349,7 +3351,7 @@
   DISALLOW_COPY_AND_ASSIGN(HGreaterThanOrEqual);
 };
 
-class HBelow : public HCondition {
+class HBelow FINAL : public HCondition {
  public:
   HBelow(HInstruction* first, HInstruction* second, uint32_t dex_pc = kNoDexPc)
       : HCondition(first, second, dex_pc) {}
@@ -3389,7 +3391,7 @@
   DISALLOW_COPY_AND_ASSIGN(HBelow);
 };
 
-class HBelowOrEqual : public HCondition {
+class HBelowOrEqual FINAL : public HCondition {
  public:
   HBelowOrEqual(HInstruction* first, HInstruction* second, uint32_t dex_pc = kNoDexPc)
       : HCondition(first, second, dex_pc) {}
@@ -3429,7 +3431,7 @@
   DISALLOW_COPY_AND_ASSIGN(HBelowOrEqual);
 };
 
-class HAbove : public HCondition {
+class HAbove FINAL : public HCondition {
  public:
   HAbove(HInstruction* first, HInstruction* second, uint32_t dex_pc = kNoDexPc)
       : HCondition(first, second, dex_pc) {}
@@ -3469,7 +3471,7 @@
   DISALLOW_COPY_AND_ASSIGN(HAbove);
 };
 
-class HAboveOrEqual : public HCondition {
+class HAboveOrEqual FINAL : public HCondition {
  public:
   HAboveOrEqual(HInstruction* first, HInstruction* second, uint32_t dex_pc = kNoDexPc)
       : HCondition(first, second, dex_pc) {}
@@ -3511,7 +3513,7 @@
 
 // Instruction to check how two inputs compare to each other.
 // Result is 0 if input0 == input1, 1 if input0 > input1, or -1 if input0 < input1.
-class HCompare : public HBinaryOperation {
+class HCompare FINAL : public HBinaryOperation {
  public:
   // Note that `comparison_type` is the type of comparison performed
   // between the comparison's inputs, not the type of the instantiated
@@ -3560,7 +3562,7 @@
     return MakeConstantComparison(ComputeFP(x->GetValue(), y->GetValue()), GetDexPc());
   }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
     return GetPackedFields() == other->AsCompare()->GetPackedFields();
   }
 
@@ -3600,7 +3602,7 @@
   DISALLOW_COPY_AND_ASSIGN(HCompare);
 };
 
-class HNewInstance : public HExpression<2> {
+class HNewInstance FINAL : public HExpression<2> {
  public:
   HNewInstance(HInstruction* cls,
                HCurrentMethod* current_method,
@@ -3689,10 +3691,13 @@
 
 class HInvoke : public HInstruction {
  public:
-  size_t InputCount() const OVERRIDE { return inputs_.size(); }
-
   bool NeedsEnvironment() const OVERRIDE;
 
+  using HInstruction::GetInputRecords;  // Keep the const version visible.
+  ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() OVERRIDE {
+    return ArrayRef<HUserRecord<HInstruction*>>(inputs_);
+  }
+
   void SetArgumentAt(size_t index, HInstruction* argument) {
     SetRawInputAt(index, argument);
   }
@@ -3729,7 +3734,7 @@
 
   bool CanBeMoved() const OVERRIDE { return IsIntrinsic(); }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
     return intrinsic_ != Intrinsics::kNone && intrinsic_ == other->AsInvoke()->intrinsic_;
   }
 
@@ -3780,14 +3785,6 @@
     SetPackedFlag<kFlagCanThrow>(true);
   }
 
-  const HUserRecord<HInstruction*> InputRecordAt(size_t index) const OVERRIDE {
-    return inputs_[index];
-  }
-
-  void SetRawInputRecordAt(size_t index, const HUserRecord<HInstruction*>& input) OVERRIDE {
-    inputs_[index] = input;
-  }
-
   void SetCanThrow(bool can_throw) { SetPackedFlag<kFlagCanThrow>(can_throw); }
 
   uint32_t number_of_arguments_;
@@ -3802,7 +3799,7 @@
   DISALLOW_COPY_AND_ASSIGN(HInvoke);
 };
 
-class HInvokeUnresolved : public HInvoke {
+class HInvokeUnresolved FINAL : public HInvoke {
  public:
   HInvokeUnresolved(ArenaAllocator* arena,
                     uint32_t number_of_arguments,
@@ -3825,7 +3822,7 @@
   DISALLOW_COPY_AND_ASSIGN(HInvokeUnresolved);
 };
 
-class HInvokeStaticOrDirect : public HInvoke {
+class HInvokeStaticOrDirect FINAL : public HInvoke {
  public:
   // Requirements of this method call regarding the class
   // initialization (clinit) check of its declaring class.
@@ -3954,6 +3951,25 @@
     InsertInputAt(GetSpecialInputIndex(), input);
   }
 
+  using HInstruction::GetInputRecords;  // Keep the const version visible.
+  ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() OVERRIDE {
+    ArrayRef<HUserRecord<HInstruction*>> input_records = HInvoke::GetInputRecords();
+    if (kIsDebugBuild && IsStaticWithExplicitClinitCheck()) {
+      DCHECK(!input_records.empty());
+      DCHECK_GT(input_records.size(), GetNumberOfArguments());
+      HInstruction* last_input = input_records.back().GetInstruction();
+      // Note: `last_input` may be null during arguments setup.
+      if (last_input != nullptr) {
+        // `last_input` is the last input of a static invoke marked as having
+        // an explicit clinit check. It must either be:
+        // - an art::HClinitCheck instruction, set by art::HGraphBuilder; or
+        // - an art::HLoadClass instruction, set by art::PrepareForRegisterAllocation.
+        DCHECK(last_input->IsClinitCheck() || last_input->IsLoadClass()) << last_input->DebugName();
+      }
+    }
+    return input_records;
+  }
+
   bool CanDoImplicitNullCheckOn(HInstruction* obj ATTRIBUTE_UNUSED) const OVERRIDE {
     // We access the method via the dex cache so we can't do an implicit null check.
     // TODO: for intrinsics we can generate implicit null checks.
@@ -4038,8 +4054,8 @@
   // instruction; only relevant for static calls with explicit clinit check.
   void RemoveExplicitClinitCheck(ClinitCheckRequirement new_requirement) {
     DCHECK(IsStaticWithExplicitClinitCheck());
-    size_t last_input_index = InputCount() - 1;
-    HInstruction* last_input = InputAt(last_input_index);
+    size_t last_input_index = inputs_.size() - 1u;
+    HInstruction* last_input = inputs_.back().GetInstruction();
     DCHECK(last_input != nullptr);
     DCHECK(last_input->IsLoadClass() || last_input->IsClinitCheck()) << last_input->DebugName();
     RemoveAsUserOfInput(last_input_index);
@@ -4068,20 +4084,6 @@
   DECLARE_INSTRUCTION(InvokeStaticOrDirect);
 
  protected:
-  const HUserRecord<HInstruction*> InputRecordAt(size_t i) const OVERRIDE {
-    const HUserRecord<HInstruction*> input_record = HInvoke::InputRecordAt(i);
-    if (kIsDebugBuild && IsStaticWithExplicitClinitCheck() && (i == InputCount() - 1)) {
-      HInstruction* input = input_record.GetInstruction();
-      // `input` is the last input of a static invoke marked as having
-      // an explicit clinit check. It must either be:
-      // - an art::HClinitCheck instruction, set by art::HGraphBuilder; or
-      // - an art::HLoadClass instruction, set by art::PrepareForRegisterAllocation.
-      DCHECK(input != nullptr);
-      DCHECK(input->IsClinitCheck() || input->IsLoadClass()) << input->DebugName();
-    }
-    return input_record;
-  }
-
   void InsertInputAt(size_t index, HInstruction* input);
   void RemoveInputAt(size_t index);
 
@@ -4114,7 +4116,7 @@
 std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::MethodLoadKind rhs);
 std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::ClinitCheckRequirement rhs);
 
-class HInvokeVirtual : public HInvoke {
+class HInvokeVirtual FINAL : public HInvoke {
  public:
   HInvokeVirtual(ArenaAllocator* arena,
                  uint32_t number_of_arguments,
@@ -4140,7 +4142,7 @@
   DISALLOW_COPY_AND_ASSIGN(HInvokeVirtual);
 };
 
-class HInvokeInterface : public HInvoke {
+class HInvokeInterface FINAL : public HInvoke {
  public:
   HInvokeInterface(ArenaAllocator* arena,
                    uint32_t number_of_arguments,
@@ -4167,7 +4169,7 @@
   DISALLOW_COPY_AND_ASSIGN(HInvokeInterface);
 };
 
-class HNeg : public HUnaryOperation {
+class HNeg FINAL : public HUnaryOperation {
  public:
   HNeg(Primitive::Type result_type, HInstruction* input, uint32_t dex_pc = kNoDexPc)
       : HUnaryOperation(result_type, input, dex_pc) {
@@ -4195,7 +4197,7 @@
   DISALLOW_COPY_AND_ASSIGN(HNeg);
 };
 
-class HNewArray : public HExpression<2> {
+class HNewArray FINAL : public HExpression<2> {
  public:
   HNewArray(HInstruction* length,
             HCurrentMethod* current_method,
@@ -4234,7 +4236,7 @@
   DISALLOW_COPY_AND_ASSIGN(HNewArray);
 };
 
-class HAdd : public HBinaryOperation {
+class HAdd FINAL : public HBinaryOperation {
  public:
   HAdd(Primitive::Type result_type,
        HInstruction* left,
@@ -4269,7 +4271,7 @@
   DISALLOW_COPY_AND_ASSIGN(HAdd);
 };
 
-class HSub : public HBinaryOperation {
+class HSub FINAL : public HBinaryOperation {
  public:
   HSub(Primitive::Type result_type,
        HInstruction* left,
@@ -4302,7 +4304,7 @@
   DISALLOW_COPY_AND_ASSIGN(HSub);
 };
 
-class HMul : public HBinaryOperation {
+class HMul FINAL : public HBinaryOperation {
  public:
   HMul(Primitive::Type result_type,
        HInstruction* left,
@@ -4337,7 +4339,7 @@
   DISALLOW_COPY_AND_ASSIGN(HMul);
 };
 
-class HDiv : public HBinaryOperation {
+class HDiv FINAL : public HBinaryOperation {
  public:
   HDiv(Primitive::Type result_type,
        HInstruction* left,
@@ -4389,7 +4391,7 @@
   DISALLOW_COPY_AND_ASSIGN(HDiv);
 };
 
-class HRem : public HBinaryOperation {
+class HRem FINAL : public HBinaryOperation {
  public:
   HRem(Primitive::Type result_type,
        HInstruction* left,
@@ -4440,7 +4442,7 @@
   DISALLOW_COPY_AND_ASSIGN(HRem);
 };
 
-class HDivZeroCheck : public HExpression<1> {
+class HDivZeroCheck FINAL : public HExpression<1> {
  public:
   // `HDivZeroCheck` can trigger GC, as it may call the `ArithmeticException`
   // constructor.
@@ -4453,7 +4455,7 @@
 
   bool CanBeMoved() const OVERRIDE { return true; }
 
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
 
@@ -4466,7 +4468,7 @@
   DISALLOW_COPY_AND_ASSIGN(HDivZeroCheck);
 };
 
-class HShl : public HBinaryOperation {
+class HShl FINAL : public HBinaryOperation {
  public:
   HShl(Primitive::Type result_type,
        HInstruction* value,
@@ -4512,7 +4514,7 @@
   DISALLOW_COPY_AND_ASSIGN(HShl);
 };
 
-class HShr : public HBinaryOperation {
+class HShr FINAL : public HBinaryOperation {
  public:
   HShr(Primitive::Type result_type,
        HInstruction* value,
@@ -4558,7 +4560,7 @@
   DISALLOW_COPY_AND_ASSIGN(HShr);
 };
 
-class HUShr : public HBinaryOperation {
+class HUShr FINAL : public HBinaryOperation {
  public:
   HUShr(Primitive::Type result_type,
         HInstruction* value,
@@ -4606,7 +4608,7 @@
   DISALLOW_COPY_AND_ASSIGN(HUShr);
 };
 
-class HAnd : public HBinaryOperation {
+class HAnd FINAL : public HBinaryOperation {
  public:
   HAnd(Primitive::Type result_type,
        HInstruction* left,
@@ -4643,7 +4645,7 @@
   DISALLOW_COPY_AND_ASSIGN(HAnd);
 };
 
-class HOr : public HBinaryOperation {
+class HOr FINAL : public HBinaryOperation {
  public:
   HOr(Primitive::Type result_type,
       HInstruction* left,
@@ -4680,7 +4682,7 @@
   DISALLOW_COPY_AND_ASSIGN(HOr);
 };
 
-class HXor : public HBinaryOperation {
+class HXor FINAL : public HBinaryOperation {
  public:
   HXor(Primitive::Type result_type,
        HInstruction* left,
@@ -4717,7 +4719,7 @@
   DISALLOW_COPY_AND_ASSIGN(HXor);
 };
 
-class HRor : public HBinaryOperation {
+class HRor FINAL : public HBinaryOperation {
  public:
   HRor(Primitive::Type result_type, HInstruction* value, HInstruction* distance)
     : HBinaryOperation(result_type, value, distance) {
@@ -4770,7 +4772,7 @@
 
 // The value of a parameter in this method. Its location depends on
 // the calling convention.
-class HParameterValue : public HExpression<0> {
+class HParameterValue FINAL : public HExpression<0> {
  public:
   HParameterValue(const DexFile& dex_file,
                   uint16_t type_index,
@@ -4812,13 +4814,13 @@
   DISALLOW_COPY_AND_ASSIGN(HParameterValue);
 };
 
-class HNot : public HUnaryOperation {
+class HNot FINAL : public HUnaryOperation {
  public:
   HNot(Primitive::Type result_type, HInstruction* input, uint32_t dex_pc = kNoDexPc)
       : HUnaryOperation(result_type, input, dex_pc) {}
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
 
@@ -4845,13 +4847,13 @@
   DISALLOW_COPY_AND_ASSIGN(HNot);
 };
 
-class HBooleanNot : public HUnaryOperation {
+class HBooleanNot FINAL : public HUnaryOperation {
  public:
   explicit HBooleanNot(HInstruction* input, uint32_t dex_pc = kNoDexPc)
       : HUnaryOperation(Primitive::Type::kPrimBoolean, input, dex_pc) {}
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
 
@@ -4882,7 +4884,7 @@
   DISALLOW_COPY_AND_ASSIGN(HBooleanNot);
 };
 
-class HTypeConversion : public HExpression<1> {
+class HTypeConversion FINAL : public HExpression<1> {
  public:
   // Instantiate a type conversion of `input` to `result_type`.
   HTypeConversion(Primitive::Type result_type, HInstruction* input, uint32_t dex_pc)
@@ -4899,7 +4901,9 @@
   Primitive::Type GetResultType() const { return GetType(); }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE { return true; }
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+    return true;
+  }
 
   // Try to statically evaluate the conversion and return a HConstant
   // containing the result.  If the input cannot be converted, return nullptr.
@@ -4925,7 +4929,7 @@
 
 static constexpr uint32_t kNoRegNumber = -1;
 
-class HNullCheck : public HExpression<1> {
+class HNullCheck FINAL : public HExpression<1> {
  public:
   // `HNullCheck` can trigger GC, as it may call the `NullPointerException`
   // constructor.
@@ -4935,7 +4939,7 @@
   }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
 
@@ -4987,7 +4991,7 @@
   const Handle<mirror::DexCache> dex_cache_;
 };
 
-class HInstanceFieldGet : public HExpression<1> {
+class HInstanceFieldGet FINAL : public HExpression<1> {
  public:
   HInstanceFieldGet(HInstruction* value,
                     Primitive::Type field_type,
@@ -5013,8 +5017,8 @@
 
   bool CanBeMoved() const OVERRIDE { return !IsVolatile(); }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
-    HInstanceFieldGet* other_get = other->AsInstanceFieldGet();
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
+    const HInstanceFieldGet* other_get = other->AsInstanceFieldGet();
     return GetFieldOffset().SizeValue() == other_get->GetFieldOffset().SizeValue();
   }
 
@@ -5039,7 +5043,7 @@
   DISALLOW_COPY_AND_ASSIGN(HInstanceFieldGet);
 };
 
-class HInstanceFieldSet : public HTemplateInstruction<2> {
+class HInstanceFieldSet FINAL : public HTemplateInstruction<2> {
  public:
   HInstanceFieldSet(HInstruction* object,
                     HInstruction* value,
@@ -5090,7 +5094,7 @@
   DISALLOW_COPY_AND_ASSIGN(HInstanceFieldSet);
 };
 
-class HArrayGet : public HExpression<2> {
+class HArrayGet FINAL : public HExpression<2> {
  public:
   HArrayGet(HInstruction* array,
             HInstruction* index,
@@ -5105,7 +5109,7 @@
   }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
   bool CanDoImplicitNullCheckOn(HInstruction* obj ATTRIBUTE_UNUSED) const OVERRIDE {
@@ -5142,7 +5146,7 @@
   DISALLOW_COPY_AND_ASSIGN(HArrayGet);
 };
 
-class HArraySet : public HTemplateInstruction<3> {
+class HArraySet FINAL : public HTemplateInstruction<3> {
  public:
   HArraySet(HInstruction* array,
             HInstruction* index,
@@ -5239,7 +5243,7 @@
   DISALLOW_COPY_AND_ASSIGN(HArraySet);
 };
 
-class HArrayLength : public HExpression<1> {
+class HArrayLength FINAL : public HExpression<1> {
  public:
   HArrayLength(HInstruction* array, uint32_t dex_pc)
       : HExpression(Primitive::kPrimInt, SideEffects::None(), dex_pc) {
@@ -5249,20 +5253,33 @@
   }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
   bool CanDoImplicitNullCheckOn(HInstruction* obj) const OVERRIDE {
     return obj == InputAt(0);
   }
 
+  void MarkAsStringLength() { SetPackedFlag<kFlagIsStringLength>(); }
+  bool IsStringLength() const { return GetPackedFlag<kFlagIsStringLength>(); }
+
   DECLARE_INSTRUCTION(ArrayLength);
 
  private:
+  // We treat a String as an array, creating the HArrayLength from String.length()
+  // or String.isEmpty() intrinsic in the instruction simplifier. We can always
+  // determine whether a particular HArrayLength is actually a String.length() by
+  // looking at the type of the input but that requires holding the mutator lock, so
+  // we prefer to use a flag, so that code generators don't need to do the locking.
+  static constexpr size_t kFlagIsStringLength = kNumberOfExpressionPackedBits;
+  static constexpr size_t kNumberOfArrayLengthPackedBits = kFlagIsStringLength + 1;
+  static_assert(kNumberOfArrayLengthPackedBits <= HInstruction::kMaxNumberOfPackedBits,
+                "Too many packed fields.");
+
   DISALLOW_COPY_AND_ASSIGN(HArrayLength);
 };
 
-class HBoundsCheck : public HExpression<2> {
+class HBoundsCheck FINAL : public HExpression<2> {
  public:
   // `HBoundsCheck` can trigger GC, as it may call the `IndexOutOfBoundsException`
   // constructor.
@@ -5274,7 +5291,7 @@
   }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
 
@@ -5290,7 +5307,7 @@
   DISALLOW_COPY_AND_ASSIGN(HBoundsCheck);
 };
 
-class HSuspendCheck : public HTemplateInstruction<0> {
+class HSuspendCheck FINAL : public HTemplateInstruction<0> {
  public:
   explicit HSuspendCheck(uint32_t dex_pc = kNoDexPc)
       : HTemplateInstruction(SideEffects::CanTriggerGC(), dex_pc), slow_path_(nullptr) {}
@@ -5332,7 +5349,7 @@
 /**
  * Instruction to load a Class object.
  */
-class HLoadClass : public HExpression<1> {
+class HLoadClass FINAL : public HExpression<1> {
  public:
   HLoadClass(HCurrentMethod* current_method,
              uint16_t type_index,
@@ -5358,7 +5375,7 @@
 
   bool CanBeMoved() const OVERRIDE { return true; }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
     // Note that we don't need to test for generate_clinit_check_.
     // Whether or not we need to generate the clinit check is processed in
     // prepare_for_register_allocator based on existing HInvokes and HClinitChecks.
@@ -5436,7 +5453,7 @@
   DISALLOW_COPY_AND_ASSIGN(HLoadClass);
 };
 
-class HLoadString : public HExpression<1> {
+class HLoadString FINAL : public HInstruction {
  public:
   // Determines how to load the String.
   enum class LoadKind {
@@ -5475,12 +5492,12 @@
               uint32_t string_index,
               const DexFile& dex_file,
               uint32_t dex_pc)
-      : HExpression(Primitive::kPrimNot, SideEffectsForArchRuntimeCalls(), dex_pc),
+      : HInstruction(SideEffectsForArchRuntimeCalls(), dex_pc),
+        special_input_(HUserRecord<HInstruction*>(current_method)),
         string_index_(string_index) {
     SetPackedFlag<kFlagIsInDexCache>(false);
     SetPackedField<LoadKindField>(LoadKind::kDexCacheViaMethod);
     load_data_.ref.dex_file = &dex_file;
-    SetRawInputAt(0, current_method);
   }
 
   void SetLoadKindWithAddress(LoadKind load_kind, uint64_t address) {
@@ -5527,7 +5544,7 @@
 
   bool CanBeMoved() const OVERRIDE { return true; }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE;
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE;
 
   size_t ComputeHashCode() const OVERRIDE { return string_index_; }
 
@@ -5563,16 +5580,22 @@
     SetSideEffects(SideEffects::None());
   }
 
-  size_t InputCount() const OVERRIDE {
-    return (InputAt(0) != nullptr) ? 1u : 0u;
+  void AddSpecialInput(HInstruction* special_input);
+
+  using HInstruction::GetInputRecords;  // Keep the const version visible.
+  ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() OVERRIDE FINAL {
+    return ArrayRef<HUserRecord<HInstruction*>>(
+        &special_input_, (special_input_.GetInstruction() != nullptr) ? 1u : 0u);
   }
 
-  void AddSpecialInput(HInstruction* special_input);
+  Primitive::Type GetType() const OVERRIDE {
+    return Primitive::kPrimNot;
+  }
 
   DECLARE_INSTRUCTION(LoadString);
 
  private:
-  static constexpr size_t kFlagIsInDexCache = kNumberOfExpressionPackedBits;
+  static constexpr size_t kFlagIsInDexCache = kNumberOfGenericPackedBits;
   static constexpr size_t kFieldLoadKind = kFlagIsInDexCache + 1;
   static constexpr size_t kFieldLoadKindSize =
       MinimumBitsToStore(static_cast<size_t>(LoadKind::kLast));
@@ -5596,6 +5619,8 @@
 
   void SetLoadKindInternal(LoadKind load_kind);
 
+  HUserRecord<HInstruction*> special_input_;
+
   // String index serves also as the hash code and it's also needed for slow-paths,
   // so it must not be overwritten with other load data.
   uint32_t string_index_;
@@ -5630,15 +5655,17 @@
   // The special input is used for PC-relative loads on some architectures.
   DCHECK(GetLoadKind() == LoadKind::kBootImageLinkTimePcRelative ||
          GetLoadKind() == LoadKind::kDexCachePcRelative) << GetLoadKind();
-  DCHECK(InputAt(0) == nullptr);
-  SetRawInputAt(0u, special_input);
+  // HLoadString::GetInputRecords() returns an empty array at this point,
+  // so use the GetInputRecords() from the base class to set the input record.
+  DCHECK(special_input_.GetInstruction() == nullptr);
+  special_input_ = HUserRecord<HInstruction*>(special_input);
   special_input->AddUseAt(this, 0);
 }
 
 /**
  * Performs an initialization check on its Class object input.
  */
-class HClinitCheck : public HExpression<1> {
+class HClinitCheck FINAL : public HExpression<1> {
  public:
   HClinitCheck(HLoadClass* constant, uint32_t dex_pc)
       : HExpression(
@@ -5649,7 +5676,7 @@
   }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
 
@@ -5668,7 +5695,7 @@
   DISALLOW_COPY_AND_ASSIGN(HClinitCheck);
 };
 
-class HStaticFieldGet : public HExpression<1> {
+class HStaticFieldGet FINAL : public HExpression<1> {
  public:
   HStaticFieldGet(HInstruction* cls,
                   Primitive::Type field_type,
@@ -5695,8 +5722,8 @@
 
   bool CanBeMoved() const OVERRIDE { return !IsVolatile(); }
 
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
-    HStaticFieldGet* other_get = other->AsStaticFieldGet();
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
+    const HStaticFieldGet* other_get = other->AsStaticFieldGet();
     return GetFieldOffset().SizeValue() == other_get->GetFieldOffset().SizeValue();
   }
 
@@ -5717,7 +5744,7 @@
   DISALLOW_COPY_AND_ASSIGN(HStaticFieldGet);
 };
 
-class HStaticFieldSet : public HTemplateInstruction<2> {
+class HStaticFieldSet FINAL : public HTemplateInstruction<2> {
  public:
   HStaticFieldSet(HInstruction* cls,
                   HInstruction* value,
@@ -5765,7 +5792,7 @@
   DISALLOW_COPY_AND_ASSIGN(HStaticFieldSet);
 };
 
-class HUnresolvedInstanceFieldGet : public HExpression<1> {
+class HUnresolvedInstanceFieldGet FINAL : public HExpression<1> {
  public:
   HUnresolvedInstanceFieldGet(HInstruction* obj,
                               Primitive::Type field_type,
@@ -5790,7 +5817,7 @@
   DISALLOW_COPY_AND_ASSIGN(HUnresolvedInstanceFieldGet);
 };
 
-class HUnresolvedInstanceFieldSet : public HTemplateInstruction<2> {
+class HUnresolvedInstanceFieldSet FINAL : public HTemplateInstruction<2> {
  public:
   HUnresolvedInstanceFieldSet(HInstruction* obj,
                               HInstruction* value,
@@ -5828,7 +5855,7 @@
   DISALLOW_COPY_AND_ASSIGN(HUnresolvedInstanceFieldSet);
 };
 
-class HUnresolvedStaticFieldGet : public HExpression<0> {
+class HUnresolvedStaticFieldGet FINAL : public HExpression<0> {
  public:
   HUnresolvedStaticFieldGet(Primitive::Type field_type,
                             uint32_t field_index,
@@ -5851,7 +5878,7 @@
   DISALLOW_COPY_AND_ASSIGN(HUnresolvedStaticFieldGet);
 };
 
-class HUnresolvedStaticFieldSet : public HTemplateInstruction<1> {
+class HUnresolvedStaticFieldSet FINAL : public HTemplateInstruction<1> {
  public:
   HUnresolvedStaticFieldSet(HInstruction* value,
                             Primitive::Type field_type,
@@ -5888,7 +5915,7 @@
 };
 
 // Implement the move-exception DEX instruction.
-class HLoadException : public HExpression<0> {
+class HLoadException FINAL : public HExpression<0> {
  public:
   explicit HLoadException(uint32_t dex_pc = kNoDexPc)
       : HExpression(Primitive::kPrimNot, SideEffects::None(), dex_pc) {}
@@ -5903,7 +5930,7 @@
 
 // Implicit part of move-exception which clears thread-local exception storage.
 // Must not be removed because the runtime expects the TLS to get cleared.
-class HClearException : public HTemplateInstruction<0> {
+class HClearException FINAL : public HTemplateInstruction<0> {
  public:
   explicit HClearException(uint32_t dex_pc = kNoDexPc)
       : HTemplateInstruction(SideEffects::AllWrites(), dex_pc) {}
@@ -5914,7 +5941,7 @@
   DISALLOW_COPY_AND_ASSIGN(HClearException);
 };
 
-class HThrow : public HTemplateInstruction<1> {
+class HThrow FINAL : public HTemplateInstruction<1> {
  public:
   HThrow(HInstruction* exception, uint32_t dex_pc)
       : HTemplateInstruction(SideEffects::CanTriggerGC(), dex_pc) {
@@ -5951,7 +5978,7 @@
 
 std::ostream& operator<<(std::ostream& os, TypeCheckKind rhs);
 
-class HInstanceOf : public HExpression<2> {
+class HInstanceOf FINAL : public HExpression<2> {
  public:
   HInstanceOf(HInstruction* object,
               HLoadClass* constant,
@@ -5968,7 +5995,7 @@
 
   bool CanBeMoved() const OVERRIDE { return true; }
 
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
 
@@ -6005,7 +6032,7 @@
   DISALLOW_COPY_AND_ASSIGN(HInstanceOf);
 };
 
-class HBoundType : public HExpression<1> {
+class HBoundType FINAL : public HExpression<1> {
  public:
   HBoundType(HInstruction* input, uint32_t dex_pc = kNoDexPc)
       : HExpression(Primitive::kPrimNot, SideEffects::None(), dex_pc),
@@ -6049,7 +6076,7 @@
   DISALLOW_COPY_AND_ASSIGN(HBoundType);
 };
 
-class HCheckCast : public HTemplateInstruction<2> {
+class HCheckCast FINAL : public HTemplateInstruction<2> {
  public:
   HCheckCast(HInstruction* object,
              HLoadClass* constant,
@@ -6064,7 +6091,7 @@
 
   bool CanBeMoved() const OVERRIDE { return true; }
 
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
     return true;
   }
 
@@ -6094,7 +6121,7 @@
   DISALLOW_COPY_AND_ASSIGN(HCheckCast);
 };
 
-class HMemoryBarrier : public HTemplateInstruction<0> {
+class HMemoryBarrier FINAL : public HTemplateInstruction<0> {
  public:
   explicit HMemoryBarrier(MemBarrierKind barrier_kind, uint32_t dex_pc = kNoDexPc)
       : HTemplateInstruction(
@@ -6119,7 +6146,7 @@
   DISALLOW_COPY_AND_ASSIGN(HMemoryBarrier);
 };
 
-class HMonitorOperation : public HTemplateInstruction<1> {
+class HMonitorOperation FINAL : public HTemplateInstruction<1> {
  public:
   enum class OperationKind {
     kEnter,
@@ -6164,7 +6191,7 @@
   DISALLOW_COPY_AND_ASSIGN(HMonitorOperation);
 };
 
-class HSelect : public HExpression<3> {
+class HSelect FINAL : public HExpression<3> {
  public:
   HSelect(HInstruction* condition,
           HInstruction* true_value,
@@ -6187,7 +6214,9 @@
   HInstruction* GetCondition() const { return InputAt(2); }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE { return true; }
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+    return true;
+  }
 
   bool CanBeNull() const OVERRIDE {
     return GetTrueValue()->CanBeNull() || GetFalseValue()->CanBeNull();
@@ -6277,7 +6306,7 @@
 
 static constexpr size_t kDefaultNumberOfMoves = 4;
 
-class HParallelMove : public HTemplateInstruction<0> {
+class HParallelMove FINAL : public HTemplateInstruction<0> {
  public:
   explicit HParallelMove(ArenaAllocator* arena, uint32_t dex_pc = kNoDexPc)
       : HTemplateInstruction(SideEffects::None(), dex_pc),
diff --git a/compiler/optimizing/nodes_arm.h b/compiler/optimizing/nodes_arm.h
index 6a1dbb9..371e8ef 100644
--- a/compiler/optimizing/nodes_arm.h
+++ b/compiler/optimizing/nodes_arm.h
@@ -19,7 +19,7 @@
 
 namespace art {
 
-class HArmDexCacheArraysBase : public HExpression<0> {
+class HArmDexCacheArraysBase FINAL : public HExpression<0> {
  public:
   explicit HArmDexCacheArraysBase(const DexFile& dex_file)
       : HExpression(Primitive::kPrimInt, SideEffects::None(), kNoDexPc),
diff --git a/compiler/optimizing/nodes_arm64.h b/compiler/optimizing/nodes_arm64.h
index 173852a..06b073c 100644
--- a/compiler/optimizing/nodes_arm64.h
+++ b/compiler/optimizing/nodes_arm64.h
@@ -21,7 +21,7 @@
 
 namespace art {
 
-class HArm64DataProcWithShifterOp : public HExpression<2> {
+class HArm64DataProcWithShifterOp FINAL : public HExpression<2> {
  public:
   enum OpKind {
     kLSL,   // Logical shift left.
@@ -56,8 +56,8 @@
   }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other_instr) const OVERRIDE {
-    HArm64DataProcWithShifterOp* other = other_instr->AsArm64DataProcWithShifterOp();
+  bool InstructionDataEquals(const HInstruction* other_instr) const OVERRIDE {
+    const HArm64DataProcWithShifterOp* other = other_instr->AsArm64DataProcWithShifterOp();
     return instr_kind_ == other->instr_kind_ &&
         op_kind_ == other->op_kind_ &&
         shift_amount_ == other->shift_amount_;
@@ -97,7 +97,7 @@
 // This instruction computes an intermediate address pointing in the 'middle' of an object. The
 // result pointer cannot be handled by GC, so extra care is taken to make sure that this value is
 // never used across anything that can trigger GC.
-class HArm64IntermediateAddress : public HExpression<2> {
+class HArm64IntermediateAddress FINAL : public HExpression<2> {
  public:
   HArm64IntermediateAddress(HInstruction* base_address, HInstruction* offset, uint32_t dex_pc)
       : HExpression(Primitive::kPrimNot, SideEffects::DependsOnGC(), dex_pc) {
@@ -106,7 +106,9 @@
   }
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE { return true; }
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+    return true;
+  }
   bool IsActualObject() const OVERRIDE { return false; }
 
   HInstruction* GetBaseAddress() const { return InputAt(0); }
diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h
index c10c718..f2d5cf3 100644
--- a/compiler/optimizing/nodes_shared.h
+++ b/compiler/optimizing/nodes_shared.h
@@ -19,7 +19,7 @@
 
 namespace art {
 
-class HMultiplyAccumulate : public HExpression<3> {
+class HMultiplyAccumulate FINAL : public HExpression<3> {
  public:
   HMultiplyAccumulate(Primitive::Type type,
                       InstructionKind op,
@@ -38,7 +38,7 @@
   static constexpr int kInputMulRightIndex = 2;
 
   bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
     return op_kind_ == other->AsMultiplyAccumulate()->op_kind_;
   }
 
@@ -53,7 +53,7 @@
   DISALLOW_COPY_AND_ASSIGN(HMultiplyAccumulate);
 };
 
-class HBitwiseNegatedRight : public HBinaryOperation {
+class HBitwiseNegatedRight FINAL : public HBinaryOperation {
  public:
   HBitwiseNegatedRight(Primitive::Type result_type,
                             InstructionKind op,
diff --git a/compiler/optimizing/nodes_x86.h b/compiler/optimizing/nodes_x86.h
index 0b3a84d..c3696b5 100644
--- a/compiler/optimizing/nodes_x86.h
+++ b/compiler/optimizing/nodes_x86.h
@@ -20,7 +20,7 @@
 namespace art {
 
 // Compute the address of the method for X86 Constant area support.
-class HX86ComputeBaseMethodAddress : public HExpression<0> {
+class HX86ComputeBaseMethodAddress FINAL : public HExpression<0> {
  public:
   // Treat the value as an int32_t, but it is really a 32 bit native pointer.
   HX86ComputeBaseMethodAddress()
@@ -33,7 +33,7 @@
 };
 
 // Load a constant value from the constant table.
-class HX86LoadFromConstantTable : public HExpression<2> {
+class HX86LoadFromConstantTable FINAL : public HExpression<2> {
  public:
   HX86LoadFromConstantTable(HX86ComputeBaseMethodAddress* method_base,
                             HConstant* constant)
@@ -57,7 +57,7 @@
 };
 
 // Version of HNeg with access to the constant table for FP types.
-class HX86FPNeg : public HExpression<2> {
+class HX86FPNeg FINAL : public HExpression<2> {
  public:
   HX86FPNeg(Primitive::Type result_type,
             HInstruction* input,
@@ -76,7 +76,7 @@
 };
 
 // X86 version of HPackedSwitch that holds a pointer to the base method address.
-class HX86PackedSwitch : public HTemplateInstruction<2> {
+class HX86PackedSwitch FINAL : public HTemplateInstruction<2> {
  public:
   HX86PackedSwitch(int32_t start_value,
                    int32_t num_entries,
diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc
index fc66823..db0d88c 100644
--- a/compiler/optimizing/optimizing_cfi_test_expected.inc
+++ b/compiler/optimizing/optimizing_cfi_test_expected.inc
@@ -32,21 +32,21 @@
 // 0x00000012: .cfi_def_cfa_offset: 64
 
 static constexpr uint8_t expected_asm_kArm64[] = {
-    0xE0, 0x0F, 0x1C, 0xF8, 0xF4, 0xD7, 0x02, 0xA9, 0xFE, 0x1F, 0x00, 0xF9,
-    0xE8, 0xA7, 0x01, 0x6D, 0xE8, 0xA7, 0x41, 0x6D, 0xF4, 0xD7, 0x42, 0xA9,
-    0xFE, 0x1F, 0x40, 0xF9, 0xFF, 0x03, 0x01, 0x91, 0xC0, 0x03, 0x5F, 0xD6,
+    0xE0, 0x0F, 0x1C, 0xF8, 0xF4, 0x17, 0x00, 0xF9, 0xF5, 0x7B, 0x03, 0xA9,
+    0xE8, 0xA7, 0x01, 0x6D, 0xE8, 0xA7, 0x41, 0x6D, 0xF4, 0x17, 0x40, 0xF9,
+    0xF5, 0x7B, 0x43, 0xA9, 0xFF, 0x03, 0x01, 0x91, 0xC0, 0x03, 0x5F, 0xD6,
 };
 static constexpr uint8_t expected_cfi_kArm64[] = {
-    0x44, 0x0E, 0x40, 0x44, 0x94, 0x06, 0x95, 0x04, 0x44, 0x9E, 0x02, 0x44,
+    0x44, 0x0E, 0x40, 0x44, 0x94, 0x06, 0x44, 0x95, 0x04, 0x9E, 0x02, 0x44,
     0x05, 0x48, 0x0A, 0x05, 0x49, 0x08, 0x0A, 0x44, 0x06, 0x48, 0x06, 0x49,
-    0x44, 0xD4, 0xD5, 0x44, 0xDE, 0x44, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0x40,
+    0x44, 0xD4, 0x44, 0xD5, 0xDE, 0x44, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0x40,
 };
 // 0x00000000: str x0, [sp, #-64]!
 // 0x00000004: .cfi_def_cfa_offset: 64
-// 0x00000004: stp x20, x21, [sp, #40]
+// 0x00000004: str x20, [sp, #40]
 // 0x00000008: .cfi_offset: r20 at cfa-24
-// 0x00000008: .cfi_offset: r21 at cfa-16
-// 0x00000008: str lr, [sp, #56]
+// 0x00000008: stp x21, lr, [sp, #48]
+// 0x0000000c: .cfi_offset: r21 at cfa-16
 // 0x0000000c: .cfi_offset: r30 at cfa-8
 // 0x0000000c: stp d8, d9, [sp, #24]
 // 0x00000010: .cfi_offset_extended: r72 at cfa-40
@@ -55,10 +55,10 @@
 // 0x00000010: ldp d8, d9, [sp, #24]
 // 0x00000014: .cfi_restore_extended: r72
 // 0x00000014: .cfi_restore_extended: r73
-// 0x00000014: ldp x20, x21, [sp, #40]
+// 0x00000014: ldr x20, [sp, #40]
 // 0x00000018: .cfi_restore: r20
-// 0x00000018: .cfi_restore: r21
-// 0x00000018: ldr lr, [sp, #56]
+// 0x00000018: ldp x21, lr, [sp, #48]
+// 0x0000001c: .cfi_restore: r21
 // 0x0000001c: .cfi_restore: r30
 // 0x0000001c: add sp, sp, #0x40 (64)
 // 0x00000020: .cfi_def_cfa_offset: 0
diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc
index dafbd3d..cb2fc0a 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.cc
+++ b/compiler/optimizing/pc_relative_fixups_x86.cc
@@ -202,8 +202,9 @@
     }
 
     // Ensure that we can load FP arguments from the constant area.
-    for (size_t i = 0, e = invoke->InputCount(); i < e; i++) {
-      HConstant* input = invoke->InputAt(i)->AsConstant();
+    auto&& inputs = invoke->GetInputs();
+    for (size_t i = 0; i < inputs.size(); i++) {
+      HConstant* input = inputs[i]->AsConstant();
       if (input != nullptr && Primitive::IsFloatingPointType(input->GetType())) {
         ReplaceInput(invoke, input, i, true);
       }
diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc
index 38e42f7..1719df0 100644
--- a/compiler/optimizing/prepare_for_register_allocation.cc
+++ b/compiler/optimizing/prepare_for_register_allocation.cc
@@ -173,8 +173,7 @@
 
 void PrepareForRegisterAllocation::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
   if (invoke->IsStaticWithExplicitClinitCheck()) {
-    size_t last_input_index = invoke->InputCount() - 1;
-    HLoadClass* last_input = invoke->InputAt(last_input_index)->AsLoadClass();
+    HLoadClass* last_input = invoke->GetInputs().back()->AsLoadClass();
     DCHECK(last_input != nullptr)
         << "Last input is not HLoadClass. It is " << last_input->DebugName();
 
diff --git a/compiler/optimizing/pretty_printer.h b/compiler/optimizing/pretty_printer.h
index ee32518..f9bef68 100644
--- a/compiler/optimizing/pretty_printer.h
+++ b/compiler/optimizing/pretty_printer.h
@@ -39,16 +39,17 @@
   }
 
   void PrintPostInstruction(HInstruction* instruction) {
-    if (instruction->InputCount() != 0) {
+    auto&& inputs = instruction->GetInputs();
+    if (!inputs.empty()) {
       PrintString("(");
       bool first = true;
-      for (HInputIterator it(instruction); !it.Done(); it.Advance()) {
+      for (const HInstruction* input : inputs) {
         if (first) {
           first = false;
         } else {
           PrintString(", ");
         }
-        PrintInt(it.Current()->GetId());
+        PrintInt(input->GetId());
       }
       PrintString(")");
     }
diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc
index 479ffc2..3e6adcb 100644
--- a/compiler/optimizing/reference_type_propagation.cc
+++ b/compiler/optimizing/reference_type_propagation.cc
@@ -823,13 +823,13 @@
 void ReferenceTypePropagation::UpdatePhi(HPhi* instr) {
   DCHECK(instr->IsLive());
 
-  size_t input_count = instr->InputCount();
+  auto&& inputs = instr->GetInputs();
   size_t first_input_index_not_null = 0;
-  while (first_input_index_not_null < input_count &&
-      instr->InputAt(first_input_index_not_null)->IsNullConstant()) {
+  while (first_input_index_not_null < inputs.size() &&
+      inputs[first_input_index_not_null]->IsNullConstant()) {
     first_input_index_not_null++;
   }
-  if (first_input_index_not_null == input_count) {
+  if (first_input_index_not_null == inputs.size()) {
     // All inputs are NullConstants, set the type to object.
     // This may happen in the presence of inlining.
     instr->SetReferenceTypeInfo(instr->GetBlock()->GetGraph()->GetInexactObjectRti());
@@ -844,11 +844,11 @@
     return;
   }
 
-  for (size_t i = first_input_index_not_null + 1; i < input_count; i++) {
-    if (instr->InputAt(i)->IsNullConstant()) {
+  for (size_t i = first_input_index_not_null + 1; i < inputs.size(); i++) {
+    if (inputs[i]->IsNullConstant()) {
       continue;
     }
-    new_rti = MergeTypes(new_rti, instr->InputAt(i)->GetReferenceTypeInfo());
+    new_rti = MergeTypes(new_rti, inputs[i]->GetReferenceTypeInfo());
     if (new_rti.IsValid() && new_rti.IsObjectClass()) {
       if (!new_rti.IsExact()) {
         break;
@@ -879,8 +879,8 @@
   if (instr->IsPhi()) {
     HPhi* phi = instr->AsPhi();
     bool new_can_be_null = false;
-    for (size_t i = 0; i < phi->InputCount(); i++) {
-      if (phi->InputAt(i)->CanBeNull()) {
+    for (HInstruction* input : phi->GetInputs()) {
+      if (input->CanBeNull()) {
         new_can_be_null = true;
         break;
       }
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 4405b80..4a6b835 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -305,7 +305,7 @@
     BlockRegisters(position, position + 1, /* caller_save_only */ true);
   }
 
-  for (size_t i = 0; i < instruction->InputCount(); ++i) {
+  for (size_t i = 0; i < locations->GetInputCount(); ++i) {
     Location input = locations->InAt(i);
     if (input.IsRegister() || input.IsFpuRegister()) {
       BlockRegister(input, position, position + 1);
@@ -753,10 +753,11 @@
   if (defined_by != nullptr && !current->IsSplit()) {
     LocationSummary* locations = defined_by->GetLocations();
     if (!locations->OutputCanOverlapWithInputs() && locations->Out().IsUnallocated()) {
-      for (size_t i = 0, e = defined_by->InputCount(); i < e; ++i) {
+      auto&& inputs = defined_by->GetInputs();
+      for (size_t i = 0; i < inputs.size(); ++i) {
         // Take the last interval of the input. It is the location of that interval
         // that will be used at `defined_by`.
-        LiveInterval* interval = defined_by->InputAt(i)->GetLiveInterval()->GetLastSibling();
+        LiveInterval* interval = inputs[i]->GetLiveInterval()->GetLastSibling();
         // Note that interval may have not been processed yet.
         // TODO: Handle non-split intervals last in the work list.
         if (locations->InAt(i).IsValid()
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index 52ee3fb..261821a 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -123,8 +123,7 @@
 static bool TypePhiFromInputs(HPhi* phi) {
   Primitive::Type common_type = phi->GetType();
 
-  for (HInputIterator it(phi); !it.Done(); it.Advance()) {
-    HInstruction* input = it.Current();
+  for (HInstruction* input : phi->GetInputs()) {
     if (input->IsPhi() && input->AsPhi()->IsDead()) {
       // Phis are constructed live so if an input is a dead phi, it must have
       // been made dead due to type conflict. Mark this phi conflicting too.
@@ -169,8 +168,7 @@
     // or `common_type` is integral and we do not need to retype ambiguous inputs
     // because they are always constructed with the integral type candidate.
     if (kIsDebugBuild) {
-      for (size_t i = 0, e = phi->InputCount(); i < e; ++i) {
-        HInstruction* input = phi->InputAt(i);
+      for (HInstruction* input : phi->GetInputs()) {
         if (common_type == Primitive::kPrimVoid) {
           DCHECK(input->IsPhi() && input->GetType() == Primitive::kPrimVoid);
         } else {
@@ -183,8 +181,9 @@
     return true;
   } else {
     DCHECK(common_type == Primitive::kPrimNot || Primitive::IsFloatingPointType(common_type));
-    for (size_t i = 0, e = phi->InputCount(); i < e; ++i) {
-      HInstruction* input = phi->InputAt(i);
+    auto&& inputs = phi->GetInputs();
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      HInstruction* input = inputs[i];
       if (input->GetType() != common_type) {
         // Input type does not match phi's type. Try to retype the input or
         // generate a suitably typed equivalent.
@@ -615,11 +614,14 @@
       || (next->AsPhi()->GetRegNumber() != phi->GetRegNumber())
       || (next->GetType() != type)) {
     ArenaAllocator* allocator = graph_->GetArena();
-    HPhi* new_phi = new (allocator) HPhi(allocator, phi->GetRegNumber(), phi->InputCount(), type);
-    for (size_t i = 0, e = phi->InputCount(); i < e; ++i) {
-      // Copy the inputs. Note that the graph may not be correctly typed
-      // by doing this copy, but the type propagation phase will fix it.
-      new_phi->SetRawInputAt(i, phi->InputAt(i));
+    auto&& inputs = phi->GetInputs();
+    HPhi* new_phi =
+        new (allocator) HPhi(allocator, phi->GetRegNumber(), inputs.size(), type);
+    // Copy the inputs. Note that the graph may not be correctly typed
+    // by doing this copy, but the type propagation phase will fix it.
+    ArrayRef<HUserRecord<HInstruction*>> new_input_records = new_phi->GetInputRecords();
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      new_input_records[i] = HUserRecord<HInstruction*>(inputs[i]);
     }
     phi->GetBlock()->InsertPhiAfter(new_phi, phi);
     DCHECK(new_phi->IsLive());
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index 36e0d99..212d935 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -177,8 +177,9 @@
 static void RecursivelyProcessInputs(HInstruction* current,
                                      HInstruction* actual_user,
                                      BitVector* live_in) {
-  for (size_t i = 0, e = current->InputCount(); i < e; ++i) {
-    HInstruction* input = current->InputAt(i);
+  auto&& inputs = current->GetInputs();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    HInstruction* input = inputs[i];
     bool has_in_location = current->GetLocations()->InAt(i).IsValid();
     bool has_out_location = input->GetLocations()->Out().IsValid();
 
@@ -430,12 +431,12 @@
         // If the instruction dies at the phi assignment, we can try having the
         // same register.
         if (end == user->GetBlock()->GetPredecessors()[input_index]->GetLifetimeEnd()) {
-          for (size_t i = 0, e = user->InputCount(); i < e; ++i) {
+          auto&& inputs = user->GetInputs();
+          for (size_t i = 0; i < inputs.size(); ++i) {
             if (i == input_index) {
               continue;
             }
-            HInstruction* input = user->InputAt(i);
-            Location location = input->GetLiveInterval()->GetLocationAt(
+            Location location = inputs[i]->GetLiveInterval()->GetLocationAt(
                 user->GetBlock()->GetPredecessors()[i]->GetLifetimeEnd() - 1);
             if (location.IsRegisterKind()) {
               int reg = RegisterOrLowRegister(location);
@@ -471,10 +472,10 @@
   if (defined_by_->IsPhi()) {
     // Try to use the same register as one of the inputs.
     const ArenaVector<HBasicBlock*>& predecessors = defined_by_->GetBlock()->GetPredecessors();
-    for (size_t i = 0, e = defined_by_->InputCount(); i < e; ++i) {
-      HInstruction* input = defined_by_->InputAt(i);
+    auto&& inputs = defined_by_->GetInputs();
+    for (size_t i = 0; i < inputs.size(); ++i) {
       size_t end = predecessors[i]->GetLifetimeEnd();
-      LiveInterval* input_interval = input->GetLiveInterval()->GetSiblingAt(end - 1);
+      LiveInterval* input_interval = inputs[i]->GetLiveInterval()->GetSiblingAt(end - 1);
       if (input_interval->GetEnd() == end) {
         // If the input dies at the end of the predecessor, we know its register can
         // be reused.
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index 1fcba8b..dc98864 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -797,8 +797,8 @@
   bool IsUsingInputRegister() const {
     CHECK(kIsDebugBuild) << "Function should be used only for DCHECKs";
     if (defined_by_ != nullptr && !IsSplit()) {
-      for (HInputIterator it(defined_by_); !it.Done(); it.Advance()) {
-        LiveInterval* interval = it.Current()->GetLiveInterval();
+      for (const HInstruction* input : defined_by_->GetInputs()) {
+        LiveInterval* interval = input->GetLiveInterval();
 
         // Find the interval that covers `defined_by`_. Calls to this function
         // are made outside the linear scan, hence we need to use CoversSlow.
@@ -828,8 +828,8 @@
       if (locations->OutputCanOverlapWithInputs()) {
         return false;
       }
-      for (HInputIterator it(defined_by_); !it.Done(); it.Advance()) {
-        LiveInterval* interval = it.Current()->GetLiveInterval();
+      for (const HInstruction* input : defined_by_->GetInputs()) {
+        LiveInterval* interval = input->GetLiveInterval();
 
         // Find the interval that covers `defined_by`_. Calls to this function
         // are made outside the linear scan, hence we need to use CoversSlow.
diff --git a/compiler/optimizing/ssa_phi_elimination.cc b/compiler/optimizing/ssa_phi_elimination.cc
index c67612e..b1ec99a 100644
--- a/compiler/optimizing/ssa_phi_elimination.cc
+++ b/compiler/optimizing/ssa_phi_elimination.cc
@@ -67,8 +67,8 @@
   while (!worklist_.empty()) {
     HPhi* phi = worklist_.back();
     worklist_.pop_back();
-    for (HInputIterator it(phi); !it.Done(); it.Advance()) {
-      HPhi* input = it.Current()->AsPhi();
+    for (HInstruction* raw_input : phi->GetInputs()) {
+      HPhi* input = raw_input->AsPhi();
       if (input != nullptr && input->IsDead()) {
         // Input is a dead phi. Revive it and add to the worklist. We make sure
         // that the phi was not dead initially (see definition of `initially_live`).
@@ -102,9 +102,7 @@
           }
         }
         // Remove the phi from use lists of its inputs.
-        for (size_t i = 0, e = phi->InputCount(); i < e; ++i) {
-          phi->RemoveAsUserOfInput(i);
-        }
+        phi->RemoveAsUserOfAllInputs();
         // Remove the phi from environments that use it.
         for (const HUseListNode<HEnvironment*>& use : phi->GetEnvUses()) {
           HEnvironment* user = use.GetUser();
@@ -159,8 +157,7 @@
     bool irreducible_loop_phi_in_cycle = phi->IsIrreducibleLoopHeaderPhi();
 
     // First do a simple loop over inputs and check if they are all the same.
-    for (size_t j = 0; j < phi->InputCount(); ++j) {
-      HInstruction* input = phi->InputAt(j);
+    for (HInstruction* input : phi->GetInputs()) {
       if (input == phi) {
         continue;
       } else if (candidate == nullptr) {
@@ -181,8 +178,7 @@
         DCHECK(!current->IsLoopHeaderPhi() ||
                current->GetBlock()->IsLoopPreHeaderFirstPredecessor());
 
-        for (size_t j = 0; j < current->InputCount(); ++j) {
-          HInstruction* input = current->InputAt(j);
+        for (HInstruction* input : current->GetInputs()) {
           if (input == current) {
             continue;
           } else if (input->IsPhi()) {
diff --git a/compiler/utils/arm/assembler_arm.cc b/compiler/utils/arm/assembler_arm.cc
index e5f91dc..a7f4547 100644
--- a/compiler/utils/arm/assembler_arm.cc
+++ b/compiler/utils/arm/assembler_arm.cc
@@ -386,8 +386,9 @@
 
 constexpr size_t kFramePointerSize = kArmPointerSize;
 
-void ArmAssembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                              const std::vector<ManagedRegister>& callee_save_regs,
+void ArmAssembler::BuildFrame(size_t frame_size,
+                              ManagedRegister method_reg,
+                              ArrayRef<const ManagedRegister> callee_save_regs,
                               const ManagedRegisterEntrySpills& entry_spills) {
   CHECK_EQ(buffer_.Size(), 0U);  // Nothing emitted yet
   CHECK_ALIGNED(frame_size, kStackAlignment);
@@ -442,7 +443,7 @@
 }
 
 void ArmAssembler::RemoveFrame(size_t frame_size,
-                              const std::vector<ManagedRegister>& callee_save_regs) {
+                               ArrayRef<const ManagedRegister> callee_save_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
   cfi_.RememberState();
 
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index ffbe786..91fe0e1 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -671,10 +671,15 @@
   virtual void vcmpdz(DRegister dd, Condition cond = AL) = 0;
   virtual void vmstat(Condition cond = AL) = 0;  // VMRS APSR_nzcv, FPSCR
 
+  virtual void vcntd(DRegister dd, DRegister dm) = 0;
+  virtual void vpaddld(DRegister dd, DRegister dm, int32_t size, bool is_unsigned) = 0;
+
   virtual void vpushs(SRegister reg, int nregs, Condition cond = AL) = 0;
   virtual void vpushd(DRegister reg, int nregs, Condition cond = AL) = 0;
   virtual void vpops(SRegister reg, int nregs, Condition cond = AL) = 0;
   virtual void vpopd(DRegister reg, int nregs, Condition cond = AL) = 0;
+  virtual void vldmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) = 0;
+  virtual void vstmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) = 0;
 
   // Branch instructions.
   virtual void b(Label* label, Condition cond = AL) = 0;
@@ -751,32 +756,7 @@
     }
   }
 
-  void LoadDImmediate(DRegister sd, double value, Condition cond = AL) {
-    if (!vmovd(sd, value, cond)) {
-      uint64_t int_value = bit_cast<uint64_t, double>(value);
-      if (int_value == bit_cast<uint64_t, double>(0.0)) {
-        // 0.0 is quite common, so we special case it by loading
-        // 2.0 in `sd` and then substracting it.
-        bool success = vmovd(sd, 2.0, cond);
-        CHECK(success);
-        vsubd(sd, sd, sd, cond);
-      } else {
-        if (sd < 16) {
-          SRegister low = static_cast<SRegister>(sd << 1);
-          SRegister high = static_cast<SRegister>(low + 1);
-          LoadSImmediate(low, bit_cast<float, uint32_t>(Low32Bits(int_value)), cond);
-          if (High32Bits(int_value) == Low32Bits(int_value)) {
-            vmovs(high, low);
-          } else {
-            LoadSImmediate(high, bit_cast<float, uint32_t>(High32Bits(int_value)), cond);
-          }
-        } else {
-          LOG(FATAL) << "Unimplemented loading of double into a D register "
-                     << "that cannot be split into two S registers";
-        }
-      }
-    }
-  }
+  virtual void LoadDImmediate(DRegister dd, double value, Condition cond = AL) = 0;
 
   virtual void MarkExceptionHandler(Label* label) = 0;
   virtual void LoadFromOffset(LoadOperandType type,
@@ -907,12 +887,13 @@
   //
 
   // Emit code that will create an activation on the stack
-  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                  const std::vector<ManagedRegister>& callee_save_regs,
+  void BuildFrame(size_t frame_size,
+                  ManagedRegister method_reg,
+                  ArrayRef<const ManagedRegister> callee_save_regs,
                   const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+  void RemoveFrame(size_t frame_size, ArrayRef<const ManagedRegister> callee_save_regs)
     OVERRIDE;
 
   void IncreaseFrameSize(size_t adjust) OVERRIDE;
diff --git a/compiler/utils/arm/assembler_arm32.cc b/compiler/utils/arm/assembler_arm32.cc
index 0a227b2..2805d86 100644
--- a/compiler/utils/arm/assembler_arm32.cc
+++ b/compiler/utils/arm/assembler_arm32.cc
@@ -1106,6 +1106,18 @@
 }
 
 
+void Arm32Assembler::vldmiad(Register, DRegister, int, Condition) {
+  LOG(FATAL) << "Unimplemented.";
+  UNREACHABLE();
+}
+
+
+void Arm32Assembler::vstmiad(Register, DRegister, int, Condition) {
+  LOG(FATAL) << "Unimplemented.";
+  UNREACHABLE();
+}
+
+
 void Arm32Assembler::EmitVPushPop(uint32_t reg, int nregs, bool push, bool dbl, Condition cond) {
   CHECK_NE(cond, kNoCondition);
   CHECK_GT(nregs, 0);
@@ -1264,6 +1276,31 @@
   Emit(encoding);
 }
 
+void Arm32Assembler::vcntd(DRegister dd, DRegister dm) {
+  uint32_t encoding = (B31 | B30 | B29 | B28 | B25 | B24 | B23 | B21 | B20) |
+    ((static_cast<int32_t>(dd) >> 4) * B22) |
+    ((static_cast<uint32_t>(dd) & 0xf) * B12) |
+    (B10 | B8) |
+    ((static_cast<int32_t>(dm) >> 4) * B5) |
+    (static_cast<uint32_t>(dm) & 0xf);
+
+  Emit(encoding);
+}
+
+void Arm32Assembler::vpaddld(DRegister dd, DRegister dm, int32_t size, bool is_unsigned) {
+  CHECK(size == 8 || size == 16 || size == 32) << size;
+  uint32_t encoding = (B31 | B30 | B29 | B28 | B25 | B24 | B23 | B21 | B20) |
+    ((static_cast<uint32_t>(size >> 4) & 0x3) * B18) |
+    ((static_cast<int32_t>(dd) >> 4) * B22) |
+    ((static_cast<uint32_t>(dd) & 0xf) * B12) |
+    (B9) |
+    (is_unsigned ? B7 : 0) |
+    ((static_cast<int32_t>(dm) >> 4) * B5) |
+    (static_cast<uint32_t>(dm) & 0xf);
+
+  Emit(encoding);
+}
+
 
 void Arm32Assembler::svc(uint32_t imm24) {
   CHECK(IsUint<24>(imm24)) << imm24;
@@ -1461,6 +1498,34 @@
   }
 }
 
+void Arm32Assembler::LoadDImmediate(DRegister dd, double value, Condition cond) {
+  if (!vmovd(dd, value, cond)) {
+    uint64_t int_value = bit_cast<uint64_t, double>(value);
+    if (int_value == bit_cast<uint64_t, double>(0.0)) {
+      // 0.0 is quite common, so we special case it by loading
+      // 2.0 in `dd` and then subtracting it.
+      bool success = vmovd(dd, 2.0, cond);
+      CHECK(success);
+      vsubd(dd, dd, dd, cond);
+    } else {
+      if (dd < 16) {
+        // Note: Depending on the particular CPU, this may cause register
+        // forwarding hazard, negatively impacting the performance.
+        SRegister low = static_cast<SRegister>(dd << 1);
+        SRegister high = static_cast<SRegister>(low + 1);
+        LoadSImmediate(low, bit_cast<float, uint32_t>(Low32Bits(int_value)), cond);
+        if (High32Bits(int_value) == Low32Bits(int_value)) {
+          vmovs(high, low);
+        } else {
+          LoadSImmediate(high, bit_cast<float, uint32_t>(High32Bits(int_value)), cond);
+        }
+      } else {
+        LOG(FATAL) << "Unimplemented loading of double into a D register "
+                   << "that cannot be split into two S registers";
+      }
+    }
+  }
+}
 
 // Implementation note: this method must emit at most one instruction when
 // Address::CanHoldLoadOffsetArm.
diff --git a/compiler/utils/arm/assembler_arm32.h b/compiler/utils/arm/assembler_arm32.h
index bc6020e..63be2e2 100644
--- a/compiler/utils/arm/assembler_arm32.h
+++ b/compiler/utils/arm/assembler_arm32.h
@@ -205,10 +205,15 @@
   void vcmpdz(DRegister dd, Condition cond = AL) OVERRIDE;
   void vmstat(Condition cond = AL) OVERRIDE;  // VMRS APSR_nzcv, FPSCR
 
+  void vcntd(DRegister dd, DRegister dm) OVERRIDE;
+  void vpaddld(DRegister dd, DRegister dm, int32_t size, bool is_unsigned) OVERRIDE;
+
   void vpushs(SRegister reg, int nregs, Condition cond = AL) OVERRIDE;
   void vpushd(DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
   void vpops(SRegister reg, int nregs, Condition cond = AL) OVERRIDE;
   void vpopd(DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
+  void vldmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
+  void vstmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
 
   // Branch instructions.
   void b(Label* label, Condition cond = AL) OVERRIDE;
@@ -267,6 +272,7 @@
 
   // Load and Store. May clobber IP.
   void LoadImmediate(Register rd, int32_t value, Condition cond = AL) OVERRIDE;
+  void LoadDImmediate(DRegister dd, double value, Condition cond = AL) OVERRIDE;
   void MarkExceptionHandler(Label* label) OVERRIDE;
   void LoadFromOffset(LoadOperandType type,
                       Register reg,
diff --git a/compiler/utils/arm/assembler_arm32_test.cc b/compiler/utils/arm/assembler_arm32_test.cc
index e570e22..b214062 100644
--- a/compiler/utils/arm/assembler_arm32_test.cc
+++ b/compiler/utils/arm/assembler_arm32_test.cc
@@ -899,4 +899,43 @@
   T3Helper(&arm::Arm32Assembler::revsh, true, "revsh{cond} {reg1}, {reg2}", "revsh");
 }
 
+TEST_F(AssemblerArm32Test, vcnt) {
+  // Different D register numbers are used here, to test register encoding.
+  // Source register number is encoded as M:Vm, destination register number is encoded as D:Vd,
+  // For source and destination registers which use D0..D15, the M bit and D bit should be 0.
+  // For source and destination registers which use D16..D32, the M bit and D bit should be 1.
+  GetAssembler()->vcntd(arm::D0, arm::D1);
+  GetAssembler()->vcntd(arm::D19, arm::D20);
+  GetAssembler()->vcntd(arm::D0, arm::D9);
+  GetAssembler()->vcntd(arm::D16, arm::D20);
+
+  std::string expected =
+      "vcnt.8 d0, d1\n"
+      "vcnt.8 d19, d20\n"
+      "vcnt.8 d0, d9\n"
+      "vcnt.8 d16, d20\n";
+
+  DriverStr(expected, "vcnt");
+}
+
+TEST_F(AssemblerArm32Test, vpaddl) {
+  // Different D register numbers are used here, to test register encoding.
+  // Source register number is encoded as M:Vm, destination register number is encoded as D:Vd,
+  // For source and destination registers which use D0..D15, the M bit and D bit should be 0.
+  // For source and destination registers which use D16..D32, the M bit and D bit should be 1.
+  // Different data types (signed and unsigned) are also tested.
+  GetAssembler()->vpaddld(arm::D0, arm::D0, 8, true);
+  GetAssembler()->vpaddld(arm::D20, arm::D20, 8, false);
+  GetAssembler()->vpaddld(arm::D0, arm::D20, 16, false);
+  GetAssembler()->vpaddld(arm::D20, arm::D0, 32, true);
+
+  std::string expected =
+      "vpaddl.u8 d0, d0\n"
+      "vpaddl.s8 d20, d20\n"
+      "vpaddl.s16 d0, d20\n"
+      "vpaddl.u32 d20, d0\n";
+
+  DriverStr(expected, "vpaddl");
+}
+
 }  // namespace art
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 546dd65..bd5875f 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -1917,7 +1917,7 @@
 
     case kLongOrFPLiteral1KiB:
       return 4u;
-    case kLongOrFPLiteral256KiB:
+    case kLongOrFPLiteral64KiB:
       return 10u;
     case kLongOrFPLiteralFar:
       return 14u;
@@ -1989,7 +1989,7 @@
       break;
     case kLiteral1MiB:
     case kLiteral64KiB:
-    case kLongOrFPLiteral256KiB:
+    case kLongOrFPLiteral64KiB:
     case kLiteralAddr64KiB:
       DCHECK_GE(diff, 4);  // The target must be at least 4 bytes after the ADD rX, PC.
       diff -= 4;        // One extra 32-bit MOV.
@@ -2018,6 +2018,45 @@
   return adjustment;
 }
 
+bool Thumb2Assembler::Fixup::IsCandidateForEmitEarly() const {
+  DCHECK(size_ == original_size_);
+  if (target_ == kUnresolved) {
+    return false;
+  }
+  // GetOffset() does not depend on current_code_size for branches, only for literals.
+  constexpr uint32_t current_code_size = 0u;
+  switch (GetSize()) {
+    case kBranch16Bit:
+      return IsInt(cond_ != AL ? 9 : 12, GetOffset(current_code_size));
+    case kBranch32Bit:
+      // We don't support conditional branches beyond +-1MiB
+      // or unconditional branches beyond +-16MiB.
+      return true;
+
+    case kCbxz16Bit:
+      return IsUint<7>(GetOffset(current_code_size));
+    case kCbxz32Bit:
+      return IsInt<9>(GetOffset(current_code_size));
+    case kCbxz48Bit:
+      // We don't support conditional branches beyond +-1MiB.
+      return true;
+
+    case kLiteral1KiB:
+    case kLiteral4KiB:
+    case kLiteral64KiB:
+    case kLiteral1MiB:
+    case kLiteralFar:
+    case kLiteralAddr1KiB:
+    case kLiteralAddr4KiB:
+    case kLiteralAddr64KiB:
+    case kLiteralAddrFar:
+    case kLongOrFPLiteral1KiB:
+    case kLongOrFPLiteral64KiB:
+    case kLongOrFPLiteralFar:
+      return false;
+  }
+}
+
 uint32_t Thumb2Assembler::Fixup::AdjustSizeIfNeeded(uint32_t current_code_size) {
   uint32_t old_code_size = current_code_size;
   switch (GetSize()) {
@@ -2105,10 +2144,10 @@
       if (IsUint<10>(GetOffset(current_code_size))) {
         break;
       }
-      current_code_size += IncreaseSize(kLongOrFPLiteral256KiB);
+      current_code_size += IncreaseSize(kLongOrFPLiteral64KiB);
       FALLTHROUGH_INTENDED;
-    case kLongOrFPLiteral256KiB:
-      if (IsUint<18>(GetOffset(current_code_size))) {
+    case kLongOrFPLiteral64KiB:
+      if (IsUint<16>(GetOffset(current_code_size))) {
         break;
       }
       current_code_size += IncreaseSize(kLongOrFPLiteralFar);
@@ -2269,11 +2308,10 @@
       buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(encoding & 0xffff));
       break;
     }
-    case kLongOrFPLiteral256KiB: {
-      int32_t offset = GetOffset(code_size);
-      int32_t mov_encoding = MovModImmEncoding32(IP, offset & ~0x3ff);
+    case kLongOrFPLiteral64KiB: {
+      int32_t mov_encoding = MovwEncoding32(IP, GetOffset(code_size));
       int16_t add_pc_encoding = AddRdnRmEncoding16(IP, PC);
-      int32_t ldr_encoding = LoadWideOrFpEncoding(IP, offset & 0x3ff);    // DCHECKs type_.
+      int32_t ldr_encoding = LoadWideOrFpEncoding(IP, 0u);    // DCHECKs type_.
       buffer->Store<int16_t>(location_, mov_encoding >> 16);
       buffer->Store<int16_t>(location_ + 2u, static_cast<int16_t>(mov_encoding & 0xffff));
       buffer->Store<int16_t>(location_ + 4u, add_pc_encoding);
@@ -2326,7 +2364,7 @@
   }
 
   Register rn = ad.GetRegister();
-  if (IsHighRegister(rn) && rn != SP && rn != PC) {
+  if (IsHighRegister(rn) && (byte || half || (rn != SP && rn != PC))) {
     must_be_32bit = true;
   }
 
@@ -2338,24 +2376,24 @@
     // Immediate offset
     int32_t offset = ad.GetOffset();
 
-    // The 16 bit SP relative instruction can only have a 10 bit offset.
-    if (rn == SP && offset >= (1 << 10)) {
-      must_be_32bit = true;
-    }
-
     if (byte) {
       // 5 bit offset, no shift.
-      if (offset >= (1 << 5)) {
+      if ((offset & ~0x1f) != 0) {
         must_be_32bit = true;
       }
     } else if (half) {
-      // 6 bit offset, shifted by 1.
-      if (offset >= (1 << 6)) {
+      // 5 bit offset, shifted by 1.
+      if ((offset & ~(0x1f << 1)) != 0) {
+        must_be_32bit = true;
+      }
+    } else if (rn == SP || rn == PC) {
+      // The 16 bit SP/PC relative instruction can only have an (imm8 << 2) offset.
+      if ((offset & ~(0xff << 2)) != 0) {
         must_be_32bit = true;
       }
     } else {
-      // 7 bit offset, shifted by 2.
-      if (offset >= (1 << 7)) {
+      // 5 bit offset, shifted by 2.
+      if ((offset & ~(0x1f << 2)) != 0) {
         must_be_32bit = true;
       }
     }
@@ -2371,7 +2409,7 @@
     } else {
       // 16 bit thumb1.
       uint8_t opA = 0;
-      bool sp_relative = false;
+      bool sp_or_pc_relative = false;
 
       if (byte) {
         opA = 7U /* 0b0111 */;
@@ -2380,7 +2418,10 @@
       } else {
         if (rn == SP) {
           opA = 9U /* 0b1001 */;
-          sp_relative = true;
+          sp_or_pc_relative = true;
+        } else if (rn == PC) {
+          opA = 4U;
+          sp_or_pc_relative = true;
         } else {
           opA = 6U /* 0b0110 */;
         }
@@ -2389,7 +2430,7 @@
           (load ? B11 : 0);
 
       CHECK_GE(offset, 0);
-      if (sp_relative) {
+      if (sp_or_pc_relative) {
         // SP relative, 10 bit offset.
         CHECK_LT(offset, (1 << 10));
         CHECK_ALIGNED(offset, 4);
@@ -2457,6 +2498,9 @@
         } else if (!byte) {
           encoding |= B22;
         }
+        if (load && is_signed && (byte || half)) {
+          encoding |= B24;
+        }
         Emit32(encoding);
       } else {
         // 16 bit register offset.
@@ -3015,9 +3059,49 @@
 }
 
 
+void Thumb2Assembler::vldmiad(Register base_reg, DRegister reg, int nregs, Condition cond) {
+  int32_t rest = B23;
+  EmitVLdmOrStm(rest,
+                static_cast<uint32_t>(reg),
+                nregs,
+                base_reg,
+                /*is_load*/ true,
+                /*dbl*/ true,
+                cond);
+}
+
+
+void Thumb2Assembler::vstmiad(Register base_reg, DRegister reg, int nregs, Condition cond) {
+  int32_t rest = B23;
+  EmitVLdmOrStm(rest,
+                static_cast<uint32_t>(reg),
+                nregs,
+                base_reg,
+                /*is_load*/ false,
+                /*dbl*/ true,
+                cond);
+}
+
+
 void Thumb2Assembler::EmitVPushPop(uint32_t reg, int nregs, bool push, bool dbl, Condition cond) {
+  int32_t rest = B21 | (push ? B24 : B23);
+  EmitVLdmOrStm(rest, reg, nregs, SP, /*is_load*/ !push, dbl, cond);
+}
+
+
+void Thumb2Assembler::EmitVLdmOrStm(int32_t rest,
+                                    uint32_t reg,
+                                    int nregs,
+                                    Register rn,
+                                    bool is_load,
+                                    bool dbl,
+                                    Condition cond) {
   CheckCondition(cond);
 
+  DCHECK_GT(nregs, 0);
+  DCHECK_LE(reg + nregs, 32u);
+  DCHECK(!dbl || (nregs <= 16));
+
   uint32_t D;
   uint32_t Vd;
   if (dbl) {
@@ -3029,14 +3113,17 @@
     D = reg & 1;
     Vd = (reg >> 1) & 15U /* 0b1111 */;
   }
-  int32_t encoding = B27 | B26 | B21 | B19 | B18 | B16 |
-                    B11 | B9 |
-        (dbl ? B8 : 0) |
-        (push ? B24 : (B23 | B20)) |
-        14U /* 0b1110 */ << 28 |
-        nregs << (dbl ? 1 : 0) |
-        D << 22 |
-        Vd << 12;
+
+  int32_t encoding = rest |
+                     14U /* 0b1110 */ << 28 |
+                     B27 | B26 | B11 | B9 |
+                     (is_load ? B20 : 0) |
+                     static_cast<int16_t>(rn) << 16 |
+                     D << 22 |
+                     Vd << 12 |
+                     (dbl ? B8 : 0) |
+                     nregs << (dbl ? 1 : 0);
+
   Emit32(encoding);
 }
 
@@ -3117,6 +3204,30 @@
   Emit32(encoding);
 }
 
+void Thumb2Assembler::vcntd(DRegister dd, DRegister dm) {
+  uint32_t encoding = (B31 | B30 | B29 | B28 | B27 | B26 | B25 | B24 | B23 | B21 | B20) |
+    ((static_cast<int32_t>(dd) >> 4) * B22) |
+    ((static_cast<uint32_t>(dd) & 0xf) * B12) |
+    (B10 | B8) |
+    ((static_cast<int32_t>(dm) >> 4) * B5) |
+    (static_cast<uint32_t>(dm) & 0xf);
+
+  Emit32(encoding);
+}
+
+void Thumb2Assembler::vpaddld(DRegister dd, DRegister dm, int32_t size, bool is_unsigned) {
+  CHECK(size == 8 || size == 16 || size == 32) << size;
+  uint32_t encoding = (B31 | B30 | B29 | B28 | B27 | B26 | B25 | B24 | B23 | B21 | B20) |
+    ((static_cast<uint32_t>(size >> 4) & 0x3) * B18) |
+    ((static_cast<int32_t>(dd) >> 4) * B22) |
+    ((static_cast<uint32_t>(dd) & 0xf) * B12) |
+    (B9) |
+    (is_unsigned ? B7 : 0) |
+    ((static_cast<int32_t>(dm) >> 4) * B5) |
+    (static_cast<uint32_t>(dm) & 0xf);
+
+  Emit32(encoding);
+}
 
 void Thumb2Assembler::svc(uint32_t imm8) {
   CHECK(IsUint<8>(imm8)) << imm8;
@@ -3271,6 +3382,30 @@
 
 void Thumb2Assembler::Bind(Label* label) {
   BindLabel(label, buffer_.Size());
+
+  // Try to emit some Fixups now to reduce the memory needed during the branch fixup later.
+  while (!fixups_.empty() && fixups_.back().IsCandidateForEmitEarly()) {
+    const Fixup& last_fixup = fixups_.back();
+    // Fixups are ordered by location, so the candidate can surely be emitted if it is
+    // a forward branch. If it's a backward branch, it may go over any number of other
+    // fixups. We could check for any number of emit early candidates but we want this
+    // heuristics to be quick, so check just one.
+    uint32_t target = last_fixup.GetTarget();
+    if (target < last_fixup.GetLocation() &&
+        fixups_.size() >= 2u &&
+        fixups_[fixups_.size() - 2u].GetLocation() >= target) {
+      const Fixup& prev_fixup = fixups_[fixups_.size() - 2u];
+      if (!prev_fixup.IsCandidateForEmitEarly()) {
+        break;
+      }
+      uint32_t min_target = std::min(target, prev_fixup.GetTarget());
+      if (fixups_.size() >= 3u && fixups_[fixups_.size() - 3u].GetLocation() >= min_target) {
+        break;
+      }
+    }
+    last_fixup.Emit(&buffer_, buffer_.Size());
+    fixups_.pop_back();
+  }
 }
 
 
@@ -3574,6 +3709,24 @@
   }
 }
 
+void Thumb2Assembler::LoadDImmediate(DRegister dd, double value, Condition cond) {
+  if (!vmovd(dd, value, cond)) {
+    uint64_t int_value = bit_cast<uint64_t, double>(value);
+    if (int_value == bit_cast<uint64_t, double>(0.0)) {
+      // 0.0 is quite common, so we special case it by loading
+      // 2.0 in `dd` and then subtracting it.
+      bool success = vmovd(dd, 2.0, cond);
+      CHECK(success);
+      vsubd(dd, dd, dd, cond);
+    } else {
+      Literal* literal = literal64_dedupe_map_.GetOrCreate(
+          int_value,
+          [this, int_value]() { return NewLiteral<uint64_t>(int_value); });
+      LoadLiteral(dd, literal);
+    }
+  }
+}
+
 int32_t Thumb2Assembler::GetAllowedLoadOffsetBits(LoadOperandType type) {
   switch (type) {
     case kLoadSignedByte:
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index ce310a4..47a55eb 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -43,6 +43,7 @@
         fixups_(arena->Adapter(kArenaAllocAssembler)),
         fixup_dependents_(arena->Adapter(kArenaAllocAssembler)),
         literals_(arena->Adapter(kArenaAllocAssembler)),
+        literal64_dedupe_map_(std::less<uint64_t>(), arena->Adapter(kArenaAllocAssembler)),
         jump_tables_(arena->Adapter(kArenaAllocAssembler)),
         last_position_adjustment_(0u),
         last_old_position_(0u),
@@ -250,10 +251,15 @@
   void vcmpdz(DRegister dd, Condition cond = AL) OVERRIDE;
   void vmstat(Condition cond = AL) OVERRIDE;  // VMRS APSR_nzcv, FPSCR
 
+  void vcntd(DRegister dd, DRegister dm) OVERRIDE;
+  void vpaddld(DRegister dd, DRegister dm, int32_t size, bool is_unsigned) OVERRIDE;
+
   void vpushs(SRegister reg, int nregs, Condition cond = AL) OVERRIDE;
   void vpushd(DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
   void vpops(SRegister reg, int nregs, Condition cond = AL) OVERRIDE;
   void vpopd(DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
+  void vldmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
+  void vstmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
 
   // Branch instructions.
   void b(Label* label, Condition cond = AL);
@@ -316,6 +322,7 @@
 
   // Load and Store. May clobber IP.
   void LoadImmediate(Register rd, int32_t value, Condition cond = AL) OVERRIDE;
+  void LoadDImmediate(DRegister dd, double value, Condition cond = AL) OVERRIDE;
   void MarkExceptionHandler(Label* label) OVERRIDE;
   void LoadFromOffset(LoadOperandType type,
                       Register reg,
@@ -461,8 +468,8 @@
       // Load long or FP literal variants.
       // VLDR s/dX, label; 32-bit insn, up to 1KiB offset; 4 bytes.
       kLongOrFPLiteral1KiB,
-      // MOV ip, modimm + ADD ip, pc + VLDR s/dX, [IP, #imm8*4]; up to 256KiB offset; 10 bytes.
-      kLongOrFPLiteral256KiB,
+      // MOV ip, imm16 + ADD ip, pc + VLDR s/dX, [IP, #0]; up to 64KiB offset; 10 bytes.
+      kLongOrFPLiteral64KiB,
       // MOV ip, imm16 + MOVT ip, imm16 + ADD ip, pc + VLDR s/dX, [IP]; any offset; 14 bytes.
       kLongOrFPLiteralFar,
     };
@@ -497,7 +504,7 @@
     // Load wide literal.
     static Fixup LoadWideLiteral(uint32_t location, Register rt, Register rt2,
                                  Size size = kLongOrFPLiteral1KiB) {
-      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral256KiB ||
+      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral64KiB ||
              size == kLongOrFPLiteralFar);
       DCHECK(!IsHighRegister(rt) || (size != kLiteral1KiB && size != kLiteral64KiB));
       return Fixup(rt, rt2, kNoSRegister, kNoDRegister,
@@ -507,7 +514,7 @@
     // Load FP single literal.
     static Fixup LoadSingleLiteral(uint32_t location, SRegister sd,
                                    Size size = kLongOrFPLiteral1KiB) {
-      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral256KiB ||
+      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral64KiB ||
              size == kLongOrFPLiteralFar);
       return Fixup(kNoRegister, kNoRegister, sd, kNoDRegister,
                    AL, kLoadFPLiteralSingle, size, location);
@@ -516,7 +523,7 @@
     // Load FP double literal.
     static Fixup LoadDoubleLiteral(uint32_t location, DRegister dd,
                                    Size size = kLongOrFPLiteral1KiB) {
-      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral256KiB ||
+      DCHECK(size == kLongOrFPLiteral1KiB || size == kLongOrFPLiteral64KiB ||
              size == kLongOrFPLiteralFar);
       return Fixup(kNoRegister, kNoRegister, kNoSRegister, dd,
                    AL, kLoadFPLiteralDouble, size, location);
@@ -568,6 +575,10 @@
       return location_;
     }
 
+    uint32_t GetTarget() const {
+      return target_;
+    }
+
     uint32_t GetAdjustment() const {
       return adjustment_;
     }
@@ -587,6 +598,11 @@
       target_ = target;
     }
 
+    // Branches with bound targets that are in range can be emitted early.
+    // However, the caller still needs to check if the branch doesn't go over
+    // another Fixup that's not ready to be emitted.
+    bool IsCandidateForEmitEarly() const;
+
     // Check if the current size is OK for current location_, target_ and adjustment_.
     // If not, increase the size. Return the size increase, 0 if unchanged.
     // If the target if after this Fixup, also add the difference to adjustment_,
@@ -745,6 +761,14 @@
                   SRegister sn,
                   SRegister sm);
 
+  void EmitVLdmOrStm(int32_t rest,
+                     uint32_t reg,
+                     int nregs,
+                     Register rn,
+                     bool is_load,
+                     bool dbl,
+                     Condition cond);
+
   void EmitVFPddd(Condition cond,
                   int32_t opcode,
                   DRegister dd,
@@ -867,6 +891,9 @@
   // without invalidating pointers and references to existing elements.
   ArenaDeque<Literal> literals_;
 
+  // Deduplication map for 64-bit literals, used for LoadDImmediate().
+  ArenaSafeMap<uint64_t, Literal*> literal64_dedupe_map_;
+
   // Jump table list.
   ArenaDeque<JumpTable> jump_tables_;
 
diff --git a/compiler/utils/arm/assembler_thumb2_test.cc b/compiler/utils/arm/assembler_thumb2_test.cc
index b5cafcb..d0799d6 100644
--- a/compiler/utils/arm/assembler_thumb2_test.cc
+++ b/compiler/utils/arm/assembler_thumb2_test.cc
@@ -279,6 +279,148 @@
   DriverStr(expected, "smull");
 }
 
+TEST_F(AssemblerThumb2Test, LoadByteFromThumbOffset) {
+  arm::LoadOperandType type = arm::kLoadUnsignedByte;
+
+  __ LoadFromOffset(type, arm::R0, arm::R7, 0);
+  __ LoadFromOffset(type, arm::R1, arm::R7, 31);
+  __ LoadFromOffset(type, arm::R2, arm::R7, 32);
+  __ LoadFromOffset(type, arm::R3, arm::R7, 4095);
+  __ LoadFromOffset(type, arm::R4, arm::SP, 0);
+
+  const char* expected =
+      "ldrb r0, [r7, #0]\n"
+      "ldrb r1, [r7, #31]\n"
+      "ldrb.w r2, [r7, #32]\n"
+      "ldrb.w r3, [r7, #4095]\n"
+      "ldrb.w r4, [sp, #0]\n";
+  DriverStr(expected, "LoadByteFromThumbOffset");
+}
+
+TEST_F(AssemblerThumb2Test, StoreByteToThumbOffset) {
+  arm::StoreOperandType type = arm::kStoreByte;
+
+  __ StoreToOffset(type, arm::R0, arm::R7, 0);
+  __ StoreToOffset(type, arm::R1, arm::R7, 31);
+  __ StoreToOffset(type, arm::R2, arm::R7, 32);
+  __ StoreToOffset(type, arm::R3, arm::R7, 4095);
+  __ StoreToOffset(type, arm::R4, arm::SP, 0);
+
+  const char* expected =
+      "strb r0, [r7, #0]\n"
+      "strb r1, [r7, #31]\n"
+      "strb.w r2, [r7, #32]\n"
+      "strb.w r3, [r7, #4095]\n"
+      "strb.w r4, [sp, #0]\n";
+  DriverStr(expected, "StoreByteToThumbOffset");
+}
+
+TEST_F(AssemblerThumb2Test, LoadHalfFromThumbOffset) {
+  arm::LoadOperandType type = arm::kLoadUnsignedHalfword;
+
+  __ LoadFromOffset(type, arm::R0, arm::R7, 0);
+  __ LoadFromOffset(type, arm::R1, arm::R7, 62);
+  __ LoadFromOffset(type, arm::R2, arm::R7, 64);
+  __ LoadFromOffset(type, arm::R3, arm::R7, 4094);
+  __ LoadFromOffset(type, arm::R4, arm::SP, 0);
+  __ LoadFromOffset(type, arm::R5, arm::R7, 1);  // Unaligned
+
+  const char* expected =
+      "ldrh r0, [r7, #0]\n"
+      "ldrh r1, [r7, #62]\n"
+      "ldrh.w r2, [r7, #64]\n"
+      "ldrh.w r3, [r7, #4094]\n"
+      "ldrh.w r4, [sp, #0]\n"
+      "ldrh.w r5, [r7, #1]\n";
+  DriverStr(expected, "LoadHalfFromThumbOffset");
+}
+
+TEST_F(AssemblerThumb2Test, StoreHalfToThumbOffset) {
+  arm::StoreOperandType type = arm::kStoreHalfword;
+
+  __ StoreToOffset(type, arm::R0, arm::R7, 0);
+  __ StoreToOffset(type, arm::R1, arm::R7, 62);
+  __ StoreToOffset(type, arm::R2, arm::R7, 64);
+  __ StoreToOffset(type, arm::R3, arm::R7, 4094);
+  __ StoreToOffset(type, arm::R4, arm::SP, 0);
+  __ StoreToOffset(type, arm::R5, arm::R7, 1);  // Unaligned
+
+  const char* expected =
+      "strh r0, [r7, #0]\n"
+      "strh r1, [r7, #62]\n"
+      "strh.w r2, [r7, #64]\n"
+      "strh.w r3, [r7, #4094]\n"
+      "strh.w r4, [sp, #0]\n"
+      "strh.w r5, [r7, #1]\n";
+  DriverStr(expected, "StoreHalfToThumbOffset");
+}
+
+TEST_F(AssemblerThumb2Test, LoadWordFromSpPlusOffset) {
+  arm::LoadOperandType type = arm::kLoadWord;
+
+  __ LoadFromOffset(type, arm::R0, arm::SP, 0);
+  __ LoadFromOffset(type, arm::R1, arm::SP, 124);
+  __ LoadFromOffset(type, arm::R2, arm::SP, 128);
+  __ LoadFromOffset(type, arm::R3, arm::SP, 1020);
+  __ LoadFromOffset(type, arm::R4, arm::SP, 1024);
+  __ LoadFromOffset(type, arm::R5, arm::SP, 4092);
+  __ LoadFromOffset(type, arm::R6, arm::SP, 1);  // Unaligned
+
+  const char* expected =
+      "ldr r0, [sp, #0]\n"
+      "ldr r1, [sp, #124]\n"
+      "ldr r2, [sp, #128]\n"
+      "ldr r3, [sp, #1020]\n"
+      "ldr.w r4, [sp, #1024]\n"
+      "ldr.w r5, [sp, #4092]\n"
+      "ldr.w r6, [sp, #1]\n";
+  DriverStr(expected, "LoadWordFromSpPlusOffset");
+}
+
+TEST_F(AssemblerThumb2Test, StoreWordToSpPlusOffset) {
+  arm::StoreOperandType type = arm::kStoreWord;
+
+  __ StoreToOffset(type, arm::R0, arm::SP, 0);
+  __ StoreToOffset(type, arm::R1, arm::SP, 124);
+  __ StoreToOffset(type, arm::R2, arm::SP, 128);
+  __ StoreToOffset(type, arm::R3, arm::SP, 1020);
+  __ StoreToOffset(type, arm::R4, arm::SP, 1024);
+  __ StoreToOffset(type, arm::R5, arm::SP, 4092);
+  __ StoreToOffset(type, arm::R6, arm::SP, 1);  // Unaligned
+
+  const char* expected =
+      "str r0, [sp, #0]\n"
+      "str r1, [sp, #124]\n"
+      "str r2, [sp, #128]\n"
+      "str r3, [sp, #1020]\n"
+      "str.w r4, [sp, #1024]\n"
+      "str.w r5, [sp, #4092]\n"
+      "str.w r6, [sp, #1]\n";
+  DriverStr(expected, "StoreWordToSpPlusOffset");
+}
+
+TEST_F(AssemblerThumb2Test, LoadWordFromPcPlusOffset) {
+  arm::LoadOperandType type = arm::kLoadWord;
+
+  __ LoadFromOffset(type, arm::R0, arm::PC, 0);
+  __ LoadFromOffset(type, arm::R1, arm::PC, 124);
+  __ LoadFromOffset(type, arm::R2, arm::PC, 128);
+  __ LoadFromOffset(type, arm::R3, arm::PC, 1020);
+  __ LoadFromOffset(type, arm::R4, arm::PC, 1024);
+  __ LoadFromOffset(type, arm::R5, arm::PC, 4092);
+  __ LoadFromOffset(type, arm::R6, arm::PC, 1);  // Unaligned
+
+  const char* expected =
+      "ldr r0, [pc, #0]\n"
+      "ldr r1, [pc, #124]\n"
+      "ldr r2, [pc, #128]\n"
+      "ldr r3, [pc, #1020]\n"
+      "ldr.w r4, [pc, #1024]\n"
+      "ldr.w r5, [pc, #4092]\n"
+      "ldr.w r6, [pc, #1]\n";
+  DriverStr(expected, "LoadWordFromPcPlusOffset");
+}
+
 TEST_F(AssemblerThumb2Test, StoreWordToThumbOffset) {
   arm::StoreOperandType type = arm::kStoreWord;
   int32_t offset = 4092;
@@ -869,10 +1011,11 @@
   }
 
   std::string expected =
-      "mov.w ip, #((2f - 1f - 4) & ~0x3ff)\n"
+      // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
+      "movw ip, #(0x408 - 0x4 - 4)\n"
       "1:\n"
       "add ip, pc\n"
-      "ldrd r1, r3, [ip, #((2f - 1b - 4) & 0x3ff)]\n" +
+      "ldrd r1, r3, [ip, #0]\n" +
       RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
       ".align 2, 0\n"
       "2:\n"
@@ -884,48 +1027,78 @@
             __ GetAdjustedPosition(label.Position()));
 }
 
-TEST_F(AssemblerThumb2Test, LoadLiteralSingleMax256KiB) {
+TEST_F(AssemblerThumb2Test, LoadLiteralSingleMax64KiB) {
   // The literal size must match but the type doesn't, so use an int32_t rather than float.
   arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
   __ LoadLiteral(arm::S3, literal);
   Label label;
   __ Bind(&label);
-  constexpr size_t kLdrR0R0Count = (1 << 17) - 3u;
-  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
-    __ ldr(arm::R0, arm::Address(arm::R0));
-  }
-
-  std::string expected =
-      "mov.w ip, #((2f - 1f - 4) & ~0x3ff)\n"
-      "1:\n"
-      "add ip, pc\n"
-      "vldr s3, [ip, #((2f - 1b - 4) & 0x3ff)]\n" +
-      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
-      ".align 2, 0\n"
-      "2:\n"
-      ".word 0x12345678\n";
-  DriverStr(expected, "LoadLiteralSingleMax256KiB");
-
-  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 6u,
-            __ GetAdjustedPosition(label.Position()));
-}
-
-TEST_F(AssemblerThumb2Test, LoadLiteralDoubleBeyondMax256KiB) {
-  // The literal size must match but the type doesn't, so use an int64_t rather than double.
-  arm::Literal* literal = __ NewLiteral<int64_t>(INT64_C(0x1234567887654321));
-  __ LoadLiteral(arm::D3, literal);
-  Label label;
-  __ Bind(&label);
-  constexpr size_t kLdrR0R0Count = (1 << 17) - 2u;
+  constexpr size_t kLdrR0R0Count = (1 << 15) - 3u;
   for (size_t i = 0; i != kLdrR0R0Count; ++i) {
     __ ldr(arm::R0, arm::Address(arm::R0));
   }
 
   std::string expected =
       // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
-      "movw ip, #(0x40000 & 0xffff)\n"
+      "movw ip, #(0x10004 - 0x4 - 4)\n"
+      "1:\n"
+      "add ip, pc\n"
+      "vldr s3, [ip, #0]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralSingleMax64KiB");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 6u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralSingleMax64KiB_UnalignedPC) {
+  // The literal size must match but the type doesn't, so use an int32_t rather than float.
+  arm::Literal* literal = __ NewLiteral<int32_t>(0x12345678);
+  __ ldr(arm::R0, arm::Address(arm::R0));
+  __ LoadLiteral(arm::S3, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1 << 15) - 4u;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      "ldr r0, [r0]\n"
+      // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
+      "movw ip, #(0x10004 - 0x6 - 4)\n"
+      "1:\n"
+      "add ip, pc\n"
+      "vldr s3, [ip, #0]\n" +
+      RepeatInsn(kLdrR0R0Count, "ldr r0, [r0]\n") +
+      ".align 2, 0\n"
+      "2:\n"
+      ".word 0x12345678\n";
+  DriverStr(expected, "LoadLiteralSingleMax64KiB_UnalignedPC");
+
+  EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 6u,
+            __ GetAdjustedPosition(label.Position()));
+}
+
+TEST_F(AssemblerThumb2Test, LoadLiteralDoubleBeyondMax64KiB) {
+  // The literal size must match but the type doesn't, so use an int64_t rather than double.
+  arm::Literal* literal = __ NewLiteral<int64_t>(INT64_C(0x1234567887654321));
+  __ LoadLiteral(arm::D3, literal);
+  Label label;
+  __ Bind(&label);
+  constexpr size_t kLdrR0R0Count = (1 << 15) - 2u;
+  for (size_t i = 0; i != kLdrR0R0Count; ++i) {
+    __ ldr(arm::R0, arm::Address(arm::R0));
+  }
+
+  std::string expected =
+      // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
+      "movw ip, #((0x1000c - 0x8 - 4) & 0xffff)\n"
       // "as" does not consider ((2f - 1f - 4) >> 16) a constant expression for movt.
-      "movt ip, #(0x40000 >> 16)\n"
+      "movt ip, #((0x1000c - 0x8 - 4) >> 16)\n"
       "1:\n"
       "add ip, pc\n"
       "vldr d3, [ip, #0]\n" +
@@ -934,7 +1107,7 @@
       "2:\n"
       ".word 0x87654321\n"
       ".word 0x12345678\n";
-  DriverStr(expected, "LoadLiteralDoubleBeyondMax256KiB");
+  DriverStr(expected, "LoadLiteralDoubleBeyondMax64KiB");
 
   EXPECT_EQ(static_cast<uint32_t>(label.Position()) + 10u,
             __ GetAdjustedPosition(label.Position()));
@@ -946,16 +1119,16 @@
   __ LoadLiteral(arm::D3, literal);
   Label label;
   __ Bind(&label);
-  constexpr size_t kLdrR0R0Count = (1 << 17) - 2u + 0x1234;
+  constexpr size_t kLdrR0R0Count = (1 << 15) - 2u + 0x1234;
   for (size_t i = 0; i != kLdrR0R0Count; ++i) {
     __ ldr(arm::R0, arm::Address(arm::R0));
   }
 
   std::string expected =
       // "as" does not consider ((2f - 1f - 4) & 0xffff) a constant expression for movw.
-      "movw ip, #((0x40000 + 2 * 0x1234) & 0xffff)\n"
+      "movw ip, #((0x1000c + 2 * 0x1234 - 0x8 - 4) & 0xffff)\n"
       // "as" does not consider ((2f - 1f - 4) >> 16) a constant expression for movt.
-      "movt ip, #((0x40000 + 2 * 0x1234) >> 16)\n"
+      "movt ip, #((0x1000c + 2 * 0x1234 - 0x8 - 4) >> 16)\n"
       "1:\n"
       "add ip, pc\n"
       "vldr d3, [ip, #0]\n" +
@@ -1380,4 +1553,104 @@
   DriverStr(expected, "revsh");
 }
 
+TEST_F(AssemblerThumb2Test, vcnt) {
+  // Different D register numbers are used here, to test register encoding.
+  // Source register number is encoded as M:Vm, destination register number is encoded as D:Vd,
+  // For source and destination registers which use D0..D15, the M bit and D bit should be 0.
+  // For source and destination registers which use D16..D32, the M bit and D bit should be 1.
+  __ vcntd(arm::D0, arm::D1);
+  __ vcntd(arm::D19, arm::D20);
+  __ vcntd(arm::D0, arm::D9);
+  __ vcntd(arm::D16, arm::D20);
+
+  std::string expected =
+      "vcnt.8 d0, d1\n"
+      "vcnt.8 d19, d20\n"
+      "vcnt.8 d0, d9\n"
+      "vcnt.8 d16, d20\n";
+
+  DriverStr(expected, "vcnt");
+}
+
+TEST_F(AssemblerThumb2Test, vpaddl) {
+  // Different D register numbers are used here, to test register encoding.
+  // Source register number is encoded as M:Vm, destination register number is encoded as D:Vd,
+  // For source and destination registers which use D0..D15, the M bit and D bit should be 0.
+  // For source and destination registers which use D16..D32, the M bit and D bit should be 1.
+  // Different data types (signed and unsigned) are also tested.
+  __ vpaddld(arm::D0, arm::D0, 8, true);
+  __ vpaddld(arm::D20, arm::D20, 8, false);
+  __ vpaddld(arm::D0, arm::D20, 16, false);
+  __ vpaddld(arm::D20, arm::D0, 32, true);
+
+  std::string expected =
+      "vpaddl.u8 d0, d0\n"
+      "vpaddl.s8 d20, d20\n"
+      "vpaddl.s16 d0, d20\n"
+      "vpaddl.u32 d20, d0\n";
+
+  DriverStr(expected, "vpaddl");
+}
+
+TEST_F(AssemblerThumb2Test, LoadFromShiftedRegOffset) {
+  arm::Address mem_address(arm::R0, arm::R1, arm::Shift::LSL, 2);
+
+  __ ldrsb(arm::R2, mem_address);
+  __ ldrb(arm::R2, mem_address);
+  __ ldrsh(arm::R2, mem_address);
+  __ ldrh(arm::R2, mem_address);
+  __ ldr(arm::R2, mem_address);
+
+  std::string expected =
+      "ldrsb r2, [r0, r1, LSL #2]\n"
+      "ldrb r2, [r0, r1, LSL #2]\n"
+      "ldrsh r2, [r0, r1, LSL #2]\n"
+      "ldrh r2, [r0, r1, LSL #2]\n"
+      "ldr r2, [r0, r1, LSL #2]\n";
+
+  DriverStr(expected, "LoadFromShiftedRegOffset");
+}
+
+TEST_F(AssemblerThumb2Test, VStmLdmPushPop) {
+  // Different D register numbers are used here, to test register encoding.
+  // Source register number is encoded as M:Vm, destination register number is encoded as D:Vd,
+  // For source and destination registers which use D0..D15, the M bit and D bit should be 0.
+  // For source and destination registers which use D16..D32, the M bit and D bit should be 1.
+  // Different data types (signed and unsigned) are also tested.
+  __ vstmiad(arm::R0, arm::D0, 4);
+  __ vldmiad(arm::R1, arm::D9, 5);
+  __ vpopd(arm::D0, 4);
+  __ vpushd(arm::D9, 5);
+  __ vpops(arm::S0, 4);
+  __ vpushs(arm::S9, 5);
+  __ vpushs(arm::S16, 5);
+  __ vpushd(arm::D0, 16);
+  __ vpushd(arm::D1, 15);
+  __ vpushd(arm::D8, 16);
+  __ vpushd(arm::D31, 1);
+  __ vpushs(arm::S0, 32);
+  __ vpushs(arm::S1, 31);
+  __ vpushs(arm::S16, 16);
+  __ vpushs(arm::S31, 1);
+
+  std::string expected =
+      "vstmia r0, {d0 - d3}\n"
+      "vldmia r1, {d9 - d13}\n"
+      "vpop {d0 - d3}\n"
+      "vpush {d9 - d13}\n"
+      "vpop {s0 - s3}\n"
+      "vpush {s9 - s13}\n"
+      "vpush {s16 - s20}\n"
+      "vpush {d0 - d15}\n"
+      "vpush {d1 - d15}\n"
+      "vpush {d8 - d23}\n"
+      "vpush {d31}\n"
+      "vpush {s0 - s31}\n"
+      "vpush {s1 - s31}\n"
+      "vpush {s16 - s31}\n"
+      "vpush {s31}\n";
+
+  DriverStr(expected, "VStmLdmPushPop");
+}
+
 }  // namespace art
diff --git a/compiler/utils/arm/managed_register_arm.h b/compiler/utils/arm/managed_register_arm.h
index 5b84058..276db44 100644
--- a/compiler/utils/arm/managed_register_arm.h
+++ b/compiler/utils/arm/managed_register_arm.h
@@ -85,34 +85,34 @@
 // There is a one-to-one mapping between ManagedRegister and register id.
 class ArmManagedRegister : public ManagedRegister {
  public:
-  Register AsCoreRegister() const {
+  constexpr Register AsCoreRegister() const {
     CHECK(IsCoreRegister());
     return static_cast<Register>(id_);
   }
 
-  SRegister AsSRegister() const {
+  constexpr SRegister AsSRegister() const {
     CHECK(IsSRegister());
     return static_cast<SRegister>(id_ - kNumberOfCoreRegIds);
   }
 
-  DRegister AsDRegister() const {
+  constexpr DRegister AsDRegister() const {
     CHECK(IsDRegister());
     return static_cast<DRegister>(id_ - kNumberOfCoreRegIds - kNumberOfSRegIds);
   }
 
-  SRegister AsOverlappingDRegisterLow() const {
+  constexpr SRegister AsOverlappingDRegisterLow() const {
     CHECK(IsOverlappingDRegister());
     DRegister d_reg = AsDRegister();
     return static_cast<SRegister>(d_reg * 2);
   }
 
-  SRegister AsOverlappingDRegisterHigh() const {
+  constexpr SRegister AsOverlappingDRegisterHigh() const {
     CHECK(IsOverlappingDRegister());
     DRegister d_reg = AsDRegister();
     return static_cast<SRegister>(d_reg * 2 + 1);
   }
 
-  RegisterPair AsRegisterPair() const {
+  constexpr RegisterPair AsRegisterPair() const {
     CHECK(IsRegisterPair());
     Register reg_low = AsRegisterPairLow();
     if (reg_low == R1) {
@@ -122,50 +122,50 @@
     }
   }
 
-  Register AsRegisterPairLow() const {
+  constexpr Register AsRegisterPairLow() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdLow().
     return FromRegId(AllocIdLow()).AsCoreRegister();
   }
 
-  Register AsRegisterPairHigh() const {
+  constexpr Register AsRegisterPairHigh() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdHigh().
     return FromRegId(AllocIdHigh()).AsCoreRegister();
   }
 
-  bool IsCoreRegister() const {
+  constexpr bool IsCoreRegister() const {
     CHECK(IsValidManagedRegister());
     return (0 <= id_) && (id_ < kNumberOfCoreRegIds);
   }
 
-  bool IsSRegister() const {
+  constexpr bool IsSRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - kNumberOfCoreRegIds;
     return (0 <= test) && (test < kNumberOfSRegIds);
   }
 
-  bool IsDRegister() const {
+  constexpr bool IsDRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - (kNumberOfCoreRegIds + kNumberOfSRegIds);
     return (0 <= test) && (test < kNumberOfDRegIds);
   }
 
   // Returns true if this DRegister overlaps SRegisters.
-  bool IsOverlappingDRegister() const {
+  constexpr bool IsOverlappingDRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - (kNumberOfCoreRegIds + kNumberOfSRegIds);
     return (0 <= test) && (test < kNumberOfOverlappingDRegIds);
   }
 
-  bool IsRegisterPair() const {
+  constexpr bool IsRegisterPair() const {
     CHECK(IsValidManagedRegister());
     const int test =
         id_ - (kNumberOfCoreRegIds + kNumberOfSRegIds + kNumberOfDRegIds);
     return (0 <= test) && (test < kNumberOfPairRegIds);
   }
 
-  bool IsSameType(ArmManagedRegister test) const {
+  constexpr bool IsSameType(ArmManagedRegister test) const {
     CHECK(IsValidManagedRegister() && test.IsValidManagedRegister());
     return
       (IsCoreRegister() && test.IsCoreRegister()) ||
@@ -182,29 +182,29 @@
 
   void Print(std::ostream& os) const;
 
-  static ArmManagedRegister FromCoreRegister(Register r) {
+  static constexpr ArmManagedRegister FromCoreRegister(Register r) {
     CHECK_NE(r, kNoRegister);
     return FromRegId(r);
   }
 
-  static ArmManagedRegister FromSRegister(SRegister r) {
+  static constexpr ArmManagedRegister FromSRegister(SRegister r) {
     CHECK_NE(r, kNoSRegister);
     return FromRegId(r + kNumberOfCoreRegIds);
   }
 
-  static ArmManagedRegister FromDRegister(DRegister r) {
+  static constexpr ArmManagedRegister FromDRegister(DRegister r) {
     CHECK_NE(r, kNoDRegister);
     return FromRegId(r + (kNumberOfCoreRegIds + kNumberOfSRegIds));
   }
 
-  static ArmManagedRegister FromRegisterPair(RegisterPair r) {
+  static constexpr ArmManagedRegister FromRegisterPair(RegisterPair r) {
     CHECK_NE(r, kNoRegisterPair);
     return FromRegId(r + (kNumberOfCoreRegIds +
                           kNumberOfSRegIds + kNumberOfDRegIds));
   }
 
   // Return a RegisterPair consisting of Register r_low and r_low + 1.
-  static ArmManagedRegister FromCoreRegisterPair(Register r_low) {
+  static constexpr ArmManagedRegister FromCoreRegisterPair(Register r_low) {
     if (r_low != R1) {  // not the dalvik special case
       CHECK_NE(r_low, kNoRegister);
       CHECK_EQ(0, (r_low % 2));
@@ -217,7 +217,7 @@
   }
 
   // Return a DRegister overlapping SRegister r_low and r_low + 1.
-  static ArmManagedRegister FromSRegisterPair(SRegister r_low) {
+  static constexpr ArmManagedRegister FromSRegisterPair(SRegister r_low) {
     CHECK_NE(r_low, kNoSRegister);
     CHECK_EQ(0, (r_low % 2));
     const int r = r_low / 2;
@@ -226,7 +226,7 @@
   }
 
  private:
-  bool IsValidManagedRegister() const {
+  constexpr bool IsValidManagedRegister() const {
     return (0 <= id_) && (id_ < kNumberOfRegIds);
   }
 
@@ -251,9 +251,9 @@
 
   friend class ManagedRegister;
 
-  explicit ArmManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
+  explicit constexpr ArmManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
 
-  static ArmManagedRegister FromRegId(int reg_id) {
+  static constexpr ArmManagedRegister FromRegId(int reg_id) {
     ArmManagedRegister reg(reg_id);
     CHECK(reg.IsValidManagedRegister());
     return reg;
@@ -264,7 +264,7 @@
 
 }  // namespace arm
 
-inline arm::ArmManagedRegister ManagedRegister::AsArm() const {
+constexpr inline arm::ArmManagedRegister ManagedRegister::AsArm() const {
   arm::ArmManagedRegister reg(id_);
   CHECK(reg.IsNoRegister() || reg.IsValidManagedRegister());
   return reg;
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index eb5112b..54ed62b 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -648,6 +648,15 @@
 void Arm64Assembler::SpillRegisters(vixl::CPURegList registers, int offset) {
   int size = registers.RegisterSizeInBytes();
   const Register sp = vixl_masm_->StackPointer();
+  // Since we are operating on register pairs, we would like to align on
+  // double the standard size; on the other hand, we don't want to insert
+  // an extra store, which will happen if the number of registers is even.
+  if (!IsAlignedParam(offset, 2 * size) && registers.Count() % 2 != 0) {
+    const CPURegister& dst0 = registers.PopLowestIndex();
+    ___ Str(dst0, MemOperand(sp, offset));
+    cfi_.RelOffset(DWARFReg(dst0), offset);
+    offset += size;
+  }
   while (registers.Count() >= 2) {
     const CPURegister& dst0 = registers.PopLowestIndex();
     const CPURegister& dst1 = registers.PopLowestIndex();
@@ -667,6 +676,13 @@
 void Arm64Assembler::UnspillRegisters(vixl::CPURegList registers, int offset) {
   int size = registers.RegisterSizeInBytes();
   const Register sp = vixl_masm_->StackPointer();
+  // Be consistent with the logic for spilling registers.
+  if (!IsAlignedParam(offset, 2 * size) && registers.Count() % 2 != 0) {
+    const CPURegister& dst0 = registers.PopLowestIndex();
+    ___ Ldr(dst0, MemOperand(sp, offset));
+    cfi_.Restore(DWARFReg(dst0));
+    offset += size;
+  }
   while (registers.Count() >= 2) {
     const CPURegister& dst0 = registers.PopLowestIndex();
     const CPURegister& dst1 = registers.PopLowestIndex();
@@ -683,8 +699,9 @@
   DCHECK(registers.IsEmpty());
 }
 
-void Arm64Assembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                                const std::vector<ManagedRegister>& callee_save_regs,
+void Arm64Assembler::BuildFrame(size_t frame_size,
+                                ManagedRegister method_reg,
+                                ArrayRef<const ManagedRegister> callee_save_regs,
                                 const ManagedRegisterEntrySpills& entry_spills) {
   // Setup VIXL CPURegList for callee-saves.
   CPURegList core_reg_list(CPURegister::kRegister, kXRegSize, 0);
@@ -741,7 +758,7 @@
 }
 
 void Arm64Assembler::RemoveFrame(size_t frame_size,
-                                 const std::vector<ManagedRegister>& callee_save_regs) {
+                                 ArrayRef<const ManagedRegister> callee_save_regs) {
   // Setup VIXL CPURegList for callee-saves.
   CPURegList core_reg_list(CPURegister::kRegister, kXRegSize, 0);
   CPURegList fp_reg_list(CPURegister::kFPRegister, kDRegSize, 0);
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index c4e5de7..91171a8 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -109,12 +109,13 @@
   void UnspillRegisters(vixl::CPURegList registers, int offset);
 
   // Emit code that will create an activation on the stack.
-  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                  const std::vector<ManagedRegister>& callee_save_regs,
+  void BuildFrame(size_t frame_size,
+                  ManagedRegister method_reg,
+                  ArrayRef<const ManagedRegister> callee_save_regs,
                   const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack.
-  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+  void RemoveFrame(size_t frame_size, ArrayRef<const ManagedRegister> callee_save_regs)
       OVERRIDE;
 
   void IncreaseFrameSize(size_t adjust) OVERRIDE;
diff --git a/compiler/utils/arm64/managed_register_arm64.h b/compiler/utils/arm64/managed_register_arm64.h
index 46be1c5..f7d74d2 100644
--- a/compiler/utils/arm64/managed_register_arm64.h
+++ b/compiler/utils/arm64/managed_register_arm64.h
@@ -56,80 +56,80 @@
 
 class Arm64ManagedRegister : public ManagedRegister {
  public:
-  XRegister AsXRegister() const {
+  constexpr XRegister AsXRegister() const {
     CHECK(IsXRegister());
     return static_cast<XRegister>(id_);
   }
 
-  WRegister AsWRegister() const {
+  constexpr WRegister AsWRegister() const {
     CHECK(IsWRegister());
     return static_cast<WRegister>(id_ - kNumberOfXRegIds);
   }
 
-  DRegister AsDRegister() const {
+  constexpr DRegister AsDRegister() const {
     CHECK(IsDRegister());
     return static_cast<DRegister>(id_ - kNumberOfXRegIds - kNumberOfWRegIds);
   }
 
-  SRegister AsSRegister() const {
+  constexpr SRegister AsSRegister() const {
     CHECK(IsSRegister());
     return static_cast<SRegister>(id_ - kNumberOfXRegIds - kNumberOfWRegIds -
                                   kNumberOfDRegIds);
   }
 
-  WRegister AsOverlappingWRegister() const {
+  constexpr WRegister AsOverlappingWRegister() const {
     CHECK(IsValidManagedRegister());
     if (IsZeroRegister()) return WZR;
     return static_cast<WRegister>(AsXRegister());
   }
 
-  XRegister AsOverlappingXRegister() const {
+  constexpr XRegister AsOverlappingXRegister() const {
     CHECK(IsValidManagedRegister());
     return static_cast<XRegister>(AsWRegister());
   }
 
-  SRegister AsOverlappingSRegister() const {
+  constexpr SRegister AsOverlappingSRegister() const {
     CHECK(IsValidManagedRegister());
     return static_cast<SRegister>(AsDRegister());
   }
 
-  DRegister AsOverlappingDRegister() const {
+  constexpr DRegister AsOverlappingDRegister() const {
     CHECK(IsValidManagedRegister());
     return static_cast<DRegister>(AsSRegister());
   }
 
-  bool IsXRegister() const {
+  constexpr bool IsXRegister() const {
     CHECK(IsValidManagedRegister());
     return (0 <= id_) && (id_ < kNumberOfXRegIds);
   }
 
-  bool IsWRegister() const {
+  constexpr bool IsWRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - kNumberOfXRegIds;
     return (0 <= test) && (test < kNumberOfWRegIds);
   }
 
-  bool IsDRegister() const {
+  constexpr bool IsDRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - (kNumberOfXRegIds + kNumberOfWRegIds);
     return (0 <= test) && (test < kNumberOfDRegIds);
   }
 
-  bool IsSRegister() const {
+  constexpr bool IsSRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - (kNumberOfXRegIds + kNumberOfWRegIds + kNumberOfDRegIds);
     return (0 <= test) && (test < kNumberOfSRegIds);
   }
 
-  bool IsGPRegister() const {
+  constexpr bool IsGPRegister() const {
     return IsXRegister() || IsWRegister();
   }
 
-  bool IsFPRegister() const {
+  constexpr bool IsFPRegister() const {
     return IsDRegister() || IsSRegister();
   }
 
-  bool IsSameType(Arm64ManagedRegister test) const {
+  constexpr bool IsSameType(Arm64ManagedRegister test) const {
     CHECK(IsValidManagedRegister() && test.IsValidManagedRegister());
     return
       (IsXRegister() && test.IsXRegister()) ||
@@ -145,53 +145,53 @@
 
   void Print(std::ostream& os) const;
 
-  static Arm64ManagedRegister FromXRegister(XRegister r) {
+  static constexpr Arm64ManagedRegister FromXRegister(XRegister r) {
     CHECK_NE(r, kNoRegister);
     return FromRegId(r);
   }
 
-  static Arm64ManagedRegister FromWRegister(WRegister r) {
+  static constexpr Arm64ManagedRegister FromWRegister(WRegister r) {
     CHECK_NE(r, kNoWRegister);
     return FromRegId(r + kNumberOfXRegIds);
   }
 
-  static Arm64ManagedRegister FromDRegister(DRegister r) {
+  static constexpr Arm64ManagedRegister FromDRegister(DRegister r) {
     CHECK_NE(r, kNoDRegister);
     return FromRegId(r + (kNumberOfXRegIds + kNumberOfWRegIds));
   }
 
-  static Arm64ManagedRegister FromSRegister(SRegister r) {
+  static constexpr Arm64ManagedRegister FromSRegister(SRegister r) {
     CHECK_NE(r, kNoSRegister);
     return FromRegId(r + (kNumberOfXRegIds + kNumberOfWRegIds +
                           kNumberOfDRegIds));
   }
 
   // Returns the X register overlapping W register r.
-  static Arm64ManagedRegister FromWRegisterX(WRegister r) {
+  static constexpr Arm64ManagedRegister FromWRegisterX(WRegister r) {
     CHECK_NE(r, kNoWRegister);
     return FromRegId(r);
   }
 
   // Return the D register overlapping S register r.
-  static Arm64ManagedRegister FromSRegisterD(SRegister r) {
+  static constexpr Arm64ManagedRegister FromSRegisterD(SRegister r) {
     CHECK_NE(r, kNoSRegister);
     return FromRegId(r + (kNumberOfXRegIds + kNumberOfWRegIds));
   }
 
  private:
-  bool IsValidManagedRegister() const {
+  constexpr bool IsValidManagedRegister() const {
     return (0 <= id_) && (id_ < kNumberOfRegIds);
   }
 
-  bool IsStackPointer() const {
+  constexpr bool IsStackPointer() const {
     return IsXRegister() && (id_ == SP);
   }
 
-  bool IsZeroRegister() const {
+  constexpr bool IsZeroRegister() const {
     return IsXRegister() && (id_ == XZR);
   }
 
-  int RegId() const {
+  constexpr int RegId() const {
     CHECK(!IsNoRegister());
     return id_;
   }
@@ -202,9 +202,9 @@
 
   friend class ManagedRegister;
 
-  explicit Arm64ManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
+  explicit constexpr Arm64ManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
 
-  static Arm64ManagedRegister FromRegId(int reg_id) {
+  static constexpr Arm64ManagedRegister FromRegId(int reg_id) {
     Arm64ManagedRegister reg(reg_id);
     CHECK(reg.IsValidManagedRegister());
     return reg;
@@ -215,7 +215,7 @@
 
 }  // namespace arm64
 
-inline arm64::Arm64ManagedRegister ManagedRegister::AsArm64() const {
+constexpr inline arm64::Arm64ManagedRegister ManagedRegister::AsArm64() const {
   arm64::Arm64ManagedRegister reg(id_);
   CHECK(reg.IsNoRegister() || reg.IsValidManagedRegister());
   return reg;
diff --git a/compiler/utils/array_ref.h b/compiler/utils/array_ref.h
index 5c33639..8dc9ab4 100644
--- a/compiler/utils/array_ref.h
+++ b/compiler/utils/array_ref.h
@@ -39,9 +39,6 @@
  */
 template <typename T>
 class ArrayRef {
- private:
-  struct tag { };
-
  public:
   typedef T value_type;
   typedef T& reference;
@@ -63,14 +60,14 @@
 
   template <size_t size>
   explicit constexpr ArrayRef(T (&array)[size])
-    : array_(array), size_(size) {
+      : array_(array), size_(size) {
   }
 
-  template <typename U, size_t size>
-  explicit constexpr ArrayRef(U (&array)[size],
-                              typename std::enable_if<std::is_same<T, const U>::value, tag>::type
-                                  t ATTRIBUTE_UNUSED = tag())
-    : array_(array), size_(size) {
+  template <typename U,
+            size_t size,
+            typename = typename std::enable_if<std::is_same<T, const U>::value>::type>
+  explicit constexpr ArrayRef(U (&array)[size])
+      : array_(array), size_(size) {
   }
 
   constexpr ArrayRef(T* array_in, size_t size_in)
@@ -165,13 +162,21 @@
   value_type* data() { return array_; }
   const value_type* data() const { return array_; }
 
-  ArrayRef SubArray(size_type pos) const {
-    return SubArray(pos, size_ - pos);
+  ArrayRef SubArray(size_type pos) {
+    return SubArray(pos, size() - pos);
   }
-  ArrayRef SubArray(size_type pos, size_type length) const {
+  ArrayRef<const T> SubArray(size_type pos) const {
+    return SubArray(pos, size() - pos);
+  }
+  ArrayRef SubArray(size_type pos, size_type length) {
     DCHECK_LE(pos, size());
     DCHECK_LE(length, size() - pos);
-    return ArrayRef(array_ + pos, length);
+    return ArrayRef(data() + pos, length);
+  }
+  ArrayRef<const T> SubArray(size_type pos, size_type length) const {
+    DCHECK_LE(pos, size());
+    DCHECK_LE(length, size() - pos);
+    return ArrayRef<const T>(data() + pos, length);
   }
 
  private:
diff --git a/compiler/utils/assembler.h b/compiler/utils/assembler.h
index 96da03d..e64f643 100644
--- a/compiler/utils/assembler.h
+++ b/compiler/utils/assembler.h
@@ -32,6 +32,7 @@
 #include "memory_region.h"
 #include "mips/constants_mips.h"
 #include "offsets.h"
+#include "utils/array_ref.h"
 #include "x86/constants_x86.h"
 #include "x86_64/constants_x86_64.h"
 
@@ -373,13 +374,14 @@
   virtual void Comment(const char* format ATTRIBUTE_UNUSED, ...) {}
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
+  virtual void BuildFrame(size_t frame_size,
+                          ManagedRegister method_reg,
+                          ArrayRef<const ManagedRegister> callee_save_regs,
                           const ManagedRegisterEntrySpills& entry_spills) = 0;
 
   // Emit code that will remove an activation from the stack
   virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs) = 0;
+                           ArrayRef<const ManagedRegister> callee_save_regs) = 0;
 
   virtual void IncreaseFrameSize(size_t adjust) = 0;
   virtual void DecreaseFrameSize(size_t adjust) = 0;
diff --git a/compiler/utils/managed_register.h b/compiler/utils/managed_register.h
index 893daff..46adb3f 100644
--- a/compiler/utils/managed_register.h
+++ b/compiler/utils/managed_register.h
@@ -47,40 +47,40 @@
   // ManagedRegister is a value class. There exists no method to change the
   // internal state. We therefore allow a copy constructor and an
   // assignment-operator.
-  ManagedRegister(const ManagedRegister& other) : id_(other.id_) { }
+  constexpr ManagedRegister(const ManagedRegister& other) : id_(other.id_) { }
 
   ManagedRegister& operator=(const ManagedRegister& other) {
     id_ = other.id_;
     return *this;
   }
 
-  arm::ArmManagedRegister AsArm() const;
-  arm64::Arm64ManagedRegister AsArm64() const;
-  mips::MipsManagedRegister AsMips() const;
-  mips64::Mips64ManagedRegister AsMips64() const;
-  x86::X86ManagedRegister AsX86() const;
-  x86_64::X86_64ManagedRegister AsX86_64() const;
+  constexpr arm::ArmManagedRegister AsArm() const;
+  constexpr arm64::Arm64ManagedRegister AsArm64() const;
+  constexpr mips::MipsManagedRegister AsMips() const;
+  constexpr mips64::Mips64ManagedRegister AsMips64() const;
+  constexpr x86::X86ManagedRegister AsX86() const;
+  constexpr x86_64::X86_64ManagedRegister AsX86_64() const;
 
   // It is valid to invoke Equals on and with a NoRegister.
-  bool Equals(const ManagedRegister& other) const {
+  constexpr bool Equals(const ManagedRegister& other) const {
     return id_ == other.id_;
   }
 
-  bool IsNoRegister() const {
+  constexpr bool IsNoRegister() const {
     return id_ == kNoRegister;
   }
 
-  static ManagedRegister NoRegister() {
+  static constexpr ManagedRegister NoRegister() {
     return ManagedRegister();
   }
 
-  int RegId() const { return id_; }
-  explicit ManagedRegister(int reg_id) : id_(reg_id) { }
+  constexpr int RegId() const { return id_; }
+  explicit constexpr ManagedRegister(int reg_id) : id_(reg_id) { }
 
  protected:
   static const int kNoRegister = -1;
 
-  ManagedRegister() : id_(kNoRegister) { }
+  constexpr ManagedRegister() : id_(kNoRegister) { }
 
   int id_;
 };
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index a1798c0..9368301 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -2438,8 +2438,9 @@
 
 constexpr size_t kFramePointerSize = 4;
 
-void MipsAssembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                               const std::vector<ManagedRegister>& callee_save_regs,
+void MipsAssembler::BuildFrame(size_t frame_size,
+                               ManagedRegister method_reg,
+                               ArrayRef<const ManagedRegister> callee_save_regs,
                                const ManagedRegisterEntrySpills& entry_spills) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
   DCHECK(!overwriting_);
@@ -2453,7 +2454,7 @@
   cfi_.RelOffset(DWARFReg(RA), stack_offset);
   for (int i = callee_save_regs.size() - 1; i >= 0; --i) {
     stack_offset -= kFramePointerSize;
-    Register reg = callee_save_regs.at(i).AsMips().AsCoreRegister();
+    Register reg = callee_save_regs[i].AsMips().AsCoreRegister();
     StoreToOffset(kStoreWord, reg, SP, stack_offset);
     cfi_.RelOffset(DWARFReg(reg), stack_offset);
   }
@@ -2482,7 +2483,7 @@
 }
 
 void MipsAssembler::RemoveFrame(size_t frame_size,
-                                const std::vector<ManagedRegister>& callee_save_regs) {
+                                ArrayRef<const ManagedRegister> callee_save_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
   DCHECK(!overwriting_);
   cfi_.RememberState();
@@ -2490,7 +2491,7 @@
   // Pop callee saves and return address.
   int stack_offset = frame_size - (callee_save_regs.size() * kFramePointerSize) - kFramePointerSize;
   for (size_t i = 0; i < callee_save_regs.size(); ++i) {
-    Register reg = callee_save_regs.at(i).AsMips().AsCoreRegister();
+    Register reg = callee_save_regs[i].AsMips().AsCoreRegister();
     LoadFromOffset(kLoadWord, reg, SP, stack_offset);
     cfi_.Restore(DWARFReg(reg));
     stack_offset += kFramePointerSize;
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index ecb67bd..d5e6285 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -414,11 +414,11 @@
   // Emit code that will create an activation on the stack.
   void BuildFrame(size_t frame_size,
                   ManagedRegister method_reg,
-                  const std::vector<ManagedRegister>& callee_save_regs,
+                  ArrayRef<const ManagedRegister> callee_save_regs,
                   const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack.
-  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+  void RemoveFrame(size_t frame_size, ArrayRef<const ManagedRegister> callee_save_regs)
       OVERRIDE;
 
   void IncreaseFrameSize(size_t adjust) OVERRIDE;
diff --git a/compiler/utils/mips/managed_register_mips.h b/compiler/utils/mips/managed_register_mips.h
index 5e7ed11..66204e7 100644
--- a/compiler/utils/mips/managed_register_mips.h
+++ b/compiler/utils/mips/managed_register_mips.h
@@ -87,70 +87,70 @@
 // There is a one-to-one mapping between ManagedRegister and register id.
 class MipsManagedRegister : public ManagedRegister {
  public:
-  Register AsCoreRegister() const {
+  constexpr Register AsCoreRegister() const {
     CHECK(IsCoreRegister());
     return static_cast<Register>(id_);
   }
 
-  FRegister AsFRegister() const {
+  constexpr FRegister AsFRegister() const {
     CHECK(IsFRegister());
     return static_cast<FRegister>(id_ - kNumberOfCoreRegIds);
   }
 
-  DRegister AsDRegister() const {
+  constexpr DRegister AsDRegister() const {
     CHECK(IsDRegister());
     return static_cast<DRegister>(id_ - kNumberOfCoreRegIds - kNumberOfFRegIds);
   }
 
-  FRegister AsOverlappingDRegisterLow() const {
+  constexpr FRegister AsOverlappingDRegisterLow() const {
     CHECK(IsOverlappingDRegister());
     DRegister d_reg = AsDRegister();
     return static_cast<FRegister>(d_reg * 2);
   }
 
-  FRegister AsOverlappingDRegisterHigh() const {
+  constexpr FRegister AsOverlappingDRegisterHigh() const {
     CHECK(IsOverlappingDRegister());
     DRegister d_reg = AsDRegister();
     return static_cast<FRegister>(d_reg * 2 + 1);
   }
 
-  Register AsRegisterPairLow() const {
+  constexpr Register AsRegisterPairLow() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdLow().
     return FromRegId(AllocIdLow()).AsCoreRegister();
   }
 
-  Register AsRegisterPairHigh() const {
+  constexpr Register AsRegisterPairHigh() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdHigh().
     return FromRegId(AllocIdHigh()).AsCoreRegister();
   }
 
-  bool IsCoreRegister() const {
+  constexpr bool IsCoreRegister() const {
     CHECK(IsValidManagedRegister());
     return (0 <= id_) && (id_ < kNumberOfCoreRegIds);
   }
 
-  bool IsFRegister() const {
+  constexpr bool IsFRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - kNumberOfCoreRegIds;
     return (0 <= test) && (test < kNumberOfFRegIds);
   }
 
-  bool IsDRegister() const {
+  constexpr bool IsDRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - (kNumberOfCoreRegIds + kNumberOfFRegIds);
     return (0 <= test) && (test < kNumberOfDRegIds);
   }
 
   // Returns true if this DRegister overlaps FRegisters.
-  bool IsOverlappingDRegister() const {
+  constexpr bool IsOverlappingDRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - (kNumberOfCoreRegIds + kNumberOfFRegIds);
     return (0 <= test) && (test < kNumberOfOverlappingDRegIds);
   }
 
-  bool IsRegisterPair() const {
+  constexpr bool IsRegisterPair() const {
     CHECK(IsValidManagedRegister());
     const int test =
         id_ - (kNumberOfCoreRegIds + kNumberOfFRegIds + kNumberOfDRegIds);
@@ -164,32 +164,32 @@
   // then false is returned.
   bool Overlaps(const MipsManagedRegister& other) const;
 
-  static MipsManagedRegister FromCoreRegister(Register r) {
+  static constexpr MipsManagedRegister FromCoreRegister(Register r) {
     CHECK_NE(r, kNoRegister);
     return FromRegId(r);
   }
 
-  static MipsManagedRegister FromFRegister(FRegister r) {
+  static constexpr MipsManagedRegister FromFRegister(FRegister r) {
     CHECK_NE(r, kNoFRegister);
     return FromRegId(r + kNumberOfCoreRegIds);
   }
 
-  static MipsManagedRegister FromDRegister(DRegister r) {
+  static constexpr MipsManagedRegister FromDRegister(DRegister r) {
     CHECK_NE(r, kNoDRegister);
     return FromRegId(r + kNumberOfCoreRegIds + kNumberOfFRegIds);
   }
 
-  static MipsManagedRegister FromRegisterPair(RegisterPair r) {
+  static constexpr MipsManagedRegister FromRegisterPair(RegisterPair r) {
     CHECK_NE(r, kNoRegisterPair);
     return FromRegId(r + (kNumberOfCoreRegIds + kNumberOfFRegIds + kNumberOfDRegIds));
   }
 
  private:
-  bool IsValidManagedRegister() const {
+  constexpr bool IsValidManagedRegister() const {
     return (0 <= id_) && (id_ < kNumberOfRegIds);
   }
 
-  int RegId() const {
+  constexpr int RegId() const {
     CHECK(!IsNoRegister());
     return id_;
   }
@@ -205,9 +205,9 @@
 
   friend class ManagedRegister;
 
-  explicit MipsManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
+  explicit constexpr MipsManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
 
-  static MipsManagedRegister FromRegId(int reg_id) {
+  static constexpr MipsManagedRegister FromRegId(int reg_id) {
     MipsManagedRegister reg(reg_id);
     CHECK(reg.IsValidManagedRegister());
     return reg;
@@ -218,7 +218,7 @@
 
 }  // namespace mips
 
-inline mips::MipsManagedRegister ManagedRegister::AsMips() const {
+constexpr inline mips::MipsManagedRegister ManagedRegister::AsMips() const {
   mips::MipsManagedRegister reg(id_);
   CHECK(reg.IsNoRegister() || reg.IsValidManagedRegister());
   return reg;
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index ab480ca..447ede5 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -1977,8 +1977,9 @@
 
 constexpr size_t kFramePointerSize = 8;
 
-void Mips64Assembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                                 const std::vector<ManagedRegister>& callee_save_regs,
+void Mips64Assembler::BuildFrame(size_t frame_size,
+                                 ManagedRegister method_reg,
+                                 ArrayRef<const ManagedRegister> callee_save_regs,
                                  const ManagedRegisterEntrySpills& entry_spills) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
   DCHECK(!overwriting_);
@@ -1992,7 +1993,7 @@
   cfi_.RelOffset(DWARFReg(RA), stack_offset);
   for (int i = callee_save_regs.size() - 1; i >= 0; --i) {
     stack_offset -= kFramePointerSize;
-    GpuRegister reg = callee_save_regs.at(i).AsMips64().AsGpuRegister();
+    GpuRegister reg = callee_save_regs[i].AsMips64().AsGpuRegister();
     StoreToOffset(kStoreDoubleword, reg, SP, stack_offset);
     cfi_.RelOffset(DWARFReg(reg), stack_offset);
   }
@@ -2003,7 +2004,7 @@
   // Write out entry spills.
   int32_t offset = frame_size + kFramePointerSize;
   for (size_t i = 0; i < entry_spills.size(); ++i) {
-    Mips64ManagedRegister reg = entry_spills.at(i).AsMips64();
+    Mips64ManagedRegister reg = entry_spills[i].AsMips64();
     ManagedRegisterSpill spill = entry_spills.at(i);
     int32_t size = spill.getSize();
     if (reg.IsNoRegister()) {
@@ -2022,7 +2023,7 @@
 }
 
 void Mips64Assembler::RemoveFrame(size_t frame_size,
-                                  const std::vector<ManagedRegister>& callee_save_regs) {
+                                  ArrayRef<const ManagedRegister> callee_save_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
   DCHECK(!overwriting_);
   cfi_.RememberState();
@@ -2030,7 +2031,7 @@
   // Pop callee saves and return address
   int stack_offset = frame_size - (callee_save_regs.size() * kFramePointerSize) - kFramePointerSize;
   for (size_t i = 0; i < callee_save_regs.size(); ++i) {
-    GpuRegister reg = callee_save_regs.at(i).AsMips64().AsGpuRegister();
+    GpuRegister reg = callee_save_regs[i].AsMips64().AsGpuRegister();
     LoadFromOffset(kLoadDoubleword, reg, SP, stack_offset);
     cfi_.Restore(DWARFReg(reg));
     stack_offset += kFramePointerSize;
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 8acc38a..0cd0708 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -365,13 +365,13 @@
   //
 
   // Emit code that will create an activation on the stack.
-  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                  const std::vector<ManagedRegister>& callee_save_regs,
+  void BuildFrame(size_t frame_size,
+                  ManagedRegister method_reg,
+                  ArrayRef<const ManagedRegister> callee_save_regs,
                   const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack.
-  void RemoveFrame(size_t frame_size,
-                   const std::vector<ManagedRegister>& callee_save_regs) OVERRIDE;
+  void RemoveFrame(size_t frame_size, ArrayRef<const ManagedRegister> callee_save_regs) OVERRIDE;
 
   void IncreaseFrameSize(size_t adjust) OVERRIDE;
   void DecreaseFrameSize(size_t adjust) OVERRIDE;
diff --git a/compiler/utils/mips64/managed_register_mips64.h b/compiler/utils/mips64/managed_register_mips64.h
index 1d36128..c9f9556 100644
--- a/compiler/utils/mips64/managed_register_mips64.h
+++ b/compiler/utils/mips64/managed_register_mips64.h
@@ -39,22 +39,22 @@
 // There is a one-to-one mapping between ManagedRegister and register id.
 class Mips64ManagedRegister : public ManagedRegister {
  public:
-  GpuRegister AsGpuRegister() const {
+  constexpr GpuRegister AsGpuRegister() const {
     CHECK(IsGpuRegister());
     return static_cast<GpuRegister>(id_);
   }
 
-  FpuRegister AsFpuRegister() const {
+  constexpr FpuRegister AsFpuRegister() const {
     CHECK(IsFpuRegister());
     return static_cast<FpuRegister>(id_ - kNumberOfGpuRegIds);
   }
 
-  bool IsGpuRegister() const {
+  constexpr bool IsGpuRegister() const {
     CHECK(IsValidManagedRegister());
     return (0 <= id_) && (id_ < kNumberOfGpuRegIds);
   }
 
-  bool IsFpuRegister() const {
+  constexpr bool IsFpuRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - kNumberOfGpuRegIds;
     return (0 <= test) && (test < kNumberOfFpuRegIds);
@@ -67,22 +67,22 @@
   // then false is returned.
   bool Overlaps(const Mips64ManagedRegister& other) const;
 
-  static Mips64ManagedRegister FromGpuRegister(GpuRegister r) {
+  static constexpr Mips64ManagedRegister FromGpuRegister(GpuRegister r) {
     CHECK_NE(r, kNoGpuRegister);
     return FromRegId(r);
   }
 
-  static Mips64ManagedRegister FromFpuRegister(FpuRegister r) {
+  static constexpr Mips64ManagedRegister FromFpuRegister(FpuRegister r) {
     CHECK_NE(r, kNoFpuRegister);
     return FromRegId(r + kNumberOfGpuRegIds);
   }
 
  private:
-  bool IsValidManagedRegister() const {
+  constexpr bool IsValidManagedRegister() const {
     return (0 <= id_) && (id_ < kNumberOfRegIds);
   }
 
-  int RegId() const {
+  constexpr int RegId() const {
     CHECK(!IsNoRegister());
     return id_;
   }
@@ -98,9 +98,9 @@
 
   friend class ManagedRegister;
 
-  explicit Mips64ManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
+  explicit constexpr Mips64ManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
 
-  static Mips64ManagedRegister FromRegId(int reg_id) {
+  static constexpr Mips64ManagedRegister FromRegId(int reg_id) {
     Mips64ManagedRegister reg(reg_id);
     CHECK(reg.IsValidManagedRegister());
     return reg;
@@ -111,7 +111,7 @@
 
 }  // namespace mips64
 
-inline mips64::Mips64ManagedRegister ManagedRegister::AsMips64() const {
+constexpr inline mips64::Mips64ManagedRegister ManagedRegister::AsMips64() const {
   mips64::Mips64ManagedRegister reg(id_);
   CHECK(reg.IsNoRegister() || reg.IsValidManagedRegister());
   return reg;
diff --git a/compiler/utils/transform_array_ref.h b/compiler/utils/transform_array_ref.h
new file mode 100644
index 0000000..6297b88
--- /dev/null
+++ b/compiler/utils/transform_array_ref.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_TRANSFORM_ARRAY_REF_H_
+#define ART_COMPILER_UTILS_TRANSFORM_ARRAY_REF_H_
+
+#include <type_traits>
+
+#include "utils/array_ref.h"
+#include "utils/transform_iterator.h"
+
+namespace art {
+
+/**
+ * @brief An ArrayRef<> wrapper that uses a transformation function for element access.
+ */
+template <typename BaseType, typename Function>
+class TransformArrayRef {
+ private:
+  using Iter = TransformIterator<typename ArrayRef<BaseType>::iterator, Function>;
+
+  // The Function may take a non-const reference, so const_iterator may not exist.
+  using FallbackConstIter = std::iterator<std::random_access_iterator_tag, void, void, void, void>;
+  using PreferredConstIter =
+      TransformIterator<typename ArrayRef<BaseType>::const_iterator, Function>;
+  template <typename F, typename = typename std::result_of<F(const BaseType&)>::type>
+  static PreferredConstIter ConstIterHelper(int&);
+  template <typename F>
+  static FallbackConstIter ConstIterHelper(const int&);
+
+  using ConstIter = decltype(ConstIterHelper<Function>(*reinterpret_cast<int*>(0)));
+
+ public:
+  using value_type = typename Iter::value_type;
+  using reference = typename Iter::reference;
+  using const_reference = typename ConstIter::reference;
+  using pointer = typename Iter::pointer;
+  using const_pointer = typename ConstIter::pointer;
+  using iterator = Iter;
+  using const_iterator = typename std::conditional<
+      std::is_same<ConstIter, FallbackConstIter>::value,
+      void,
+      ConstIter>::type;
+  using reverse_iterator = std::reverse_iterator<Iter>;
+  using const_reverse_iterator = typename std::conditional<
+      std::is_same<ConstIter, FallbackConstIter>::value,
+      void,
+      std::reverse_iterator<ConstIter>>::type;
+  using difference_type = typename ArrayRef<BaseType>::difference_type;
+  using size_type = typename ArrayRef<BaseType>::size_type;
+
+  // Constructors.
+
+  TransformArrayRef(const TransformArrayRef& other) = default;
+
+  template <typename OtherBT>
+  TransformArrayRef(const ArrayRef<OtherBT>& base, Function fn)
+      : data_(base, fn) { }
+
+  // Assignment operators.
+
+  TransformArrayRef& operator=(const TransformArrayRef& other) = default;
+
+  template <typename OtherBT,
+            typename = typename std::enable_if<std::is_same<BaseType, const OtherBT>::value>::type>
+  TransformArrayRef& operator=(const TransformArrayRef<OtherBT, Function>& other) {
+    return *this = TransformArrayRef(other.base(), other.GetFunction());
+  }
+
+  // Destructor.
+  ~TransformArrayRef() = default;
+
+  // Iterators.
+  iterator begin() { return MakeIterator(base().begin()); }
+  const_iterator begin() const { return MakeIterator(base().cbegin()); }
+  const_iterator cbegin() const { return MakeIterator(base().cbegin()); }
+  iterator end() { return MakeIterator(base().end()); }
+  const_iterator end() const { MakeIterator(base().cend()); }
+  const_iterator cend() const { return MakeIterator(base().cend()); }
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+  const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); }
+  const_reverse_iterator crbegin() const { return const_reverse_iterator(cend()); }
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+  const_reverse_iterator rend() const { return const_reverse_iterator(begin()); }
+  const_reverse_iterator crend() const { return const_reverse_iterator(cbegin()); }
+
+  // Size.
+  size_type size() const { return base().size(); }
+  bool empty() const { return base().empty(); }
+
+  // Element access. NOTE: Not providing data().
+
+  reference operator[](size_type n) { return GetFunction()(base()[n]); }
+  const_reference operator[](size_type n) const { return GetFunction()(base()[n]); }
+
+  reference front() { return GetFunction()(base().front()); }
+  const_reference front() const { return GetFunction()(base().front()); }
+
+  reference back() { return GetFunction()(base().back()); }
+  const_reference back() const { return GetFunction()(base().back()); }
+
+  TransformArrayRef SubArray(size_type pos) {
+    return TransformArrayRef(base().subarray(pos), GetFunction());
+  }
+  TransformArrayRef SubArray(size_type pos) const {
+    return TransformArrayRef(base().subarray(pos), GetFunction());
+  }
+  TransformArrayRef SubArray(size_type pos, size_type length) const {
+    return TransformArrayRef(base().subarray(pos, length), GetFunction());
+  }
+
+  // Retrieve the base ArrayRef<>.
+  ArrayRef<BaseType> base() {
+    return data_.base_;
+  }
+  ArrayRef<const BaseType> base() const {
+    return ArrayRef<const BaseType>(data_.base_);
+  }
+
+ private:
+  // Allow EBO for state-less Function.
+  struct Data : Function {
+   public:
+    Data(ArrayRef<BaseType> base, Function fn) : Function(fn), base_(base) { }
+
+    ArrayRef<BaseType> base_;
+  };
+
+  const Function& GetFunction() const {
+    return static_cast<const Function&>(data_);
+  }
+
+  template <typename BaseIterator>
+  auto MakeIterator(BaseIterator base) const {
+    return MakeTransformIterator(base, GetFunction());
+  }
+
+  Data data_;
+};
+
+template <typename BaseType, typename Function>
+bool operator==(const TransformArrayRef<BaseType, Function>& lhs,
+                const TransformArrayRef<BaseType, Function>& rhs) {
+  return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+template <typename BaseType, typename Function>
+bool operator!=(const TransformArrayRef<BaseType, Function>& lhs,
+                const TransformArrayRef<BaseType, Function>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename ValueType, typename Function>
+TransformArrayRef<ValueType, Function> MakeTransformArrayRef(
+    ArrayRef<ValueType> container, Function f) {
+  return TransformArrayRef<ValueType, Function>(container, f);
+}
+
+template <typename Container, typename Function>
+TransformArrayRef<typename Container::value_type, Function> MakeTransformArrayRef(
+    Container& container, Function f) {
+  return TransformArrayRef<typename Container::value_type, Function>(
+      ArrayRef<typename Container::value_type>(container.data(), container.size()), f);
+}
+
+template <typename Container, typename Function>
+TransformArrayRef<const typename Container::value_type, Function> MakeTransformArrayRef(
+    const Container& container, Function f) {
+  return TransformArrayRef<const typename Container::value_type, Function>(
+      ArrayRef<const typename Container::value_type>(container.data(), container.size()), f);
+}
+
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_TRANSFORM_ARRAY_REF_H_
diff --git a/compiler/utils/transform_array_ref_test.cc b/compiler/utils/transform_array_ref_test.cc
new file mode 100644
index 0000000..2593fad
--- /dev/null
+++ b/compiler/utils/transform_array_ref_test.cc
@@ -0,0 +1,165 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "utils/transform_array_ref.h"
+
+namespace art {
+
+namespace {  // anonymous namespace
+
+struct ValueHolder {
+  // Deliberately not explicit.
+  ValueHolder(int v) : value(v) { }  // NOLINT
+  int value;
+};
+
+ATTRIBUTE_UNUSED bool operator==(const ValueHolder& lhs, const ValueHolder& rhs) {
+  return lhs.value == rhs.value;
+}
+
+}  // anonymous namespace
+
+TEST(TransformArrayRef, ConstRefAdd1) {
+  auto add1 = [](const ValueHolder& h) { return h.value + 1; };  // NOLINT [readability/braces]
+  std::vector<ValueHolder> input({ 7, 6, 4, 0 });
+  std::vector<int> output;
+
+  auto taref = MakeTransformArrayRef(input, add1);
+  using TarefIter = decltype(taref)::iterator;
+  using ConstTarefIter = decltype(taref)::const_iterator;
+  static_assert(std::is_same<int, decltype(taref)::value_type>::value, "value_type");
+  static_assert(std::is_same<TarefIter, decltype(taref)::pointer>::value, "pointer");
+  static_assert(std::is_same<int, decltype(taref)::reference>::value, "reference");
+  static_assert(std::is_same<ConstTarefIter, decltype(taref)::const_pointer>::value,
+                "const_pointer");
+  static_assert(std::is_same<int, decltype(taref)::const_reference>::value, "const_reference");
+
+  std::copy(taref.begin(), taref.end(), std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 8, 7, 5, 1 }), output);
+  output.clear();
+
+  std::copy(taref.cbegin(), taref.cend(), std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 8, 7, 5, 1 }), output);
+  output.clear();
+
+  std::copy(taref.rbegin(), taref.rend(), std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 1, 5, 7, 8 }), output);
+  output.clear();
+
+  std::copy(taref.crbegin(), taref.crend(), std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 1, 5, 7, 8 }), output);
+  output.clear();
+
+  ASSERT_EQ(input.size(), taref.size());
+  ASSERT_EQ(input.empty(), taref.empty());
+  ASSERT_EQ(input.front().value + 1, taref.front());
+  ASSERT_EQ(input.back().value + 1, taref.back());
+
+  for (size_t i = 0; i != input.size(); ++i) {
+    ASSERT_EQ(input[i].value + 1, taref[i]);
+  }
+}
+
+TEST(TransformArrayRef, NonConstRefSub1) {
+  auto sub1 = [](ValueHolder& h) { return h.value - 1; };  // NOLINT [readability/braces]
+  std::vector<ValueHolder> input({ 4, 4, 5, 7, 10 });
+  std::vector<int> output;
+
+  auto taref = MakeTransformArrayRef(input, sub1);
+  using TarefIter = decltype(taref)::iterator;
+  static_assert(std::is_same<void, decltype(taref)::const_iterator>::value, "const_iterator");
+  static_assert(std::is_same<int, decltype(taref)::value_type>::value, "value_type");
+  static_assert(std::is_same<TarefIter, decltype(taref)::pointer>::value, "pointer");
+  static_assert(std::is_same<int, decltype(taref)::reference>::value, "reference");
+  static_assert(std::is_same<void, decltype(taref)::const_pointer>::value, "const_pointer");
+  static_assert(std::is_same<void, decltype(taref)::const_reference>::value, "const_reference");
+
+  std::copy(taref.begin(), taref.end(), std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 3, 3, 4, 6, 9 }), output);
+  output.clear();
+
+  std::copy(taref.rbegin(), taref.rend(), std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 9, 6, 4, 3, 3 }), output);
+  output.clear();
+
+  ASSERT_EQ(input.size(), taref.size());
+  ASSERT_EQ(input.empty(), taref.empty());
+  ASSERT_EQ(input.front().value - 1, taref.front());
+  ASSERT_EQ(input.back().value - 1, taref.back());
+
+  for (size_t i = 0; i != input.size(); ++i) {
+    ASSERT_EQ(input[i].value - 1, taref[i]);
+  }
+}
+
+TEST(TransformArrayRef, ConstAndNonConstRef) {
+  struct Ref {
+    int& operator()(ValueHolder& h) const { return h.value; }
+    const int& operator()(const ValueHolder& h) const { return h.value; }
+  };
+  Ref ref;
+  std::vector<ValueHolder> input({ 1, 0, 1, 0, 3, 1 });
+  std::vector<int> output;
+
+  auto taref = MakeTransformArrayRef(input, ref);
+  static_assert(std::is_same<int, decltype(taref)::value_type>::value, "value_type");
+  static_assert(std::is_same<int*, decltype(taref)::pointer>::value, "pointer");
+  static_assert(std::is_same<int&, decltype(taref)::reference>::value, "reference");
+  static_assert(std::is_same<const int*, decltype(taref)::const_pointer>::value, "const_pointer");
+  static_assert(std::is_same<const int&, decltype(taref)::const_reference>::value,
+                "const_reference");
+
+  std::copy(taref.begin(), taref.end(), std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 1, 0, 1, 0, 3, 1 }), output);
+  output.clear();
+
+  std::copy(taref.cbegin(), taref.cend(), std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 1, 0, 1, 0, 3, 1 }), output);
+  output.clear();
+
+  std::copy(taref.rbegin(), taref.rend(), std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 1, 3, 0, 1, 0, 1 }), output);
+  output.clear();
+
+  std::copy(taref.crbegin(), taref.crend(), std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 1, 3, 0, 1, 0, 1 }), output);
+  output.clear();
+
+  ASSERT_EQ(input.size(), taref.size());
+  ASSERT_EQ(input.empty(), taref.empty());
+  ASSERT_EQ(input.front().value, taref.front());
+  ASSERT_EQ(input.back().value, taref.back());
+
+  for (size_t i = 0; i != input.size(); ++i) {
+    ASSERT_EQ(input[i].value, taref[i]);
+  }
+
+  // Test writing through the transform iterator.
+  std::vector<int> transform_input({ 24, 37, 11, 71 });
+  std::vector<ValueHolder> transformed(transform_input.size(), 0);
+  taref = MakeTransformArrayRef(transformed, ref);
+  for (size_t i = 0; i != transform_input.size(); ++i) {
+    taref[i] = transform_input[i];
+  }
+  ASSERT_EQ(std::vector<ValueHolder>({ 24, 37, 11, 71 }), transformed);
+}
+
+}  // namespace art
diff --git a/compiler/utils/transform_iterator.h b/compiler/utils/transform_iterator.h
new file mode 100644
index 0000000..f0769d4
--- /dev/null
+++ b/compiler/utils/transform_iterator.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_TRANSFORM_ITERATOR_H_
+#define ART_COMPILER_UTILS_TRANSFORM_ITERATOR_H_
+
+#include <iterator>
+#include <type_traits>
+
+#include "base/iteration_range.h"
+
+namespace art {
+
+// The transform iterator transforms values from the base iterator with a given
+// transformation function. It can serve as a replacement for std::transform(), i.e.
+//    std::copy(MakeTransformIterator(begin, f), MakeTransformIterator(end, f), out)
+// is equivalent to
+//    std::transform(begin, end, f)
+// If the function returns an l-value reference or a wrapper that supports assignment,
+// the TransformIterator can be used also as an output iterator, i.e.
+//    std::copy(begin, end, MakeTransformIterator(out, f))
+// is equivalent to
+//    for (auto it = begin; it != end; ++it) {
+//      f(*out++) = *it;
+//    }
+template <typename BaseIterator, typename Function>
+class TransformIterator {
+ private:
+  static_assert(std::is_base_of<
+                    std::input_iterator_tag,
+                    typename std::iterator_traits<BaseIterator>::iterator_category>::value,
+                "Transform iterator base must be an input iterator.");
+
+  using InputType =
+      typename std::conditional<
+          std::is_same<void, typename std::iterator_traits<BaseIterator>::reference>::value,
+          typename std::iterator_traits<BaseIterator>::value_type,
+          typename std::iterator_traits<BaseIterator>::reference>::type;
+  using ResultType = typename std::result_of<Function(InputType)>::type;
+
+ public:
+  using iterator_category = typename std::iterator_traits<BaseIterator>::iterator_category;
+  using value_type =
+      typename std::remove_const<typename std::remove_reference<ResultType>::type>::type;
+  using difference_type = typename std::iterator_traits<BaseIterator>::difference_type;
+  using pointer = typename std::conditional<
+      std::is_reference<ResultType>::value,
+      typename std::add_pointer<typename std::remove_reference<ResultType>::type>::type,
+      TransformIterator>::type;
+  using reference = ResultType;
+
+  TransformIterator(BaseIterator base, Function fn)
+      : data_(base, fn) { }
+
+  template <typename OtherBI>
+  TransformIterator(const TransformIterator<OtherBI, Function>& other)
+      : data_(other.base(), other.GetFunction()) {
+  }
+
+  TransformIterator& operator++() {
+    ++data_.base_;
+    return *this;
+  }
+
+  TransformIterator& operator++(int) {
+    TransformIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+
+  TransformIterator& operator--() {
+    static_assert(
+        std::is_base_of<std::bidirectional_iterator_tag,
+                        typename std::iterator_traits<BaseIterator>::iterator_category>::value,
+        "BaseIterator must be bidirectional iterator to use operator--()");
+    --data_.base_;
+    return *this;
+  }
+
+  TransformIterator& operator--(int) {
+    TransformIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  reference operator*() const {
+    return GetFunction()(*base());
+  }
+
+  reference operator[](difference_type n) const {
+    static_assert(
+        std::is_base_of<std::random_access_iterator_tag,
+                        typename std::iterator_traits<BaseIterator>::iterator_category>::value,
+        "BaseIterator must be random access iterator to use operator[]");
+    return GetFunction()(base()[n]);
+  }
+
+  TransformIterator operator+(difference_type n) const {
+    static_assert(
+        std::is_base_of<std::random_access_iterator_tag,
+                        typename std::iterator_traits<BaseIterator>::iterator_category>::value,
+        "BaseIterator must be random access iterator to use operator+");
+    return TransformIterator(base() + n, GetFunction());
+  }
+
+  TransformIterator operator-(difference_type n) const {
+    static_assert(
+        std::is_base_of<std::random_access_iterator_tag,
+                        typename std::iterator_traits<BaseIterator>::iterator_category>::value,
+        "BaseIterator must be random access iterator to use operator-");
+    return TransformIterator(base() - n, GetFunction());
+  }
+
+  difference_type operator-(const TransformIterator& other) const {
+    static_assert(
+        std::is_base_of<std::random_access_iterator_tag,
+                        typename std::iterator_traits<BaseIterator>::iterator_category>::value,
+        "BaseIterator must be random access iterator to use operator-");
+    return base() - other.base();
+  }
+
+  // Retrieve the base iterator.
+  BaseIterator base() const {
+    return data_.base_;
+  }
+
+  // Retrieve the transformation function.
+  const Function& GetFunction() const {
+    return static_cast<const Function&>(data_);
+  }
+
+ private:
+  // Allow EBO for state-less Function.
+  struct Data : Function {
+   public:
+    Data(BaseIterator base, Function fn) : Function(fn), base_(base) { }
+
+    BaseIterator base_;
+  };
+
+  Data data_;
+};
+
+template <typename BaseIterator1, typename BaseIterator2, typename Function>
+bool operator==(const TransformIterator<BaseIterator1, Function>& lhs,
+                const TransformIterator<BaseIterator2, Function>& rhs) {
+  return lhs.base() == rhs.base();
+}
+
+template <typename BaseIterator1, typename BaseIterator2, typename Function>
+bool operator!=(const TransformIterator<BaseIterator1, Function>& lhs,
+                const TransformIterator<BaseIterator2, Function>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename BaseIterator, typename Function>
+TransformIterator<BaseIterator, Function> MakeTransformIterator(BaseIterator base, Function f) {
+  return TransformIterator<BaseIterator, Function>(base, f);
+}
+
+template <typename BaseRange, typename Function>
+auto MakeTransformRange(BaseRange& range, Function f) {
+  return MakeIterationRange(MakeTransformIterator(range.begin(), f),
+                            MakeTransformIterator(range.end(), f));
+}
+
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_TRANSFORM_ITERATOR_H_
diff --git a/compiler/utils/transform_iterator_test.cc b/compiler/utils/transform_iterator_test.cc
new file mode 100644
index 0000000..dbb4779
--- /dev/null
+++ b/compiler/utils/transform_iterator_test.cc
@@ -0,0 +1,533 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <forward_list>
+#include <list>
+#include <type_traits>
+#include <vector>
+
+#include <array>
+
+#include "gtest/gtest.h"
+
+#include "utils/transform_iterator.h"
+
+namespace art {
+
+namespace {  // anonymous namespace
+
+struct ValueHolder {
+  // Deliberately not explicit.
+  ValueHolder(int v) : value(v) { }  // NOLINT
+  int value;
+};
+
+bool operator==(const ValueHolder& lhs, const ValueHolder& rhs) {
+  return lhs.value == rhs.value;
+}
+
+}  // anonymous namespace
+
+TEST(TransformIterator, VectorAdd1) {
+  auto add1 = [](const ValueHolder& h) { return h.value + 1; };  // NOLINT [readability/braces]
+  std::vector<ValueHolder> input({ 1, 7, 3, 8 });
+  std::vector<int> output;
+
+  using vector_titer = decltype(MakeTransformIterator(input.begin(), add1));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_titer::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_titer::value_type>::value, "value_type");
+  static_assert(std::is_same<vector_titer, vector_titer::pointer>::value, "pointer");
+  static_assert(std::is_same<int, vector_titer::reference>::value, "reference");
+
+  using vector_ctiter = decltype(MakeTransformIterator(input.cbegin(), add1));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_ctiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_ctiter::value_type>::value, "value_type");
+  static_assert(std::is_same<vector_ctiter, vector_ctiter::pointer>::value, "pointer");
+  static_assert(std::is_same<int, vector_ctiter::reference>::value, "reference");
+
+  using vector_rtiter = decltype(MakeTransformIterator(input.rbegin(), add1));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_rtiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_rtiter::value_type>::value, "value_type");
+  static_assert(std::is_same<vector_rtiter, vector_rtiter::pointer>::value, "pointer");
+  static_assert(std::is_same<int, vector_rtiter::reference>::value, "reference");
+
+  using vector_crtiter = decltype(MakeTransformIterator(input.crbegin(), add1));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_crtiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_crtiter::value_type>::value, "value_type");
+  static_assert(std::is_same<vector_crtiter, vector_crtiter::pointer>::value, "pointer");
+  static_assert(std::is_same<int, vector_crtiter::reference>::value, "reference");
+
+  std::copy(MakeTransformIterator(input.begin(), add1),
+            MakeTransformIterator(input.end(), add1),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 2, 8, 4, 9 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.cbegin(), add1),
+            MakeTransformIterator(input.cend(), add1),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 2, 8, 4, 9 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.rbegin(), add1),
+            MakeTransformIterator(input.rend(), add1),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 9, 4, 8, 2 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.crbegin(), add1),
+            MakeTransformIterator(input.crend(), add1),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 9, 4, 8, 2 }), output);
+  output.clear();
+
+  for (size_t i = 0; i != input.size(); ++i) {
+    ASSERT_EQ(input[i].value + 1, MakeTransformIterator(input.begin(), add1)[i]);
+    ASSERT_EQ(input[i].value + 1, MakeTransformIterator(input.cbegin(), add1)[i]);
+    ptrdiff_t index_from_rbegin = static_cast<ptrdiff_t>(input.size() - i - 1u);
+    ASSERT_EQ(input[i].value + 1, MakeTransformIterator(input.rbegin(), add1)[index_from_rbegin]);
+    ASSERT_EQ(input[i].value + 1, MakeTransformIterator(input.crbegin(), add1)[index_from_rbegin]);
+    ptrdiff_t index_from_end = -static_cast<ptrdiff_t>(input.size() - i);
+    ASSERT_EQ(input[i].value + 1, MakeTransformIterator(input.end(), add1)[index_from_end]);
+    ASSERT_EQ(input[i].value + 1, MakeTransformIterator(input.cend(), add1)[index_from_end]);
+    ptrdiff_t index_from_rend = -1 - static_cast<ptrdiff_t>(i);
+    ASSERT_EQ(input[i].value + 1, MakeTransformIterator(input.rend(), add1)[index_from_rend]);
+    ASSERT_EQ(input[i].value + 1, MakeTransformIterator(input.crend(), add1)[index_from_rend]);
+
+    ASSERT_EQ(MakeTransformIterator(input.begin(), add1) + i,
+              MakeTransformIterator(input.begin() + i, add1));
+    ASSERT_EQ(MakeTransformIterator(input.cbegin(), add1) + i,
+              MakeTransformIterator(input.cbegin() + i, add1));
+    ASSERT_EQ(MakeTransformIterator(input.rbegin(), add1) + i,
+              MakeTransformIterator(input.rbegin() + i, add1));
+    ASSERT_EQ(MakeTransformIterator(input.crbegin(), add1) + i,
+              MakeTransformIterator(input.crbegin() + i, add1));
+    ASSERT_EQ(MakeTransformIterator(input.end(), add1) - i,
+              MakeTransformIterator(input.end() - i, add1));
+    ASSERT_EQ(MakeTransformIterator(input.cend(), add1) - i,
+              MakeTransformIterator(input.cend() - i, add1));
+    ASSERT_EQ(MakeTransformIterator(input.rend(), add1) - i,
+              MakeTransformIterator(input.rend() - i, add1));
+    ASSERT_EQ(MakeTransformIterator(input.crend(), add1) - i,
+              MakeTransformIterator(input.crend() - i, add1));
+  }
+  ASSERT_EQ(input.end(),
+            (MakeTransformIterator(input.begin(), add1) + input.size()).base());
+  ASSERT_EQ(MakeTransformIterator(input.end(), add1) - MakeTransformIterator(input.begin(), add1),
+            static_cast<ptrdiff_t>(input.size()));
+
+  // Test iterator->const_iterator conversion and comparison.
+  auto it = MakeTransformIterator(input.begin(), add1);
+  decltype(MakeTransformIterator(input.cbegin(), add1)) cit = it;
+  static_assert(!std::is_same<decltype(it), decltype(cit)>::value, "Types must be different");
+  ASSERT_EQ(it, cit);
+  auto rit = MakeTransformIterator(input.rbegin(), add1);
+  decltype(MakeTransformIterator(input.crbegin(), add1)) crit(rit);
+  static_assert(!std::is_same<decltype(rit), decltype(crit)>::value, "Types must be different");
+  ASSERT_EQ(rit, crit);
+}
+
+TEST(TransformIterator, ListSub1) {
+  auto sub1 = [](const ValueHolder& h) { return h.value - 1; };  // NOLINT [readability/braces]
+  std::list<ValueHolder> input({ 2, 3, 5, 7, 11 });
+  std::vector<int> output;
+
+  using list_titer = decltype(MakeTransformIterator(input.begin(), sub1));
+  static_assert(std::is_same<std::bidirectional_iterator_tag,
+                             list_titer::iterator_category>::value, "category");
+  static_assert(std::is_same<int, list_titer::value_type>::value, "value_type");
+  static_assert(std::is_same<list_titer, list_titer::pointer>::value, "pointer");
+  static_assert(std::is_same<int, list_titer::reference>::value, "reference");
+
+  using list_ctiter = decltype(MakeTransformIterator(input.cbegin(), sub1));
+  static_assert(std::is_same<std::bidirectional_iterator_tag,
+                             list_ctiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, list_ctiter::value_type>::value, "value_type");
+  static_assert(std::is_same<list_ctiter, list_ctiter::pointer>::value, "pointer");
+  static_assert(std::is_same<int, list_ctiter::reference>::value, "reference");
+
+  using list_rtiter = decltype(MakeTransformIterator(input.rbegin(), sub1));
+  static_assert(std::is_same<std::bidirectional_iterator_tag,
+                             list_rtiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, list_rtiter::value_type>::value, "value_type");
+  static_assert(std::is_same<list_rtiter, list_rtiter::pointer>::value, "pointer");
+  static_assert(std::is_same<int, list_rtiter::reference>::value, "reference");
+
+  using list_crtiter = decltype(MakeTransformIterator(input.crbegin(), sub1));
+  static_assert(std::is_same<std::bidirectional_iterator_tag,
+                             list_crtiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, list_crtiter::value_type>::value, "value_type");
+  static_assert(std::is_same<list_crtiter, list_crtiter::pointer>::value, "pointer");
+  static_assert(std::is_same<int, list_crtiter::reference>::value, "reference");
+
+  std::copy(MakeTransformIterator(input.begin(), sub1),
+            MakeTransformIterator(input.end(), sub1),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 1, 2, 4, 6, 10 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.cbegin(), sub1),
+            MakeTransformIterator(input.cend(), sub1),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 1, 2, 4, 6, 10 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.rbegin(), sub1),
+            MakeTransformIterator(input.rend(), sub1),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 10, 6, 4, 2, 1 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.crbegin(), sub1),
+            MakeTransformIterator(input.crend(), sub1),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 10, 6, 4, 2, 1  }), output);
+  output.clear();
+
+  // Test iterator->const_iterator conversion and comparison.
+  auto it = MakeTransformIterator(input.begin(), sub1);
+  decltype(MakeTransformIterator(input.cbegin(), sub1)) cit = it;
+  static_assert(!std::is_same<decltype(it), decltype(cit)>::value, "Types must be different");
+  ASSERT_EQ(it, cit);
+}
+
+TEST(TransformIterator, ForwardListSub1) {
+  auto mul3 = [](const ValueHolder& h) { return h.value * 3; };  // NOLINT [readability/braces]
+  std::forward_list<ValueHolder> input({ 1, 1, 2, 3, 5, 8 });
+  std::vector<int> output;
+
+  using flist_titer = decltype(MakeTransformIterator(input.begin(), mul3));
+  static_assert(std::is_same<std::forward_iterator_tag,
+                             flist_titer::iterator_category>::value, "category");
+  static_assert(std::is_same<int, flist_titer::value_type>::value, "value_type");
+  static_assert(std::is_same<flist_titer, flist_titer::pointer>::value, "pointer");
+  static_assert(std::is_same<int, flist_titer::reference>::value, "reference");
+
+  using flist_ctiter = decltype(MakeTransformIterator(input.cbegin(), mul3));
+  static_assert(std::is_same<std::forward_iterator_tag,
+                             flist_ctiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, flist_ctiter::value_type>::value, "value_type");
+  static_assert(std::is_same<flist_ctiter, flist_ctiter::pointer>::value, "pointer");
+  static_assert(std::is_same<int, flist_ctiter::reference>::value, "reference");
+
+  std::copy(MakeTransformIterator(input.begin(), mul3),
+            MakeTransformIterator(input.end(), mul3),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 3, 3, 6, 9, 15, 24 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.cbegin(), mul3),
+            MakeTransformIterator(input.cend(), mul3),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 3, 3, 6, 9, 15, 24 }), output);
+  output.clear();
+
+  // Test iterator->const_iterator conversion and comparison.
+  auto it = MakeTransformIterator(input.begin(), mul3);
+  decltype(MakeTransformIterator(input.cbegin(), mul3)) cit = it;
+  static_assert(!std::is_same<decltype(it), decltype(cit)>::value, "Types must be different");
+  ASSERT_EQ(it, cit);
+}
+
+TEST(TransformIterator, VectorConstReference) {
+  auto ref = [](const ValueHolder& h) -> const int& { return h.value; };  // NOLINT [readability/braces]
+  std::vector<ValueHolder> input({ 7, 3, 1, 2, 4, 8 });
+  std::vector<int> output;
+
+  using vector_titer = decltype(MakeTransformIterator(input.begin(), ref));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_titer::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_titer::value_type>::value, "value_type");
+  static_assert(std::is_same<const int*, vector_titer::pointer>::value, "pointer");
+  static_assert(std::is_same<const int&, vector_titer::reference>::value, "reference");
+
+  using vector_ctiter = decltype(MakeTransformIterator(input.cbegin(), ref));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_ctiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_ctiter::value_type>::value, "value_type");
+  static_assert(std::is_same<const int*, vector_ctiter::pointer>::value, "pointer");
+  static_assert(std::is_same<const int&, vector_ctiter::reference>::value, "reference");
+
+  using vector_rtiter = decltype(MakeTransformIterator(input.rbegin(), ref));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_rtiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_rtiter::value_type>::value, "value_type");
+  static_assert(std::is_same<const int*, vector_rtiter::pointer>::value, "pointer");
+  static_assert(std::is_same<const int&, vector_rtiter::reference>::value, "reference");
+
+  using vector_crtiter = decltype(MakeTransformIterator(input.crbegin(), ref));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_crtiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_crtiter::value_type>::value, "value_type");
+  static_assert(std::is_same<const int*, vector_crtiter::pointer>::value, "pointer");
+  static_assert(std::is_same<const int&, vector_crtiter::reference>::value, "reference");
+
+  std::copy(MakeTransformIterator(input.begin(), ref),
+            MakeTransformIterator(input.end(), ref),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 7, 3, 1, 2, 4, 8 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.cbegin(), ref),
+            MakeTransformIterator(input.cend(), ref),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 7, 3, 1, 2, 4, 8 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.rbegin(), ref),
+            MakeTransformIterator(input.rend(), ref),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 8, 4, 2, 1, 3, 7 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.crbegin(), ref),
+            MakeTransformIterator(input.crend(), ref),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 8, 4, 2, 1, 3, 7 }), output);
+  output.clear();
+
+  for (size_t i = 0; i != input.size(); ++i) {
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.begin(), ref)[i]);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.cbegin(), ref)[i]);
+    ptrdiff_t index_from_rbegin = static_cast<ptrdiff_t>(input.size() - i - 1u);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.rbegin(), ref)[index_from_rbegin]);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.crbegin(), ref)[index_from_rbegin]);
+    ptrdiff_t index_from_end = -static_cast<ptrdiff_t>(input.size() - i);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.end(), ref)[index_from_end]);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.cend(), ref)[index_from_end]);
+    ptrdiff_t index_from_rend = -1 - static_cast<ptrdiff_t>(i);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.rend(), ref)[index_from_rend]);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.crend(), ref)[index_from_rend]);
+
+    ASSERT_EQ(MakeTransformIterator(input.begin(), ref) + i,
+              MakeTransformIterator(input.begin() + i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.cbegin(), ref) + i,
+              MakeTransformIterator(input.cbegin() + i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.rbegin(), ref) + i,
+              MakeTransformIterator(input.rbegin() + i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.crbegin(), ref) + i,
+              MakeTransformIterator(input.crbegin() + i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.end(), ref) - i,
+              MakeTransformIterator(input.end() - i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.cend(), ref) - i,
+              MakeTransformIterator(input.cend() - i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.rend(), ref) - i,
+              MakeTransformIterator(input.rend() - i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.crend(), ref) - i,
+              MakeTransformIterator(input.crend() - i, ref));
+  }
+  ASSERT_EQ(input.end(),
+            (MakeTransformIterator(input.begin(), ref) + input.size()).base());
+  ASSERT_EQ(MakeTransformIterator(input.end(), ref) - MakeTransformIterator(input.begin(), ref),
+            static_cast<ptrdiff_t>(input.size()));
+}
+
+TEST(TransformIterator, VectorNonConstReference) {
+  auto ref = [](ValueHolder& h) -> int& { return h.value; };  // NOLINT [readability/braces]
+  std::vector<ValueHolder> input({ 7, 3, 1, 2, 4, 8 });
+  std::vector<int> output;
+
+  using vector_titer = decltype(MakeTransformIterator(input.begin(), ref));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_titer::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_titer::value_type>::value, "value_type");
+  static_assert(std::is_same<int*, vector_titer::pointer>::value, "pointer");
+  static_assert(std::is_same<int&, vector_titer::reference>::value, "reference");
+
+  using vector_rtiter = decltype(MakeTransformIterator(input.rbegin(), ref));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_rtiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_rtiter::value_type>::value, "value_type");
+  static_assert(std::is_same<int*, vector_rtiter::pointer>::value, "pointer");
+  static_assert(std::is_same<int&, vector_rtiter::reference>::value, "reference");
+
+  std::copy(MakeTransformIterator(input.begin(), ref),
+            MakeTransformIterator(input.end(), ref),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 7, 3, 1, 2, 4, 8 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.rbegin(), ref),
+            MakeTransformIterator(input.rend(), ref),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 8, 4, 2, 1, 3, 7 }), output);
+  output.clear();
+
+  for (size_t i = 0; i != input.size(); ++i) {
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.begin(), ref)[i]);
+    ptrdiff_t index_from_rbegin = static_cast<ptrdiff_t>(input.size() - i - 1u);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.rbegin(), ref)[index_from_rbegin]);
+    ptrdiff_t index_from_end = -static_cast<ptrdiff_t>(input.size() - i);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.end(), ref)[index_from_end]);
+    ptrdiff_t index_from_rend = -1 - static_cast<ptrdiff_t>(i);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.rend(), ref)[index_from_rend]);
+
+    ASSERT_EQ(MakeTransformIterator(input.begin(), ref) + i,
+              MakeTransformIterator(input.begin() + i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.rbegin(), ref) + i,
+              MakeTransformIterator(input.rbegin() + i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.end(), ref) - i,
+              MakeTransformIterator(input.end() - i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.rend(), ref) - i,
+              MakeTransformIterator(input.rend() - i, ref));
+  }
+  ASSERT_EQ(input.end(),
+            (MakeTransformIterator(input.begin(), ref) + input.size()).base());
+  ASSERT_EQ(MakeTransformIterator(input.end(), ref) - MakeTransformIterator(input.begin(), ref),
+            static_cast<ptrdiff_t>(input.size()));
+
+  // Test writing through the transform iterator.
+  std::list<int> transform_input({ 1, -1, 2, -2, 3, -3 });
+  std::vector<ValueHolder> transformed(transform_input.size(), 0);
+  std::transform(transform_input.begin(),
+                 transform_input.end(),
+                 MakeTransformIterator(transformed.begin(), ref),
+                 [](int v) { return -2 * v; });
+  ASSERT_EQ(std::vector<ValueHolder>({ -2, 2, -4, 4, -6, 6 }), transformed);
+}
+
+TEST(TransformIterator, VectorConstAndNonConstReference) {
+  struct Ref {
+    int& operator()(ValueHolder& h) const { return h.value; }
+    const int& operator()(const ValueHolder& h) const { return h.value; }
+  };
+  Ref ref;
+  std::vector<ValueHolder> input({ 7, 3, 1, 2, 4, 8 });
+  std::vector<int> output;
+
+  using vector_titer = decltype(MakeTransformIterator(input.begin(), ref));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_titer::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_titer::value_type>::value, "value_type");
+  static_assert(std::is_same<int*, vector_titer::pointer>::value, "pointer");
+  static_assert(std::is_same<int&, vector_titer::reference>::value, "reference");
+
+  using vector_ctiter = decltype(MakeTransformIterator(input.cbegin(), ref));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_ctiter::iterator_category>::value, "category");
+  // static_assert(std::is_same<int, vector_ctiter::value_type>::value, "value_type");
+  static_assert(std::is_same<const int*, vector_ctiter::pointer>::value, "pointer");
+  static_assert(std::is_same<const int&, vector_ctiter::reference>::value, "reference");
+
+  using vector_rtiter = decltype(MakeTransformIterator(input.rbegin(), ref));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_rtiter::iterator_category>::value, "category");
+  static_assert(std::is_same<int, vector_rtiter::value_type>::value, "value_type");
+  static_assert(std::is_same<int*, vector_rtiter::pointer>::value, "pointer");
+  static_assert(std::is_same<int&, vector_rtiter::reference>::value, "reference");
+
+  using vector_crtiter = decltype(MakeTransformIterator(input.crbegin(), ref));
+  static_assert(std::is_same<std::random_access_iterator_tag,
+                             vector_crtiter::iterator_category>::value, "category");
+  // static_assert(std::is_same<int, vector_crtiter::value_type>::value, "value_type");
+  static_assert(std::is_same<const int*, vector_crtiter::pointer>::value, "pointer");
+  static_assert(std::is_same<const int&, vector_crtiter::reference>::value, "reference");
+
+  std::copy(MakeTransformIterator(input.begin(), ref),
+            MakeTransformIterator(input.end(), ref),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 7, 3, 1, 2, 4, 8 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.cbegin(), ref),
+            MakeTransformIterator(input.cend(), ref),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 7, 3, 1, 2, 4, 8 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.rbegin(), ref),
+            MakeTransformIterator(input.rend(), ref),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 8, 4, 2, 1, 3, 7 }), output);
+  output.clear();
+
+  std::copy(MakeTransformIterator(input.crbegin(), ref),
+            MakeTransformIterator(input.crend(), ref),
+            std::back_inserter(output));
+  ASSERT_EQ(std::vector<int>({ 8, 4, 2, 1, 3, 7 }), output);
+  output.clear();
+
+  for (size_t i = 0; i != input.size(); ++i) {
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.begin(), ref)[i]);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.cbegin(), ref)[i]);
+    ptrdiff_t index_from_rbegin = static_cast<ptrdiff_t>(input.size() - i - 1u);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.rbegin(), ref)[index_from_rbegin]);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.crbegin(), ref)[index_from_rbegin]);
+    ptrdiff_t index_from_end = -static_cast<ptrdiff_t>(input.size() - i);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.end(), ref)[index_from_end]);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.cend(), ref)[index_from_end]);
+    ptrdiff_t index_from_rend = -1 - static_cast<ptrdiff_t>(i);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.rend(), ref)[index_from_rend]);
+    ASSERT_EQ(input[i].value, MakeTransformIterator(input.crend(), ref)[index_from_rend]);
+
+    ASSERT_EQ(MakeTransformIterator(input.begin(), ref) + i,
+              MakeTransformIterator(input.begin() + i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.cbegin(), ref) + i,
+              MakeTransformIterator(input.cbegin() + i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.rbegin(), ref) + i,
+              MakeTransformIterator(input.rbegin() + i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.crbegin(), ref) + i,
+              MakeTransformIterator(input.crbegin() + i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.end(), ref) - i,
+              MakeTransformIterator(input.end() - i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.cend(), ref) - i,
+              MakeTransformIterator(input.cend() - i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.rend(), ref) - i,
+              MakeTransformIterator(input.rend() - i, ref));
+    ASSERT_EQ(MakeTransformIterator(input.crend(), ref) - i,
+              MakeTransformIterator(input.crend() - i, ref));
+  }
+  ASSERT_EQ(input.end(),
+            (MakeTransformIterator(input.begin(), ref) + input.size()).base());
+  ASSERT_EQ(MakeTransformIterator(input.end(), ref) - MakeTransformIterator(input.begin(), ref),
+            static_cast<ptrdiff_t>(input.size()));
+
+  // Test iterator->const_iterator conversion and comparison.
+  auto it = MakeTransformIterator(input.begin(), ref);
+  decltype(MakeTransformIterator(input.cbegin(), ref)) cit = it;
+  static_assert(!std::is_same<decltype(it), decltype(cit)>::value, "Types must be different");
+  ASSERT_EQ(it, cit);
+  auto rit = MakeTransformIterator(input.rbegin(), ref);
+  decltype(MakeTransformIterator(input.crbegin(), ref)) crit(rit);
+  static_assert(!std::is_same<decltype(rit), decltype(crit)>::value, "Types must be different");
+  ASSERT_EQ(rit, crit);
+
+  // Test writing through the transform iterator.
+  std::list<int> transform_input({ 42, 73, 11, 17 });
+  std::vector<ValueHolder> transformed(transform_input.size(), 0);
+  std::transform(transform_input.begin(),
+                 transform_input.end(),
+                 MakeTransformIterator(transformed.begin(), ref),
+                 [](int v) { return -v; });
+  ASSERT_EQ(std::vector<ValueHolder>({ -42, -73, -11, -17 }), transformed);
+}
+
+TEST(TransformIterator, TransformRange) {
+  auto ref = [](ValueHolder& h) -> int& { return h.value; };  // NOLINT [readability/braces]
+  std::vector<ValueHolder> data({ 1, 0, 1, 3, 1, 0 });
+
+  for (int& v : MakeTransformRange(data, ref)) {
+    v += 11;
+  }
+  ASSERT_EQ(std::vector<ValueHolder>({ 12, 11, 12, 14, 12, 11 }), data);
+}
+
+}  // namespace art
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 2203646..b3f7ef9 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1924,15 +1924,16 @@
 
 constexpr size_t kFramePointerSize = 4;
 
-void X86Assembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                              const std::vector<ManagedRegister>& spill_regs,
+void X86Assembler::BuildFrame(size_t frame_size,
+                              ManagedRegister method_reg,
+                              ArrayRef<const ManagedRegister> spill_regs,
                               const ManagedRegisterEntrySpills& entry_spills) {
   DCHECK_EQ(buffer_.Size(), 0U);  // Nothing emitted yet.
   cfi_.SetCurrentCFAOffset(4);  // Return address on stack.
   CHECK_ALIGNED(frame_size, kStackAlignment);
   int gpr_count = 0;
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
-    Register spill = spill_regs.at(i).AsX86().AsCpuRegister();
+    Register spill = spill_regs[i].AsX86().AsCpuRegister();
     pushl(spill);
     gpr_count++;
     cfi_.AdjustCFAOffset(kFramePointerSize);
@@ -1966,7 +1967,7 @@
   }
 }
 
-void X86Assembler::RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& spill_regs) {
+void X86Assembler::RemoveFrame(size_t frame_size, ArrayRef<const ManagedRegister> spill_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
   cfi_.RememberState();
   // -kFramePointerSize for ArtMethod*.
@@ -1974,7 +1975,7 @@
   addl(ESP, Immediate(adjust));
   cfi_.AdjustCFAOffset(-adjust);
   for (size_t i = 0; i < spill_regs.size(); ++i) {
-    Register spill = spill_regs.at(i).AsX86().AsCpuRegister();
+    Register spill = spill_regs[i].AsX86().AsCpuRegister();
     popl(spill);
     cfi_.AdjustCFAOffset(-static_cast<int>(kFramePointerSize));
     cfi_.Restore(DWARFReg(spill));
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 8567ad2..4fe87b3 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -632,12 +632,13 @@
   //
 
   // Emit code that will create an activation on the stack
-  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                  const std::vector<ManagedRegister>& callee_save_regs,
+  void BuildFrame(size_t frame_size,
+                  ManagedRegister method_reg,
+                  ArrayRef<const ManagedRegister> callee_save_regs,
                   const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+  void RemoveFrame(size_t frame_size, ArrayRef<const ManagedRegister> callee_save_regs)
       OVERRIDE;
 
   void IncreaseFrameSize(size_t adjust) OVERRIDE;
diff --git a/compiler/utils/x86/managed_register_x86.h b/compiler/utils/x86/managed_register_x86.h
index fc20d7e..c0c2b65 100644
--- a/compiler/utils/x86/managed_register_x86.h
+++ b/compiler/utils/x86/managed_register_x86.h
@@ -89,64 +89,64 @@
 // There is a one-to-one mapping between ManagedRegister and register id.
 class X86ManagedRegister : public ManagedRegister {
  public:
-  ByteRegister AsByteRegister() const {
+  constexpr ByteRegister AsByteRegister() const {
     CHECK(IsCpuRegister());
     CHECK_LT(AsCpuRegister(), ESP);  // ESP, EBP, ESI and EDI cannot be encoded as byte registers.
     return static_cast<ByteRegister>(id_);
   }
 
-  Register AsCpuRegister() const {
+  constexpr Register AsCpuRegister() const {
     CHECK(IsCpuRegister());
     return static_cast<Register>(id_);
   }
 
-  XmmRegister AsXmmRegister() const {
+  constexpr XmmRegister AsXmmRegister() const {
     CHECK(IsXmmRegister());
     return static_cast<XmmRegister>(id_ - kNumberOfCpuRegIds);
   }
 
-  X87Register AsX87Register() const {
+  constexpr X87Register AsX87Register() const {
     CHECK(IsX87Register());
     return static_cast<X87Register>(id_ -
                                     (kNumberOfCpuRegIds + kNumberOfXmmRegIds));
   }
 
-  Register AsRegisterPairLow() const {
+  constexpr Register AsRegisterPairLow() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdLow().
     return FromRegId(AllocIdLow()).AsCpuRegister();
   }
 
-  Register AsRegisterPairHigh() const {
+  constexpr Register AsRegisterPairHigh() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdHigh().
     return FromRegId(AllocIdHigh()).AsCpuRegister();
   }
 
-  RegisterPair AsRegisterPair() const {
+  constexpr RegisterPair AsRegisterPair() const {
     CHECK(IsRegisterPair());
     return static_cast<RegisterPair>(id_ -
         (kNumberOfCpuRegIds + kNumberOfXmmRegIds + kNumberOfX87RegIds));
   }
 
-  bool IsCpuRegister() const {
+  constexpr bool IsCpuRegister() const {
     CHECK(IsValidManagedRegister());
     return (0 <= id_) && (id_ < kNumberOfCpuRegIds);
   }
 
-  bool IsXmmRegister() const {
+  constexpr bool IsXmmRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - kNumberOfCpuRegIds;
     return (0 <= test) && (test < kNumberOfXmmRegIds);
   }
 
-  bool IsX87Register() const {
+  constexpr bool IsX87Register() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - (kNumberOfCpuRegIds + kNumberOfXmmRegIds);
     return (0 <= test) && (test < kNumberOfX87RegIds);
   }
 
-  bool IsRegisterPair() const {
+  constexpr bool IsRegisterPair() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ -
         (kNumberOfCpuRegIds + kNumberOfXmmRegIds + kNumberOfX87RegIds);
@@ -160,33 +160,33 @@
   // then false is returned.
   bool Overlaps(const X86ManagedRegister& other) const;
 
-  static X86ManagedRegister FromCpuRegister(Register r) {
+  static constexpr X86ManagedRegister FromCpuRegister(Register r) {
     CHECK_NE(r, kNoRegister);
     return FromRegId(r);
   }
 
-  static X86ManagedRegister FromXmmRegister(XmmRegister r) {
+  static constexpr X86ManagedRegister FromXmmRegister(XmmRegister r) {
     CHECK_NE(r, kNoXmmRegister);
     return FromRegId(r + kNumberOfCpuRegIds);
   }
 
-  static X86ManagedRegister FromX87Register(X87Register r) {
+  static constexpr X86ManagedRegister FromX87Register(X87Register r) {
     CHECK_NE(r, kNoX87Register);
     return FromRegId(r + kNumberOfCpuRegIds + kNumberOfXmmRegIds);
   }
 
-  static X86ManagedRegister FromRegisterPair(RegisterPair r) {
+  static constexpr X86ManagedRegister FromRegisterPair(RegisterPair r) {
     CHECK_NE(r, kNoRegisterPair);
     return FromRegId(r + (kNumberOfCpuRegIds + kNumberOfXmmRegIds +
                           kNumberOfX87RegIds));
   }
 
  private:
-  bool IsValidManagedRegister() const {
+  constexpr bool IsValidManagedRegister() const {
     return (0 <= id_) && (id_ < kNumberOfRegIds);
   }
 
-  int RegId() const {
+  constexpr int RegId() const {
     CHECK(!IsNoRegister());
     return id_;
   }
@@ -202,9 +202,9 @@
 
   friend class ManagedRegister;
 
-  explicit X86ManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
+  explicit constexpr X86ManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
 
-  static X86ManagedRegister FromRegId(int reg_id) {
+  static constexpr X86ManagedRegister FromRegId(int reg_id) {
     X86ManagedRegister reg(reg_id);
     CHECK(reg.IsValidManagedRegister());
     return reg;
@@ -215,7 +215,7 @@
 
 }  // namespace x86
 
-inline x86::X86ManagedRegister ManagedRegister::AsX86() const {
+constexpr inline x86::X86ManagedRegister ManagedRegister::AsX86() const {
   x86::X86ManagedRegister reg(id_);
   CHECK(reg.IsNoRegister() || reg.IsValidManagedRegister());
   return reg;
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 32eb4a3..1170af1 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -2638,15 +2638,16 @@
 
 constexpr size_t kFramePointerSize = 8;
 
-void X86_64Assembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                                 const std::vector<ManagedRegister>& spill_regs,
+void X86_64Assembler::BuildFrame(size_t frame_size,
+                                 ManagedRegister method_reg,
+                                 ArrayRef<const ManagedRegister> spill_regs,
                                  const ManagedRegisterEntrySpills& entry_spills) {
   DCHECK_EQ(buffer_.Size(), 0U);  // Nothing emitted yet.
   cfi_.SetCurrentCFAOffset(8);  // Return address on stack.
   CHECK_ALIGNED(frame_size, kStackAlignment);
   int gpr_count = 0;
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
-    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    x86_64::X86_64ManagedRegister spill = spill_regs[i].AsX86_64();
     if (spill.IsCpuRegister()) {
       pushq(spill.AsCpuRegister());
       gpr_count++;
@@ -2664,7 +2665,7 @@
   // spill xmms
   int64_t offset = rest_of_frame;
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
-    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    x86_64::X86_64ManagedRegister spill = spill_regs[i].AsX86_64();
     if (spill.IsXmmRegister()) {
       offset -= sizeof(double);
       movsd(Address(CpuRegister(RSP), offset), spill.AsXmmRegister());
@@ -2697,15 +2698,14 @@
   }
 }
 
-void X86_64Assembler::RemoveFrame(size_t frame_size,
-                            const std::vector<ManagedRegister>& spill_regs) {
+void X86_64Assembler::RemoveFrame(size_t frame_size, ArrayRef<const ManagedRegister> spill_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
   cfi_.RememberState();
   int gpr_count = 0;
   // unspill xmms
   int64_t offset = static_cast<int64_t>(frame_size) - (spill_regs.size() * kFramePointerSize) - 2 * kFramePointerSize;
   for (size_t i = 0; i < spill_regs.size(); ++i) {
-    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    x86_64::X86_64ManagedRegister spill = spill_regs[i].AsX86_64();
     if (spill.IsXmmRegister()) {
       offset += sizeof(double);
       movsd(spill.AsXmmRegister(), Address(CpuRegister(RSP), offset));
@@ -2718,7 +2718,7 @@
   addq(CpuRegister(RSP), Immediate(adjust));
   cfi_.AdjustCFAOffset(-adjust);
   for (size_t i = 0; i < spill_regs.size(); ++i) {
-    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    x86_64::X86_64ManagedRegister spill = spill_regs[i].AsX86_64();
     if (spill.IsCpuRegister()) {
       popq(spill.AsCpuRegister());
       cfi_.AdjustCFAOffset(-static_cast<int>(kFramePointerSize));
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 92c7d0a..a1547cc 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -703,12 +703,13 @@
   //
 
   // Emit code that will create an activation on the stack
-  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                  const std::vector<ManagedRegister>& callee_save_regs,
+  void BuildFrame(size_t frame_size,
+                  ManagedRegister method_reg,
+                  ArrayRef<const ManagedRegister> callee_save_regs,
                   const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+  void RemoveFrame(size_t frame_size, ArrayRef<const ManagedRegister> callee_save_regs)
       OVERRIDE;
 
   void IncreaseFrameSize(size_t adjust) OVERRIDE;
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index b19e616..f5b4aa5 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1498,9 +1498,11 @@
   // TODO: more interesting spill registers / entry spills.
 
   // Two random spill regs.
-  std::vector<ManagedRegister> spill_regs;
-  spill_regs.push_back(ManagedFromCpu(x86_64::R10));
-  spill_regs.push_back(ManagedFromCpu(x86_64::RSI));
+  const ManagedRegister raw_spill_regs[] = {
+      ManagedFromCpu(x86_64::R10),
+      ManagedFromCpu(x86_64::RSI)
+  };
+  ArrayRef<const ManagedRegister> spill_regs(raw_spill_regs);
 
   // Three random entry spills.
   ManagedRegisterEntrySpills entry_spills;
@@ -1543,9 +1545,11 @@
   // TODO: more interesting spill registers / entry spills.
 
   // Two random spill regs.
-  std::vector<ManagedRegister> spill_regs;
-  spill_regs.push_back(ManagedFromCpu(x86_64::R10));
-  spill_regs.push_back(ManagedFromCpu(x86_64::RSI));
+  const ManagedRegister raw_spill_regs[] = {
+      ManagedFromCpu(x86_64::R10),
+      ManagedFromCpu(x86_64::RSI)
+  };
+  ArrayRef<const ManagedRegister> spill_regs(raw_spill_regs);
 
   size_t frame_size = 10 * kStackAlignment;
   assembler->RemoveFrame(10 * kStackAlignment, spill_regs);
diff --git a/compiler/utils/x86_64/constants_x86_64.h b/compiler/utils/x86_64/constants_x86_64.h
index 0c782d4..37db6b1 100644
--- a/compiler/utils/x86_64/constants_x86_64.h
+++ b/compiler/utils/x86_64/constants_x86_64.h
@@ -29,15 +29,15 @@
 
 class CpuRegister {
  public:
-  explicit CpuRegister(Register r) : reg_(r) {}
-  explicit CpuRegister(int r) : reg_(Register(r)) {}
-  Register AsRegister() const {
+  explicit constexpr CpuRegister(Register r) : reg_(r) {}
+  explicit constexpr CpuRegister(int r) : reg_(Register(r)) {}
+  constexpr Register AsRegister() const {
     return reg_;
   }
-  uint8_t LowBits() const {
+  constexpr uint8_t LowBits() const {
     return reg_ & 7;
   }
-  bool NeedsRex() const {
+  constexpr bool NeedsRex() const {
     return reg_ > 7;
   }
  private:
@@ -47,15 +47,15 @@
 
 class XmmRegister {
  public:
-  explicit XmmRegister(FloatRegister r) : reg_(r) {}
-  explicit XmmRegister(int r) : reg_(FloatRegister(r)) {}
-  FloatRegister AsFloatRegister() const {
+  explicit constexpr XmmRegister(FloatRegister r) : reg_(r) {}
+  explicit constexpr XmmRegister(int r) : reg_(FloatRegister(r)) {}
+  constexpr FloatRegister AsFloatRegister() const {
     return reg_;
   }
-  uint8_t LowBits() const {
+  constexpr uint8_t LowBits() const {
     return reg_ & 7;
   }
-  bool NeedsRex() const {
+  constexpr bool NeedsRex() const {
     return reg_ > 7;
   }
  private:
diff --git a/compiler/utils/x86_64/managed_register_x86_64.h b/compiler/utils/x86_64/managed_register_x86_64.h
index c4228c1..32af672 100644
--- a/compiler/utils/x86_64/managed_register_x86_64.h
+++ b/compiler/utils/x86_64/managed_register_x86_64.h
@@ -88,52 +88,52 @@
 // There is a one-to-one mapping between ManagedRegister and register id.
 class X86_64ManagedRegister : public ManagedRegister {
  public:
-  CpuRegister AsCpuRegister() const {
+  constexpr CpuRegister AsCpuRegister() const {
     CHECK(IsCpuRegister());
     return CpuRegister(static_cast<Register>(id_));
   }
 
-  XmmRegister AsXmmRegister() const {
+  constexpr XmmRegister AsXmmRegister() const {
     CHECK(IsXmmRegister());
     return XmmRegister(static_cast<FloatRegister>(id_ - kNumberOfCpuRegIds));
   }
 
-  X87Register AsX87Register() const {
+  constexpr X87Register AsX87Register() const {
     CHECK(IsX87Register());
     return static_cast<X87Register>(id_ -
                                     (kNumberOfCpuRegIds + kNumberOfXmmRegIds));
   }
 
-  CpuRegister AsRegisterPairLow() const {
+  constexpr CpuRegister AsRegisterPairLow() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdLow().
     return FromRegId(AllocIdLow()).AsCpuRegister();
   }
 
-  CpuRegister AsRegisterPairHigh() const {
+  constexpr CpuRegister AsRegisterPairHigh() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdHigh().
     return FromRegId(AllocIdHigh()).AsCpuRegister();
   }
 
-  bool IsCpuRegister() const {
+  constexpr bool IsCpuRegister() const {
     CHECK(IsValidManagedRegister());
     return (0 <= id_) && (id_ < kNumberOfCpuRegIds);
   }
 
-  bool IsXmmRegister() const {
+  constexpr bool IsXmmRegister() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - kNumberOfCpuRegIds;
     return (0 <= test) && (test < kNumberOfXmmRegIds);
   }
 
-  bool IsX87Register() const {
+  constexpr bool IsX87Register() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ - (kNumberOfCpuRegIds + kNumberOfXmmRegIds);
     return (0 <= test) && (test < kNumberOfX87RegIds);
   }
 
-  bool IsRegisterPair() const {
+  constexpr bool IsRegisterPair() const {
     CHECK(IsValidManagedRegister());
     const int test = id_ -
         (kNumberOfCpuRegIds + kNumberOfXmmRegIds + kNumberOfX87RegIds);
@@ -147,32 +147,32 @@
   // then false is returned.
   bool Overlaps(const X86_64ManagedRegister& other) const;
 
-  static X86_64ManagedRegister FromCpuRegister(Register r) {
+  static constexpr X86_64ManagedRegister FromCpuRegister(Register r) {
     CHECK_NE(r, kNoRegister);
     return FromRegId(r);
   }
 
-  static X86_64ManagedRegister FromXmmRegister(FloatRegister r) {
+  static constexpr X86_64ManagedRegister FromXmmRegister(FloatRegister r) {
     return FromRegId(r + kNumberOfCpuRegIds);
   }
 
-  static X86_64ManagedRegister FromX87Register(X87Register r) {
+  static constexpr X86_64ManagedRegister FromX87Register(X87Register r) {
     CHECK_NE(r, kNoX87Register);
     return FromRegId(r + kNumberOfCpuRegIds + kNumberOfXmmRegIds);
   }
 
-  static X86_64ManagedRegister FromRegisterPair(RegisterPair r) {
+  static constexpr X86_64ManagedRegister FromRegisterPair(RegisterPair r) {
     CHECK_NE(r, kNoRegisterPair);
     return FromRegId(r + (kNumberOfCpuRegIds + kNumberOfXmmRegIds +
                           kNumberOfX87RegIds));
   }
 
  private:
-  bool IsValidManagedRegister() const {
+  constexpr bool IsValidManagedRegister() const {
     return (0 <= id_) && (id_ < kNumberOfRegIds);
   }
 
-  int RegId() const {
+  constexpr int RegId() const {
     CHECK(!IsNoRegister());
     return id_;
   }
@@ -188,9 +188,9 @@
 
   friend class ManagedRegister;
 
-  explicit X86_64ManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
+  explicit constexpr X86_64ManagedRegister(int reg_id) : ManagedRegister(reg_id) {}
 
-  static X86_64ManagedRegister FromRegId(int reg_id) {
+  static constexpr X86_64ManagedRegister FromRegId(int reg_id) {
     X86_64ManagedRegister reg(reg_id);
     CHECK(reg.IsValidManagedRegister());
     return reg;
@@ -201,7 +201,7 @@
 
 }  // namespace x86_64
 
-inline x86_64::X86_64ManagedRegister ManagedRegister::AsX86_64() const {
+constexpr inline x86_64::X86_64ManagedRegister ManagedRegister::AsX86_64() const {
   x86_64::X86_64ManagedRegister reg(id_);
   CHECK(reg.IsNoRegister() || reg.IsValidManagedRegister());
   return reg;
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 729d712..f79fced 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -80,6 +80,8 @@
 #include "well_known_classes.h"
 #include "zip_archive.h"
 
+#define WATCHDOG_CLOCK  CLOCK_MONOTONIC
+
 namespace art {
 
 static constexpr size_t kDefaultMinDexFilesForSwap = 2;
@@ -413,7 +415,10 @@
     shutting_down_ = false;
     const char* reason = "dex2oat watch dog thread startup";
     CHECK_WATCH_DOG_PTHREAD_CALL(pthread_mutex_init, (&mutex_, nullptr), reason);
-    CHECK_WATCH_DOG_PTHREAD_CALL(pthread_cond_init, (&cond_, nullptr), reason);
+    CHECK_WATCH_DOG_PTHREAD_CALL(pthread_condattr_init, (&condattr_), reason);
+    CHECK_WATCH_DOG_PTHREAD_CALL(pthread_condattr_setclock, (&condattr_, WATCHDOG_CLOCK), reason);
+    CHECK_WATCH_DOG_PTHREAD_CALL(pthread_cond_init, (&cond_, &condattr_), reason);
+    CHECK_WATCH_DOG_PTHREAD_CALL(pthread_condattr_destroy, (&condattr_), reason);
     CHECK_WATCH_DOG_PTHREAD_CALL(pthread_attr_init, (&attr_), reason);
     CHECK_WATCH_DOG_PTHREAD_CALL(pthread_create, (&pthread_, &attr_, &CallBack, this), reason);
     CHECK_WATCH_DOG_PTHREAD_CALL(pthread_attr_destroy, (&attr_), reason);
@@ -456,7 +461,7 @@
     //       large.
     constexpr int64_t multiplier = kVerifyObjectSupport > kVerifyObjectModeFast ? 100 : 1;
     timespec timeout_ts;
-    InitTimeSpec(true, CLOCK_REALTIME, multiplier * kWatchDogTimeoutSeconds * 1000, 0, &timeout_ts);
+    InitTimeSpec(true, WATCHDOG_CLOCK, multiplier * kWatchDogTimeoutSeconds * 1000, 0, &timeout_ts);
     const char* reason = "dex2oat watch dog thread waiting";
     CHECK_WATCH_DOG_PTHREAD_CALL(pthread_mutex_lock, (&mutex_), reason);
     while (!shutting_down_) {
@@ -486,6 +491,7 @@
   bool shutting_down_;
   // TODO: Switch to Mutex when we can guarantee it won't prevent shutdown in error cases.
   pthread_mutex_t mutex_;
+  pthread_condattr_t condattr_;
   pthread_cond_t cond_;
   pthread_attr_t attr_;
   pthread_t pthread_;
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 286faf2..ee4953f 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -782,23 +782,13 @@
         args << Rm;
 
         // Shift operand.
-        bool noShift = (imm5 == 0 && shift_type != 0x3);
+        bool noShift = (imm5 == 0 && shift_type == 0x0);
         if (!noShift) {
           args << ", ";
-          switch (shift_type) {
-            case 0x0: args << "lsl"; break;
-            case 0x1: args << "lsr"; break;
-            case 0x2: args << "asr"; break;
-            case 0x3:
-              if (imm5 == 0) {
-                args << "rrx";
-              } else {
-                args << "ror #" << imm5;
-              }
-              break;
-          }
-          if (shift_type != 0x3 /* rrx */) {
-            args << StringPrintf(" #%d", (0 != imm5 || 0 == shift_type) ? imm5 : 32);
+          if (shift_type == 0x3u && imm5 == 0u) {
+            args << "rrx";
+          } else {
+            args << kThumb2ShiftOperations[shift_type] << " #" << ((0 != imm5) ? imm5 : 32);
           }
         }
 
@@ -951,17 +941,11 @@
                 opcode << (op != 0 ? "vsqrt" : "vneg") << (S != 0 ? ".f64" : ".f32");
                 args << d << ", " << m;
               } else if (op5 == 4) {
-                opcode << "vcmp" << (S != 0 ? ".f64" : ".f32");
+                opcode << "vcmp" << ((op != 0) ? "e" : "") << (S != 0 ? ".f64" : ".f32");
                 args << d << ", " << m;
-                if (op != 0) {
-                  args << " (quiet nan)";
-                }
               } else if (op5 == 5) {
-                opcode << "vcmpe" << (S != 0 ? ".f64" : ".f32");
+                opcode << "vcmp" << ((op != 0) ? "e" : "") << (S != 0 ? ".f64" : ".f32");
                 args << d << ", #0.0";
-                if (op != 0) {
-                  args << " (quiet nan)";
-                }
                 if ((instr & 0x2f) != 0) {
                   args << " (UNPREDICTABLE)";
                 }
@@ -1497,82 +1481,101 @@
           }
           break;
         }
-      default:      // more formats
-        if ((op2 >> 4) == 2) {      // 010xxxx
-          // data processing (register)
-          if ((instr & 0x0080f0f0) == 0x0000f000) {
-            // LSL, LSR, ASR, ROR
-            uint32_t shift_op = (instr >> 21) & 3;
-            uint32_t S = (instr >> 20) & 1;
-            ArmRegister Rd(instr, 8);
+        case 0x7B: case 0x7F: {
+          FpRegister d(instr, 12, 22);
+          FpRegister m(instr, 0, 5);
+          uint32_t sz = (instr >> 18) & 0x3;  // Decode size bits.
+          uint32_t size = (sz == 0) ? 8 : sz << 4;
+          uint32_t opc2 = (instr >> 7) & 0xF;
+          uint32_t Q = (instr >> 6) & 1;
+          if (Q == 0 && opc2 == 0xA && size == 8) {  // 1010, VCNT
+            opcode << "vcnt." << size;
+            args << d << ", " << m;
+          } else if (Q == 0 && (opc2 == 0x4 || opc2 == 0x5) && size <= 32) {  // 010x, VPADDL
+            bool op = HasBitSet(instr, 7);
+            opcode << "vpaddl." << (op ? "u" : "s") << size;
+            args << d << ", " << m;
+          } else {
+            opcode << "UNKNOWN " << op2;
+          }
+          break;
+        }
+        default:      // more formats
+          if ((op2 >> 4) == 2) {      // 010xxxx
+            // data processing (register)
+            if ((instr & 0x0080f0f0) == 0x0000f000) {
+              // LSL, LSR, ASR, ROR
+              uint32_t shift_op = (instr >> 21) & 3;
+              uint32_t S = (instr >> 20) & 1;
+              ArmRegister Rd(instr, 8);
+              ArmRegister Rn(instr, 16);
+              ArmRegister Rm(instr, 0);
+              opcode << kThumb2ShiftOperations[shift_op] << (S != 0 ? "s" : "");
+              args << Rd << ", " << Rn << ", " << Rm;
+            }
+          } else if ((op2 >> 3) == 6) {       // 0110xxx
+            // Multiply, multiply accumulate, and absolute difference
+            op1 = (instr >> 20) & 0x7;
+            op2 = (instr >> 4) & 0x1;
+            ArmRegister Ra(instr, 12);
             ArmRegister Rn(instr, 16);
             ArmRegister Rm(instr, 0);
-            opcode << kThumb2ShiftOperations[shift_op] << (S != 0 ? "s" : "");
-            args << Rd << ", " << Rn << ", " << Rm;
-          }
-        } else if ((op2 >> 3) == 6) {       // 0110xxx
-          // Multiply, multiply accumulate, and absolute difference
-          op1 = (instr >> 20) & 0x7;
-          op2 = (instr >> 4) & 0x1;
-          ArmRegister Ra(instr, 12);
-          ArmRegister Rn(instr, 16);
-          ArmRegister Rm(instr, 0);
-          ArmRegister Rd(instr, 8);
-          switch (op1) {
-          case 0:
-            if (op2 == 0) {
-              if (Ra.r == 0xf) {
-                opcode << "mul";
-                args << Rd << ", " << Rn << ", " << Rm;
+            ArmRegister Rd(instr, 8);
+            switch (op1) {
+            case 0:
+              if (op2 == 0) {
+                if (Ra.r == 0xf) {
+                  opcode << "mul";
+                  args << Rd << ", " << Rn << ", " << Rm;
+                } else {
+                  opcode << "mla";
+                  args << Rd << ", " << Rn << ", " << Rm << ", " << Ra;
+                }
               } else {
-                opcode << "mla";
+                opcode << "mls";
                 args << Rd << ", " << Rn << ", " << Rm << ", " << Ra;
               }
-            } else {
-              opcode << "mls";
-              args << Rd << ", " << Rn << ", " << Rm << ", " << Ra;
+              break;
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+                break;        // do these sometime
             }
-            break;
-          case 1:
-          case 2:
-          case 3:
-          case 4:
-          case 5:
-          case 6:
-              break;        // do these sometime
+          } else if ((op2 >> 3) == 7) {       // 0111xxx
+            // Long multiply, long multiply accumulate, and divide
+            op1 = (instr >> 20) & 0x7;
+            op2 = (instr >> 4) & 0xf;
+            ArmRegister Rn(instr, 16);
+            ArmRegister Rm(instr, 0);
+            ArmRegister Rd(instr, 8);
+            ArmRegister RdHi(instr, 8);
+            ArmRegister RdLo(instr, 12);
+            switch (op1) {
+            case 0:
+              opcode << "smull";
+              args << RdLo << ", " << RdHi << ", " << Rn << ", " << Rm;
+              break;
+            case 1:
+              opcode << "sdiv";
+              args << Rd << ", " << Rn << ", " << Rm;
+              break;
+            case 2:
+              opcode << "umull";
+              args << RdLo << ", " << RdHi << ", " << Rn << ", " << Rm;
+              break;
+            case 3:
+              opcode << "udiv";
+              args << Rd << ", " << Rn << ", " << Rm;
+              break;
+            case 4:
+            case 5:
+            case 6:
+              break;      // TODO: when we generate these...
+            }
           }
-        } else if ((op2 >> 3) == 7) {       // 0111xxx
-          // Long multiply, long multiply accumulate, and divide
-          op1 = (instr >> 20) & 0x7;
-          op2 = (instr >> 4) & 0xf;
-          ArmRegister Rn(instr, 16);
-          ArmRegister Rm(instr, 0);
-          ArmRegister Rd(instr, 8);
-          ArmRegister RdHi(instr, 8);
-          ArmRegister RdLo(instr, 12);
-          switch (op1) {
-          case 0:
-            opcode << "smull";
-            args << RdLo << ", " << RdHi << ", " << Rn << ", " << Rm;
-            break;
-          case 1:
-            opcode << "sdiv";
-            args << Rd << ", " << Rn << ", " << Rm;
-            break;
-          case 2:
-            opcode << "umull";
-            args << RdLo << ", " << RdHi << ", " << Rn << ", " << Rm;
-            break;
-          case 3:
-            opcode << "udiv";
-            args << Rd << ", " << Rn << ", " << Rm;
-            break;
-          case 4:
-          case 5:
-          case 6:
-            break;      // TODO: when we generate these...
-          }
-        }
       }
       break;
     default:
diff --git a/runtime/Android.mk b/runtime/Android.mk
index aa12c83..a4b1a3c 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -389,7 +389,7 @@
 LIBART_TARGET_DEFAULT_INSTRUCTION_SET_FEATURES := default
 2ND_LIBART_TARGET_DEFAULT_INSTRUCTION_SET_FEATURES := default
 ifeq ($(DEX2OAT_TARGET_ARCH),arm)
-  ifneq (,$(filter $(DEX2OAT_TARGET_CPU_VARIANT),cortex-a15 krait denver))
+  ifneq (,$(filter $(DEX2OAT_TARGET_CPU_VARIANT),cortex-a15 krait kryo denver))
     LIBART_TARGET_DEFAULT_INSTRUCTION_SET_FEATURES := atomic_ldrd_strd,div
   else
     ifneq (,$(filter $(DEX2OAT_TARGET_CPU_VARIANT),cortex-a7))
@@ -398,7 +398,7 @@
   endif
 endif
 ifeq ($(2ND_DEX2OAT_TARGET_ARCH),arm)
-  ifneq (,$(filter $(DEX2OAT_TARGET_CPU_VARIANT),cortex-a15 krait denver))
+  ifneq (,$(filter $(DEX2OAT_TARGET_CPU_VARIANT),cortex-a15 krait kryo denver))
     2ND_LIBART_TARGET_DEFAULT_INSTRUCTION_SET_FEATURES := atomic_ldrd_strd,div
   else
     ifneq (,$(filter $(DEX2OAT_TARGET_CPU_VARIANT),cortex-a7))
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index f0e9ac5..4c68862 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -97,7 +97,8 @@
 
   // Intrinsics
   qpoints->pIndexOf = art_quick_indexof;
-  qpoints->pStringCompareTo = art_quick_string_compareto;
+  // The ARM StringCompareTo intrinsic does not call the runtime.
+  qpoints->pStringCompareTo = nullptr;
   qpoints->pMemcpy = memcpy;
 
   // Read barrier.
diff --git a/runtime/arch/arm/instruction_set_features_arm.cc b/runtime/arch/arm/instruction_set_features_arm.cc
index 51f992b..2aa4ee2 100644
--- a/runtime/arch/arm/instruction_set_features_arm.cc
+++ b/runtime/arch/arm/instruction_set_features_arm.cc
@@ -43,14 +43,14 @@
   static const char* arm_variants_with_div[] = {
           "cortex-a7", "cortex-a12", "cortex-a15", "cortex-a17", "cortex-a53", "cortex-a57",
           "cortex-a53.a57", "cortex-m3", "cortex-m4", "cortex-r4", "cortex-r5",
-          "cyclone", "denver", "krait", "swift" };
+          "cyclone", "denver", "krait", "kryo", "swift" };
 
   bool has_div = FindVariantInArray(arm_variants_with_div, arraysize(arm_variants_with_div),
                                     variant);
 
   // Look for variants that have LPAE support.
   static const char* arm_variants_with_lpae[] = {
-      "cortex-a7", "cortex-a15", "krait", "denver", "cortex-a53", "cortex-a57", "cortex-a53.a57"
+      "cortex-a7", "cortex-a15", "krait", "kryo", "denver", "cortex-a53", "cortex-a57", "cortex-a53.a57"
   };
   bool has_lpae = FindVariantInArray(arm_variants_with_lpae, arraysize(arm_variants_with_lpae),
                                      variant);
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index fa8c8f9..e46debb 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1704,145 +1704,6 @@
     pop {r4, r10-r11, pc}
 END art_quick_indexof
 
-   /*
-     * String's compareTo.
-     *
-     * Requires rARG0/rARG1 to have been previously checked for null.  Will
-     * return negative if this's string is < comp, 0 if they are the
-     * same and positive if >.
-     *
-     * On entry:
-     *    r0:   this object pointer
-     *    r1:   comp object pointer
-     *
-     */
-    .extern __memcmp16
-ENTRY art_quick_string_compareto
-    mov    r2, r0         @ this to r2, opening up r0 for return value
-    sub    r0, r2, r1     @ Same?
-    cbnz   r0,1f
-    bx     lr
-1:                        @ Same strings, return.
-
-    push {r4, r7-r12, lr} @ 8 words - keep alignment
-    .cfi_adjust_cfa_offset 32
-    .cfi_rel_offset r4, 0
-    .cfi_rel_offset r7, 4
-    .cfi_rel_offset r8, 8
-    .cfi_rel_offset r9, 12
-    .cfi_rel_offset r10, 16
-    .cfi_rel_offset r11, 20
-    .cfi_rel_offset r12, 24
-    .cfi_rel_offset lr, 28
-
-    ldr    r7, [r2, #MIRROR_STRING_COUNT_OFFSET]
-    ldr    r10, [r1, #MIRROR_STRING_COUNT_OFFSET]
-    add    r2, #MIRROR_STRING_VALUE_OFFSET
-    add    r1, #MIRROR_STRING_VALUE_OFFSET
-
-    /*
-     * At this point, we have:
-     *    value:  r2/r1
-     *    offset: r4/r9
-     *    count:  r7/r10
-     * We're going to compute
-     *    r11 <- countDiff
-     *    r10 <- minCount
-     */
-     subs  r11, r7, r10
-     it    ls
-     movls r10, r7
-
-     /*
-      * Note: data pointers point to previous element so we can use pre-index
-      * mode with base writeback.
-      */
-     subs  r2, #2   @ offset to contents[-1]
-     subs  r1, #2   @ offset to contents[-1]
-
-     /*
-      * At this point we have:
-      *   r2: *this string data
-      *   r1: *comp string data
-      *   r10: iteration count for comparison
-      *   r11: value to return if the first part of the string is equal
-      *   r0: reserved for result
-      *   r3, r4, r7, r8, r9, r12 available for loading string data
-      */
-
-    subs  r10, #2
-    blt   .Ldo_remainder2
-
-      /*
-       * Unroll the first two checks so we can quickly catch early mismatch
-       * on long strings (but preserve incoming alignment)
-       */
-
-    ldrh  r3, [r2, #2]!
-    ldrh  r4, [r1, #2]!
-    ldrh  r7, [r2, #2]!
-    ldrh  r8, [r1, #2]!
-    subs  r0, r3, r4
-    it    eq
-    subseq  r0, r7, r8
-    bne   .Ldone
-    cmp   r10, #28
-    bgt   .Ldo_memcmp16
-    subs  r10, #3
-    blt   .Ldo_remainder
-
-.Lloopback_triple:
-    ldrh  r3, [r2, #2]!
-    ldrh  r4, [r1, #2]!
-    ldrh  r7, [r2, #2]!
-    ldrh  r8, [r1, #2]!
-    ldrh  r9, [r2, #2]!
-    ldrh  r12,[r1, #2]!
-    subs  r0, r3, r4
-    it    eq
-    subseq  r0, r7, r8
-    it    eq
-    subseq  r0, r9, r12
-    bne   .Ldone
-    subs  r10, #3
-    bge   .Lloopback_triple
-
-.Ldo_remainder:
-    adds  r10, #3
-    beq   .Lreturn_diff
-
-.Lloopback_single:
-    ldrh  r3, [r2, #2]!
-    ldrh  r4, [r1, #2]!
-    subs  r0, r3, r4
-    bne   .Ldone
-    subs  r10, #1
-    bne   .Lloopback_single
-
-.Lreturn_diff:
-    mov   r0, r11
-    pop   {r4, r7-r12, pc}
-
-.Ldo_remainder2:
-    adds  r10, #2
-    bne   .Lloopback_single
-    mov   r0, r11
-    pop   {r4, r7-r12, pc}
-
-    /* Long string case */
-.Ldo_memcmp16:
-    mov   r7, r11
-    add   r0, r2, #2
-    add   r1, r1, #2
-    mov   r2, r10
-    bl    __memcmp16
-    cmp   r0, #0
-    it    eq
-    moveq r0, r7
-.Ldone:
-    pop   {r4, r7-r12, pc}
-END art_quick_string_compareto
-
     /* Assembly routines used to handle ABI differences. */
 
     /* double fmod(double a, double b) */
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index 1618ced..bf0f647 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -80,7 +80,8 @@
 
   // Intrinsics
   qpoints->pIndexOf = art_quick_indexof;
-  qpoints->pStringCompareTo = art_quick_string_compareto;
+  // The ARM64 StringCompareTo intrinsic does not call the runtime.
+  qpoints->pStringCompareTo = nullptr;
   qpoints->pMemcpy = memcpy;
 
   // Read barrier.
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 209e7f0..8982ace 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1249,8 +1249,18 @@
      * name mismatch between instructions. This macro uses the lower 32b of register when possible.
      * TODO: When read barrier has a fast path, add heap unpoisoning support for the fast path.
      */
-.macro READ_BARRIER xDest, wDest, xObj, offset
+.macro READ_BARRIER xDest, wDest, xObj, xTemp, wTemp, offset, number
 #ifdef USE_READ_BARRIER
+#ifdef USE_BAKER_READ_BARRIER
+    ldr \wTemp, [\xObj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tbnz \wTemp, #LOCK_WORD_READ_BARRIER_STATE_SHIFT, .Lrb_slowpath\number
+    // False dependency to avoid needing load/load fence.
+    add \xObj, \xObj, \xTemp, lsr #32
+    ldr \wDest, [\xObj, #\offset]   // Heap reference = 32b. This also zero-extends to \xDest.
+    UNPOISON_HEAP_REF \wDest
+    b .Lrb_exit\number
+#endif
+.Lrb_slowpath\number:
     // Store registers used in art_quick_aput_obj (x0-x4, LR), stack is 16B aligned.
     stp x0, x1, [sp, #-48]!
     .cfi_adjust_cfa_offset 48
@@ -1284,6 +1294,7 @@
     .cfi_restore x30
     add sp, sp, #48
     .cfi_adjust_cfa_offset -48
+.Lrb_exit\number:
 #else
     ldr \wDest, [\xObj, #\offset]   // Heap reference = 32b. This also zero-extends to \xDest.
     UNPOISON_HEAP_REF \wDest
@@ -1322,12 +1333,12 @@
 #endif
 ENTRY art_quick_aput_obj
     cbz x2, .Ldo_aput_null
-    READ_BARRIER x3, w3, x0, MIRROR_OBJECT_CLASS_OFFSET     // Heap reference = 32b
-                                                         // This also zero-extends to x3
-    READ_BARRIER x4, w4, x2, MIRROR_OBJECT_CLASS_OFFSET     // Heap reference = 32b
-                                                         // This also zero-extends to x4
-    READ_BARRIER x3, w3, x3, MIRROR_CLASS_COMPONENT_TYPE_OFFSET // Heap reference = 32b
-                                                         // This also zero-extends to x3
+    READ_BARRIER x3, w3, x0, x3, w3, MIRROR_OBJECT_CLASS_OFFSET, 0  // Heap reference = 32b
+                                                                    // This also zero-extends to x3
+    READ_BARRIER x3, w3, x3, x4, w4, MIRROR_CLASS_COMPONENT_TYPE_OFFSET, 1 // Heap reference = 32b
+    // This also zero-extends to x3
+    READ_BARRIER x4, w4, x2, x4, w4, MIRROR_OBJECT_CLASS_OFFSET, 2  // Heap reference = 32b
+                                                                    // This also zero-extends to x4
     cmp w3, w4  // value's type == array's component type - trivial assignability
     bne .Lcheck_assignability
 .Ldo_aput:
@@ -2208,108 +2219,3 @@
     asr   x0, x0, #1
     ret
 END art_quick_indexof
-
-   /*
-     * String's compareTo.
-     *
-     * TODO: Not very optimized.
-     *
-     * On entry:
-     *    x0:   this object pointer
-     *    x1:   comp object pointer
-     *
-     */
-    .extern __memcmp16
-ENTRY art_quick_string_compareto
-    mov    x2, x0         // x0 is return, use x2 for first input.
-    sub    x0, x2, x1     // Same string object?
-    cbnz   x0,1f
-    ret
-1:                        // Different string objects.
-
-    ldr    w4, [x2, #MIRROR_STRING_COUNT_OFFSET]
-    ldr    w3, [x1, #MIRROR_STRING_COUNT_OFFSET]
-    add    x2, x2, #MIRROR_STRING_VALUE_OFFSET
-    add    x1, x1, #MIRROR_STRING_VALUE_OFFSET
-
-    /*
-     * Now:           Data*  Count
-     *    first arg    x2      w4
-     *   second arg    x1      w3
-     */
-
-    // x0 := str1.length(w4) - str2.length(w3). ldr zero-extended w3/w4 into x3/x4.
-    subs x0, x4, x3
-    // Min(count1, count2) into w3.
-    csel x3, x3, x4, ge
-
-    // TODO: Tune this value.
-    // Check for long string, do memcmp16 for them.
-    cmp w3, #28  // Constant from arm32.
-    bgt .Ldo_memcmp16
-
-    /*
-     * Now:
-     *   x2: *first string data
-     *   x1: *second string data
-     *   w3: iteration count
-     *   x0: return value if comparison equal
-     *   x4, x5, x6, x7: free
-     */
-
-    // Do a simple unrolled loop.
-.Lloop:
-    // At least two more elements?
-    subs w3, w3, #2
-    b.lt .Lremainder_or_done
-
-    ldrh w4, [x2], #2
-    ldrh w5, [x1], #2
-
-    ldrh w6, [x2], #2
-    ldrh w7, [x1], #2
-
-    subs w4, w4, w5
-    b.ne .Lw4_result
-
-    subs w6, w6, w7
-    b.ne .Lw6_result
-
-    b .Lloop
-
-.Lremainder_or_done:
-    adds w3, w3, #1
-    b.eq .Lremainder
-    ret
-
-.Lremainder:
-    ldrh w4, [x2], #2
-    ldrh w5, [x1], #2
-    subs w4, w4, w5
-    b.ne .Lw4_result
-    ret
-
-// Result is in w4
-.Lw4_result:
-    sxtw x0, w4
-    ret
-
-// Result is in w6
-.Lw6_result:
-    sxtw x0, w6
-    ret
-
-.Ldo_memcmp16:
-    mov x14, x0                  // Save x0 and LR. __memcmp16 does not use these temps.
-    mov x15, xLR                 //                 TODO: Codify and check that?
-
-    mov x0, x2
-    uxtw x2, w3
-    bl __memcmp16
-
-    mov xLR, x15                 // Restore LR.
-
-    cmp x0, #0                   // Check the memcmp difference.
-    csel x0, x0, x14, ne         // x0 := x0 != 0 ? x14(prev x0=length diff) : x1.
-    ret
-END art_quick_string_compareto
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 3cdff55..a7d6d6f 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -1205,8 +1205,9 @@
 
 
 TEST_F(StubTest, StringCompareTo) {
-#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || \
-    defined(__mips__) || (defined(__x86_64__) && !defined(__APPLE__))
+  // There is no StringCompareTo runtime entrypoint for __arm__ or __aarch64__.
+#if defined(__i386__) || defined(__mips__) || \
+    (defined(__x86_64__) && !defined(__APPLE__))
   // TODO: Check the "Unresolved" allocation stubs
 
   Thread* self = Thread::Current();
diff --git a/runtime/base/logging.h b/runtime/base/logging.h
index 3b5b8b5..e9e97b8 100644
--- a/runtime/base/logging.h
+++ b/runtime/base/logging.h
@@ -198,14 +198,14 @@
 // types of LHS and RHS.
 template <typename LHS, typename RHS>
 struct EagerEvaluator {
-  EagerEvaluator(LHS l, RHS r) : lhs(l), rhs(r) { }
+  constexpr EagerEvaluator(LHS l, RHS r) : lhs(l), rhs(r) { }
   LHS lhs;
   RHS rhs;
 };
 
 // Helper function for CHECK_xx.
 template <typename LHS, typename RHS>
-static inline EagerEvaluator<LHS, RHS> MakeEagerEvaluator(LHS lhs, RHS rhs) {
+static inline constexpr EagerEvaluator<LHS, RHS> MakeEagerEvaluator(LHS lhs, RHS rhs) {
   return EagerEvaluator<LHS, RHS>(lhs, rhs);
 }
 
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index d1713ed..2c66e4f 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -476,7 +476,7 @@
         done =  state_.CompareExchangeWeakSequentiallyConsistent(cur_state, 0 /* new state */);
         if (LIKELY(done)) {  // Spurious fail?
           // Wake a contender.
-          if (UNLIKELY(num_contenders_.LoadRelaxed() > 0)) {
+          if (UNLIKELY(num_contenders_.LoadAcquire() > 0)) {
             futex(state_.Address(), FUTEX_WAKE, 1, nullptr, nullptr, 0);
           }
         }
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 36efca1..a94620d 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -168,8 +168,10 @@
   immune_spaces_.Reset();
   bytes_moved_.StoreRelaxed(0);
   objects_moved_.StoreRelaxed(0);
-  if (GetCurrentIteration()->GetGcCause() == kGcCauseExplicit ||
-      GetCurrentIteration()->GetGcCause() == kGcCauseForNativeAlloc ||
+  GcCause gc_cause = GetCurrentIteration()->GetGcCause();
+  if (gc_cause == kGcCauseExplicit ||
+      gc_cause == kGcCauseForNativeAlloc ||
+      gc_cause == kGcCauseCollectorTransition ||
       GetCurrentIteration()->GetClearSoftReferences()) {
     force_evacuate_all_ = true;
   } else {
@@ -1617,11 +1619,18 @@
 
 // Scan ref fields of an object.
 inline void ConcurrentCopying::Scan(mirror::Object* to_ref) {
+  if (kIsDebugBuild) {
+    // Avoid all read barriers during visit references to help performance.
+    Thread::Current()->ModifyDebugDisallowReadBarrier(1);
+  }
   DCHECK(!region_space_->IsInFromSpace(to_ref));
   RefFieldsVisitor visitor(this);
   // Disable the read barrier for a performance reason.
   to_ref->VisitReferences</*kVisitNativeRoots*/true, kDefaultVerifyFlags, kWithoutReadBarrier>(
       visitor, visitor);
+  if (kIsDebugBuild) {
+    Thread::Current()->ModifyDebugDisallowReadBarrier(-1);
+  }
 }
 
 // Process a field.
@@ -1704,7 +1713,7 @@
   mirror::Class* int_array_class = mirror::IntArray::GetArrayClass();
   CHECK(int_array_class != nullptr);
   AssertToSpaceInvariant(nullptr, MemberOffset(0), int_array_class);
-  size_t component_size = int_array_class->GetComponentSize();
+  size_t component_size = int_array_class->GetComponentSize<kWithoutReadBarrier>();
   CHECK_EQ(component_size, sizeof(int32_t));
   size_t data_offset = mirror::Array::DataOffset(component_size).SizeValue();
   if (data_offset > byte_size) {
@@ -1717,13 +1726,14 @@
   } else {
     // Use an int array.
     dummy_obj->SetClass(int_array_class);
-    CHECK(dummy_obj->IsArrayInstance());
+    CHECK((dummy_obj->IsArrayInstance<kVerifyNone, kWithoutReadBarrier>()));
     int32_t length = (byte_size - data_offset) / component_size;
-    dummy_obj->AsArray()->SetLength(length);
-    CHECK_EQ(dummy_obj->AsArray()->GetLength(), length)
+    mirror::Array* dummy_arr = dummy_obj->AsArray<kVerifyNone, kWithoutReadBarrier>();
+    dummy_arr->SetLength(length);
+    CHECK_EQ(dummy_arr->GetLength(), length)
         << "byte_size=" << byte_size << " length=" << length
         << " component_size=" << component_size << " data_offset=" << data_offset;
-    CHECK_EQ(byte_size, dummy_obj->SizeOf())
+    CHECK_EQ(byte_size, (dummy_obj->SizeOf<kVerifyNone, kWithoutReadBarrier>()))
         << "byte_size=" << byte_size << " length=" << length
         << " component_size=" << component_size << " data_offset=" << data_offset;
   }
@@ -1830,13 +1840,23 @@
   }
   DCHECK(to_ref != nullptr);
 
+  // Copy the object excluding the lock word since that is handled in the loop.
+  to_ref->SetClass(from_ref->GetClass<kVerifyNone, kWithoutReadBarrier>());
+  const size_t kObjectHeaderSize = sizeof(mirror::Object);
+  DCHECK_GE(obj_size, kObjectHeaderSize);
+  static_assert(kObjectHeaderSize == sizeof(mirror::HeapReference<mirror::Class>) +
+                    sizeof(LockWord),
+                "Object header size does not match");
+  // Memcpy can tear for words since it may do byte copy. It is only safe to do this since the
+  // object in the from space is immutable other than the lock word. b/31423258
+  memcpy(reinterpret_cast<uint8_t*>(to_ref) + kObjectHeaderSize,
+         reinterpret_cast<const uint8_t*>(from_ref) + kObjectHeaderSize,
+         obj_size - kObjectHeaderSize);
+
   // Attempt to install the forward pointer. This is in a loop as the
   // lock word atomic write can fail.
   while (true) {
-    // Copy the object. TODO: copy only the lockword in the second iteration and on?
-    memcpy(to_ref, from_ref, obj_size);
-
-    LockWord old_lock_word = to_ref->GetLockWord(false);
+    LockWord old_lock_word = from_ref->GetLockWord(false);
 
     if (old_lock_word.GetState() == LockWord::kForwardingAddress) {
       // Lost the race. Another thread (either GC or mutator) stored
@@ -1879,6 +1899,8 @@
       return to_ref;
     }
 
+    // Copy the old lock word over since we did not copy it yet.
+    to_ref->SetLockWord(old_lock_word, false);
     // Set the gray ptr.
     if (kUseBakerReadBarrier) {
       to_ref->SetReadBarrierPointer(ReadBarrier::GrayPtr());
diff --git a/runtime/gc/collector_type.h b/runtime/gc/collector_type.h
index f14d086..5009d7e 100644
--- a/runtime/gc/collector_type.h
+++ b/runtime/gc/collector_type.h
@@ -40,6 +40,8 @@
   kCollectorTypeHeapTrim,
   // A (mostly) concurrent copying collector.
   kCollectorTypeCC,
+  // The background compaction of the concurrent copying collector.
+  kCollectorTypeCCBackground,
   // Instrumentation critical section fake collector.
   kCollectorTypeInstrumentation,
   // Fake collector for adding or removing application image spaces.
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index e9c71b4..730fcd7 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -718,6 +718,7 @@
 }
 
 void Heap::DisableMovingGc() {
+  CHECK(!kUseReadBarrier);
   if (IsMovingGc(foreground_collector_type_)) {
     foreground_collector_type_ = kCollectorTypeCMS;
   }
@@ -953,7 +954,8 @@
       // Don't delay for debug builds since we may want to stress test the GC.
       // If background_collector_type_ is kCollectorTypeHomogeneousSpaceCompact then we have
       // special handling which does a homogenous space compaction once but then doesn't transition
-      // the collector.
+      // the collector. Similarly, we invoke a full compaction for kCollectorTypeCC but don't
+      // transition the collector.
       RequestCollectorTransition(background_collector_type_,
                                  kIsDebugBuild ? 0 : kCollectorTransitionWait);
     }
@@ -1367,6 +1369,16 @@
     } else {
       VLOG(gc) << "Homogeneous compaction ignored due to jank perceptible process state";
     }
+  } else if (desired_collector_type == kCollectorTypeCCBackground) {
+    DCHECK(kUseReadBarrier);
+    if (!CareAboutPauseTimes()) {
+      // Invoke CC full compaction.
+      CollectGarbageInternal(collector::kGcTypeFull,
+                             kGcCauseCollectorTransition,
+                             /*clear_soft_references*/false);
+    } else {
+      VLOG(gc) << "CC background compaction ignored due to jank perceptible process state";
+    }
   } else {
     TransitionCollector(desired_collector_type);
   }
@@ -1824,6 +1836,10 @@
         break;
       }
       case kAllocatorTypeNonMoving: {
+        if (kUseReadBarrier) {
+          // DisableMovingGc() isn't compatible with CC.
+          break;
+        }
         // Try to transition the heap if the allocation failure was due to the space being full.
         if (!IsOutOfMemoryOnAllocation<false>(allocator, alloc_size)) {
           // If we aren't out of memory then the OOM was probably from the non moving space being
@@ -2092,6 +2108,8 @@
 }
 
 void Heap::TransitionCollector(CollectorType collector_type) {
+  // Collector transition must not happen with CC
+  CHECK(!kUseReadBarrier);
   if (collector_type == collector_type_) {
     return;
   }
@@ -3757,6 +3775,12 @@
   if (desired_collector_type_ == collector_type_ || !CanAddHeapTask(self)) {
     return;
   }
+  if (collector_type_ == kCollectorTypeCC) {
+    // For CC, we invoke a full compaction when going to the background, but the collector type
+    // doesn't change.
+    DCHECK_EQ(desired_collector_type_, kCollectorTypeCCBackground);
+  }
+  DCHECK_NE(collector_type_, kCollectorTypeCCBackground);
   CollectorTransitionTask* added_task = nullptr;
   const uint64_t target_time = NanoTime() + delta_time;
   {
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 2a1a4a1..7661439 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -535,7 +535,7 @@
   void DumpForSigQuit(std::ostream& os) REQUIRES(!*gc_complete_lock_, !native_histogram_lock_);
 
   // Do a pending collector transition.
-  void DoPendingCollectorTransition() REQUIRES(!*gc_complete_lock_);
+  void DoPendingCollectorTransition() REQUIRES(!*gc_complete_lock_, !*pending_task_lock_);
 
   // Deflate monitors, ... and trim the spaces.
   void Trim(Thread* self) REQUIRES(!*gc_complete_lock_);
@@ -706,8 +706,6 @@
     if (IsGcConcurrent() && IsMovingGc(collector_type_)) {
       // Assume no transition when a concurrent moving collector is used.
       DCHECK_EQ(collector_type_, foreground_collector_type_);
-      DCHECK_EQ(foreground_collector_type_, background_collector_type_)
-          << "Assume no transition such that collector_type_ won't change";
       return true;
     }
     return false;
@@ -826,6 +824,7 @@
         collector_type == kCollectorTypeSS ||
         collector_type == kCollectorTypeGSS ||
         collector_type == kCollectorTypeCC ||
+        collector_type == kCollectorTypeCCBackground ||
         collector_type == kCollectorTypeMC ||
         collector_type == kCollectorTypeHomogeneousSpaceCompact;
   }
@@ -995,7 +994,9 @@
   // What kind of concurrency behavior is the runtime after? Currently true for concurrent mark
   // sweep GC, false for other GC types.
   bool IsGcConcurrent() const ALWAYS_INLINE {
-    return collector_type_ == kCollectorTypeCMS || collector_type_ == kCollectorTypeCC;
+    return collector_type_ == kCollectorTypeCMS ||
+        collector_type_ == kCollectorTypeCC ||
+        collector_type_ == kCollectorTypeCCBackground;
   }
 
   // Trim the managed and native spaces by releasing unused memory back to the OS.
diff --git a/runtime/gc/reference_queue.cc b/runtime/gc/reference_queue.cc
index 03ab9a1..52c9093 100644
--- a/runtime/gc/reference_queue.cc
+++ b/runtime/gc/reference_queue.cc
@@ -44,7 +44,9 @@
     // 1 element cyclic queue, ie: Reference ref = ..; ref.pendingNext = ref;
     list_ = ref;
   } else {
-    mirror::Reference* head = list_->GetPendingNext();
+    // The list is owned by the GC, everything that has been inserted must already be at least
+    // gray.
+    mirror::Reference* head = list_->GetPendingNext<kWithoutReadBarrier>();
     DCHECK(head != nullptr);
     ref->SetPendingNext(head);
   }
@@ -54,14 +56,14 @@
 
 mirror::Reference* ReferenceQueue::DequeuePendingReference() {
   DCHECK(!IsEmpty());
-  mirror::Reference* ref = list_->GetPendingNext();
+  mirror::Reference* ref = list_->GetPendingNext<kWithoutReadBarrier>();
   DCHECK(ref != nullptr);
   // Note: the following code is thread-safe because it is only called from ProcessReferences which
   // is single threaded.
   if (list_ == ref) {
     list_ = nullptr;
   } else {
-    mirror::Reference* next = ref->GetPendingNext();
+    mirror::Reference* next = ref->GetPendingNext<kWithoutReadBarrier>();
     list_->SetPendingNext(next);
   }
   ref->SetPendingNext(nullptr);
diff --git a/runtime/gc/space/region_space.cc b/runtime/gc/space/region_space.cc
index 9a2d0c6..f890e41 100644
--- a/runtime/gc/space/region_space.cc
+++ b/runtime/gc/space/region_space.cc
@@ -344,8 +344,7 @@
     if (r->IsFree()) {
       r->Unfree(time_);
       ++num_non_free_regions_;
-      // TODO: this is buggy. Debug it.
-      // r->SetNewlyAllocated();
+      r->SetNewlyAllocated();
       r->SetTop(r->End());
       r->is_a_tlab_ = true;
       r->thread_ = self;
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index eceb593..6b3dcf1 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -316,8 +316,14 @@
 
 mirror::String* InternTable::InternStrong(int32_t utf16_length, const char* utf8_data) {
   DCHECK(utf8_data != nullptr);
+  Thread* self = Thread::Current();
+  // Try to avoid allocation.
+  mirror::String* s = LookupStrong(self, utf16_length, utf8_data);
+  if (s != nullptr) {
+    return s;
+  }
   return InternStrong(mirror::String::AllocFromModifiedUtf8(
-      Thread::Current(), utf16_length, utf8_data));
+      self, utf16_length, utf8_data));
 }
 
 mirror::String* InternTable::InternStrong(const char* utf8_data) {
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 26dae7c..8cbb582 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -471,7 +471,7 @@
 
   template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   size_t GetComponentSize() SHARED_REQUIRES(Locks::mutator_lock_) {
-    return 1U << GetComponentSizeShift();
+    return 1U << GetComponentSizeShift<kReadBarrierOption>();
   }
 
   template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
diff --git a/runtime/mirror/reference.h b/runtime/mirror/reference.h
index 3baa12e..e8ad5fa 100644
--- a/runtime/mirror/reference.h
+++ b/runtime/mirror/reference.h
@@ -76,8 +76,9 @@
     SetFieldObjectVolatile<kTransactionActive>(ReferentOffset(), nullptr);
   }
 
+  template <ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   Reference* GetPendingNext() SHARED_REQUIRES(Locks::mutator_lock_) {
-    return GetFieldObject<Reference>(PendingNextOffset());
+    return GetFieldObject<Reference, kDefaultVerifyFlags, kReadBarrierOption>(PendingNextOffset());
   }
 
   void SetPendingNext(Reference* pending_next)
@@ -102,7 +103,7 @@
   // removed from the list after having determined the reference is not ready
   // to be enqueued on a java ReferenceQueue.
   bool IsUnprocessed() SHARED_REQUIRES(Locks::mutator_lock_) {
-    return GetPendingNext() == nullptr;
+    return GetPendingNext<kWithoutReadBarrier>() == nullptr;
   }
 
   template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
diff --git a/runtime/native/dalvik_system_ZygoteHooks.cc b/runtime/native/dalvik_system_ZygoteHooks.cc
index 9da44a4..2b3d10c 100644
--- a/runtime/native/dalvik_system_ZygoteHooks.cc
+++ b/runtime/native/dalvik_system_ZygoteHooks.cc
@@ -34,7 +34,9 @@
 #if defined(__linux__)
 #include <sys/prctl.h>
 #endif
-
+#ifdef __ANDROID__
+#include <cutils/properties.h>
+#endif
 #include <sys/resource.h>
 
 namespace art {
@@ -59,7 +61,18 @@
 #endif
   // We don't want core dumps, though, so set the core dump size to 0.
   rlimit rl;
+#ifdef __ANDROID__
+  char prop_value[PROPERTY_VALUE_MAX];
+  property_get("persist.debug.trace", prop_value, "0");
+  if (prop_value[0] == '1') {
+      LOG(INFO) << "setting RLIM to infinity for process " << getpid();
+      rl.rlim_cur = RLIM_INFINITY;
+  } else {
+      rl.rlim_cur = 0;
+  }
+#else
   rl.rlim_cur = 0;
+#endif
   rl.rlim_max = RLIM_INFINITY;
   if (setrlimit(RLIMIT_CORE, &rl) == -1) {
     PLOG(ERROR) << "setrlimit(RLIMIT_CORE) failed for pid " << getpid();
diff --git a/runtime/thread.cc b/runtime/thread.cc
index f3b39c9..fca12ed 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -16,6 +16,10 @@
 
 #include "thread.h"
 
+#if !defined(__APPLE__)
+#include <sched.h>
+#endif
+
 #include <pthread.h>
 #include <signal.h>
 #include <sys/resource.h>
@@ -1293,8 +1297,21 @@
   if (thread != nullptr) {
     int policy;
     sched_param sp;
+#if !defined(__APPLE__)
+    // b/36445592 Don't use pthread_getschedparam since pthread may have exited.
+    policy = sched_getscheduler(tid);
+    if (policy == -1) {
+      PLOG(WARNING) << "sched_getscheduler(" << tid << ")";
+    }
+    int sched_getparam_result = sched_getparam(tid, &sp);
+    if (sched_getparam_result == -1) {
+      PLOG(WARNING) << "sched_getparam(" << tid << ", &sp)";
+      sp.sched_priority = -1;
+    }
+#else
     CHECK_PTHREAD_CALL(pthread_getschedparam, (thread->tlsPtr_.pthread_self, &policy, &sp),
                        __FUNCTION__);
+#endif
     os << " sched=" << policy << "/" << sp.sched_priority
        << " handle=" << reinterpret_cast<void*>(thread->tlsPtr_.pthread_self);
   }
diff --git a/test/004-UnsafeTest/src/Main.java b/test/004-UnsafeTest/src/Main.java
index b2f905e..9d4618a 100644
--- a/test/004-UnsafeTest/src/Main.java
+++ b/test/004-UnsafeTest/src/Main.java
@@ -39,16 +39,24 @@
     }
   }
 
-  private static Unsafe getUnsafe() throws Exception {
+  private static Unsafe getUnsafe() throws NoSuchFieldException, IllegalAccessException {
     Class<?> unsafeClass = Unsafe.class;
     Field f = unsafeClass.getDeclaredField("theUnsafe");
     f.setAccessible(true);
     return (Unsafe) f.get(null);
   }
 
-  public static void main(String[] args) throws Exception {
+  public static void main(String[] args) throws NoSuchFieldException, IllegalAccessException {
     System.loadLibrary(args[0]);
     Unsafe unsafe = getUnsafe();
+
+    testArrayBaseOffset(unsafe);
+    testArrayIndexScale(unsafe);
+    testGetAndPutAndCAS(unsafe);
+    testGetAndPutVolatile(unsafe);
+  }
+
+  private static void testArrayBaseOffset(Unsafe unsafe) {
     check(unsafe.arrayBaseOffset(boolean[].class), vmArrayBaseOffset(boolean[].class),
         "Unsafe.arrayBaseOffset(boolean[])");
     check(unsafe.arrayBaseOffset(byte[].class), vmArrayBaseOffset(byte[].class),
@@ -65,7 +73,9 @@
         "Unsafe.arrayBaseOffset(long[])");
     check(unsafe.arrayBaseOffset(Object[].class), vmArrayBaseOffset(Object[].class),
         "Unsafe.arrayBaseOffset(Object[])");
+  }
 
+  private static void testArrayIndexScale(Unsafe unsafe) {
     check(unsafe.arrayIndexScale(boolean[].class), vmArrayIndexScale(boolean[].class),
         "Unsafe.arrayIndexScale(boolean[])");
     check(unsafe.arrayIndexScale(byte[].class), vmArrayIndexScale(byte[].class),
@@ -82,7 +92,9 @@
         "Unsafe.arrayIndexScale(long[])");
     check(unsafe.arrayIndexScale(Object[].class), vmArrayIndexScale(Object[].class),
         "Unsafe.arrayIndexScale(Object[])");
+  }
 
+  private static void testGetAndPutAndCAS(Unsafe unsafe) throws NoSuchFieldException {
     TestClass t = new TestClass();
 
     int intValue = 12345678;
@@ -185,12 +197,58 @@
     }
   }
 
+  private static void testGetAndPutVolatile(Unsafe unsafe) throws NoSuchFieldException {
+    TestVolatileClass tv = new TestVolatileClass();
+
+    int intValue = 12345678;
+    Field volatileIntField = TestVolatileClass.class.getDeclaredField("volatileIntVar");
+    long volatileIntOffset = unsafe.objectFieldOffset(volatileIntField);
+    check(unsafe.getIntVolatile(tv, volatileIntOffset),
+          0,
+          "Unsafe.getIntVolatile(Object, long) - initial");
+    unsafe.putIntVolatile(tv, volatileIntOffset, intValue);
+    check(tv.volatileIntVar, intValue, "Unsafe.putIntVolatile(Object, long, int)");
+    check(unsafe.getIntVolatile(tv, volatileIntOffset),
+          intValue,
+          "Unsafe.getIntVolatile(Object, long)");
+
+    long longValue = 1234567887654321L;
+    Field volatileLongField = TestVolatileClass.class.getDeclaredField("volatileLongVar");
+    long volatileLongOffset = unsafe.objectFieldOffset(volatileLongField);
+    check(unsafe.getLongVolatile(tv, volatileLongOffset),
+          0,
+          "Unsafe.getLongVolatile(Object, long) - initial");
+    unsafe.putLongVolatile(tv, volatileLongOffset, longValue);
+    check(tv.volatileLongVar, longValue, "Unsafe.putLongVolatile(Object, long, long)");
+    check(unsafe.getLongVolatile(tv, volatileLongOffset),
+          longValue,
+          "Unsafe.getLongVolatile(Object, long)");
+
+    Object objectValue = new Object();
+    Field volatileObjectField = TestVolatileClass.class.getDeclaredField("volatileObjectVar");
+    long volatileObjectOffset = unsafe.objectFieldOffset(volatileObjectField);
+    check(unsafe.getObjectVolatile(tv, volatileObjectOffset),
+          null,
+          "Unsafe.getObjectVolatile(Object, long) - initial");
+    unsafe.putObjectVolatile(tv, volatileObjectOffset, objectValue);
+    check(tv.volatileObjectVar, objectValue, "Unsafe.putObjectVolatile(Object, long, Object)");
+    check(unsafe.getObjectVolatile(tv, volatileObjectOffset),
+          objectValue,
+          "Unsafe.getObjectVolatile(Object, long)");
+  }
+
   private static class TestClass {
     public int intVar = 0;
     public long longVar = 0;
     public Object objectVar = null;
   }
 
+  private static class TestVolatileClass {
+    public volatile int volatileIntVar = 0;
+    public volatile long volatileLongVar = 0;
+    public volatile Object volatileObjectVar = null;
+  }
+
   private static native int vmArrayBaseOffset(Class clazz);
   private static native int vmArrayIndexScale(Class clazz);
 }
diff --git a/test/458-checker-instruction-simplification/src/Main.java b/test/458-checker-instruction-simplification/src/Main.java
index 53c2e0b..fd72fe5 100644
--- a/test/458-checker-instruction-simplification/src/Main.java
+++ b/test/458-checker-instruction-simplification/src/Main.java
@@ -1674,6 +1674,108 @@
     }
   }
 
+  /// CHECK-START: int Main.$noinline$intUnnecessaryShiftMasking(int, int) instruction_simplifier (before)
+  /// CHECK:          <<Value:i\d+>>    ParameterValue
+  /// CHECK:          <<Shift:i\d+>>    ParameterValue
+  /// CHECK-DAG:      <<Const31:i\d+>>  IntConstant 31
+  /// CHECK-DAG:      <<And:i\d+>>      And [<<Shift>>,<<Const31>>]
+  /// CHECK-DAG:      <<Shl:i\d+>>      Shl [<<Value>>,<<And>>]
+  /// CHECK-DAG:                        Return [<<Shl>>]
+
+  /// CHECK-START: int Main.$noinline$intUnnecessaryShiftMasking(int, int) instruction_simplifier (after)
+  /// CHECK:          <<Value:i\d+>>    ParameterValue
+  /// CHECK:          <<Shift:i\d+>>    ParameterValue
+  /// CHECK-DAG:      <<Shl:i\d+>>      Shl [<<Value>>,<<Shift>>]
+  /// CHECK-DAG:                        Return [<<Shl>>]
+
+  public static int $noinline$intUnnecessaryShiftMasking(int value, int shift) {
+    if (doThrow) { throw new Error(); }
+    return value << (shift & 31);
+  }
+
+  /// CHECK-START: long Main.$noinline$longUnnecessaryShiftMasking(long, int) instruction_simplifier (before)
+  /// CHECK:          <<Value:j\d+>>    ParameterValue
+  /// CHECK:          <<Shift:i\d+>>    ParameterValue
+  /// CHECK-DAG:      <<Const63:i\d+>>  IntConstant 63
+  /// CHECK-DAG:      <<And:i\d+>>      And [<<Shift>>,<<Const63>>]
+  /// CHECK-DAG:      <<Shr:j\d+>>      Shr [<<Value>>,<<And>>]
+  /// CHECK-DAG:                        Return [<<Shr>>]
+
+  /// CHECK-START: long Main.$noinline$longUnnecessaryShiftMasking(long, int) instruction_simplifier (after)
+  /// CHECK:          <<Value:j\d+>>    ParameterValue
+  /// CHECK:          <<Shift:i\d+>>    ParameterValue
+  /// CHECK-DAG:      <<Shr:j\d+>>      Shr [<<Value>>,<<Shift>>]
+  /// CHECK-DAG:                        Return [<<Shr>>]
+
+  public static long $noinline$longUnnecessaryShiftMasking(long value, int shift) {
+    if (doThrow) { throw new Error(); }
+    return value >> (shift & 63);
+  }
+
+  /// CHECK-START: int Main.$noinline$intUnnecessaryWiderShiftMasking(int, int) instruction_simplifier (before)
+  /// CHECK:          <<Value:i\d+>>    ParameterValue
+  /// CHECK:          <<Shift:i\d+>>    ParameterValue
+  /// CHECK-DAG:      <<Const255:i\d+>> IntConstant 255
+  /// CHECK-DAG:      <<And:i\d+>>      And [<<Shift>>,<<Const255>>]
+  /// CHECK-DAG:      <<UShr:i\d+>>     UShr [<<Value>>,<<And>>]
+  /// CHECK-DAG:                        Return [<<UShr>>]
+
+  /// CHECK-START: int Main.$noinline$intUnnecessaryWiderShiftMasking(int, int) instruction_simplifier (after)
+  /// CHECK:          <<Value:i\d+>>    ParameterValue
+  /// CHECK:          <<Shift:i\d+>>    ParameterValue
+  /// CHECK-DAG:      <<UShr:i\d+>>     UShr [<<Value>>,<<Shift>>]
+  /// CHECK-DAG:                        Return [<<UShr>>]
+
+  public static int $noinline$intUnnecessaryWiderShiftMasking(int value, int shift) {
+    if (doThrow) { throw new Error(); }
+    return value >>> (shift & 0xff);
+  }
+
+  /// CHECK-START: long Main.$noinline$longSmallerShiftMasking(long, int) instruction_simplifier (before)
+  /// CHECK:          <<Value:j\d+>>    ParameterValue
+  /// CHECK:          <<Shift:i\d+>>    ParameterValue
+  /// CHECK-DAG:      <<Const3:i\d+>>   IntConstant 3
+  /// CHECK-DAG:      <<And:i\d+>>      And [<<Shift>>,<<Const3>>]
+  /// CHECK-DAG:      <<Shl:j\d+>>      Shl [<<Value>>,<<And>>]
+  /// CHECK-DAG:                        Return [<<Shl>>]
+
+  /// CHECK-START: long Main.$noinline$longSmallerShiftMasking(long, int) instruction_simplifier (after)
+  /// CHECK:          <<Value:j\d+>>    ParameterValue
+  /// CHECK:          <<Shift:i\d+>>    ParameterValue
+  /// CHECK-DAG:      <<Const3:i\d+>>   IntConstant 3
+  /// CHECK-DAG:      <<And:i\d+>>      And [<<Shift>>,<<Const3>>]
+  /// CHECK-DAG:      <<Shl:j\d+>>      Shl [<<Value>>,<<And>>]
+  /// CHECK-DAG:                        Return [<<Shl>>]
+
+  public static long $noinline$longSmallerShiftMasking(long value, int shift) {
+    if (doThrow) { throw new Error(); }
+    return value << (shift & 3);
+  }
+
+  /// CHECK-START: int Main.$noinline$otherUseOfUnnecessaryShiftMasking(int, int) instruction_simplifier (before)
+  /// CHECK:          <<Value:i\d+>>    ParameterValue
+  /// CHECK:          <<Shift:i\d+>>    ParameterValue
+  /// CHECK-DAG:      <<Const31:i\d+>>  IntConstant 31
+  /// CHECK-DAG:      <<And:i\d+>>      And [<<Shift>>,<<Const31>>]
+  /// CHECK-DAG:      <<Shr:i\d+>>      Shr [<<Value>>,<<And>>]
+  /// CHECK-DAG:      <<Add:i\d+>>      Add [<<Shr>>,<<And>>]
+  /// CHECK-DAG:                        Return [<<Add>>]
+
+  /// CHECK-START: int Main.$noinline$otherUseOfUnnecessaryShiftMasking(int, int) instruction_simplifier (after)
+  /// CHECK:          <<Value:i\d+>>    ParameterValue
+  /// CHECK:          <<Shift:i\d+>>    ParameterValue
+  /// CHECK-DAG:      <<Const31:i\d+>>  IntConstant 31
+  /// CHECK-DAG:      <<And:i\d+>>      And [<<Shift>>,<<Const31>>]
+  /// CHECK-DAG:      <<Shr:i\d+>>      Shr [<<Value>>,<<Shift>>]
+  /// CHECK-DAG:      <<Add:i\d+>>      Add [<<Shr>>,<<And>>]
+  /// CHECK-DAG:                        Return [<<Add>>]
+
+  public static int $noinline$otherUseOfUnnecessaryShiftMasking(int value, int shift) {
+    if (doThrow) { throw new Error(); }
+    int temp = shift & 31;
+    return (value >> temp) + temp;
+  }
+
 public static void main(String[] args) {
     int arg = 123456;
 
@@ -1823,6 +1925,17 @@
         }
       }
     }
+
+    assertIntEquals(0x5e6f7808, $noinline$intUnnecessaryShiftMasking(0xabcdef01, 3));
+    assertIntEquals(0x5e6f7808, $noinline$intUnnecessaryShiftMasking(0xabcdef01, 3 + 32));
+    assertLongEquals(0xffffffffffffeaf3L, $noinline$longUnnecessaryShiftMasking(0xabcdef0123456789L, 50));
+    assertLongEquals(0xffffffffffffeaf3L, $noinline$longUnnecessaryShiftMasking(0xabcdef0123456789L, 50 + 64));
+    assertIntEquals(0x2af37b, $noinline$intUnnecessaryWiderShiftMasking(0xabcdef01, 10));
+    assertIntEquals(0x2af37b, $noinline$intUnnecessaryWiderShiftMasking(0xabcdef01, 10 + 128));
+    assertLongEquals(0xaf37bc048d159e24L, $noinline$longSmallerShiftMasking(0xabcdef0123456789L, 2));
+    assertLongEquals(0xaf37bc048d159e24L, $noinline$longSmallerShiftMasking(0xabcdef0123456789L, 2 + 256));
+    assertIntEquals(0xfffd5e7c, $noinline$otherUseOfUnnecessaryShiftMasking(0xabcdef01, 13));
+    assertIntEquals(0xfffd5e7c, $noinline$otherUseOfUnnecessaryShiftMasking(0xabcdef01, 13 + 512));
   }
 
   private static boolean $inline$true() { return true; }
diff --git a/test/536-checker-intrinsic-optimization/src/Main.java b/test/536-checker-intrinsic-optimization/src/Main.java
index be666e9..24ed2fe 100644
--- a/test/536-checker-intrinsic-optimization/src/Main.java
+++ b/test/536-checker-intrinsic-optimization/src/Main.java
@@ -16,9 +16,69 @@
 
 
 public class Main {
+  public static boolean doThrow = false;
+
+  public static void assertIntEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void assertBooleanEquals(boolean expected, boolean result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
   public static void main(String[] args) {
     stringEqualsSame();
     stringArgumentNotNull("Foo");
+
+    assertIntEquals(0, $opt$noinline$getStringLength(""));
+    assertIntEquals(3, $opt$noinline$getStringLength("abc"));
+    assertIntEquals(10, $opt$noinline$getStringLength("0123456789"));
+
+    assertBooleanEquals(true, $opt$noinline$isStringEmpty(""));
+    assertBooleanEquals(false, $opt$noinline$isStringEmpty("abc"));
+    assertBooleanEquals(false, $opt$noinline$isStringEmpty("0123456789"));
+  }
+
+  /// CHECK-START: int Main.$opt$noinline$getStringLength(java.lang.String) instruction_simplifier (before)
+  /// CHECK-DAG:  <<Length:i\d+>>   InvokeVirtual intrinsic:StringLength
+  /// CHECK-DAG:                    Return [<<Length>>]
+
+  /// CHECK-START: int Main.$opt$noinline$getStringLength(java.lang.String) instruction_simplifier (after)
+  /// CHECK-DAG:  <<String:l\d+>>   ParameterValue
+  /// CHECK-DAG:  <<NullCk:l\d+>>   NullCheck [<<String>>]
+  /// CHECK-DAG:  <<Length:i\d+>>   ArrayLength [<<NullCk>>] is_string_length:true
+  /// CHECK-DAG:                    Return [<<Length>>]
+
+  /// CHECK-START: int Main.$opt$noinline$getStringLength(java.lang.String) instruction_simplifier (after)
+  /// CHECK-NOT:                    InvokeVirtual intrinsic:StringLength
+
+  static public int $opt$noinline$getStringLength(String s) {
+    if (doThrow) { throw new Error(); }
+    return s.length();
+  }
+
+  /// CHECK-START: boolean Main.$opt$noinline$isStringEmpty(java.lang.String) instruction_simplifier (before)
+  /// CHECK-DAG:  <<IsEmpty:z\d+>>  InvokeVirtual intrinsic:StringIsEmpty
+  /// CHECK-DAG:                    Return [<<IsEmpty>>]
+
+  /// CHECK-START: boolean Main.$opt$noinline$isStringEmpty(java.lang.String) instruction_simplifier (after)
+  /// CHECK-DAG:  <<String:l\d+>>   ParameterValue
+  /// CHECK-DAG:  <<Const0:i\d+>>   IntConstant 0
+  /// CHECK-DAG:  <<NullCk:l\d+>>   NullCheck [<<String>>]
+  /// CHECK-DAG:  <<Length:i\d+>>   ArrayLength [<<NullCk>>] is_string_length:true
+  /// CHECK-DAG:  <<IsEmpty:z\d+>>  Equal [<<Length>>,<<Const0>>]
+  /// CHECK-DAG:                    Return [<<IsEmpty>>]
+
+  /// CHECK-START: boolean Main.$opt$noinline$isStringEmpty(java.lang.String) instruction_simplifier (after)
+  /// CHECK-NOT:                    InvokeVirtual intrinsic:StringIsEmpty
+
+  static public boolean $opt$noinline$isStringEmpty(String s) {
+    if (doThrow) { throw new Error(); }
+    return s.isEmpty();
   }
 
   /// CHECK-START: boolean Main.stringEqualsSame() instruction_simplifier (before)
@@ -47,8 +107,28 @@
   }
 
   /// CHECK-START-X86: boolean Main.stringArgumentNotNull(java.lang.Object) disassembly (after)
-  /// CHECK:          InvokeVirtual {{.*\.equals.*}}
+  /// CHECK:          InvokeVirtual {{.*\.equals.*}} intrinsic:StringEquals
   /// CHECK-NOT:      test
+
+  /// CHECK-START-X86_64: boolean Main.stringArgumentNotNull(java.lang.Object) disassembly (after)
+  /// CHECK:          InvokeVirtual {{.*\.equals.*}} intrinsic:StringEquals
+  /// CHECK-NOT:      test
+
+  /// CHECK-START-ARM: boolean Main.stringArgumentNotNull(java.lang.Object) disassembly (after)
+  /// CHECK:          InvokeVirtual {{.*\.equals.*}} intrinsic:StringEquals
+  // CompareAndBranchIfZero() may emit either CBZ or CMP+BEQ.
+  /// CHECK-NOT:      cbz
+  /// CHECK-NOT:      cmp {{r\d+}}, #0
+  // Terminate the scope for the CHECK-NOT search at the reference or length comparison,
+  // whichever comes first.
+  /// CHECK:          cmp {{r\d+}}, {{r\d+}}
+
+  /// CHECK-START-ARM64: boolean Main.stringArgumentNotNull(java.lang.Object) disassembly (after)
+  /// CHECK:          InvokeVirtual {{.*\.equals.*}} intrinsic:StringEquals
+  /// CHECK-NOT:      cbz
+  // Terminate the scope for the CHECK-NOT search at the reference or length comparison,
+  // whichever comes first.
+  /// CHECK:          cmp {{w.*,}} {{w.*}}
   public static boolean stringArgumentNotNull(Object obj) {
     obj.getClass();
     return "foo".equals(obj);
@@ -56,12 +136,53 @@
 
   // Test is very brittle as it depends on the order we emit instructions.
   /// CHECK-START-X86: boolean Main.stringArgumentIsString() disassembly (after)
-  /// CHECK:      InvokeVirtual
-  /// CHECK:      test
-  /// CHECK:      jz/eq
+  /// CHECK:          InvokeVirtual intrinsic:StringEquals
+  /// CHECK:          test
+  /// CHECK:          jz/eq
   // Check that we don't try to compare the classes.
-  /// CHECK-NOT:  mov
-  /// CHECK:      cmp
+  /// CHECK-NOT:      mov
+  /// CHECK:          cmp
+
+  // Test is very brittle as it depends on the order we emit instructions.
+  /// CHECK-START-X86_64: boolean Main.stringArgumentIsString() disassembly (after)
+  /// CHECK:          InvokeVirtual intrinsic:StringEquals
+  /// CHECK:          test
+  /// CHECK:          jz/eq
+  // Check that we don't try to compare the classes.
+  /// CHECK-NOT:      mov
+  /// CHECK:          cmp
+
+  // Test is brittle as it depends on the class offset being 0.
+  /// CHECK-START-ARM: boolean Main.stringArgumentIsString() disassembly (after)
+  /// CHECK:          InvokeVirtual intrinsic:StringEquals
+  /// CHECK:          {{cbz|cmp}}
+  // Check that we don't try to compare the classes.
+  // The dissassembler currently explicitly emits the offset 0 but don't rely on it.
+  // We want to terminate the CHECK-NOT search after two CMPs, one for reference
+  // equality and one for length comparison but these may be emitted in different order,
+  // so repeat the check twice.
+  /// CHECK-NOT:      ldr{{(|.w)}} {{r\d+}}, [{{r\d+}}]
+  /// CHECK-NOT:      ldr{{(|.w)}} {{r\d+}}, [{{r\d+}}, #0]
+  /// CHECK:          cmp {{r\d+}}, {{r\d+}}
+  /// CHECK-NOT:      ldr{{(|.w)}} {{r\d+}}, [{{r\d+}}]
+  /// CHECK-NOT:      ldr{{(|.w)}} {{r\d+}}, [{{r\d+}}, #0]
+  /// CHECK:          cmp {{r\d+}}, {{r\d+}}
+
+  // Test is brittle as it depends on the class offset being 0.
+  /// CHECK-START-ARM64: boolean Main.stringArgumentIsString() disassembly (after)
+  /// CHECK:          InvokeVirtual intrinsic:StringEquals
+  /// CHECK:          cbz
+  // Check that we don't try to compare the classes.
+  // The dissassembler currently does not explicitly emits the offset 0 but don't rely on it.
+  // We want to terminate the CHECK-NOT search after two CMPs, one for reference
+  // equality and one for length comparison but these may be emitted in different order,
+  // so repeat the check twice.
+  /// CHECK-NOT:      ldr {{w\d+}}, [{{x\d+}}]
+  /// CHECK-NOT:      ldr {{w\d+}}, [{{x\d+}}, #0]
+  /// CHECK:          cmp {{w\d+}}, {{w\d+}}
+  /// CHECK-NOT:      ldr {{w\d+}}, [{{x\d+}}]
+  /// CHECK-NOT:      ldr {{w\d+}}, [{{x\d+}}, #0]
+  /// CHECK:          cmp {{w\d+}}, {{w\d+}}
   public static boolean stringArgumentIsString() {
     return "foo".equals(myString);
   }
diff --git a/test/538-checker-embed-constants/src/Main.java b/test/538-checker-embed-constants/src/Main.java
index f791adf..f6713a2 100644
--- a/test/538-checker-embed-constants/src/Main.java
+++ b/test/538-checker-embed-constants/src/Main.java
@@ -473,7 +473,7 @@
   }
 
   /**
-   * Test that the `-1` constant is not synthesized in a register and that we
+   * ARM/ARM64: Test that the `-1` constant is not synthesized in a register and that we
    * instead simply switch between `add` and `sub` instructions with the
    * constant embedded.
    * We need two uses (or more) of the constant because the compiler always
@@ -491,10 +491,137 @@
   /// CHECK:                        sub x{{\d+}}, x{{\d+}}, #0x1
   /// CHECK:                        add x{{\d+}}, x{{\d+}}, #0x1
 
+  /// CHECK-START-ARM: long Main.addM1(long) register (after)
+  /// CHECK:     <<Arg:j\d+>>       ParameterValue
+  /// CHECK:     <<ConstM1:j\d+>>   LongConstant -1
+  /// CHECK-NOT:                    ParallelMove
+  /// CHECK:                        Add [<<Arg>>,<<ConstM1>>]
+  /// CHECK:                        Sub [<<Arg>>,<<ConstM1>>]
+
+  /// CHECK-START-ARM: long Main.addM1(long) disassembly (after)
+  /// CHECK:     <<Arg:j\d+>>       ParameterValue
+  /// CHECK:     <<ConstM1:j\d+>>   LongConstant -1
+  /// CHECK:                        Add [<<Arg>>,<<ConstM1>>]
+  /// CHECK-NEXT:                   subs r{{\d+}}, #1
+  /// CHECK-NEXT:                   adc r{{\d+}}, r{{\d+}}, #-1
+  /// CHECK:                        Sub [<<Arg>>,<<ConstM1>>]
+  /// CHECK-NEXT:                   adds r{{\d+}}, #1
+  /// CHECK-NEXT:                   adc r{{\d+}}, r{{\d+}}, #0
+
   public static long addM1(long arg) {
     return (arg + (-1)) | (arg - (-1));
   }
 
+  /**
+   * ARM: Test that some long constants are not synthesized in a register for add-long.
+   * Also test some negative cases where we do synthetize constants in registers.
+   */
+
+  /// CHECK-START-ARM: long Main.addLongConstants(long) disassembly (after)
+  /// CHECK:     <<Arg:j\d+>>       ParameterValue
+  /// CHECK-DAG: <<ConstA:j\d+>>    LongConstant 4486007727657233
+  /// CHECK-DAG: <<ConstB:j\d+>>    LongConstant 4486011735248896
+  /// CHECK-DAG: <<ConstC:j\d+>>    LongConstant -1071856711330889728
+  /// CHECK-DAG: <<ConstD:j\d+>>    LongConstant 17587891077120
+  /// CHECK-DAG: <<ConstE:j\d+>>    LongConstant -8808977924096
+  /// CHECK-DAG: <<ConstF:j\d+>>    LongConstant 17587891077121
+  /// CHECK-DAG: <<ConstG:j\d+>>    LongConstant 4095
+  /// CHECK:                        Add [<<Arg>>,<<ConstA>>]
+  /// CHECK-NEXT:                   adds r{{\d+}}, r{{\d+}}, #286331153
+  /// CHECK-NEXT:                   adc r{{\d+}}, r{{\d+}}, #1044480
+  /// CHECK:                        Add [<<Arg>>,<<ConstB>>]
+  /// CHECK-NEXT:                   subs r{{\d+}}, r{{\d+}}, #1044480
+  /// CHECK-NEXT:                   adc r{{\d+}}, r{{\d+}}, #1044480
+  /// CHECK:                        Add [<<Arg>>,<<ConstC>>]
+  /// CHECK-NEXT:                   subs r{{\d+}}, r{{\d+}}, #16711680
+  /// CHECK-NEXT:                   sbc r{{\d+}}, r{{\d+}}, #249561088
+  /// CHECK:                        Add [<<Arg>>,<<ConstD>>]
+  // There may or may not be a MOV here.
+  /// CHECK:                        addw r{{\d+}}, r{{\d+}}, #4095
+  /// CHECK:                        Add [<<Arg>>,<<ConstE>>]
+  // There may or may not be a MOV here.
+  /// CHECK:                        subw r{{\d+}}, r{{\d+}}, #2051
+  /// CHECK:                        Add [<<Arg>>,<<ConstF>>]
+  /// CHECK-NEXT:                   adds{{(\.w)?}} r{{\d+}}, r{{\d+}}, r{{\d+}}
+  /// CHECK-NEXT:                   adc{{(\.w)?}} r{{\d+}}, r{{\d+}}, r{{\d+}}
+  /// CHECK:                        Add [<<Arg>>,<<ConstG>>]
+  /// CHECK-NEXT:                   adds{{(\.w)?}} r{{\d+}}, r{{\d+}}, r{{\d+}}
+  /// CHECK-NEXT:                   adc{{(\.w)?}} r{{\d+}}, r{{\d+}}, r{{\d+}}
+
+  public static long addLongConstants(long arg) {
+    return
+        // Modified immediates.
+        (arg + 0x000ff00011111111L) ^  // 4486007727657233
+        // Modified immediates high and -low.
+        (arg + 0x000ff000fff01000L) ^  // 4486011735248896
+        // Modified immediates ~high and -low.
+        (arg + 0xf11fffffff010000L) ^  // -1071856711330889728
+        // Low word 0 (no carry), high is imm12.
+        (arg + 0x00000fff00000000L) ^  // 17587891077120
+        // Low word 0 (no carry), -high is imm12.
+        (arg + 0xfffff7fd00000000L) ^  // -8808977924096
+        // Cannot embed imm12 in ADC/SBC for high word.
+        (arg + 0x00000fff00000001L) ^  // 17587891077121
+        // Cannot embed imm12 in ADDS/SUBS for low word (need to set flags).
+        (arg + 0x0000000000000fffL) ^  // 4095
+        arg;
+  }
+
+  /**
+   * ARM: Test that some long constants are not synthesized in a register for add-long.
+   * Also test some negative cases where we do synthetize constants in registers.
+   */
+
+  /// CHECK-START-ARM: long Main.subLongConstants(long) disassembly (after)
+  /// CHECK:     <<Arg:j\d+>>       ParameterValue
+  /// CHECK-DAG: <<ConstA:j\d+>>    LongConstant 4486007727657233
+  /// CHECK-DAG: <<ConstB:j\d+>>    LongConstant 4486011735248896
+  /// CHECK-DAG: <<ConstC:j\d+>>    LongConstant -1071856711330889728
+  /// CHECK-DAG: <<ConstD:j\d+>>    LongConstant 17587891077120
+  /// CHECK-DAG: <<ConstE:j\d+>>    LongConstant -8808977924096
+  /// CHECK-DAG: <<ConstF:j\d+>>    LongConstant 17587891077121
+  /// CHECK-DAG: <<ConstG:j\d+>>    LongConstant 4095
+  /// CHECK:                        Sub [<<Arg>>,<<ConstA>>]
+  /// CHECK-NEXT:                   subs r{{\d+}}, r{{\d+}}, #286331153
+  /// CHECK-NEXT:                   sbc r{{\d+}}, r{{\d+}}, #1044480
+  /// CHECK:                        Sub [<<Arg>>,<<ConstB>>]
+  /// CHECK-NEXT:                   adds r{{\d+}}, r{{\d+}}, #1044480
+  /// CHECK-NEXT:                   sbc r{{\d+}}, r{{\d+}}, #1044480
+  /// CHECK:                        Sub [<<Arg>>,<<ConstC>>]
+  /// CHECK-NEXT:                   adds r{{\d+}}, r{{\d+}}, #16711680
+  /// CHECK-NEXT:                   adc r{{\d+}}, r{{\d+}}, #249561088
+  /// CHECK:                        Sub [<<Arg>>,<<ConstD>>]
+  // There may or may not be a MOV here.
+  /// CHECK:                        subw r{{\d+}}, r{{\d+}}, #4095
+  /// CHECK:                        Sub [<<Arg>>,<<ConstE>>]
+  // There may or may not be a MOV here.
+  /// CHECK:                        addw r{{\d+}}, r{{\d+}}, #2051
+  /// CHECK:                        Sub [<<Arg>>,<<ConstF>>]
+  /// CHECK-NEXT:                   subs{{(\.w)?}} r{{\d+}}, r{{\d+}}, r{{\d+}}
+  /// CHECK-NEXT:                   sbc{{(\.w)?}} r{{\d+}}, r{{\d+}}, r{{\d+}}
+  /// CHECK:                        Sub [<<Arg>>,<<ConstG>>]
+  /// CHECK-NEXT:                   subs{{(\.w)?}} r{{\d+}}, r{{\d+}}, r{{\d+}}
+  /// CHECK-NEXT:                   sbc{{(\.w)?}} r{{\d+}}, r{{\d+}}, r{{\d+}}
+
+  public static long subLongConstants(long arg) {
+    return
+        // Modified immediates.
+        (arg - 0x000ff00011111111L) ^  // 4486007727657233
+        // Modified immediates high and -low.
+        (arg - 0x000ff000fff01000L) ^  // 4486011735248896
+        // Modified immediates ~high and -low.
+        (arg - 0xf11fffffff010000L) ^  // -1071856711330889728
+        // Low word 0 (no carry), high is imm12.
+        (arg - 0x00000fff00000000L) ^  // 17587891077120
+        // Low word 0 (no carry), -high is imm12.
+        (arg - 0xfffff7fd00000000L) ^  // -8808977924096
+        // Cannot embed imm12 in ADC/SBC for high word.
+        (arg - 0x00000fff00000001L) ^  // 17587891077121
+        // Cannot embed imm12 in ADDS/SUBS for low word (need to set flags).
+        (arg - 0x0000000000000fffL) ^  // 4095
+        arg;
+  }
+
   public static void main(String[] args) {
     int arg = 0x87654321;
     assertIntEquals(and255(arg), 0x21);
@@ -522,7 +649,7 @@
     assertLongEquals(xor0xfffffff00000000f(longArg), 0xedcba9888765432eL);
     assertLongEquals(xor0xf00000000000000f(longArg), 0xe23456788765432eL);
 
-    assertLongEquals(14, addM1(7));
+    assertLongEquals(14L, addM1(7));
 
     assertLongEquals(shl1(longArg), 0x2468acf10eca8642L);
     assertLongEquals(shl2(longArg), 0x48d159e21d950c84L);
@@ -562,5 +689,30 @@
     assertLongEquals(ushr32(~longArg), 0x00000000edcba987L);
     assertLongEquals(ushr33(~longArg), 0x0000000076e5d4c3L);
     assertLongEquals(ushr63(~longArg), 0x0000000000000001L);
+
+    // Test -1, 0, +1 and arbitrary constants just before and after overflow
+    // on low word in subexpressions of addLongConstants()/subLongConstants(),
+    // so that we check that we carry the overflow correctly to the high word.
+    // For example
+    //    0x111eeeeeeee+0x000ff00011111111 = 0x000ff111ffffffff (carry=0),
+    //    0x111eeeeeeef+0x000ff00011111111 = 0x000ff11200000000 (carry=1).
+    assertLongEquals(0xf11ff7fdee1e1111L, addLongConstants(0xffffffffffffffffL));
+    assertLongEquals(0xee0080211e00eefL, addLongConstants(0x0L));
+    assertLongEquals(0xee0080211e01111L, addLongConstants(0x1L));
+    assertLongEquals(0xedff81c12201113L, addLongConstants(0x111eeeeeeeeL));
+    assertLongEquals(0xedff81feddfeef1L, addLongConstants(0x111eeeeeeefL));
+    assertLongEquals(0xedff83e11c1f111L, addLongConstants(0x222000fefffL));
+    assertLongEquals(0xedff83fee3e0eefL, addLongConstants(0x222000ff000L));
+    assertLongEquals(0xedff805edfe1111L, addLongConstants(0x33300feffffL));
+    assertLongEquals(0xedff80412000eefL, addLongConstants(0x33300ff0000L));
+    assertLongEquals(0xee0080211e00eefL, subLongConstants(0xffffffffffffffffL));
+    assertLongEquals(0xf11ff7fdee1e1111L, subLongConstants(0x0L));
+    assertLongEquals(0xf11ff7fc11e1eef3L, subLongConstants(0x1L));
+    assertLongEquals(0xee0080412201113L, subLongConstants(0x44411111111L));
+    assertLongEquals(0xee0080412201111L, subLongConstants(0x44411111112L));
+    assertLongEquals(0xee0080e11c1f111L, subLongConstants(0x555fff01000L));
+    assertLongEquals(0xee0080e11c1eef3L, subLongConstants(0x555fff01001L));
+    assertLongEquals(0xee0080dedfe1111L, subLongConstants(0x666ff010000L));
+    assertLongEquals(0xee0080dedffeef3L, subLongConstants(0x666ff010001L));
   }
 }
diff --git a/test/615-checker-arm64-zr-parallel-move/expected.txt b/test/615-checker-arm64-zr-parallel-move/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/615-checker-arm64-zr-parallel-move/expected.txt
diff --git a/test/615-checker-arm64-zr-parallel-move/info.txt b/test/615-checker-arm64-zr-parallel-move/info.txt
new file mode 100644
index 0000000..199755d
--- /dev/null
+++ b/test/615-checker-arm64-zr-parallel-move/info.txt
@@ -0,0 +1 @@
+Checker test to verify we correctly use wzr and xzr to synthesize zero constants.
diff --git a/test/615-checker-arm64-zr-parallel-move/src/Main.java b/test/615-checker-arm64-zr-parallel-move/src/Main.java
new file mode 100644
index 0000000..5024f28
--- /dev/null
+++ b/test/615-checker-arm64-zr-parallel-move/src/Main.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  public static boolean doThrow = false;
+
+  public void $noinline$foo(int in_w1,
+                            int in_w2,
+                            int in_w3,
+                            int in_w4,
+                            int in_w5,
+                            int in_w6,
+                            int in_w7,
+                            int on_stack_int,
+                            long on_stack_long,
+                            float in_s0,
+                            float in_s1,
+                            float in_s2,
+                            float in_s3,
+                            float in_s4,
+                            float in_s5,
+                            float in_s6,
+                            float in_s7,
+                            float on_stack_float,
+                            double on_stack_double) {
+    if (doThrow) throw new Error();
+  }
+
+  // We expect a parallel move that moves four times the zero constant to stack locations.
+  /// CHECK-START-ARM64: void Main.bar() register (after)
+  /// CHECK:             ParallelMove {{.*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*}}
+
+  // Those four moves should generate four 'store' instructions using directly the zero register.
+  /// CHECK-START-ARM64: void Main.bar() disassembly (after)
+  /// CHECK-DAG:         {{(str|stur)}} wzr, [sp, #{{[0-9]+}}]
+  /// CHECK-DAG:         {{(str|stur)}} xzr, [sp, #{{[0-9]+}}]
+  /// CHECK-DAG:         {{(str|stur)}} wzr, [sp, #{{[0-9]+}}]
+  /// CHECK-DAG:         {{(str|stur)}} xzr, [sp, #{{[0-9]+}}]
+
+  public void bar() {
+    $noinline$foo(1, 2, 3, 4, 5, 6, 7,     // Integral values in registers.
+                  0, 0L,                   // Integral values on the stack.
+                  1, 2, 3, 4, 5, 6, 7, 8,  // Floating-point values in registers.
+                  0.0f, 0.0);              // Floating-point values on the stack.
+  }
+
+  public static void main(String args[]) {}
+}