Merge "Revert "Revert "Use IsMarked instead of Mark for profiling info."""
diff --git a/compiler/compiled_class.h b/compiler/compiled_class.h
deleted file mode 100644
index 06ce946..0000000
--- a/compiler/compiled_class.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2011 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_COMPILED_CLASS_H_
-#define ART_COMPILER_COMPILED_CLASS_H_
-
-#include "mirror/class.h"
-
-namespace art {
-
-class CompiledClass {
- public:
-  explicit CompiledClass(mirror::Class::Status status) : status_(status) {}
-  ~CompiledClass() {}
-  mirror::Class::Status GetStatus() const {
-    return status_;
-  }
-  void SetStatus(mirror::Class::Status status) {
-    status_ = status;
-  }
- private:
-  mirror::Class::Status status_;
-};
-
-}  // namespace art
-
-#endif  // ART_COMPILER_COMPILED_CLASS_H_
diff --git a/compiler/dex/verification_results.cc b/compiler/dex/verification_results.cc
index 3f0df3b..0338cfd 100644
--- a/compiler/dex/verification_results.cc
+++ b/compiler/dex/verification_results.cc
@@ -82,7 +82,12 @@
     // TODO: Investigate why are we doing the work again for this method and try to avoid it.
     LOG(WARNING) << "Method processed more than once: " << ref.PrettyMethod();
     if (!Runtime::Current()->UseJitCompilation()) {
-      DCHECK_EQ(existing->GetSafeCastSet().size(), verified_method->GetSafeCastSet().size());
+      if (kIsDebugBuild) {
+        auto ex_set = existing->GetSafeCastSet();
+        auto ve_set = verified_method->GetSafeCastSet();
+        CHECK_EQ(ex_set == nullptr, ve_set == nullptr);
+        CHECK((ex_set == nullptr) || (ex_set->size() == ve_set->size()));
+      }
     }
     // Let the unique_ptr delete the new verified method since there was already an existing one
     // registered. It is unsafe to replace the existing one since the JIT may be using it to
diff --git a/compiler/dex/verified_method.cc b/compiler/dex/verified_method.cc
index 608a18a..e46dc59 100644
--- a/compiler/dex/verified_method.cc
+++ b/compiler/dex/verified_method.cc
@@ -49,7 +49,10 @@
 }
 
 bool VerifiedMethod::IsSafeCast(uint32_t pc) const {
-  return std::binary_search(safe_cast_set_.begin(), safe_cast_set_.end(), pc);
+  if (safe_cast_set_ == nullptr) {
+    return false;
+  }
+  return std::binary_search(safe_cast_set_->begin(), safe_cast_set_->end(), pc);
 }
 
 void VerifiedMethod::GenerateSafeCastSet(verifier::MethodVerifier* method_verifier) {
@@ -94,12 +97,16 @@
                                                            /* strict */ true,
                                                            /* assignable */ true);
         }
+        if (safe_cast_set_ == nullptr) {
+          safe_cast_set_.reset(new SafeCastSet());
+        }
         // Verify ordering for push_back() to the sorted vector.
-        DCHECK(safe_cast_set_.empty() || safe_cast_set_.back() < dex_pc);
-        safe_cast_set_.push_back(dex_pc);
+        DCHECK(safe_cast_set_->empty() || safe_cast_set_->back() < dex_pc);
+        safe_cast_set_->push_back(dex_pc);
       }
     }
   }
+  DCHECK(safe_cast_set_ == nullptr || !safe_cast_set_->empty());
 }
 
 }  // namespace art
diff --git a/compiler/dex/verified_method.h b/compiler/dex/verified_method.h
index 439e69e..64b3f44 100644
--- a/compiler/dex/verified_method.h
+++ b/compiler/dex/verified_method.h
@@ -43,8 +43,8 @@
       REQUIRES_SHARED(Locks::mutator_lock_);
   ~VerifiedMethod() = default;
 
-  const SafeCastSet& GetSafeCastSet() const {
-    return safe_cast_set_;
+  const SafeCastSet* GetSafeCastSet() const {
+    return safe_cast_set_.get();
   }
 
   // Returns true if the cast can statically be verified to be redundant
@@ -69,7 +69,7 @@
   void GenerateSafeCastSet(verifier::MethodVerifier* method_verifier)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  SafeCastSet safe_cast_set_;
+  std::unique_ptr<SafeCastSet> safe_cast_set_;
 
   const uint32_t encountered_error_types_;
   const bool has_runtime_throw_;
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index a8ab7c6..c2d792d 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -36,7 +36,6 @@
 #include "base/time_utils.h"
 #include "base/timing_logger.h"
 #include "class_linker-inl.h"
-#include "compiled_class.h"
 #include "compiled_method.h"
 #include "compiler.h"
 #include "compiler_callbacks.h"
@@ -317,11 +316,6 @@
 }
 
 CompilerDriver::~CompilerDriver() {
-  Thread* self = Thread::Current();
-  {
-    MutexLock mu(self, compiled_classes_lock_);
-    STLDeleteValues(&compiled_classes_);
-  }
   compiled_methods_.Visit([this](const MethodReference& ref ATTRIBUTE_UNUSED,
                                  CompiledMethod* method) {
     if (method != nullptr) {
@@ -1978,8 +1972,7 @@
         if (compiler_only_verifies) {
           // Just update the compiled_classes_ map. The compiler doesn't need to resolve
           // the type.
-          compiled_classes_.Overwrite(
-              ClassReference(dex_file, i), new CompiledClass(mirror::Class::kStatusVerified));
+          compiled_classes_.Overwrite(ClassReference(dex_file, i), mirror::Class::kStatusVerified);
         } else {
           // Update the class status, so later compilation stages know they don't need to verify
           // the class.
@@ -2690,14 +2683,15 @@
       << method_ref.dex_file->PrettyMethod(method_ref.dex_method_index);
 }
 
-CompiledClass* CompilerDriver::GetCompiledClass(ClassReference ref) const {
+bool CompilerDriver::GetCompiledClass(ClassReference ref, mirror::Class::Status* status) const {
+  DCHECK(status != nullptr);
   MutexLock mu(Thread::Current(), compiled_classes_lock_);
-  ClassTable::const_iterator it = compiled_classes_.find(ref);
+  ClassStateTable::const_iterator it = compiled_classes_.find(ref);
   if (it == compiled_classes_.end()) {
-    return nullptr;
+    return false;
   }
-  CHECK(it->second != nullptr);
-  return it->second;
+  *status = it->second;
+  return true;
 }
 
 void CompilerDriver::RecordClassStatus(ClassReference ref, mirror::Class::Status status) {
@@ -2719,12 +2713,11 @@
   MutexLock mu(Thread::Current(), compiled_classes_lock_);
   auto it = compiled_classes_.find(ref);
   if (it == compiled_classes_.end()) {
-    CompiledClass* compiled_class = new CompiledClass(status);
-    compiled_classes_.Overwrite(ref, compiled_class);
-  } else if (status > it->second->GetStatus()) {
+    compiled_classes_.Overwrite(ref, status);
+  } else if (status > it->second) {
     // Update the status if we now have a greater one. This happens with vdex,
     // which records a class is verified, but does not resolve it.
-    it->second->SetStatus(status);
+    it->second = status;
   }
 }
 
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index fbab9df..e0d97b7 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -56,7 +56,6 @@
 }  // namespace verifier
 
 class BitVector;
-class CompiledClass;
 class CompiledMethod;
 class CompilerOptions;
 class DexCompilationUnit;
@@ -164,7 +163,7 @@
   std::unique_ptr<const std::vector<uint8_t>> CreateQuickResolutionTrampoline() const;
   std::unique_ptr<const std::vector<uint8_t>> CreateQuickToInterpreterBridge() const;
 
-  CompiledClass* GetCompiledClass(ClassReference ref) const
+  bool GetCompiledClass(ClassReference ref, mirror::Class::Status* status) const
       REQUIRES(!compiled_classes_lock_);
 
   CompiledMethod* GetCompiledMethod(MethodReference ref) const;
@@ -505,10 +504,10 @@
   std::map<ClassReference, bool> requires_constructor_barrier_
       GUARDED_BY(requires_constructor_barrier_lock_);
 
-  typedef SafeMap<const ClassReference, CompiledClass*> ClassTable;
+  using ClassStateTable = SafeMap<const ClassReference, mirror::Class::Status>;
   // All class references that this compiler has compiled.
   mutable Mutex compiled_classes_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  ClassTable compiled_classes_ GUARDED_BY(compiled_classes_lock_);
+  ClassStateTable compiled_classes_ GUARDED_BY(compiled_classes_lock_);
 
   typedef AtomicMethodRefMap<CompiledMethod*> MethodTable;
 
diff --git a/compiler/driver/compiler_driver_test.cc b/compiler/driver/compiler_driver_test.cc
index 17854fd..26ea39f 100644
--- a/compiler/driver/compiler_driver_test.cc
+++ b/compiler/driver/compiler_driver_test.cc
@@ -23,7 +23,6 @@
 #include "art_method-inl.h"
 #include "class_linker-inl.h"
 #include "common_compiler_test.h"
-#include "compiled_class.h"
 #include "dex_file.h"
 #include "dex_file_types.h"
 #include "gc/heap.h"
@@ -339,10 +338,11 @@
     ASSERT_NE(klass, nullptr);
     EXPECT_TRUE(klass->IsVerified());
 
-    CompiledClass* compiled_class = compiler_driver_->GetCompiledClass(
-        ClassReference(&klass->GetDexFile(), klass->GetDexTypeIndex().index_));
-    ASSERT_NE(compiled_class, nullptr);
-    EXPECT_EQ(compiled_class->GetStatus(), mirror::Class::kStatusVerified);
+    mirror::Class::Status status;
+    bool found = compiler_driver_->GetCompiledClass(
+        ClassReference(&klass->GetDexFile(), klass->GetDexTypeIndex().index_), &status);
+    ASSERT_TRUE(found);
+    EXPECT_EQ(status, mirror::Class::kStatusVerified);
   }
 };
 
diff --git a/compiler/linker/arm/relative_patcher_arm_base.cc b/compiler/linker/arm/relative_patcher_arm_base.cc
index e9d579d..c1ac230 100644
--- a/compiler/linker/arm/relative_patcher_arm_base.cc
+++ b/compiler/linker/arm/relative_patcher_arm_base.cc
@@ -311,24 +311,22 @@
 }
 
 ArmBaseRelativePatcher::ThunkKey ArmBaseRelativePatcher::GetMethodCallKey() {
-  return ThunkKey(ThunkType::kMethodCall, ThunkParams{{ 0u, 0u }});  // NOLINT(whitespace/braces)
+  return ThunkKey(ThunkType::kMethodCall);
 }
 
 ArmBaseRelativePatcher::ThunkKey ArmBaseRelativePatcher::GetBakerThunkKey(
     const LinkerPatch& patch) {
   DCHECK_EQ(patch.GetType(), LinkerPatch::Type::kBakerReadBarrierBranch);
-  ThunkParams params;
-  params.baker_params.custom_value1 = patch.GetBakerCustomValue1();
-  params.baker_params.custom_value2 = patch.GetBakerCustomValue2();
-  ThunkKey key(ThunkType::kBakerReadBarrier, params);
-  return key;
+  return ThunkKey(ThunkType::kBakerReadBarrier,
+                  patch.GetBakerCustomValue1(),
+                  patch.GetBakerCustomValue2());
 }
 
 void ArmBaseRelativePatcher::ProcessPatches(const CompiledMethod* compiled_method,
                                             uint32_t code_offset) {
   for (const LinkerPatch& patch : compiled_method->GetPatches()) {
     uint32_t patch_offset = code_offset + patch.LiteralOffset();
-    ThunkKey key(static_cast<ThunkType>(-1), ThunkParams{{ 0u, 0u }});  // NOLINT(whitespace/braces)
+    ThunkKey key(static_cast<ThunkType>(-1));
     ThunkData* old_data = nullptr;
     if (patch.GetType() == LinkerPatch::Type::kCallRelative) {
       key = GetMethodCallKey();
diff --git a/compiler/linker/arm/relative_patcher_arm_base.h b/compiler/linker/arm/relative_patcher_arm_base.h
index fd204c0..5197ce2 100644
--- a/compiler/linker/arm/relative_patcher_arm_base.h
+++ b/compiler/linker/arm/relative_patcher_arm_base.h
@@ -45,42 +45,27 @@
     kBakerReadBarrier,        // Baker read barrier.
   };
 
-  struct BakerReadBarrierParams {
-    uint32_t custom_value1;
-    uint32_t custom_value2;
-  };
-
-  struct RawThunkParams {
-    uint32_t first;
-    uint32_t second;
-  };
-
-  union ThunkParams {
-    RawThunkParams raw_params;
-    BakerReadBarrierParams baker_params;
-    static_assert(sizeof(raw_params) == sizeof(baker_params), "baker_params size check");
-  };
-
   class ThunkKey {
    public:
-    ThunkKey(ThunkType type, ThunkParams params) : type_(type), params_(params) { }
+    explicit ThunkKey(ThunkType type, uint32_t custom_value1 = 0u, uint32_t custom_value2 = 0u)
+        : type_(type), custom_value1_(custom_value1), custom_value2_(custom_value2) { }
 
     ThunkType GetType() const {
       return type_;
     }
 
-    BakerReadBarrierParams GetBakerReadBarrierParams() const {
-      DCHECK(type_ == ThunkType::kBakerReadBarrier);
-      return params_.baker_params;
+    uint32_t GetCustomValue1() const {
+      return custom_value1_;
     }
 
-    RawThunkParams GetRawParams() const {
-      return params_.raw_params;
+    uint32_t GetCustomValue2() const {
+      return custom_value2_;
     }
 
    private:
     ThunkType type_;
-    ThunkParams params_;
+    uint32_t custom_value1_;
+    uint32_t custom_value2_;
   };
 
   class ThunkKeyCompare {
@@ -89,10 +74,10 @@
       if (lhs.GetType() != rhs.GetType()) {
         return lhs.GetType() < rhs.GetType();
       }
-      if (lhs.GetRawParams().first != rhs.GetRawParams().first) {
-        return lhs.GetRawParams().first < rhs.GetRawParams().first;
+      if (lhs.GetCustomValue1() != rhs.GetCustomValue1()) {
+        return lhs.GetCustomValue1() < rhs.GetCustomValue1();
       }
-      return lhs.GetRawParams().second < rhs.GetRawParams().second;
+      return lhs.GetCustomValue2() < rhs.GetCustomValue2();
     }
   };
 
diff --git a/compiler/linker/arm/relative_patcher_thumb2.cc b/compiler/linker/arm/relative_patcher_thumb2.cc
index a98aedf..aa5a945 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2.cc
@@ -108,7 +108,7 @@
   DCHECK_EQ(insn, 0xf0408000);  // BNE +0 (unpatched)
   ThunkKey key = GetBakerThunkKey(patch);
   if (kIsDebugBuild) {
-    const uint32_t encoded_data = key.GetBakerReadBarrierParams().custom_value1;
+    const uint32_t encoded_data = key.GetCustomValue1();
     BakerReadBarrierKind kind = BakerReadBarrierKindField::Decode(encoded_data);
     // Check that the next instruction matches the expected LDR.
     switch (kind) {
@@ -346,7 +346,7 @@
       __ Bkpt(0);
       break;
     case ThunkType::kBakerReadBarrier:
-      CompileBakerReadBarrierThunk(assembler, key.GetBakerReadBarrierParams().custom_value1);
+      CompileBakerReadBarrierThunk(assembler, key.GetCustomValue1());
       break;
   }
 
diff --git a/compiler/linker/arm/relative_patcher_thumb2.h b/compiler/linker/arm/relative_patcher_thumb2.h
index 7e787d2..183e5e6 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.h
+++ b/compiler/linker/arm/relative_patcher_thumb2.h
@@ -94,13 +94,13 @@
     kField,   // Field get or array get with constant offset (i.e. constant index).
     kArray,   // Array get with index in register.
     kGcRoot,  // GC root load.
-    kLast
+    kLast = kGcRoot
   };
 
   enum class BakerReadBarrierWidth : uint8_t {
     kWide,          // 32-bit LDR (and 32-bit NEG if heap poisoning is enabled).
     kNarrow,        // 16-bit LDR (and 16-bit NEG if heap poisoning is enabled).
-    kLast
+    kLast = kNarrow
   };
 
   static constexpr size_t kBitsForBakerReadBarrierKind =
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index 2b06e3f..e99687a 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -307,7 +307,7 @@
   DCHECK_EQ(insn & 0xffffffe0u, 0xb5000000);  // CBNZ Xt, +0 (unpatched)
   ThunkKey key = GetBakerThunkKey(patch);
   if (kIsDebugBuild) {
-    const uint32_t encoded_data = key.GetBakerReadBarrierParams().custom_value1;
+    const uint32_t encoded_data = key.GetCustomValue1();
     BakerReadBarrierKind kind = BakerReadBarrierKindField::Decode(encoded_data);
     // Check that the next instruction matches the expected LDR.
     switch (kind) {
@@ -500,7 +500,7 @@
       break;
     }
     case ThunkType::kBakerReadBarrier: {
-      CompileBakerReadBarrierThunk(assembler, key.GetBakerReadBarrierParams().custom_value1);
+      CompileBakerReadBarrierThunk(assembler, key.GetCustomValue1());
       break;
     }
   }
diff --git a/compiler/linker/arm64/relative_patcher_arm64.h b/compiler/linker/arm64/relative_patcher_arm64.h
index 02a5b1e..b00dd08 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.h
+++ b/compiler/linker/arm64/relative_patcher_arm64.h
@@ -86,7 +86,7 @@
     kField,   // Field get or array get with constant offset (i.e. constant index).
     kArray,   // Array get with index in register.
     kGcRoot,  // GC root load.
-    kLast
+    kLast = kGcRoot
   };
 
   static constexpr size_t kBitsForBakerReadBarrierKind =
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 6b5387a..9b22334 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -28,7 +28,6 @@
 #include "base/stl_util.h"
 #include "base/unix_file/fd_file.h"
 #include "class_linker.h"
-#include "compiled_class.h"
 #include "compiled_method.h"
 #include "debug/method_debug_info.h"
 #include "dex/verification_results.h"
@@ -712,17 +711,17 @@
 
   bool EndClass() {
     ClassReference class_ref(dex_file_, class_def_index_);
-    CompiledClass* compiled_class = writer_->compiler_driver_->GetCompiledClass(class_ref);
     mirror::Class::Status status;
-    if (compiled_class != nullptr) {
-      status = compiled_class->GetStatus();
-    } else if (writer_->compiler_driver_->GetVerificationResults()->IsClassRejected(class_ref)) {
-      // The oat class status is used only for verification of resolved classes,
-      // so use kStatusErrorResolved whether the class was resolved or unresolved
-      // during compile-time verification.
-      status = mirror::Class::kStatusErrorResolved;
-    } else {
-      status = mirror::Class::kStatusNotReady;
+    bool found = writer_->compiler_driver_->GetCompiledClass(class_ref, &status);
+    if (!found) {
+      if (writer_->compiler_driver_->GetVerificationResults()->IsClassRejected(class_ref)) {
+        // The oat class status is used only for verification of resolved classes,
+        // so use kStatusErrorResolved whether the class was resolved or unresolved
+        // during compile-time verification.
+        status = mirror::Class::kStatusErrorResolved;
+      } else {
+        status = mirror::Class::kStatusNotReady;
+      }
     }
 
     writer_->oat_classes_.emplace_back(offset_,
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index cef5fdc..ab3d499 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -6374,6 +6374,15 @@
   }
 }
 
+void LocationsBuilderARM::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
 void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) {
   RegisterSet caller_saves = RegisterSet::Empty();
   InvokeRuntimeCallingConvention calling_convention;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index cbe52aa..fa39b79 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -2661,6 +2661,38 @@
          Operand(InputOperandAt(instruction, 1)));
 }
 
+void LocationsBuilderARM64::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+
+  HIntConstant* shift = instruction->GetShift()->AsIntConstant();
+
+  locations->SetInAt(0, Location::RequiresRegister());
+  // For byte case we don't need to shift the index variable so we can encode the data offset into
+  // ADD instruction. For other cases we prefer the data_offset to be in register; that will hoist
+  // data offset constant generation out of the loop and reduce the critical path length in the
+  // loop.
+  locations->SetInAt(1, shift->GetValue() == 0
+                        ? Location::ConstantLocation(instruction->GetOffset()->AsIntConstant())
+                        : Location::RequiresRegister());
+  locations->SetInAt(2, Location::ConstantLocation(shift));
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void InstructionCodeGeneratorARM64::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  Register index_reg = InputRegisterAt(instruction, 0);
+  uint32_t shift = Int64ConstantFrom(instruction->GetLocations()->InAt(2));
+  uint32_t offset = instruction->GetOffset()->AsIntConstant()->GetValue();
+
+  if (shift == 0) {
+    __ Add(OutputRegister(instruction), index_reg, offset);
+  } else {
+    Register offset_reg = InputRegisterAt(instruction, 1);
+    __ Add(OutputRegister(instruction), offset_reg, Operand(index_reg, LSL, shift));
+  }
+}
+
 void LocationsBuilderARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall);
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index ebf8d21..1759c68 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -6447,6 +6447,16 @@
   }
 }
 
+void LocationsBuilderARMVIXL::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitBoundsCheck(HBoundsCheck* instruction) {
   RegisterSet caller_saves = RegisterSet::Empty();
   InvokeRuntimeCallingConventionARMVIXL calling_convention;
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index fdfa4ee..503026e 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -8415,6 +8415,23 @@
     }
   } else if (Primitive::IsIntegralType(result_type) && Primitive::IsFloatingPointType(input_type)) {
     CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong);
+
+    // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum
+    // value of the output type if the input is outside of the range after the truncation or
+    // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct
+    // results. This matches the desired float/double-to-int/long conversion exactly.
+    //
+    // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive
+    // value when the input is either a NaN or is outside of the range of the output type
+    // after the truncation. IOW, the three special cases (NaN, too small, too big) produce
+    // the same result.
+    //
+    // The code takes care of the different behaviors by first comparing the input to the
+    // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int).
+    // If the input is greater than or equal to the minimum, it procedes to the truncate
+    // instruction, which will handle such an input the same way irrespective of NAN2008.
+    // Otherwise the input is compared to itself to determine whether it is a NaN or not
+    // in order to return either zero or the minimum value.
     if (result_type == Primitive::kPrimLong) {
       if (isR6) {
         // trunc.l.s/trunc.l.d requires MIPSR2+ with FR=1. MIPS32R6 is implemented as a secondary
@@ -8422,62 +8439,6 @@
         FRegister src = locations->InAt(0).AsFpuRegister<FRegister>();
         Register dst_high = locations->Out().AsRegisterPairHigh<Register>();
         Register dst_low = locations->Out().AsRegisterPairLow<Register>();
-        MipsLabel truncate;
-        MipsLabel done;
-
-        // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive
-        // value when the input is either a NaN or is outside of the range of the output type
-        // after the truncation. IOW, the three special cases (NaN, too small, too big) produce
-        // the same result.
-        //
-        // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum
-        // value of the output type if the input is outside of the range after the truncation or
-        // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct
-        // results. This matches the desired float/double-to-int/long conversion exactly.
-        //
-        // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction.
-        //
-        // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate
-        // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6,
-        // even though it must be NAN2008=1 on R6.
-        //
-        // The code takes care of the different behaviors by first comparing the input to the
-        // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int).
-        // If the input is greater than or equal to the minimum, it procedes to the truncate
-        // instruction, which will handle such an input the same way irrespective of NAN2008.
-        // Otherwise the input is compared to itself to determine whether it is a NaN or not
-        // in order to return either zero or the minimum value.
-        //
-        // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the
-        // truncate instruction for MIPS64R6.
-        if (input_type == Primitive::kPrimFloat) {
-          uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min());
-          __ LoadConst32(TMP, min_val);
-          __ Mtc1(TMP, FTMP);
-          __ CmpLeS(FTMP, FTMP, src);
-        } else {
-          uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min());
-          __ LoadConst32(TMP, High32Bits(min_val));
-          __ Mtc1(ZERO, FTMP);
-          __ Mthc1(TMP, FTMP);
-          __ CmpLeD(FTMP, FTMP, src);
-        }
-
-        __ Bc1nez(FTMP, &truncate);
-
-        if (input_type == Primitive::kPrimFloat) {
-          __ CmpEqS(FTMP, src, src);
-        } else {
-          __ CmpEqD(FTMP, src, src);
-        }
-        __ Move(dst_low, ZERO);
-        __ LoadConst32(dst_high, std::numeric_limits<int32_t>::min());
-        __ Mfc1(TMP, FTMP);
-        __ And(dst_high, dst_high, TMP);
-
-        __ B(&done);
-
-        __ Bind(&truncate);
 
         if (input_type == Primitive::kPrimFloat) {
           __ TruncLS(FTMP, src);
@@ -8486,8 +8447,6 @@
         }
         __ Mfc1(dst_low, FTMP);
         __ Mfhc1(dst_high, FTMP);
-
-        __ Bind(&done);
       } else {
         QuickEntrypointEnum entrypoint = (input_type == Primitive::kPrimFloat) ? kQuickF2l
                                                                                : kQuickD2l;
@@ -8504,43 +8463,19 @@
       MipsLabel truncate;
       MipsLabel done;
 
-      // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate
-      // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6,
-      // even though it must be NAN2008=1 on R6.
-      //
-      // For details see the large comment above for the truncation of float/double to long on R6.
-      //
-      // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the
-      // truncate instruction for MIPS64R6.
-      if (input_type == Primitive::kPrimFloat) {
-        uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min());
-        __ LoadConst32(TMP, min_val);
-        __ Mtc1(TMP, FTMP);
-      } else {
-        uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min());
-        __ LoadConst32(TMP, High32Bits(min_val));
-        __ Mtc1(ZERO, FTMP);
-        __ MoveToFpuHigh(TMP, FTMP);
-      }
-
-      if (isR6) {
+      if (!isR6) {
         if (input_type == Primitive::kPrimFloat) {
-          __ CmpLeS(FTMP, FTMP, src);
+          uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min());
+          __ LoadConst32(TMP, min_val);
+          __ Mtc1(TMP, FTMP);
         } else {
-          __ CmpLeD(FTMP, FTMP, src);
+          uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min());
+          __ LoadConst32(TMP, High32Bits(min_val));
+          __ Mtc1(ZERO, FTMP);
+          __ MoveToFpuHigh(TMP, FTMP);
         }
-        __ Bc1nez(FTMP, &truncate);
 
         if (input_type == Primitive::kPrimFloat) {
-          __ CmpEqS(FTMP, src, src);
-        } else {
-          __ CmpEqD(FTMP, src, src);
-        }
-        __ LoadConst32(dst, std::numeric_limits<int32_t>::min());
-        __ Mfc1(TMP, FTMP);
-        __ And(dst, dst, TMP);
-      } else {
-        if (input_type == Primitive::kPrimFloat) {
           __ ColeS(0, FTMP, src);
         } else {
           __ ColeD(0, FTMP, src);
@@ -8554,12 +8489,12 @@
         }
         __ LoadConst32(dst, std::numeric_limits<int32_t>::min());
         __ Movf(dst, ZERO, 0);
+
+        __ B(&done);
+
+        __ Bind(&truncate);
       }
 
-      __ B(&done);
-
-      __ Bind(&truncate);
-
       if (input_type == Primitive::kPrimFloat) {
         __ TruncWS(FTMP, src);
       } else {
@@ -8567,7 +8502,9 @@
       }
       __ Mfc1(dst, FTMP);
 
-      __ Bind(&done);
+      if (!isR6) {
+        __ Bind(&done);
+      }
     }
   } else if (Primitive::IsFloatingPointType(result_type) &&
              Primitive::IsFloatingPointType(input_type)) {
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index d3ae3a7..e0dba21 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -302,10 +302,13 @@
       : SlowPathCodeMIPS64(instruction), successor_(successor) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
     CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
     __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);     // Only saves live vector registers for SIMD.
     mips64_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
+    RestoreLiveRegisters(codegen, locations);  // Only restores live vector registers for SIMD.
     if (successor_ == nullptr) {
       __ Bc(GetReturnLabel());
     } else {
@@ -1647,13 +1650,19 @@
 }
 
 size_t CodeGeneratorMIPS64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-  __ StoreFpuToOffset(kStoreDoubleword, FpuRegister(reg_id), SP, stack_index);
-  return kMips64DoublewordSize;
+  __ StoreFpuToOffset(GetGraph()->HasSIMD() ? kStoreQuadword : kStoreDoubleword,
+                      FpuRegister(reg_id),
+                      SP,
+                      stack_index);
+  return GetFloatingPointSpillSlotSize();
 }
 
 size_t CodeGeneratorMIPS64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
-  __ LoadFpuFromOffset(kLoadDoubleword, FpuRegister(reg_id), SP, stack_index);
-  return kMips64DoublewordSize;
+  __ LoadFpuFromOffset(GetGraph()->HasSIMD() ? kLoadQuadword : kLoadDoubleword,
+                       FpuRegister(reg_id),
+                       SP,
+                       stack_index);
+  return GetFloatingPointSpillSlotSize();
 }
 
 void CodeGeneratorMIPS64::DumpCoreRegister(std::ostream& stream, int reg) const {
@@ -5852,7 +5861,11 @@
 void LocationsBuilderMIPS64::VisitSuspendCheck(HSuspendCheck* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath);
-  locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  // In suspend check slow path, usually there are no caller-save registers at all.
+  // If SIMD instructions are present, however, we force spilling all live SIMD
+  // registers in full width (since the runtime only saves/restores lower part).
+  locations->SetCustomSlowPathCallerSaves(
+      GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty());
 }
 
 void InstructionCodeGeneratorMIPS64::VisitSuspendCheck(HSuspendCheck* instruction) {
@@ -5979,68 +5992,6 @@
     CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong);
     GpuRegister dst = locations->Out().AsRegister<GpuRegister>();
     FpuRegister src = locations->InAt(0).AsFpuRegister<FpuRegister>();
-    Mips64Label truncate;
-    Mips64Label done;
-
-    // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive
-    // value when the input is either a NaN or is outside of the range of the output type
-    // after the truncation. IOW, the three special cases (NaN, too small, too big) produce
-    // the same result.
-    //
-    // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum
-    // value of the output type if the input is outside of the range after the truncation or
-    // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct
-    // results. This matches the desired float/double-to-int/long conversion exactly.
-    //
-    // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction.
-    //
-    // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate
-    // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6,
-    // even though it must be NAN2008=1 on R6.
-    //
-    // The code takes care of the different behaviors by first comparing the input to the
-    // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int).
-    // If the input is greater than or equal to the minimum, it procedes to the truncate
-    // instruction, which will handle such an input the same way irrespective of NAN2008.
-    // Otherwise the input is compared to itself to determine whether it is a NaN or not
-    // in order to return either zero or the minimum value.
-    //
-    // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the
-    // truncate instruction for MIPS64R6.
-    if (input_type == Primitive::kPrimFloat) {
-      uint32_t min_val = (result_type == Primitive::kPrimLong)
-          ? bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min())
-          : bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min());
-      __ LoadConst32(TMP, min_val);
-      __ Mtc1(TMP, FTMP);
-      __ CmpLeS(FTMP, FTMP, src);
-    } else {
-      uint64_t min_val = (result_type == Primitive::kPrimLong)
-          ? bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min())
-          : bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min());
-      __ LoadConst64(TMP, min_val);
-      __ Dmtc1(TMP, FTMP);
-      __ CmpLeD(FTMP, FTMP, src);
-    }
-
-    __ Bc1nez(FTMP, &truncate);
-
-    if (input_type == Primitive::kPrimFloat) {
-      __ CmpEqS(FTMP, src, src);
-    } else {
-      __ CmpEqD(FTMP, src, src);
-    }
-    if (result_type == Primitive::kPrimLong) {
-      __ LoadConst64(dst, std::numeric_limits<int64_t>::min());
-    } else {
-      __ LoadConst32(dst, std::numeric_limits<int32_t>::min());
-    }
-    __ Mfc1(TMP, FTMP);
-    __ And(dst, dst, TMP);
-
-    __ Bc(&done);
-
-    __ Bind(&truncate);
 
     if (result_type == Primitive::kPrimLong) {
       if (input_type == Primitive::kPrimFloat) {
@@ -6057,8 +6008,6 @@
       }
       __ Mfc1(dst, FTMP);
     }
-
-    __ Bind(&done);
   } else if (Primitive::IsFloatingPointType(result_type) &&
              Primitive::IsFloatingPointType(input_type)) {
     FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>();
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 200e884..4c83766 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -336,7 +336,11 @@
 
   size_t GetWordSize() const OVERRIDE { return kMips64DoublewordSize; }
 
-  size_t GetFloatingPointSpillSlotSize() const OVERRIDE { return kMips64DoublewordSize; }
+  size_t GetFloatingPointSpillSlotSize() const OVERRIDE {
+    return GetGraph()->HasSIMD()
+        ? 2 * kMips64DoublewordSize   // 16 bytes for each spill.
+        : 1 * kMips64DoublewordSize;  //  8 bytes for each spill.
+  }
 
   uintptr_t GetAddressOf(HBasicBlock* block) OVERRIDE {
     return assembler_.GetLabelLocation(GetLabelOf(block));
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 57f7e6b..478bd24 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -783,6 +783,12 @@
     /*out*/ Register* scratch) {
   LocationSummary* locations = instruction->GetLocations();
   Register base = InputRegisterAt(instruction, 0);
+
+  if (instruction->InputAt(1)->IsIntermediateAddressIndex()) {
+    DCHECK(!is_string_char_at);
+    return MemOperand(base.X(), InputRegisterAt(instruction, 1).X());
+  }
+
   Location index = locations->InAt(1);
   uint32_t offset = is_string_char_at
       ? mirror::String::ValueOffset().Uint32Value()
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index f16e372..311be1f 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -216,5 +216,18 @@
   }
 }
 
+void InstructionSimplifierArm64Visitor::VisitVecLoad(HVecLoad* instruction) {
+  if (!instruction->IsStringCharAt()
+      && TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+    RecordSimplification();
+  }
+}
+
+void InstructionSimplifierArm64Visitor::VisitVecStore(HVecStore* instruction) {
+  if (TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+    RecordSimplification();
+  }
+}
+
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index eec4e49..8596f6a 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -75,6 +75,8 @@
   void VisitUShr(HUShr* instruction) OVERRIDE;
   void VisitXor(HXor* instruction) OVERRIDE;
   void VisitVecMul(HVecMul* instruction) OVERRIDE;
+  void VisitVecLoad(HVecLoad* instruction) OVERRIDE;
+  void VisitVecStore(HVecStore* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
 };
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
index c39e5f4..e5a8499 100644
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ b/compiler/optimizing/instruction_simplifier_shared.cc
@@ -16,6 +16,8 @@
 
 #include "instruction_simplifier_shared.h"
 
+#include "mirror/array-inl.h"
+
 namespace art {
 
 namespace {
@@ -346,4 +348,59 @@
   return false;
 }
 
+bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index) {
+  if (index->IsConstant()) {
+    // If index is constant the whole address calculation often can be done by LDR/STR themselves.
+    // TODO: Treat the case with not-embedable constant.
+    return false;
+  }
+
+  HGraph* graph = access->GetBlock()->GetGraph();
+  ArenaAllocator* arena = graph->GetArena();
+  Primitive::Type packed_type = access->GetPackedType();
+  uint32_t data_offset = mirror::Array::DataOffset(
+      Primitive::ComponentSize(packed_type)).Uint32Value();
+  size_t component_shift = Primitive::ComponentSizeShift(packed_type);
+
+  bool is_extracting_beneficial = false;
+  // It is beneficial to extract index intermediate address only if there are at least 2 users.
+  for (const HUseListNode<HInstruction*>& use : index->GetUses()) {
+    HInstruction* user = use.GetUser();
+    if (user->IsVecMemoryOperation() && user != access) {
+      HVecMemoryOperation* another_access = user->AsVecMemoryOperation();
+      Primitive::Type another_packed_type = another_access->GetPackedType();
+      uint32_t another_data_offset = mirror::Array::DataOffset(
+          Primitive::ComponentSize(another_packed_type)).Uint32Value();
+      size_t another_component_shift = Primitive::ComponentSizeShift(another_packed_type);
+      if (another_data_offset == data_offset && another_component_shift == component_shift) {
+        is_extracting_beneficial = true;
+        break;
+      }
+    } else if (user->IsIntermediateAddressIndex()) {
+      HIntermediateAddressIndex* another_access = user->AsIntermediateAddressIndex();
+      uint32_t another_data_offset = another_access->GetOffset()->AsIntConstant()->GetValue();
+      size_t another_component_shift = another_access->GetShift()->AsIntConstant()->GetValue();
+      if (another_data_offset == data_offset && another_component_shift == component_shift) {
+        is_extracting_beneficial = true;
+        break;
+      }
+    }
+  }
+
+  if (!is_extracting_beneficial) {
+    return false;
+  }
+
+  // Proceed to extract the index + data_offset address computation.
+  HIntConstant* offset = graph->GetIntConstant(data_offset);
+  HIntConstant* shift = graph->GetIntConstant(component_shift);
+  HIntermediateAddressIndex* address =
+      new (arena) HIntermediateAddressIndex(index, offset, shift, kNoDexPc);
+
+  access->GetBlock()->InsertInstructionBefore(address, access);
+  access->ReplaceInput(address, 1);
+
+  return true;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h
index 2ea103a..371619f 100644
--- a/compiler/optimizing/instruction_simplifier_shared.h
+++ b/compiler/optimizing/instruction_simplifier_shared.h
@@ -59,6 +59,7 @@
                                   size_t data_offset);
 
 bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa);
+bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index);
 
 }  // namespace art
 
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index abf5b12..eb28742 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2555,101 +2555,110 @@
   Register out = locations->Out().AsRegister<Register>();
 
   MipsLabel done;
-  MipsLabel finite;
-  MipsLabel add;
 
-  // if (in.isNaN) {
-  //   return 0;
-  // }
-  //
-  // out = floor.w.s(in);
-  //
-  // /*
-  //  * This "if" statement is only needed for the pre-R6 version of floor.w.s
-  //  * which outputs Integer.MAX_VALUE for negative numbers with magnitudes
-  //  * too large to fit in a 32-bit integer.
-  //  *
-  //  * Starting with MIPSR6, which always sets FCSR.NAN2008=1, negative
-  //  * numbers which are too large to be represented in a 32-bit signed
-  //  * integer will be processed by floor.w.s to output Integer.MIN_VALUE,
-  //  * and will no longer be processed by this "if" statement.
-  //  */
-  // if (out == Integer.MAX_VALUE) {
-  //   TMP = (in < 0.0f) ? 1 : 0;
-  //   /*
-  //    * If TMP is 1, then adding it to out will wrap its value from
-  //    * Integer.MAX_VALUE to Integer.MIN_VALUE.
-  //    */
-  //   return out += TMP;
-  // }
-  //
-  // /*
-  //  * For negative values not handled by the previous "if" statement the
-  //  * test here will correctly set the value of TMP.
-  //  */
-  // TMP = ((in - out) >= 0.5f) ? 1 : 0;
-  // return out += TMP;
-
-  // Test for NaN.
   if (IsR6()) {
-    __ CmpUnS(FTMP, in, in);
-  } else {
-    __ CunS(in, in);
-  }
+    // out = floor(in);
+    //
+    // if (out != MAX_VALUE && out != MIN_VALUE) {
+    //     TMP = ((in - out) >= 0.5) ? 1 : 0;
+    //     return out += TMP;
+    // }
+    // return out;
 
-  // Return zero for NaN.
-  __ Move(out, ZERO);
-  if (IsR6()) {
-    __ Bc1nez(FTMP, &done);
-  } else {
-    __ Bc1t(&done);
-  }
+    // out = floor(in);
+    __ FloorWS(FTMP, in);
+    __ Mfc1(out, FTMP);
 
-  // out = floor(in);
-  __ FloorWS(FTMP, in);
-  __ Mfc1(out, FTMP);
+    // if (out != MAX_VALUE && out != MIN_VALUE)
+    __ Addiu(TMP, out, 1);
+    __ Aui(TMP, TMP, 0x8000);  // TMP = out + 0x8000 0001
+                               // or    out - 0x7FFF FFFF.
+                               // IOW, TMP = 1 if out = Int.MIN_VALUE
+                               // or   TMP = 0 if out = Int.MAX_VALUE.
+    __ Srl(TMP, TMP, 1);       // TMP = 0 if out = Int.MIN_VALUE
+                               //         or out = Int.MAX_VALUE.
+    __ Beqz(TMP, &done);
 
-  if (!IsR6()) {
-    __ LoadConst32(TMP, -1);
-  }
+    // TMP = (0.5f <= (in - out)) ? -1 : 0;
+    __ Cvtsw(FTMP, FTMP);      // Convert output of floor.w.s back to "float".
+    __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f));
+    __ SubS(FTMP, in, FTMP);
+    __ Mtc1(AT, half);
 
-  // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0;
-  __ LoadConst32(AT, std::numeric_limits<int32_t>::max());
-  __ Bne(AT, out, &finite);
-
-  __ Mtc1(ZERO, FTMP);
-  if (IsR6()) {
-    __ CmpLtS(FTMP, in, FTMP);
-    __ Mfc1(TMP, FTMP);
-  } else {
-    __ ColtS(in, FTMP);
-  }
-
-  __ B(&add);
-
-  __ Bind(&finite);
-
-  // TMP = (0.5f <= (in - out)) ? -1 : 0;
-  __ Cvtsw(FTMP, FTMP);  // Convert output of floor.w.s back to "float".
-  __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f));
-  __ SubS(FTMP, in, FTMP);
-  __ Mtc1(AT, half);
-  if (IsR6()) {
     __ CmpLeS(FTMP, half, FTMP);
     __ Mfc1(TMP, FTMP);
+
+    // Return out -= TMP.
+    __ Subu(out, out, TMP);
   } else {
+    // if (in.isNaN) {
+    //   return 0;
+    // }
+    //
+    // out = floor.w.s(in);
+    //
+    // /*
+    //  * This "if" statement is only needed for the pre-R6 version of floor.w.s
+    //  * which outputs Integer.MAX_VALUE for negative numbers with magnitudes
+    //  * too large to fit in a 32-bit integer.
+    //  */
+    // if (out == Integer.MAX_VALUE) {
+    //   TMP = (in < 0.0f) ? 1 : 0;
+    //   /*
+    //    * If TMP is 1, then adding it to out will wrap its value from
+    //    * Integer.MAX_VALUE to Integer.MIN_VALUE.
+    //    */
+    //   return out += TMP;
+    // }
+    //
+    // /*
+    //  * For negative values not handled by the previous "if" statement the
+    //  * test here will correctly set the value of TMP.
+    //  */
+    // TMP = ((in - out) >= 0.5f) ? 1 : 0;
+    // return out += TMP;
+
+    MipsLabel finite;
+    MipsLabel add;
+
+    // Test for NaN.
+    __ CunS(in, in);
+
+    // Return zero for NaN.
+    __ Move(out, ZERO);
+    __ Bc1t(&done);
+
+    // out = floor(in);
+    __ FloorWS(FTMP, in);
+    __ Mfc1(out, FTMP);
+
+    __ LoadConst32(TMP, -1);
+
+    // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0;
+    __ LoadConst32(AT, std::numeric_limits<int32_t>::max());
+    __ Bne(AT, out, &finite);
+
+    __ Mtc1(ZERO, FTMP);
+    __ ColtS(in, FTMP);
+
+    __ B(&add);
+
+    __ Bind(&finite);
+
+    // TMP = (0.5f <= (in - out)) ? -1 : 0;
+    __ Cvtsw(FTMP, FTMP);  // Convert output of floor.w.s back to "float".
+    __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f));
+    __ SubS(FTMP, in, FTMP);
+    __ Mtc1(AT, half);
     __ ColeS(half, FTMP);
-  }
 
-  __ Bind(&add);
+    __ Bind(&add);
 
-  if (!IsR6()) {
     __ Movf(TMP, ZERO);
+
+    // Return out -= TMP.
+    __ Subu(out, out, TMP);
   }
-
-  // Return out -= TMP.
-  __ Subu(out, out, TMP);
-
   __ Bind(&done);
 }
 
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 9dce59b..a476b2b 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -890,54 +890,14 @@
   DCHECK(type == Primitive::kPrimFloat || type == Primitive::kPrimDouble);
 
   Mips64Label done;
-  Mips64Label finite;
-  Mips64Label add;
 
-  // if (in.isNaN) {
-  //   return 0;
-  // }
-  //
   // out = floor(in);
   //
-  // /*
-  //  * TODO: Amend this code when emulator FCSR.NAN2008=1 bug is fixed.
-  //  *
-  //  * Starting with MIPSR6, which always sets FCSR.NAN2008=1, negative
-  //  * numbers which are too large to be represented in a 32-/64-bit
-  //  * signed integer will be processed by floor.X.Y to output
-  //  * Integer.MIN_VALUE/Long.MIN_VALUE, and will no longer be
-  //  * processed by this "if" statement.
-  //  *
-  //  * However, this bug in the 64-bit MIPS emulator causes the
-  //  * behavior of floor.X.Y to be the same as pre-R6 implementations
-  //  * of MIPS64.  When that bug is fixed this logic should be amended.
-  //  */
-  // if (out == MAX_VALUE) {
-  //   TMP = (in < 0.0) ? 1 : 0;
-  //   /*
-  //    * If TMP is 1, then adding it to out will wrap its value from
-  //    * MAX_VALUE to MIN_VALUE.
-  //    */
+  // if (out != MAX_VALUE && out != MIN_VALUE) {
+  //   TMP = ((in - out) >= 0.5) ? 1 : 0;
   //   return out += TMP;
   // }
-  //
-  // /*
-  //  * For negative values not handled by the previous "if" statement the
-  //  * test here will correctly set the value of TMP.
-  //  */
-  // TMP = ((in - out) >= 0.5) ? 1 : 0;
-  // return out += TMP;
-
-  // Test for NaN.
-  if (type == Primitive::kPrimDouble) {
-    __ CmpUnD(FTMP, in, in);
-  } else {
-    __ CmpUnS(FTMP, in, in);
-  }
-
-  // Return zero for NaN.
-  __ Move(out, ZERO);
-  __ Bc1nez(FTMP, &done);
+  // return out;
 
   // out = floor(in);
   if (type == Primitive::kPrimDouble) {
@@ -948,27 +908,26 @@
     __ Mfc1(out, FTMP);
   }
 
-  // TMP = (out = java.lang.Integer.MAX_VALUE) ? 1 : 0;
+  // if (out != MAX_VALUE && out != MIN_VALUE)
   if (type == Primitive::kPrimDouble) {
-    __ LoadConst64(AT, std::numeric_limits<int64_t>::max());
+    __ Daddiu(TMP, out, 1);
+    __ Dati(TMP, 0x8000);  // TMP = out + 0x8000 0000 0000 0001
+                           // or    out - 0x7FFF FFFF FFFF FFFF.
+                           // IOW, TMP = 1 if out = Long.MIN_VALUE
+                           // or   TMP = 0 if out = Long.MAX_VALUE.
+    __ Dsrl(TMP, TMP, 1);  // TMP = 0 if out = Long.MIN_VALUE
+                           //         or out = Long.MAX_VALUE.
+    __ Beqzc(TMP, &done);
   } else {
-    __ LoadConst32(AT, std::numeric_limits<int32_t>::max());
+    __ Addiu(TMP, out, 1);
+    __ Aui(TMP, TMP, 0x8000);  // TMP = out + 0x8000 0001
+                               // or    out - 0x7FFF FFFF.
+                               // IOW, TMP = 1 if out = Int.MIN_VALUE
+                               // or   TMP = 0 if out = Int.MAX_VALUE.
+    __ Srl(TMP, TMP, 1);       // TMP = 0 if out = Int.MIN_VALUE
+                               //         or out = Int.MAX_VALUE.
+    __ Beqzc(TMP, &done);
   }
-  __ Bnec(AT, out, &finite);
-
-  if (type == Primitive::kPrimDouble) {
-    __ Dmtc1(ZERO, FTMP);
-    __ CmpLtD(FTMP, in, FTMP);
-    __ Dmfc1(AT, FTMP);
-  } else {
-    __ Mtc1(ZERO, FTMP);
-    __ CmpLtS(FTMP, in, FTMP);
-    __ Mfc1(AT, FTMP);
-  }
-
-  __ Bc(&add);
-
-  __ Bind(&finite);
 
   // TMP = (0.5 <= (in - out)) ? -1 : 0;
   if (type == Primitive::kPrimDouble) {
@@ -977,23 +936,21 @@
     __ SubD(FTMP, in, FTMP);
     __ Dmtc1(AT, half);
     __ CmpLeD(FTMP, half, FTMP);
-    __ Dmfc1(AT, FTMP);
+    __ Dmfc1(TMP, FTMP);
   } else {
     __ Cvtsw(FTMP, FTMP);  // Convert output of floor.w.s back to "float".
     __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f));
     __ SubS(FTMP, in, FTMP);
     __ Mtc1(AT, half);
     __ CmpLeS(FTMP, half, FTMP);
-    __ Mfc1(AT, FTMP);
+    __ Mfc1(TMP, FTMP);
   }
 
-  __ Bind(&add);
-
   // Return out -= TMP.
   if (type == Primitive::kPrimDouble) {
-    __ Dsubu(out, out, AT);
+    __ Dsubu(out, out, TMP);
   } else {
-    __ Subu(out, out, AT);
+    __ Subu(out, out, TMP);
   }
 
   __ Bind(&done);
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index b4da20b..5229624 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1406,7 +1406,8 @@
   M(BitwiseNegatedRight, Instruction)                                   \
   M(DataProcWithShifterOp, Instruction)                                 \
   M(MultiplyAccumulate, Instruction)                                    \
-  M(IntermediateAddress, Instruction)
+  M(IntermediateAddress, Instruction)                                   \
+  M(IntermediateAddressIndex, Instruction)
 #endif
 
 #ifndef ART_ENABLE_CODEGEN_arm
diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h
index c6bfbcc..075a816 100644
--- a/compiler/optimizing/nodes_shared.h
+++ b/compiler/optimizing/nodes_shared.h
@@ -150,6 +150,49 @@
   DISALLOW_COPY_AND_ASSIGN(HIntermediateAddress);
 };
 
+// This instruction computes part of the array access offset (data and index offset).
+//
+// For array accesses the element address has the following structure:
+// Address = CONST_OFFSET + base_addr + index << ELEM_SHIFT. Taking into account LDR/STR addressing
+// modes address part (CONST_OFFSET + index << ELEM_SHIFT) can be shared across array access with
+// the same data type and index. For example, for the following loop 5 accesses can share address
+// computation:
+//
+// void foo(int[] a, int[] b, int[] c) {
+//   for (i...) {
+//     a[i] = a[i] + 5;
+//     b[i] = b[i] + c[i];
+//   }
+// }
+//
+// Note: as the instruction doesn't involve base array address into computations it has no side
+// effects (in comparison of HIntermediateAddress).
+class HIntermediateAddressIndex FINAL : public HExpression<3> {
+ public:
+  HIntermediateAddressIndex(
+      HInstruction* index, HInstruction* offset, HInstruction* shift, uint32_t dex_pc)
+      : HExpression(Primitive::kPrimInt, SideEffects::None(), dex_pc) {
+    SetRawInputAt(0, index);
+    SetRawInputAt(1, offset);
+    SetRawInputAt(2, shift);
+  }
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+    return true;
+  }
+  bool IsActualObject() const OVERRIDE { return false; }
+
+  HInstruction* GetIndex() const { return InputAt(0); }
+  HInstruction* GetOffset() const { return InputAt(1); }
+  HInstruction* GetShift() const { return InputAt(2); }
+
+  DECLARE_INSTRUCTION(IntermediateAddressIndex);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HIntermediateAddressIndex);
+};
+
 class HDataProcWithShifterOp FINAL : public HExpression<2> {
  public:
   enum OpKind {
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 52c247b..92fe9bf 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -178,12 +178,17 @@
                       size_t vector_length,
                       uint32_t dex_pc)
       : HVecOperation(arena, packed_type, side_effects, number_of_inputs, vector_length, dex_pc),
-        alignment_(Primitive::ComponentSize(packed_type), 0) { }
+        alignment_(Primitive::ComponentSize(packed_type), 0) {
+    DCHECK_GE(number_of_inputs, 2u);
+  }
 
   void SetAlignment(Alignment alignment) { alignment_ = alignment; }
 
   Alignment GetAlignment() const { return alignment_; }
 
+  HInstruction* GetArray() const { return InputAt(0); }
+  HInstruction* GetIndex() const { return InputAt(1); }
+
   DECLARE_ABSTRACT_INSTRUCTION(VecMemoryOperation);
 
  private:
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index f4afb33..c03b98c 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -1775,6 +1775,26 @@
   EmitMsaMI10((offset >> TIMES_8) & kMsaS10Mask, rs, wd, 0x9, 0x3);
 }
 
+void Mips64Assembler::IlvrB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x0, wt, ws, wd, 0x14);
+}
+
+void Mips64Assembler::IlvrH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x14);
+}
+
+void Mips64Assembler::IlvrW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x14);
+}
+
+void Mips64Assembler::IlvrD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
+  CHECK(HasMsa());
+  EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x14);
+}
+
 void Mips64Assembler::LoadConst32(GpuRegister rd, int32_t value) {
   TemplateLoadConst32(this, rd, value);
 }
@@ -2802,6 +2822,94 @@
   CHECK_EQ(misalignment, offset & (kMips64DoublewordSize - 1));
 }
 
+void Mips64Assembler::AdjustBaseOffsetAndElementSizeShift(GpuRegister& base,
+                                                          int32_t& offset,
+                                                          int& element_size_shift) {
+  // This method is used to adjust the base register, offset and element_size_shift
+  // for a vector load/store when the offset doesn't fit into allowed number of bits.
+  // MSA ld.df and st.df instructions take signed offsets as arguments, but maximum
+  // offset is dependant on the size of the data format df (10-bit offsets for ld.b,
+  // 11-bit for ld.h, 12-bit for ld.w and 13-bit for ld.d).
+  // If element_size_shift is non-negative at entry, it won't be changed, but offset
+  // will be checked for appropriate alignment. If negative at entry, it will be
+  // adjusted based on offset for maximum fit.
+  // It's assumed that `base` is a multiple of 8.
+
+  CHECK_NE(base, AT);  // Must not overwrite the register `base` while loading `offset`.
+
+  if (element_size_shift >= 0) {
+    CHECK_LE(element_size_shift, TIMES_8);
+    CHECK_GE(JAVASTYLE_CTZ(offset), element_size_shift);
+  } else if (IsAligned<kMips64DoublewordSize>(offset)) {
+    element_size_shift = TIMES_8;
+  } else if (IsAligned<kMips64WordSize>(offset)) {
+    element_size_shift = TIMES_4;
+  } else if (IsAligned<kMips64HalfwordSize>(offset)) {
+    element_size_shift = TIMES_2;
+  } else {
+    element_size_shift = TIMES_1;
+  }
+
+  const int low_len = 10 + element_size_shift;  // How many low bits of `offset` ld.df/st.df
+                                                // will take.
+  int16_t low = offset & ((1 << low_len) - 1);  // Isolate these bits.
+  low -= (low & (1 << (low_len - 1))) << 1;     // Sign-extend these bits.
+  if (low == offset) {
+    return;  // `offset` fits into ld.df/st.df.
+  }
+
+  // First, see if `offset` can be represented as a sum of two signed offsets.
+  // This can save an instruction.
+
+  // Max int16_t that's a multiple of element size.
+  const int32_t kMaxDeltaForSimpleAdjustment = 0x8000 - (1 << element_size_shift);
+  // Max ld.df/st.df offset that's a multiple of element size.
+  const int32_t kMaxLoadStoreOffset = 0x1ff << element_size_shift;
+  const int32_t kMaxOffsetForSimpleAdjustment = kMaxDeltaForSimpleAdjustment + kMaxLoadStoreOffset;
+
+  if (IsInt<16>(offset)) {
+    Daddiu(AT, base, offset);
+    offset = 0;
+  } else if (0 <= offset && offset <= kMaxOffsetForSimpleAdjustment) {
+    Daddiu(AT, base, kMaxDeltaForSimpleAdjustment);
+    offset -= kMaxDeltaForSimpleAdjustment;
+  } else if (-kMaxOffsetForSimpleAdjustment <= offset && offset < 0) {
+    Daddiu(AT, base, -kMaxDeltaForSimpleAdjustment);
+    offset += kMaxDeltaForSimpleAdjustment;
+  } else {
+    // Let's treat `offset` as 64-bit to simplify handling of sign
+    // extensions in the instructions that supply its smaller signed parts.
+    //
+    // 16-bit or smaller parts of `offset`:
+    // |63  top  48|47  hi  32|31  upper  16|15  mid  13-10|12-9  low  0|
+    //
+    // Instructions that supply each part as a signed integer addend:
+    // |dati       |dahi      |daui         |daddiu        |ld.df/st.df |
+    //
+    // `top` is always 0, so dati isn't used.
+    // `hi` is 1 when `offset` is close to +2GB and 0 otherwise.
+    uint64_t tmp = static_cast<uint64_t>(offset) - low;  // Exclude `low` from the rest of `offset`
+                                                         // (accounts for sign of `low`).
+    tmp += (tmp & (UINT64_C(1) << 15)) << 1;  // Account for sign extension in daddiu.
+    tmp += (tmp & (UINT64_C(1) << 31)) << 1;  // Account for sign extension in daui.
+    int16_t mid = Low16Bits(tmp);
+    int16_t upper = High16Bits(tmp);
+    int16_t hi = Low16Bits(High32Bits(tmp));
+    Daui(AT, base, upper);
+    if (hi != 0) {
+      CHECK_EQ(hi, 1);
+      Dahi(AT, hi);
+    }
+    if (mid != 0) {
+      Daddiu(AT, AT, mid);
+    }
+    offset = low;
+  }
+  base = AT;
+  CHECK_GE(JAVASTYLE_CTZ(offset), element_size_shift);
+  CHECK(IsInt<10>(offset >> element_size_shift));
+}
+
 void Mips64Assembler::LoadFromOffset(LoadOperandType type,
                                      GpuRegister reg,
                                      GpuRegister base,
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 6ac3361..c92cf4c 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -278,14 +278,16 @@
   kLoadUnsignedHalfword,
   kLoadWord,
   kLoadUnsignedWord,
-  kLoadDoubleword
+  kLoadDoubleword,
+  kLoadQuadword
 };
 
 enum StoreOperandType {
   kStoreByte,
   kStoreHalfword,
   kStoreWord,
-  kStoreDoubleword
+  kStoreDoubleword,
+  kStoreQuadword
 };
 
 // Used to test the values returned by ClassS/ClassD.
@@ -767,6 +769,11 @@
   void StW(VectorRegister wd, GpuRegister rs, int offset);
   void StD(VectorRegister wd, GpuRegister rs, int offset);
 
+  void IlvrB(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void IlvrH(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void IlvrW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+  void IlvrD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+
   // Higher level composite instructions.
   int InstrCountForLoadReplicatedConst32(int64_t);
   void LoadConst32(GpuRegister rd, int32_t value);
@@ -896,6 +903,10 @@
 
   void EmitLoad(ManagedRegister m_dst, GpuRegister src_register, int32_t src_offset, size_t size);
   void AdjustBaseAndOffset(GpuRegister& base, int32_t& offset, bool is_doubleword);
+  // If element_size_shift is negative at entry, its value will be calculated based on the offset.
+  void AdjustBaseOffsetAndElementSizeShift(GpuRegister& base,
+                                           int32_t& offset,
+                                           int& element_size_shift);
 
  private:
   // This will be used as an argument for loads/stores
@@ -1019,6 +1030,8 @@
           null_checker();
         }
         break;
+      default:
+        LOG(FATAL) << "UNREACHABLE";
     }
     if (type != kLoadDoubleword) {
       null_checker();
@@ -1031,7 +1044,12 @@
                          GpuRegister base,
                          int32_t offset,
                          ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
-    AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kLoadDoubleword));
+    int element_size_shift = -1;
+    if (type != kLoadQuadword) {
+      AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kLoadDoubleword));
+    } else {
+      AdjustBaseOffsetAndElementSizeShift(base, offset, element_size_shift);
+    }
 
     switch (type) {
       case kLoadWord:
@@ -1051,6 +1069,17 @@
           null_checker();
         }
         break;
+      case kLoadQuadword:
+        switch (element_size_shift) {
+          case TIMES_1: LdB(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_2: LdH(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_4: LdW(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_8: LdD(static_cast<VectorRegister>(reg), base, offset); break;
+          default:
+            LOG(FATAL) << "UNREACHABLE";
+        }
+        null_checker();
+        break;
       default:
         LOG(FATAL) << "UNREACHABLE";
     }
@@ -1104,7 +1133,12 @@
                         GpuRegister base,
                         int32_t offset,
                         ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
-    AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kStoreDoubleword));
+    int element_size_shift = -1;
+    if (type != kStoreQuadword) {
+      AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kStoreDoubleword));
+    } else {
+      AdjustBaseOffsetAndElementSizeShift(base, offset, element_size_shift);
+    }
 
     switch (type) {
       case kStoreWord:
@@ -1124,6 +1158,17 @@
           null_checker();
         }
         break;
+      case kStoreQuadword:
+        switch (element_size_shift) {
+          case TIMES_1: StB(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_2: StH(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_4: StW(static_cast<VectorRegister>(reg), base, offset); break;
+          case TIMES_8: StD(static_cast<VectorRegister>(reg), base, offset); break;
+          default:
+            LOG(FATAL) << "UNREACHABLE";
+        }
+        null_checker();
+        break;
       default:
         LOG(FATAL) << "UNREACHABLE";
     }
diff --git a/compiler/utils/mips64/assembler_mips64_test.cc b/compiler/utils/mips64/assembler_mips64_test.cc
index 084ce6f..fbebe0c 100644
--- a/compiler/utils/mips64/assembler_mips64_test.cc
+++ b/compiler/utils/mips64/assembler_mips64_test.cc
@@ -1970,6 +1970,50 @@
   __ LoadFpuFromOffset(mips64::kLoadDoubleword, mips64::F0, mips64::A0, -32768);
   __ LoadFpuFromOffset(mips64::kLoadDoubleword, mips64::F0, mips64::A0, 0xABCDEF00);
 
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 8);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 511);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 512);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 513);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 514);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 516);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1022);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1024);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1025);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1026);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1028);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2044);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2048);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2049);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2050);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2052);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4088);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4096);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4097);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4098);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4100);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4104);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x7FFC);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x8000);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x10000);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x12345678);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x12350078);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -256);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -511);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -513);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -1022);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -1026);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -2044);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -2052);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -4096);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -4104);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -32768);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0xABCDEF00);
+  __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x7FFFABCD);
+
   const char* expected =
       "lwc1 $f0, 0($a0)\n"
       "lwc1 $f0, 4($a0)\n"
@@ -2010,7 +2054,78 @@
       "ldc1 $f0, -256($a0)\n"
       "ldc1 $f0, -32768($a0)\n"
       "daui $at, $a0, 0xABCE\n"
-      "ldc1 $f0, -0x1100($at) # 0xEF00\n";
+      "ldc1 $f0, -0x1100($at) # 0xEF00\n"
+
+      "ld.d $w0, 0($a0)\n"
+      "ld.b $w0, 1($a0)\n"
+      "ld.h $w0, 2($a0)\n"
+      "ld.w $w0, 4($a0)\n"
+      "ld.d $w0, 8($a0)\n"
+      "ld.b $w0, 511($a0)\n"
+      "ld.d $w0, 512($a0)\n"
+      "daddiu $at, $a0, 513\n"
+      "ld.b $w0, 0($at)\n"
+      "ld.h $w0, 514($a0)\n"
+      "ld.w $w0, 516($a0)\n"
+      "ld.h $w0, 1022($a0)\n"
+      "ld.d $w0, 1024($a0)\n"
+      "daddiu $at, $a0, 1025\n"
+      "ld.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 1026\n"
+      "ld.h $w0, 0($at)\n"
+      "ld.w $w0, 1028($a0)\n"
+      "ld.w $w0, 2044($a0)\n"
+      "ld.d $w0, 2048($a0)\n"
+      "daddiu $at, $a0, 2049\n"
+      "ld.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 2050\n"
+      "ld.h $w0, 0($at)\n"
+      "daddiu $at, $a0, 2052\n"
+      "ld.w $w0, 0($at)\n"
+      "ld.d $w0, 4088($a0)\n"
+      "daddiu $at, $a0, 4096\n"
+      "ld.d $w0, 0($at)\n"
+      "daddiu $at, $a0, 4097\n"
+      "ld.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 4098\n"
+      "ld.h $w0, 0($at)\n"
+      "daddiu $at, $a0, 4100\n"
+      "ld.w $w0, 0($at)\n"
+      "daddiu $at, $a0, 4104\n"
+      "ld.d $w0, 0($at)\n"
+      "daddiu $at, $a0, 0x7FFC\n"
+      "ld.w $w0, 0($at)\n"
+      "daddiu $at, $a0, 0x7FF8\n"
+      "ld.d $w0, 8($at)\n"
+      "daui $at, $a0, 0x1\n"
+      "ld.d $w0, 0($at)\n"
+      "daui $at, $a0, 0x1234\n"
+      "daddiu $at, $at, 0x6000\n"
+      "ld.d $w0, -2440($at) # 0xF678\n"
+      "daui $at, $a0, 0x1235\n"
+      "ld.d $w0, 0x78($at)\n"
+      "ld.d $w0, -256($a0)\n"
+      "ld.b $w0, -511($a0)\n"
+      "daddiu $at, $a0, -513\n"
+      "ld.b $w0, 0($at)\n"
+      "ld.h $w0, -1022($a0)\n"
+      "daddiu $at, $a0, -1026\n"
+      "ld.h $w0, 0($at)\n"
+      "ld.w $w0, -2044($a0)\n"
+      "daddiu $at, $a0, -2052\n"
+      "ld.w $w0, 0($at)\n"
+      "ld.d $w0, -4096($a0)\n"
+      "daddiu $at, $a0, -4104\n"
+      "ld.d $w0, 0($at)\n"
+      "daddiu $at, $a0, -32768\n"
+      "ld.d $w0, 0($at)\n"
+      "daui $at, $a0, 0xABCE\n"
+      "daddiu $at, $at, -8192 # 0xE000\n"
+      "ld.d $w0, 0xF00($at)\n"
+      "daui $at, $a0, 0x8000\n"
+      "dahi $at, $at, 1\n"
+      "daddiu $at, $at, -21504 # 0xAC00\n"
+      "ld.b $w0, -51($at) # 0xFFCD\n";
   DriverStr(expected, "LoadFpuFromOffset");
 }
 
@@ -2200,6 +2315,50 @@
   __ StoreFpuToOffset(mips64::kStoreDoubleword, mips64::F0, mips64::A0, -32768);
   __ StoreFpuToOffset(mips64::kStoreDoubleword, mips64::F0, mips64::A0, 0xABCDEF00);
 
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 8);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 511);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 512);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 513);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 514);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 516);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1022);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1024);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1025);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1026);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1028);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2044);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2048);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2049);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2050);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2052);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4088);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4096);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4097);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4098);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4100);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4104);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x7FFC);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x8000);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x10000);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x12345678);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x12350078);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -256);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -511);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -513);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -1022);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -1026);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -2044);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -2052);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -4096);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -4104);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -32768);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0xABCDEF00);
+  __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x7FFFABCD);
+
   const char* expected =
       "swc1 $f0, 0($a0)\n"
       "swc1 $f0, 4($a0)\n"
@@ -2240,7 +2399,78 @@
       "sdc1 $f0, -256($a0)\n"
       "sdc1 $f0, -32768($a0)\n"
       "daui $at, $a0, 0xABCE\n"
-      "sdc1 $f0, -0x1100($at)\n";
+      "sdc1 $f0, -0x1100($at)\n"
+
+      "st.d $w0, 0($a0)\n"
+      "st.b $w0, 1($a0)\n"
+      "st.h $w0, 2($a0)\n"
+      "st.w $w0, 4($a0)\n"
+      "st.d $w0, 8($a0)\n"
+      "st.b $w0, 511($a0)\n"
+      "st.d $w0, 512($a0)\n"
+      "daddiu $at, $a0, 513\n"
+      "st.b $w0, 0($at)\n"
+      "st.h $w0, 514($a0)\n"
+      "st.w $w0, 516($a0)\n"
+      "st.h $w0, 1022($a0)\n"
+      "st.d $w0, 1024($a0)\n"
+      "daddiu $at, $a0, 1025\n"
+      "st.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 1026\n"
+      "st.h $w0, 0($at)\n"
+      "st.w $w0, 1028($a0)\n"
+      "st.w $w0, 2044($a0)\n"
+      "st.d $w0, 2048($a0)\n"
+      "daddiu $at, $a0, 2049\n"
+      "st.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 2050\n"
+      "st.h $w0, 0($at)\n"
+      "daddiu $at, $a0, 2052\n"
+      "st.w $w0, 0($at)\n"
+      "st.d $w0, 4088($a0)\n"
+      "daddiu $at, $a0, 4096\n"
+      "st.d $w0, 0($at)\n"
+      "daddiu $at, $a0, 4097\n"
+      "st.b $w0, 0($at)\n"
+      "daddiu $at, $a0, 4098\n"
+      "st.h $w0, 0($at)\n"
+      "daddiu $at, $a0, 4100\n"
+      "st.w $w0, 0($at)\n"
+      "daddiu $at, $a0, 4104\n"
+      "st.d $w0, 0($at)\n"
+      "daddiu $at, $a0, 0x7FFC\n"
+      "st.w $w0, 0($at)\n"
+      "daddiu $at, $a0, 0x7FF8\n"
+      "st.d $w0, 8($at)\n"
+      "daui $at, $a0, 0x1\n"
+      "st.d $w0, 0($at)\n"
+      "daui $at, $a0, 0x1234\n"
+      "daddiu $at, $at, 0x6000\n"
+      "st.d $w0, -2440($at) # 0xF678\n"
+      "daui $at, $a0, 0x1235\n"
+      "st.d $w0, 0x78($at)\n"
+      "st.d $w0, -256($a0)\n"
+      "st.b $w0, -511($a0)\n"
+      "daddiu $at, $a0, -513\n"
+      "st.b $w0, 0($at)\n"
+      "st.h $w0, -1022($a0)\n"
+      "daddiu $at, $a0, -1026\n"
+      "st.h $w0, 0($at)\n"
+      "st.w $w0, -2044($a0)\n"
+      "daddiu $at, $a0, -2052\n"
+      "st.w $w0, 0($at)\n"
+      "st.d $w0, -4096($a0)\n"
+      "daddiu $at, $a0, -4104\n"
+      "st.d $w0, 0($at)\n"
+      "daddiu $at, $a0, -32768\n"
+      "st.d $w0, 0($at)\n"
+      "daui $at, $a0, 0xABCE\n"
+      "daddiu $at, $at, -8192 # 0xE000\n"
+      "st.d $w0, 0xF00($at)\n"
+      "daui $at, $a0, 0x8000\n"
+      "dahi $at, $at, 1\n"
+      "daddiu $at, $at, -21504 # 0xAC00\n"
+      "st.b $w0, -51($at) # 0xFFCD\n";
   DriverStr(expected, "StoreFpuToOffset");
 }
 
@@ -2990,6 +3220,26 @@
             "st.d");
 }
 
+TEST_F(AssemblerMIPS64Test, IlvrB) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::IlvrB, "ilvr.b ${reg1}, ${reg2}, ${reg3}"),
+            "ilvr.b");
+}
+
+TEST_F(AssemblerMIPS64Test, IlvrH) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::IlvrH, "ilvr.h ${reg1}, ${reg2}, ${reg3}"),
+            "ilvr.h");
+}
+
+TEST_F(AssemblerMIPS64Test, IlvrW) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::IlvrW, "ilvr.w ${reg1}, ${reg2}, ${reg3}"),
+            "ilvr.w");
+}
+
+TEST_F(AssemblerMIPS64Test, IlvrD) {
+  DriverStr(RepeatVVV(&mips64::Mips64Assembler::IlvrD, "ilvr.d ${reg1}, ${reg2}, ${reg3}"),
+            "ilvr.d");
+}
+
 #undef __
 
 }  // namespace art
diff --git a/dexlayout/dexlayout_test.cc b/dexlayout/dexlayout_test.cc
index 877ea92..5a6a20d 100644
--- a/dexlayout/dexlayout_test.cc
+++ b/dexlayout/dexlayout_test.cc
@@ -41,7 +41,7 @@
     "AAAAdQEAAAAQAAABAAAAjAEAAA==";
 
 static const char kDexFileLayoutInputProfile[] =
-    "cHJvADAwNQABCwABAAAAAAD1KW3+Y2xhc3Nlcy5kZXgBAA==";
+    "cHJvADAwNwAAAAAAAAgAAAB4AQMAAAAAAQ==";
 
 // Dex file with catch handler unreferenced by try blocks.
 // Constructed by building a dex file with try/catch blocks and hex editing.
diff --git a/disassembler/disassembler_mips.cc b/disassembler/disassembler_mips.cc
index 3c60bf4..8894cc9 100644
--- a/disassembler/disassembler_mips.cc
+++ b/disassembler/disassembler_mips.cc
@@ -456,6 +456,7 @@
   { kMsaMask | (0x7 << 23), kMsa | (0x6 << 23) | 0x7, "ldi", "kx" },
   { kMsaSpecialMask | (0xf << 2), kMsa | (0x8 << 2), "ld", "kw" },
   { kMsaSpecialMask | (0xf << 2), kMsa | (0x9 << 2), "st", "kw" },
+  { kMsaMask | (0x7 << 23), kMsa | (0x5 << 23) | 0x14, "ilvr", "Vkmn" },
 };
 
 static uint32_t ReadU32(const uint8_t* ptr) {
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 81b87f1..2bd1d64 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -322,12 +322,19 @@
   return data - ComputeRootTableSize(roots);
 }
 
+// Use a sentinel for marking entries in the JIT table that have been cleared.
+// This helps diagnosing in case the compiled code tries to wrongly access such
+// entries.
+static mirror::Class* const weak_sentinel = reinterpret_cast<mirror::Class*>(0x1);
+
 // Helper for the GC to process a weak class in a JIT root table.
-static inline void ProcessWeakClass(GcRoot<mirror::Class>* root_ptr, IsMarkedVisitor* visitor)
+static inline void ProcessWeakClass(GcRoot<mirror::Class>* root_ptr,
+                                    IsMarkedVisitor* visitor,
+                                    mirror::Class* update)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   // This does not need a read barrier because this is called by GC.
   mirror::Class* cls = root_ptr->Read<kWithoutReadBarrier>();
-  if (cls != nullptr) {
+  if (cls != nullptr && cls != weak_sentinel) {
     DCHECK((cls->IsClass<kDefaultVerifyFlags, kWithoutReadBarrier>()));
     // Look at the classloader of the class to know if it has been unloaded.
     // This does not need a read barrier because this is called by GC.
@@ -342,7 +349,7 @@
       }
     } else {
       // The class loader is not live, clear the entry.
-      *root_ptr = GcRoot<mirror::Class>(nullptr);
+      *root_ptr = GcRoot<mirror::Class>(update);
     }
   }
 }
@@ -356,7 +363,7 @@
     for (uint32_t i = 0; i < number_of_roots; ++i) {
       // This does not need a read barrier because this is called by GC.
       mirror::Object* object = roots[i].Read<kWithoutReadBarrier>();
-      if (object == nullptr) {
+      if (object == nullptr || object == weak_sentinel) {
         // entry got deleted in a previous sweep.
       } else if (object->IsString<kDefaultVerifyFlags, kWithoutReadBarrier>()) {
         mirror::Object* new_object = visitor->IsMarked(object);
@@ -371,7 +378,8 @@
           roots[i] = GcRoot<mirror::Object>(new_object);
         }
       } else {
-        ProcessWeakClass(reinterpret_cast<GcRoot<mirror::Class>*>(&roots[i]), visitor);
+        ProcessWeakClass(
+            reinterpret_cast<GcRoot<mirror::Class>*>(&roots[i]), visitor, weak_sentinel);
       }
     }
   }
@@ -380,7 +388,7 @@
     for (size_t i = 0; i < info->number_of_inline_caches_; ++i) {
       InlineCache* cache = &info->cache_[i];
       for (size_t j = 0; j < InlineCache::kIndividualCacheSize; ++j) {
-        ProcessWeakClass(&cache->classes_[j], visitor);
+        ProcessWeakClass(&cache->classes_[j], visitor, nullptr);
       }
     }
   }
diff --git a/runtime/jit/profile_compilation_info.cc b/runtime/jit/profile_compilation_info.cc
index 0acce1e..9ea5ece 100644
--- a/runtime/jit/profile_compilation_info.cc
+++ b/runtime/jit/profile_compilation_info.cc
@@ -18,11 +18,18 @@
 
 #include "errno.h"
 #include <limits.h>
+#include <string>
 #include <vector>
 #include <stdlib.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <zlib.h>
+#include <base/time_utils.h>
 
 #include "base/mutex.h"
 #include "base/scoped_flock.h"
@@ -33,13 +40,14 @@
 #include "os.h"
 #include "safe_map.h"
 #include "utils.h"
+#include "android-base/file.h"
 
 namespace art {
 
 const uint8_t ProfileCompilationInfo::kProfileMagic[] = { 'p', 'r', 'o', '\0' };
-// Last profile version: fix profman merges. Update profile version to force
-// regeneration of possibly faulty profiles.
-const uint8_t ProfileCompilationInfo::kProfileVersion[] = { '0', '0', '5', '\0' };
+// Last profile version: Instead of method index, put the difference with the last
+// method's index.
+const uint8_t ProfileCompilationInfo::kProfileVersion[] = { '0', '0', '7', '\0' };
 
 static constexpr uint16_t kMaxDexFileKeyLength = PATH_MAX;
 
@@ -209,12 +217,12 @@
 
 /**
  * Serialization format:
- *    magic,version,number_of_dex_files
- *    dex_location1,number_of_classes1,methods_region_size,dex_location_checksum1, \
+ *    magic,version,number_of_dex_files,uncompressed_size_of_zipped_data,compressed_data_size,
+ *    zipped[dex_location1,number_of_classes1,methods_region_size,dex_location_checksum1, \
  *        method_encoding_11,method_encoding_12...,class_id1,class_id2...
  *    dex_location2,number_of_classes2,methods_region_size,dex_location_checksum2, \
  *        method_encoding_21,method_encoding_22...,,class_id1,class_id2...
- *    .....
+ *    .....]
  * The method_encoding is:
  *    method_id,number_of_inline_caches,inline_cache1,inline_cache2...
  * The inline_cache is:
@@ -228,28 +236,53 @@
  *    When present, there will be no class ids following.
  **/
 bool ProfileCompilationInfo::Save(int fd) {
+  uint64_t start = NanoTime();
   ScopedTrace trace(__PRETTY_FUNCTION__);
   DCHECK_GE(fd, 0);
 
-  // Cache at most 50KB before writing.
-  static constexpr size_t kMaxSizeToKeepBeforeWriting = 50 * KB;
   // Use a vector wrapper to avoid keeping track of offsets when we add elements.
   std::vector<uint8_t> buffer;
-  WriteBuffer(fd, kProfileMagic, sizeof(kProfileMagic));
-  WriteBuffer(fd, kProfileVersion, sizeof(kProfileVersion));
+  if (!WriteBuffer(fd, kProfileMagic, sizeof(kProfileMagic))) {
+    return false;
+  }
+  if (!WriteBuffer(fd, kProfileVersion, sizeof(kProfileVersion))) {
+    return false;
+  }
   DCHECK_LE(info_.size(), std::numeric_limits<uint8_t>::max());
   AddUintToBuffer(&buffer, static_cast<uint8_t>(info_.size()));
 
+  uint32_t required_capacity = 0;
+  for (const DexFileData* dex_data_ptr : info_) {
+    const DexFileData& dex_data = *dex_data_ptr;
+    uint32_t methods_region_size = GetMethodsRegionSize(dex_data);
+    required_capacity += kLineHeaderSize +
+        dex_data.profile_key.size() +
+        sizeof(uint16_t) * dex_data.class_set.size() +
+        methods_region_size;
+  }
+  if (required_capacity > kProfileSizeErrorThresholdInBytes) {
+    LOG(ERROR) << "Profile data size exceeds "
+               << std::to_string(kProfileSizeErrorThresholdInBytes)
+               << " bytes. Profile will not be written to disk.";
+    return false;
+  }
+  if (required_capacity > kProfileSizeWarningThresholdInBytes) {
+    LOG(WARNING) << "Profile data size exceeds "
+                 << std::to_string(kProfileSizeWarningThresholdInBytes);
+  }
+  AddUintToBuffer(&buffer, required_capacity);
+  if (!WriteBuffer(fd, buffer.data(), buffer.size())) {
+    return false;
+  }
+  // Make sure that the buffer has enough capacity to avoid repeated resizings
+  // while we add data.
+  buffer.reserve(required_capacity);
+  buffer.clear();
+
   // Dex files must be written in the order of their profile index. This
   // avoids writing the index in the output file and simplifies the parsing logic.
   for (const DexFileData* dex_data_ptr : info_) {
     const DexFileData& dex_data = *dex_data_ptr;
-    if (buffer.size() > kMaxSizeToKeepBeforeWriting) {
-      if (!WriteBuffer(fd, buffer.data(), buffer.size())) {
-        return false;
-      }
-      buffer.clear();
-    }
 
     // Note that we allow dex files without any methods or classes, so that
     // inline caches can refer valid dex files.
@@ -259,16 +292,8 @@
       return false;
     }
 
-    // Make sure that the buffer has enough capacity to avoid repeated resizings
-    // while we add data.
     uint32_t methods_region_size = GetMethodsRegionSize(dex_data);
-    size_t required_capacity = buffer.size() +
-        kLineHeaderSize +
-        dex_data.profile_key.size() +
-        sizeof(uint16_t) * dex_data.class_set.size() +
-        methods_region_size;
 
-    buffer.reserve(required_capacity);
     DCHECK_LE(dex_data.profile_key.size(), std::numeric_limits<uint16_t>::max());
     DCHECK_LE(dex_data.class_set.size(), std::numeric_limits<uint16_t>::max());
     AddUintToBuffer(&buffer, static_cast<uint16_t>(dex_data.profile_key.size()));
@@ -278,19 +303,49 @@
 
     AddStringToBuffer(&buffer, dex_data.profile_key);
 
+    uint16_t last_method_index = 0;
     for (const auto& method_it : dex_data.method_map) {
-      AddUintToBuffer(&buffer, method_it.first);
+      // Store the difference between the method indices. The SafeMap is ordered by
+      // method_id, so the difference will always be non negative.
+      DCHECK_GE(method_it.first, last_method_index);
+      uint16_t diff_with_last_method_index = method_it.first - last_method_index;
+      last_method_index = method_it.first;
+      AddUintToBuffer(&buffer, diff_with_last_method_index);
       AddInlineCacheToBuffer(&buffer, method_it.second);
     }
-    for (const auto& class_id : dex_data.class_set) {
-      AddUintToBuffer(&buffer, class_id.index_);
-    }
 
-    DCHECK_LE(required_capacity, buffer.size())
-        << "Failed to add the expected number of bytes in the buffer";
+    uint16_t last_class_index = 0;
+    for (const auto& class_id : dex_data.class_set) {
+      // Store the difference between the class indices. The set is ordered by
+      // class_id, so the difference will always be non negative.
+      DCHECK_GE(class_id.index_, last_class_index);
+      uint16_t diff_with_last_class_index = class_id.index_ - last_class_index;
+      last_class_index = class_id.index_;
+      AddUintToBuffer(&buffer, diff_with_last_class_index);
+    }
   }
 
-  return WriteBuffer(fd, buffer.data(), buffer.size());
+  uint32_t output_size = 0;
+  std::unique_ptr<uint8_t[]> compressed_buffer = DeflateBuffer(buffer.data(),
+                                                               required_capacity,
+                                                               &output_size);
+
+  buffer.clear();
+  AddUintToBuffer(&buffer, output_size);
+
+  if (!WriteBuffer(fd, buffer.data(), buffer.size())) {
+    return false;
+  }
+  if (!WriteBuffer(fd, compressed_buffer.get(), output_size)) {
+    return false;
+  }
+  uint64_t total_time = NanoTime() - start;
+  VLOG(profiler) << "Compressed from "
+                 << std::to_string(required_capacity)
+                 << " to "
+                 << std::to_string(output_size);
+  VLOG(profiler) << "Time to save profile: " << std::to_string(total_time);
+  return true;
 }
 
 void ProfileCompilationInfo::AddInlineCacheToBuffer(std::vector<uint8_t>* buffer,
@@ -584,33 +639,60 @@
                                          uint8_t number_of_dex_files,
                                          const ProfileLineHeader& line_header,
                                          /*out*/std::string* error) {
-  while (buffer.HasMoreData()) {
+  uint32_t unread_bytes_before_operation = buffer.CountUnreadBytes();
+  if (unread_bytes_before_operation < line_header.method_region_size_bytes) {
+    *error += "Profile EOF reached prematurely for ReadMethod";
+    return kProfileLoadBadData;
+  }
+  size_t expected_unread_bytes_after_operation = buffer.CountUnreadBytes()
+      - line_header.method_region_size_bytes;
+  uint16_t last_method_index = 0;
+  while (buffer.CountUnreadBytes() > expected_unread_bytes_after_operation) {
     DexFileData* const data = GetOrAddDexFileData(line_header.dex_location, line_header.checksum);
-    uint16_t method_index;
-    READ_UINT(uint16_t, buffer, method_index, error);
-
+    uint16_t diff_with_last_method_index;
+    READ_UINT(uint16_t, buffer, diff_with_last_method_index, error);
+    uint16_t method_index = last_method_index + diff_with_last_method_index;
+    last_method_index = method_index;
     auto it = data->method_map.FindOrAdd(method_index);
     if (!ReadInlineCache(buffer, number_of_dex_files, &(it->second), error)) {
       return false;
     }
   }
-
+  uint32_t total_bytes_read = unread_bytes_before_operation - buffer.CountUnreadBytes();
+  if (total_bytes_read != line_header.method_region_size_bytes) {
+    *error += "Profile data inconsistent for ReadMethods";
+    return false;
+  }
   return true;
 }
 
 bool ProfileCompilationInfo::ReadClasses(SafeBuffer& buffer,
-                                         uint16_t classes_to_read,
                                          const ProfileLineHeader& line_header,
                                          /*out*/std::string* error) {
-  for (uint16_t i = 0; i < classes_to_read; i++) {
-    uint16_t type_index;
-    READ_UINT(uint16_t, buffer, type_index, error);
+  size_t unread_bytes_before_op = buffer.CountUnreadBytes();
+  if (unread_bytes_before_op < line_header.class_set_size) {
+    *error += "Profile EOF reached prematurely for ReadClasses";
+    return kProfileLoadBadData;
+  }
+
+  uint16_t last_class_index = 0;
+  for (uint16_t i = 0; i < line_header.class_set_size; i++) {
+    uint16_t diff_with_last_class_index;
+    READ_UINT(uint16_t, buffer, diff_with_last_class_index, error);
+    uint16_t type_index = last_class_index + diff_with_last_class_index;
+    last_class_index = type_index;
     if (!AddClassIndex(line_header.dex_location,
                        line_header.checksum,
                        dex::TypeIndex(type_index))) {
       return false;
     }
   }
+  size_t total_bytes_read = unread_bytes_before_op - buffer.CountUnreadBytes();
+  uint32_t expected_bytes_read = line_header.class_set_size * sizeof(uint16_t);
+  if (total_bytes_read != expected_bytes_read) {
+    *error += "Profile data inconsistent for ReadClasses";
+    return false;
+  }
   return true;
 }
 
@@ -650,15 +732,11 @@
   return false;
 }
 
-bool ProfileCompilationInfo::SafeBuffer::HasMoreData() {
-  return ptr_current_ < ptr_end_;
-}
-
 ProfileCompilationInfo::ProfileLoadSatus ProfileCompilationInfo::SafeBuffer::FillFromFd(
       int fd,
       const std::string& source,
       /*out*/std::string* error) {
-  size_t byte_count = ptr_end_ - ptr_current_;
+  size_t byte_count = (ptr_end_ - ptr_current_) * sizeof(*ptr_current_);
   uint8_t* buffer = ptr_current_;
   while (byte_count > 0) {
     int bytes_read = TEMP_FAILURE_RETRY(read(fd, buffer, byte_count));
@@ -675,15 +753,31 @@
   return kProfileLoadSuccess;
 }
 
+size_t ProfileCompilationInfo::SafeBuffer::CountUnreadBytes() {
+  return (ptr_end_ - ptr_current_) * sizeof(*ptr_current_);
+}
+
+const uint8_t* ProfileCompilationInfo::SafeBuffer::GetCurrentPtr() {
+  return ptr_current_;
+}
+
+void ProfileCompilationInfo::SafeBuffer::Advance(size_t data_size) {
+  ptr_current_ += data_size;
+}
+
 ProfileCompilationInfo::ProfileLoadSatus ProfileCompilationInfo::ReadProfileHeader(
       int fd,
       /*out*/uint8_t* number_of_dex_files,
+      /*out*/uint32_t* uncompressed_data_size,
+      /*out*/uint32_t* compressed_data_size,
       /*out*/std::string* error) {
   // Read magic and version
   const size_t kMagicVersionSize =
     sizeof(kProfileMagic) +
     sizeof(kProfileVersion) +
-    sizeof(uint8_t);  // number of dex files
+    sizeof(uint8_t) +  // number of dex files
+    sizeof(uint32_t) +  // size of uncompressed profile data
+    sizeof(uint32_t);  // size of compressed profile data
 
   SafeBuffer safe_buffer(kMagicVersionSize);
 
@@ -704,6 +798,14 @@
     *error = "Cannot read the number of dex files";
     return kProfileLoadBadData;
   }
+  if (!safe_buffer.ReadUintAndAdvance<uint32_t>(uncompressed_data_size)) {
+    *error = "Cannot read the size of uncompressed data";
+    return kProfileLoadBadData;
+  }
+  if (!safe_buffer.ReadUintAndAdvance<uint32_t>(compressed_data_size)) {
+    *error = "Cannot read the size of compressed data";
+    return kProfileLoadBadData;
+  }
   return kProfileLoadSuccess;
 }
 
@@ -719,17 +821,16 @@
 }
 
 ProfileCompilationInfo::ProfileLoadSatus ProfileCompilationInfo::ReadProfileLineHeader(
-      int fd,
-      /*out*/ProfileLineHeader* line_header,
-      /*out*/std::string* error) {
-  SafeBuffer header_buffer(kLineHeaderSize);
-  ProfileLoadSatus status = header_buffer.FillFromFd(fd, "ReadProfileLineHeader", error);
-  if (status != kProfileLoadSuccess) {
-    return status;
+    SafeBuffer& buffer,
+    /*out*/ProfileLineHeader* line_header,
+    /*out*/std::string* error) {
+  if (buffer.CountUnreadBytes() < kLineHeaderSize) {
+    *error += "Profile EOF reached prematurely for ReadProfileLineHeader";
+    return kProfileLoadBadData;
   }
 
   uint16_t dex_location_size;
-  if (!ReadProfileLineHeaderElements(header_buffer, &dex_location_size, line_header, error)) {
+  if (!ReadProfileLineHeaderElements(buffer, &dex_location_size, line_header, error)) {
     return kProfileLoadBadData;
   }
 
@@ -739,18 +840,19 @@
     return kProfileLoadBadData;
   }
 
-  SafeBuffer location_buffer(dex_location_size);
-  status = location_buffer.FillFromFd(fd, "ReadProfileHeaderDexLocation", error);
-  if (status != kProfileLoadSuccess) {
-    return status;
+  if (buffer.CountUnreadBytes() < dex_location_size) {
+    *error += "Profile EOF reached prematurely for ReadProfileHeaderDexLocation";
+    return kProfileLoadBadData;
   }
+  const uint8_t* base_ptr = buffer.GetCurrentPtr();
   line_header->dex_location.assign(
-      reinterpret_cast<char*>(location_buffer.Get()), dex_location_size);
+      reinterpret_cast<const char*>(base_ptr), dex_location_size);
+  buffer.Advance(dex_location_size);
   return kProfileLoadSuccess;
 }
 
 ProfileCompilationInfo::ProfileLoadSatus ProfileCompilationInfo::ReadProfileLine(
-      int fd,
+      SafeBuffer& buffer,
       uint8_t number_of_dex_files,
       const ProfileLineHeader& line_header,
       /*out*/std::string* error) {
@@ -760,29 +862,13 @@
     return kProfileLoadBadData;
   }
 
-  {
-    SafeBuffer buffer(line_header.method_region_size_bytes);
-    ProfileLoadSatus status = buffer.FillFromFd(fd, "ReadProfileLineMethods", error);
-    if (status != kProfileLoadSuccess) {
-      return status;
-    }
-
-    if (!ReadMethods(buffer, number_of_dex_files, line_header, error)) {
-      return kProfileLoadBadData;
-    }
+  if (!ReadMethods(buffer, number_of_dex_files, line_header, error)) {
+    return kProfileLoadBadData;
   }
 
-  {
-    SafeBuffer buffer(sizeof(uint16_t) * line_header.class_set_size);
-    ProfileLoadSatus status = buffer.FillFromFd(fd, "ReadProfileLineClasses", error);
-    if (status != kProfileLoadSuccess) {
-      return status;
-    }
-    if (!ReadClasses(buffer, line_header.class_set_size, line_header, error)) {
-      return kProfileLoadBadData;
-    }
+  if (!ReadClasses(buffer, line_header, error)) {
+    return kProfileLoadBadData;
   }
-
   return kProfileLoadSuccess;
 }
 
@@ -821,39 +907,135 @@
   }
   // Read profile header: magic + version + number_of_dex_files.
   uint8_t number_of_dex_files;
-  ProfileLoadSatus status = ReadProfileHeader(fd, &number_of_dex_files, error);
+  uint32_t uncompressed_data_size;
+  uint32_t compressed_data_size;
+  ProfileLoadSatus status = ReadProfileHeader(fd,
+                                              &number_of_dex_files,
+                                              &uncompressed_data_size,
+                                              &compressed_data_size,
+                                              error);
+
   if (status != kProfileLoadSuccess) {
     return status;
   }
 
+  if (uncompressed_data_size > kProfileSizeErrorThresholdInBytes) {
+    LOG(ERROR) << "Profile data size exceeds "
+               << std::to_string(kProfileSizeErrorThresholdInBytes)
+               << " bytes";
+    return kProfileLoadBadData;
+  }
+  if (uncompressed_data_size > kProfileSizeWarningThresholdInBytes) {
+    LOG(WARNING) << "Profile data size exceeds "
+                 << std::to_string(kProfileSizeWarningThresholdInBytes)
+                 << " bytes";
+  }
+
+  std::unique_ptr<uint8_t[]> compressed_data(new uint8_t[compressed_data_size]);
+  bool bytes_read_success =
+      android::base::ReadFully(fd, compressed_data.get(), compressed_data_size);
+
+  if (testEOF(fd) != 0) {
+    *error += "Unexpected data in the profile file.";
+    return kProfileLoadBadData;
+  }
+
+  if (!bytes_read_success) {
+    *error += "Unable to read compressed profile data";
+    return kProfileLoadBadData;
+  }
+
+  SafeBuffer uncompressed_data(uncompressed_data_size);
+
+  int ret = InflateBuffer(compressed_data.get(),
+                          compressed_data_size,
+                          uncompressed_data_size,
+                          uncompressed_data.Get());
+
+  if (ret != Z_STREAM_END) {
+    *error += "Error reading uncompressed profile data";
+    return kProfileLoadBadData;
+  }
+
   for (uint8_t k = 0; k < number_of_dex_files; k++) {
     ProfileLineHeader line_header;
 
     // First, read the line header to get the amount of data we need to read.
-    status = ReadProfileLineHeader(fd, &line_header, error);
+    status = ReadProfileLineHeader(uncompressed_data, &line_header, error);
     if (status != kProfileLoadSuccess) {
       return status;
     }
 
     // Now read the actual profile line.
-    status = ReadProfileLine(fd, number_of_dex_files, line_header, error);
+    status = ReadProfileLine(uncompressed_data, number_of_dex_files, line_header, error);
     if (status != kProfileLoadSuccess) {
       return status;
     }
   }
 
   // Check that we read everything and that profiles don't contain junk data.
-  int result = testEOF(fd);
-  if (result == 0) {
-    return kProfileLoadSuccess;
-  } else if (result < 0) {
-    return kProfileLoadIOError;
-  } else {
+  if (uncompressed_data.CountUnreadBytes() > 0) {
     *error = "Unexpected content in the profile file";
     return kProfileLoadBadData;
+  } else {
+    return kProfileLoadSuccess;
   }
 }
 
+std::unique_ptr<uint8_t[]> ProfileCompilationInfo::DeflateBuffer(const uint8_t* in_buffer,
+                                                                 uint32_t in_size,
+                                                                 uint32_t* compressed_data_size) {
+  z_stream strm;
+  strm.zalloc = Z_NULL;
+  strm.zfree = Z_NULL;
+  strm.opaque = Z_NULL;
+  int ret = deflateInit(&strm, 1);
+  if (ret != Z_OK) {
+    return nullptr;
+  }
+
+  uint32_t out_size = deflateBound(&strm, in_size);
+
+  std::unique_ptr<uint8_t[]> compressed_buffer(new uint8_t[out_size]);
+  strm.avail_in = in_size;
+  strm.next_in = const_cast<uint8_t*>(in_buffer);
+  strm.avail_out = out_size;
+  strm.next_out = &compressed_buffer[0];
+  ret = deflate(&strm, Z_FINISH);
+  if (ret == Z_STREAM_ERROR) {
+    return nullptr;
+  }
+  *compressed_data_size = out_size - strm.avail_out;
+  deflateEnd(&strm);
+  return compressed_buffer;
+}
+
+int ProfileCompilationInfo::InflateBuffer(const uint8_t* in_buffer,
+                                          uint32_t in_size,
+                                          uint32_t expected_uncompressed_data_size,
+                                          uint8_t* out_buffer) {
+  z_stream strm;
+
+  /* allocate inflate state */
+  strm.zalloc = Z_NULL;
+  strm.zfree = Z_NULL;
+  strm.opaque = Z_NULL;
+  strm.avail_in = in_size;
+  strm.next_in = const_cast<uint8_t*>(in_buffer);
+  strm.avail_out = expected_uncompressed_data_size;
+  strm.next_out = out_buffer;
+
+  int ret;
+  inflateInit(&strm);
+  ret = inflate(&strm, Z_NO_FLUSH);
+
+  if (strm.avail_in != 0 || strm.avail_out != 0) {
+    return Z_DATA_ERROR;
+  }
+  inflateEnd(&strm);
+  return ret;
+}
+
 bool ProfileCompilationInfo::MergeWith(const ProfileCompilationInfo& other) {
   // First verify that all checksums match. This will avoid adding garbage to
   // the current profile info.
diff --git a/runtime/jit/profile_compilation_info.h b/runtime/jit/profile_compilation_info.h
index f68ed5d..9e47cc1 100644
--- a/runtime/jit/profile_compilation_info.h
+++ b/runtime/jit/profile_compilation_info.h
@@ -284,6 +284,9 @@
     kProfileLoadSuccess
   };
 
+  const uint32_t kProfileSizeWarningThresholdInBytes = 500000U;
+  const uint32_t kProfileSizeErrorThresholdInBytes = 1000000U;
+
   // Internal representation of the profile information belonging to a dex file.
   // Note that we could do without profile_key (the key used to encode the dex
   // file in the profile) and profile_index (the index of the dex file in the
@@ -353,6 +356,21 @@
   // Checks if the profile is empty.
   bool IsEmpty() const;
 
+  // Inflate the input buffer (in_buffer) of size in_size. It returns a buffer of
+  // compressed data for the input buffer of "compressed_data_size" size.
+  std::unique_ptr<uint8_t[]> DeflateBuffer(const uint8_t* in_buffer,
+                                           uint32_t in_size,
+                                           /*out*/uint32_t* compressed_data_size);
+
+  // Inflate the input buffer(in_buffer) of size in_size. out_size is the expected output
+  // size of the buffer. It puts the output in out_buffer. It returns Z_STREAM_END on
+  // success. On error, it returns Z_STREAM_ERROR if the compressed data is inconsistent
+  // and Z_DATA_ERROR if the stream ended prematurely or the stream has extra data.
+  int InflateBuffer(const uint8_t* in_buffer,
+                    uint32_t in_size,
+                    uint32_t out_size,
+                    /*out*/uint8_t* out_buffer);
+
   // Parsing functionality.
 
   // The information present in the header of each profile line.
@@ -376,6 +394,10 @@
                                 const std::string& source,
                                 /*out*/std::string* error);
 
+    ProfileLoadSatus FillFromBuffer(uint8_t* buffer_ptr,
+                                    const std::string& source,
+                                    /*out*/std::string* error);
+
     // Reads an uint value (high bits to low bits) and advances the current pointer
     // with the number of bits read.
     template <typename T> bool ReadUintAndAdvance(/*out*/ T* value);
@@ -384,16 +406,22 @@
     // equal it advances the current pointer by data_size.
     bool CompareAndAdvance(const uint8_t* data, size_t data_size);
 
-    // Returns true if the buffer has more data to read.
-    bool HasMoreData();
+    // Advances current pointer by data_size.
+    void Advance(size_t data_size);
+
+    // Returns the count of unread bytes.
+    size_t CountUnreadBytes();
+
+    // Returns the current pointer.
+    const uint8_t* GetCurrentPtr();
 
     // Get the underlying raw buffer.
     uint8_t* Get() { return storage_.get(); }
 
    private:
     std::unique_ptr<uint8_t[]> storage_;
-    uint8_t* ptr_current_;
     uint8_t* ptr_end_;
+    uint8_t* ptr_current_;
   };
 
   // Entry point for profile loding functionality.
@@ -403,10 +431,12 @@
   // lines into number_of_dex_files.
   ProfileLoadSatus ReadProfileHeader(int fd,
                                      /*out*/uint8_t* number_of_dex_files,
+                                     /*out*/uint32_t* size_uncompressed_data,
+                                     /*out*/uint32_t* size_compressed_data,
                                      /*out*/std::string* error);
 
   // Read the header of a profile line from the given fd.
-  ProfileLoadSatus ReadProfileLineHeader(int fd,
+  ProfileLoadSatus ReadProfileLineHeader(SafeBuffer& buffer,
                                          /*out*/ProfileLineHeader* line_header,
                                          /*out*/std::string* error);
 
@@ -417,14 +447,13 @@
                                      /*out*/std::string* error);
 
   // Read a single profile line from the given fd.
-  ProfileLoadSatus ReadProfileLine(int fd,
+  ProfileLoadSatus ReadProfileLine(SafeBuffer& buffer,
                                    uint8_t number_of_dex_files,
                                    const ProfileLineHeader& line_header,
                                    /*out*/std::string* error);
 
   // Read all the classes from the buffer into the profile `info_` structure.
   bool ReadClasses(SafeBuffer& buffer,
-                   uint16_t classes_to_read,
                    const ProfileLineHeader& line_header,
                    /*out*/std::string* error);
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 0bc0869..b169373 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -1185,12 +1185,6 @@
 
   if (!no_sig_chain_) {
     // Dex2Oat's Runtime does not need the signal chain or the fault handler.
-
-    // Initialize the signal chain so that any calls to sigaction get
-    // correctly routed to the next in the chain regardless of whether we
-    // have claimed the signal or not.
-    InitializeSignalChain();
-
     if (implicit_null_checks_ || implicit_so_checks_ || implicit_suspend_checks_) {
       fault_manager.Init();
 
diff --git a/sigchainlib/sigchain.cc b/sigchainlib/sigchain.cc
index f4799d2..13c03e8 100644
--- a/sigchainlib/sigchain.cc
+++ b/sigchainlib/sigchain.cc
@@ -93,6 +93,35 @@
 static decltype(&sigaction) linked_sigaction;
 static decltype(&sigprocmask) linked_sigprocmask;
 
+__attribute__((constructor)) static void InitializeSignalChain() {
+  static std::once_flag once;
+  std::call_once(once, []() {
+    void* linked_sigaction_sym = dlsym(RTLD_NEXT, "sigaction");
+    if (linked_sigaction_sym == nullptr) {
+      linked_sigaction_sym = dlsym(RTLD_DEFAULT, "sigaction");
+      if (linked_sigaction_sym == nullptr ||
+          linked_sigaction_sym == reinterpret_cast<void*>(sigaction)) {
+        fatal("Unable to find next sigaction in signal chain");
+      }
+    }
+
+    void* linked_sigprocmask_sym = dlsym(RTLD_NEXT, "sigprocmask");
+    if (linked_sigprocmask_sym == nullptr) {
+      linked_sigprocmask_sym = dlsym(RTLD_DEFAULT, "sigprocmask");
+      if (linked_sigprocmask_sym == nullptr ||
+          linked_sigprocmask_sym == reinterpret_cast<void*>(sigprocmask)) {
+        fatal("Unable to find next sigprocmask in signal chain");
+      }
+    }
+
+    linked_sigaction =
+        reinterpret_cast<decltype(linked_sigaction)>(linked_sigaction_sym);
+    linked_sigprocmask =
+        reinterpret_cast<decltype(linked_sigprocmask)>(linked_sigprocmask_sym);
+  });
+}
+
+
 static pthread_key_t GetHandlingSignalKey() {
   static pthread_key_t key;
   static std::once_flag once;
@@ -263,6 +292,8 @@
 }
 
 extern "C" int sigaction(int signal, const struct sigaction* new_action, struct sigaction* old_action) {
+  InitializeSignalChain();
+
   // If this signal has been claimed as a signal chain, record the user's
   // action but don't pass it on to the kernel.
   // Note that we check that the signal number is in range here.  An out of range signal
@@ -285,11 +316,12 @@
 
   // Will only get here if the signal chain has not been claimed.  We want
   // to pass the sigaction on to the kernel via the real sigaction in libc.
-  InitializeSignalChain();
   return linked_sigaction(signal, new_action, old_action);
 }
 
 extern "C" sighandler_t signal(int signo, sighandler_t handler) {
+  InitializeSignalChain();
+
   if (signo < 0 || signo > _NSIG) {
     errno = EINVAL;
     return SIG_ERR;
@@ -311,7 +343,6 @@
 
   // Will only get here if the signal chain has not been claimed.  We want
   // to pass the sigaction on to the kernel via the real sigaction in libc.
-  InitializeSignalChain();
   if (linked_sigaction(signo, &sa, &sa) == -1) {
     return SIG_ERR;
   }
@@ -321,11 +352,15 @@
 
 #if !defined(__LP64__)
 extern "C" sighandler_t bsd_signal(int signo, sighandler_t handler) {
+  InitializeSignalChain();
+
   return signal(signo, handler);
 }
 #endif
 
 extern "C" int sigprocmask(int how, const sigset_t* bionic_new_set, sigset_t* bionic_old_set) {
+  InitializeSignalChain();
+
   // When inside a signal handler, forward directly to the actual sigprocmask.
   if (GetHandlingSignal()) {
     return linked_sigprocmask(how, bionic_new_set, bionic_old_set);
@@ -348,47 +383,12 @@
     new_set_ptr = &tmpset;
   }
 
-  InitializeSignalChain();
   return linked_sigprocmask(how, new_set_ptr, bionic_old_set);
 }
 
-extern "C" void InitializeSignalChain() {
-  // Warning.
-  // Don't call this from within a signal context as it makes calls to
-  // dlsym.  Calling into the dynamic linker will result in locks being
-  // taken and if it so happens that a signal occurs while one of these
-  // locks is already taken, dlsym will block trying to reenter a
-  // mutex and we will never get out of it.
-  static bool initialized = false;
-  if (initialized) {
-    // Don't initialize twice.
-    return;
-  }
-
-  void* linked_sigaction_sym = dlsym(RTLD_NEXT, "sigaction");
-  if (linked_sigaction_sym == nullptr) {
-    linked_sigaction_sym = dlsym(RTLD_DEFAULT, "sigaction");
-    if (linked_sigaction_sym == nullptr ||
-        linked_sigaction_sym == reinterpret_cast<void*>(sigaction)) {
-      fatal("Unable to find next sigaction in signal chain");
-    }
-  }
-
-  void* linked_sigprocmask_sym = dlsym(RTLD_NEXT, "sigprocmask");
-  if (linked_sigprocmask_sym == nullptr) {
-    linked_sigprocmask_sym = dlsym(RTLD_DEFAULT, "sigprocmask");
-    if (linked_sigprocmask_sym == nullptr ||
-        linked_sigprocmask_sym == reinterpret_cast<void*>(sigprocmask)) {
-      fatal("Unable to find next sigprocmask in signal chain");
-    }
-  }
-
-  linked_sigaction = reinterpret_cast<decltype(linked_sigaction)>(linked_sigaction_sym);
-  linked_sigprocmask = reinterpret_cast<decltype(linked_sigprocmask)>(linked_sigprocmask_sym);
-  initialized = true;
-}
-
 extern "C" void AddSpecialSignalHandlerFn(int signal, SpecialSignalHandlerFn fn) {
+  InitializeSignalChain();
+
   if (signal <= 0 || signal >= _NSIG) {
     fatal("Invalid signal %d", signal);
   }
@@ -399,6 +399,8 @@
 }
 
 extern "C" void RemoveSpecialSignalHandlerFn(int signal, SpecialSignalHandlerFn fn) {
+  InitializeSignalChain();
+
   if (signal <= 0 || signal >= _NSIG) {
     fatal("Invalid signal %d", signal);
   }
@@ -407,14 +409,16 @@
 }
 
 extern "C" void EnsureFrontOfChain(int signal) {
+  InitializeSignalChain();
+
   if (signal <= 0 || signal >= _NSIG) {
     fatal("Invalid signal %d", signal);
   }
 
   // Read the current action without looking at the chain, it should be the expected action.
   struct sigaction current_action;
-  InitializeSignalChain();
   linked_sigaction(signal, nullptr, &current_action);
+
   // If the sigactions don't match then we put the current action on the chain and make ourself as
   // the main action.
   if (current_action.sa_sigaction != SignalChain::Handler) {
diff --git a/sigchainlib/sigchain.h b/sigchainlib/sigchain.h
index 960d221..0bed117 100644
--- a/sigchainlib/sigchain.h
+++ b/sigchainlib/sigchain.h
@@ -21,8 +21,6 @@
 
 namespace art {
 
-extern "C" void InitializeSignalChain();
-
 typedef bool (*SpecialSignalHandlerFn)(int, siginfo_t*, void*);
 extern "C" void AddSpecialSignalHandlerFn(int signal, SpecialSignalHandlerFn fn);
 extern "C" void RemoveSpecialSignalHandlerFn(int signal, SpecialSignalHandlerFn fn);
diff --git a/sigchainlib/sigchain_dummy.cc b/sigchainlib/sigchain_dummy.cc
index d6a5e12..2d7985a 100644
--- a/sigchainlib/sigchain_dummy.cc
+++ b/sigchainlib/sigchain_dummy.cc
@@ -48,11 +48,6 @@
 
 namespace art {
 
-extern "C" void InitializeSignalChain() {
-  log("InitializeSignalChain is not exported by the main executable.");
-  abort();
-}
-
 extern "C" void EnsureFrontOfChain(int signal ATTRIBUTE_UNUSED) {
   log("EnsureFrontOfChain is not exported by the main executable.");
   abort();
diff --git a/sigchainlib/version-script32.txt b/sigchainlib/version-script32.txt
index f360efa..2340785 100644
--- a/sigchainlib/version-script32.txt
+++ b/sigchainlib/version-script32.txt
@@ -1,6 +1,5 @@
 {
 global:
-  InitializeSignalChain;
   EnsureFrontOfChain;
   AddSpecialSignalHandlerFn;
   RemoveSpecialSignalHandlerFn;
diff --git a/sigchainlib/version-script64.txt b/sigchainlib/version-script64.txt
index 319d1c6..acf3630 100644
--- a/sigchainlib/version-script64.txt
+++ b/sigchainlib/version-script64.txt
@@ -1,6 +1,5 @@
 {
 global:
-  InitializeSignalChain;
   EnsureFrontOfChain;
   AddSpecialSignalHandlerFn;
   RemoveSpecialSignalHandlerFn;
diff --git a/test/527-checker-array-access-simd/expected.txt b/test/527-checker-array-access-simd/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/527-checker-array-access-simd/expected.txt
diff --git a/test/527-checker-array-access-simd/info.txt b/test/527-checker-array-access-simd/info.txt
new file mode 100644
index 0000000..f147943
--- /dev/null
+++ b/test/527-checker-array-access-simd/info.txt
@@ -0,0 +1 @@
+Test arm- and arm64-specific array access optimization for simd loops.
diff --git a/test/527-checker-array-access-simd/src/Main.java b/test/527-checker-array-access-simd/src/Main.java
new file mode 100644
index 0000000..8af5465
--- /dev/null
+++ b/test/527-checker-array-access-simd/src/Main.java
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  public static void assertIntEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Index>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address2>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address1>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, w{{[0-9]+}}, lsl #2
+  public static void checkIntCase(int[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] += 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Index>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address2>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address1>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK:                                        VecLoad
+  /// CHECK-NEXT:                                   ldr q{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
+  /// CHECK:                                        VecStore
+  /// CHECK-NEXT:                                   str q{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
+  public static void checkByteCase(byte[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] += 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkSingleAccess(int[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Repl>>]
+
+  /// CHECK-START-ARM64: void Main.checkSingleAccess(int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  public static void checkSingleAccess(int[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] = 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Index>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Index>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Address1>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Address2>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Address1>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Address1>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, w{{[0-9]+}}, lsl #2
+  public static void checkInt2Float(int[] a, float[] b) {
+    for (int i = 0; i < 128; i++) {
+      b[i] = (float) a[i];
+    }
+  }
+
+  public static final int ARRAY_SIZE = 1024;
+
+  public static int calcArraySum(int[] a, byte[] b, float[] c) {
+    int sum = 0;
+    for (int i = 0; i < 128; i++) {
+      sum += a[i] + b[i] + (int) c[i];
+    }
+    return sum;
+  }
+
+  public static void main(String[] args) {
+    byte[] ba = new byte[ARRAY_SIZE];
+    int[] ia = new int[ARRAY_SIZE];
+    float[] fa = new float[ARRAY_SIZE];
+
+    checkSingleAccess(ia);
+    checkIntCase(ia);
+    checkByteCase(ba);
+    checkInt2Float(ia, fa);
+
+    assertIntEquals(3200, calcArraySum(ia, ba, fa));
+  }
+}
diff --git a/test/etc/run-test-jar b/test/etc/run-test-jar
index f750556..c6ef8ed 100755
--- a/test/etc/run-test-jar
+++ b/test/etc/run-test-jar
@@ -668,7 +668,7 @@
     adb root > /dev/null
     adb wait-for-device
     if [ "$QUIET" = "n" ]; then
-      adb shell rm -r $DEX_LOCATION
+      adb shell rm -rf $DEX_LOCATION
       adb shell mkdir -p $DEX_LOCATION
       adb push $TEST_NAME.jar $DEX_LOCATION
       adb push $TEST_NAME-ex.jar $DEX_LOCATION
@@ -736,6 +736,7 @@
 
     rm -f $cmdfile
 else
+    # Host run.
     export ANDROID_PRINTF_LOG=brief
 
     # By default, and for prebuild dex2oat, we are interested in errors being logged. In dev mode
@@ -792,7 +793,12 @@
 
     cd $ANDROID_BUILD_TOP
 
+    # Make sure we delete any existing compiler artifacts.
+    # This enables tests to call the RUN script multiple times in a row
+    # without worrying about interference.
+    rm -rf ${DEX_LOCATION}/oat
     rm -rf ${DEX_LOCATION}/dalvik-cache/
+
     mkdir -p ${mkdir_locations} || exit 1
     $profman_cmdline || { echo "Profman failed." >&2 ; exit 2; }
     $dex2oat_cmdline || { echo "Dex2oat failed." >&2 ; exit 2; }
diff --git a/test/knownfailures.json b/test/knownfailures.json
index 659b814..366396f 100644
--- a/test/knownfailures.json
+++ b/test/knownfailures.json
@@ -336,11 +336,6 @@
         "variant": "no-image & jit"
     },
     {
-        "tests": ["597-deopt-new-string"],
-        "bug": "http://b/36467228",
-        "variant": "no-image & jit"
-    },
-    {
         "tests": ["530-checker-lse",
                   "530-checker-lse2",
                   "030-bad-finalizer",