diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index 7cb23f8..b3d1380 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -220,6 +220,14 @@
 #  -Wmissing-declarations \
 
 
+
+ifdef ART_IMT_SIZE
+  art_cflags += -DIMT_SIZE=$(ART_IMT_SIZE)
+else
+  # Default is 64
+  art_cflags += -DIMT_SIZE=64
+endif
+
 ifeq ($(ART_SMALL_MODE),true)
   art_cflags += -DART_SMALL_MODE=1
 endif
@@ -228,6 +236,10 @@
   art_cflags += -DART_SEA_IR_MODE=1
 endif
 
+ifeq ($(ART_USE_OPTIMIZING_COMPILER),true)
+  art_cflags += -DART_USE_OPTIMIZING_COMPILER=1
+endif
+
 # Cflags for non-debug ART and ART tools.
 art_non_debug_cflags := \
   -O3
@@ -236,6 +248,7 @@
 art_debug_cflags := \
   -O2 \
   -DDYNAMIC_ANNOTATIONS_ENABLED=1 \
+  -DVIXL_DEBUG \
   -UNDEBUG
 
 art_host_non_debug_cflags := $(art_non_debug_cflags)
@@ -244,7 +257,7 @@
 ifeq ($(HOST_OS),linux)
   # Larger frame-size for host clang builds today
   ifndef SANITIZE_HOST
-    art_host_non_debug_cflags += -Wframe-larger-than=2600
+    art_host_non_debug_cflags += -Wframe-larger-than=2700
   endif
   art_target_non_debug_cflags += -Wframe-larger-than=1728
 endif
@@ -252,7 +265,7 @@
 ifndef LIBART_IMG_HOST_BASE_ADDRESS
   $(error LIBART_IMG_HOST_BASE_ADDRESS unset)
 endif
-ART_HOST_CFLAGS += $(art_cflags) -DANDROID_SMP=1 -DART_BASE_ADDRESS=$(LIBART_IMG_HOST_BASE_ADDRESS)
+ART_HOST_CFLAGS += $(art_cflags) -DART_BASE_ADDRESS=$(LIBART_IMG_HOST_BASE_ADDRESS)
 ART_HOST_CFLAGS += -DART_DEFAULT_INSTRUCTION_SET_FEATURES=default
 
 ifndef LIBART_IMG_TARGET_BASE_ADDRESS
@@ -283,18 +296,6 @@
 ART_TARGET_CFLAGS += -DART_BASE_ADDRESS_MIN_DELTA=$(LIBART_IMG_TARGET_MIN_BASE_ADDRESS_DELTA)
 ART_TARGET_CFLAGS += -DART_BASE_ADDRESS_MAX_DELTA=$(LIBART_IMG_TARGET_MAX_BASE_ADDRESS_DELTA)
 
-ifeq ($(TARGET_CPU_SMP),true)
-  ART_TARGET_CFLAGS += -DANDROID_SMP=1
-else
-  ifeq ($(TARGET_CPU_SMP),false)
-    ART_TARGET_CFLAGS += -DANDROID_SMP=0
-  else
-    $(warning TARGET_CPU_SMP should be (true|false), found $(TARGET_CPU_SMP))
-    # Make sure we emit barriers for the worst case.
-    ART_TARGET_CFLAGS += -DANDROID_SMP=1
-  endif
-endif
-
 # To use oprofile_android --callgraph, uncomment this and recompile with "mmm art -B -j16"
 # ART_TARGET_CFLAGS += -fno-omit-frame-pointer -marm -mapcs
 
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 7e28b37..10b0400 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -368,7 +368,7 @@
   ifeq ($$(art_target_or_host),target)
     $$(eval $$(call set-target-local-clang-vars))
     $$(eval $$(call set-target-local-cflags-vars,debug))
-    LOCAL_SHARED_LIBRARIES += libdl libicuuc libicui18n libnativehelper libz libcutils libvixl
+    LOCAL_SHARED_LIBRARIES += libdl libicuuc libicui18n libnativehelper libz libcutils libvixld
     LOCAL_MODULE_PATH_32 := $$(ART_TARGET_NATIVETEST_OUT)/$$(ART_TARGET_ARCH_32)
     LOCAL_MODULE_PATH_64 := $$(ART_TARGET_NATIVETEST_OUT)/$$(ART_TARGET_ARCH_64)
     LOCAL_MULTILIB := both
@@ -404,7 +404,7 @@
   else # host
     LOCAL_CLANG := $$(ART_HOST_CLANG)
     LOCAL_CFLAGS += $$(ART_HOST_CFLAGS) $$(ART_HOST_DEBUG_CFLAGS)
-    LOCAL_SHARED_LIBRARIES += libicuuc-host libicui18n-host libnativehelper libziparchive-host libz-host libvixl
+    LOCAL_SHARED_LIBRARIES += libicuuc-host libicui18n-host libnativehelper libziparchive-host libz-host libvixld
     LOCAL_LDLIBS := $(ART_HOST_LDLIBS) -lpthread -ldl
     LOCAL_IS_HOST_MODULE := true
     LOCAL_MULTILIB := both
diff --git a/compiler/Android.mk b/compiler/Android.mk
index 610f453..70c7e52 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -178,8 +178,10 @@
   dex/pass_me.h \
   driver/compiler_driver.h \
   driver/compiler_options.h \
+  image_writer.h \
   optimizing/locations.h \
-  utils/arm/constants_arm.h
+  utils/arm/constants_arm.h \
+  utils/dex_instruction_utils.h
 
 # $(1): target or host
 # $(2): ndebug or debug
@@ -206,7 +208,9 @@
   ifeq ($$(art_ndebug_or_debug),ndebug)
     LOCAL_MODULE := libart-compiler
     LOCAL_SHARED_LIBRARIES += libart
-    LOCAL_FDO_SUPPORT := true
+    ifeq ($$(art_target_or_host),target)
+      LOCAL_FDO_SUPPORT := true
+    endif
   else # debug
     LOCAL_MODULE := libartd-compiler
     LOCAL_SHARED_LIBRARIES += libartd
@@ -275,7 +279,11 @@
   LOCAL_ADDITIONAL_DEPENDENCIES := art/build/Android.common_build.mk
   LOCAL_ADDITIONAL_DEPENDENCIES += $(LOCAL_PATH)/Android.mk
   # Vixl assembly support for ARM64 targets.
-  LOCAL_SHARED_LIBRARIES += libvixl
+  ifeq ($$(art_ndebug_or_debug),debug)
+    LOCAL_SHARED_LIBRARIES += libvixld
+  else
+    LOCAL_SHARED_LIBRARIES += libvixl
+  endif
   ifeq ($$(art_target_or_host),target)
     # For atrace.
     LOCAL_SHARED_LIBRARIES += libcutils
diff --git a/compiler/common_compiler_test.cc b/compiler/common_compiler_test.cc
index 97387a1..a3d9a0b 100644
--- a/compiler/common_compiler_test.cc
+++ b/compiler/common_compiler_test.cc
@@ -60,14 +60,18 @@
       const std::vector<uint8_t>& mapping_table = compiled_method->GetMappingTable();
       uint32_t mapping_table_offset = mapping_table.empty() ? 0u
           : sizeof(OatQuickMethodHeader) + vmap_table.size() + mapping_table.size();
-      OatQuickMethodHeader method_header(mapping_table_offset, vmap_table_offset,
+      const std::vector<uint8_t>& gc_map = *compiled_method->GetGcMap();
+      uint32_t gc_map_offset = gc_map.empty() ? 0u
+          : sizeof(OatQuickMethodHeader) + vmap_table.size() + mapping_table.size() + gc_map.size();
+      OatQuickMethodHeader method_header(mapping_table_offset, vmap_table_offset, gc_map_offset,
                                          compiled_method->GetFrameSizeInBytes(),
                                          compiled_method->GetCoreSpillMask(),
                                          compiled_method->GetFpSpillMask(), code_size);
 
       header_code_and_maps_chunks_.push_back(std::vector<uint8_t>());
       std::vector<uint8_t>* chunk = &header_code_and_maps_chunks_.back();
-      size_t size = sizeof(method_header) + code_size + vmap_table.size() + mapping_table.size();
+      size_t size = sizeof(method_header) + code_size + vmap_table.size() + mapping_table.size() +
+          gc_map.size();
       size_t code_offset = compiled_method->AlignCode(size - code_size);
       size_t padding = code_offset - (size - code_size);
       chunk->reserve(padding + size);
@@ -75,6 +79,7 @@
       memcpy(&(*chunk)[0], &method_header, sizeof(method_header));
       chunk->insert(chunk->begin(), vmap_table.begin(), vmap_table.end());
       chunk->insert(chunk->begin(), mapping_table.begin(), mapping_table.end());
+      chunk->insert(chunk->begin(), gc_map.begin(), gc_map.end());
       chunk->insert(chunk->begin(), padding, 0);
       chunk->insert(chunk->end(), code->begin(), code->end());
       CHECK_EQ(padding + size, chunk->size());
diff --git a/compiler/common_compiler_test.h b/compiler/common_compiler_test.h
index 20b750c..9cffbc8 100644
--- a/compiler/common_compiler_test.h
+++ b/compiler/common_compiler_test.h
@@ -42,7 +42,7 @@
   ~CommonCompilerTest();
 
   // Create an OatMethod based on pointers (for unit tests).
-  OatFile::OatMethod CreateOatMethod(const void* code, const uint8_t* gc_map);
+  OatFile::OatMethod CreateOatMethod(const void* code);
 
   void MakeExecutable(mirror::ArtMethod* method) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
diff --git a/compiler/compilers.cc b/compiler/compilers.cc
deleted file mode 100644
index 2481128..0000000
--- a/compiler/compilers.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "compilers.h"
-
-#include "dex/mir_graph.h"
-#include "dex/quick/mir_to_lir.h"
-#include "elf_writer_quick.h"
-#include "mirror/art_method-inl.h"
-
-namespace art {
-
-extern "C" void ArtInitQuickCompilerContext(art::CompilerDriver* driver);
-extern "C" void ArtUnInitQuickCompilerContext(art::CompilerDriver* driver);
-extern "C" art::CompiledMethod* ArtQuickCompileMethod(art::CompilerDriver* driver,
-                                                      const art::DexFile::CodeItem* code_item,
-                                                      uint32_t access_flags,
-                                                      art::InvokeType invoke_type,
-                                                      uint16_t class_def_idx,
-                                                      uint32_t method_idx,
-                                                      jobject class_loader,
-                                                      const art::DexFile& dex_file);
-
-extern "C" art::CompiledMethod* ArtQuickJniCompileMethod(art::CompilerDriver* driver,
-                                                         uint32_t access_flags, uint32_t method_idx,
-                                                         const art::DexFile& dex_file);
-
-// Hack for CFI CIE initialization
-extern std::vector<uint8_t>* X86CFIInitialization(bool is_x86_64);
-
-void QuickCompiler::Init() const {
-  ArtInitQuickCompilerContext(GetCompilerDriver());
-}
-
-void QuickCompiler::UnInit() const {
-  ArtUnInitQuickCompilerContext(GetCompilerDriver());
-}
-
-CompiledMethod* QuickCompiler::Compile(const DexFile::CodeItem* code_item,
-                                       uint32_t access_flags,
-                                       InvokeType invoke_type,
-                                       uint16_t class_def_idx,
-                                       uint32_t method_idx,
-                                       jobject class_loader,
-                                       const DexFile& dex_file) const {
-  CompiledMethod* method = TryCompileWithSeaIR(code_item,
-                                               access_flags,
-                                               invoke_type,
-                                               class_def_idx,
-                                               method_idx,
-                                               class_loader,
-                                               dex_file);
-  if (method != nullptr) {
-    return method;
-  }
-
-  return ArtQuickCompileMethod(GetCompilerDriver(),
-                               code_item,
-                               access_flags,
-                               invoke_type,
-                               class_def_idx,
-                               method_idx,
-                               class_loader,
-                               dex_file);
-}
-
-CompiledMethod* QuickCompiler::JniCompile(uint32_t access_flags,
-                                          uint32_t method_idx,
-                                          const DexFile& dex_file) const {
-  return ArtQuickJniCompileMethod(GetCompilerDriver(), access_flags, method_idx, dex_file);
-}
-
-uintptr_t QuickCompiler::GetEntryPointOf(mirror::ArtMethod* method) const {
-  size_t pointer_size = InstructionSetPointerSize(GetCompilerDriver()->GetInstructionSet());
-  return reinterpret_cast<uintptr_t>(method->GetEntryPointFromQuickCompiledCodePtrSize(
-      pointer_size));
-}
-
-bool QuickCompiler::WriteElf(art::File* file,
-                             OatWriter* oat_writer,
-                             const std::vector<const art::DexFile*>& dex_files,
-                             const std::string& android_root,
-                             bool is_host) const {
-  return art::ElfWriterQuick::Create(file, oat_writer, dex_files, android_root, is_host,
-                                     *GetCompilerDriver());
-}
-
-Backend* QuickCompiler::GetCodeGenerator(CompilationUnit* cu, void* compilation_unit) const {
-  Mir2Lir* mir_to_lir = nullptr;
-  switch (cu->instruction_set) {
-    case kThumb2:
-      mir_to_lir = ArmCodeGenerator(cu, cu->mir_graph.get(), &cu->arena);
-      break;
-    case kArm64:
-      mir_to_lir = Arm64CodeGenerator(cu, cu->mir_graph.get(), &cu->arena);
-      break;
-    case kMips:
-      mir_to_lir = MipsCodeGenerator(cu, cu->mir_graph.get(), &cu->arena);
-      break;
-    case kX86:
-      // Fall-through.
-    case kX86_64:
-      mir_to_lir = X86CodeGenerator(cu, cu->mir_graph.get(), &cu->arena);
-      break;
-    default:
-      LOG(FATAL) << "Unexpected instruction set: " << cu->instruction_set;
-  }
-
-  /* The number of compiler temporaries depends on backend so set it up now if possible */
-  if (mir_to_lir) {
-    size_t max_temps = mir_to_lir->GetMaxPossibleCompilerTemps();
-    bool set_max = cu->mir_graph->SetMaxAvailableNonSpecialCompilerTemps(max_temps);
-    CHECK(set_max);
-  }
-  return mir_to_lir;
-}
-
-std::vector<uint8_t>* QuickCompiler::GetCallFrameInformationInitialization(
-    const CompilerDriver& driver) const {
-  if (driver.GetInstructionSet() == kX86) {
-    return X86CFIInitialization(false);
-  }
-  if (driver.GetInstructionSet() == kX86_64) {
-    return X86CFIInitialization(true);
-  }
-  return nullptr;
-}
-
-CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item,
-                                            uint32_t access_flags,
-                                            InvokeType invoke_type,
-                                            uint16_t class_def_idx,
-                                            uint32_t method_idx,
-                                            jobject class_loader,
-                                            const DexFile& dex_file) const {
-  CompiledMethod* method = TryCompile(code_item, access_flags, invoke_type, class_def_idx,
-                                      method_idx, class_loader, dex_file);
-  if (method != nullptr) {
-    return method;
-  }
-
-  return QuickCompiler::Compile(code_item, access_flags, invoke_type, class_def_idx, method_idx,
-                                class_loader, dex_file);
-}
-
-}  // namespace art
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 5d877fd..3b3170e 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -316,9 +316,8 @@
 
 enum MIROptimizationFlagPositions {
   kMIRIgnoreNullCheck = 0,
-  kMIRNullCheckOnly,
   kMIRIgnoreRangeCheck,
-  kMIRRangeCheckOnly,
+  kMIRStoreNonNullValue,              // Storing non-null value, always mark GC card.
   kMIRClassIsInitialized,
   kMIRClassIsInDexCache,
   kMirIgnoreDivZeroCheck,
@@ -610,21 +609,21 @@
 // LIR fixup kinds for Arm
 enum FixupKind {
   kFixupNone,
-  kFixupLabel,       // For labels we just adjust the offset.
-  kFixupLoad,        // Mostly for immediates.
-  kFixupVLoad,       // FP load which *may* be pc-relative.
-  kFixupCBxZ,        // Cbz, Cbnz.
-  kFixupTBxZ,        // Tbz, Tbnz.
-  kFixupPushPop,     // Not really pc relative, but changes size based on args.
-  kFixupCondBranch,  // Conditional branch
-  kFixupT1Branch,    // Thumb1 Unconditional branch
-  kFixupT2Branch,    // Thumb2 Unconditional branch
-  kFixupBlx1,        // Blx1 (start of Blx1/Blx2 pair).
-  kFixupBl1,         // Bl1 (start of Bl1/Bl2 pair).
-  kFixupAdr,         // Adr.
-  kFixupMovImmLST,   // kThumb2MovImm16LST.
-  kFixupMovImmHST,   // kThumb2MovImm16HST.
-  kFixupAlign4,      // Align to 4-byte boundary.
+  kFixupLabel,             // For labels we just adjust the offset.
+  kFixupLoad,              // Mostly for immediates.
+  kFixupVLoad,             // FP load which *may* be pc-relative.
+  kFixupCBxZ,              // Cbz, Cbnz.
+  kFixupTBxZ,              // Tbz, Tbnz.
+  kFixupCondBranch,        // Conditional branch
+  kFixupT1Branch,          // Thumb1 Unconditional branch
+  kFixupT2Branch,          // Thumb2 Unconditional branch
+  kFixupBlx1,              // Blx1 (start of Blx1/Blx2 pair).
+  kFixupBl1,               // Bl1 (start of Bl1/Bl2 pair).
+  kFixupAdr,               // Adr.
+  kFixupMovImmLST,         // kThumb2MovImm16LST.
+  kFixupMovImmHST,         // kThumb2MovImm16HST.
+  kFixupAlign4,            // Align to 4-byte boundary.
+  kFixupA53Erratum835769,  // Cortex A53 Erratum 835769.
 };
 std::ostream& operator<<(std::ostream& os, const FixupKind& kind);
 
diff --git a/compiler/dex/global_value_numbering.cc b/compiler/dex/global_value_numbering.cc
index d311bc7..578952b 100644
--- a/compiler/dex/global_value_numbering.cc
+++ b/compiler/dex/global_value_numbering.cc
@@ -15,7 +15,6 @@
  */
 
 #include "global_value_numbering.h"
-
 #include "local_value_numbering.h"
 
 namespace art {
@@ -31,8 +30,6 @@
       modifications_allowed_(true),
       mode_(mode),
       global_value_map_(std::less<uint64_t>(), allocator->Adapter()),
-      field_index_map_(FieldReferenceComparator(), allocator->Adapter()),
-      field_index_reverse_map_(allocator->Adapter()),
       array_location_map_(ArrayLocationComparator(), allocator->Adapter()),
       array_location_reverse_map_(allocator->Adapter()),
       ref_set_map_(std::less<ValueNameSet>(), allocator->Adapter()),
@@ -107,15 +104,8 @@
     if (bb->catch_entry) {
       merge_type = LocalValueNumbering::kCatchMerge;
     } else if (bb->last_mir_insn != nullptr &&
-        (bb->last_mir_insn->dalvikInsn.opcode == Instruction::RETURN_VOID ||
-         bb->last_mir_insn->dalvikInsn.opcode == Instruction::RETURN ||
-         bb->last_mir_insn->dalvikInsn.opcode == Instruction::RETURN_OBJECT ||
-         bb->last_mir_insn->dalvikInsn.opcode == Instruction::RETURN_WIDE) &&
-        (bb->first_mir_insn == bb->last_mir_insn ||
-         (static_cast<int>(bb->first_mir_insn->dalvikInsn.opcode) == kMirOpPhi &&
-          (bb->first_mir_insn->next == bb->last_mir_insn ||
-           (static_cast<int>(bb->first_mir_insn->next->dalvikInsn.opcode) == kMirOpPhi &&
-            bb->first_mir_insn->next->next == bb->last_mir_insn))))) {
+        IsInstructionReturn(bb->last_mir_insn->dalvikInsn.opcode) &&
+        bb->GetFirstNonPhiInsn() == bb->last_mir_insn) {
       merge_type = LocalValueNumbering::kReturnMerge;
     }
     // At least one predecessor must have been processed before this bb.
@@ -145,19 +135,6 @@
   return change;
 }
 
-uint16_t GlobalValueNumbering::GetFieldId(const MirFieldInfo& field_info, uint16_t type) {
-  FieldReference key = { field_info.DeclaringDexFile(), field_info.DeclaringFieldIndex(), type };
-  auto lb = field_index_map_.lower_bound(key);
-  if (lb != field_index_map_.end() && !field_index_map_.key_comp()(key, lb->first)) {
-    return lb->second;
-  }
-  DCHECK_LT(field_index_map_.size(), kNoValue);
-  uint16_t id = field_index_map_.size();
-  auto it = field_index_map_.PutBefore(lb, key, id);
-  field_index_reverse_map_.push_back(&*it);
-  return id;
-}
-
 uint16_t GlobalValueNumbering::GetArrayLocation(uint16_t base, uint16_t index) {
   auto cmp = array_location_map_.key_comp();
   ArrayLocation key = { base, index };
@@ -207,4 +184,20 @@
   return true;
 }
 
+bool GlobalValueNumbering::DivZeroCheckedInAllPredecessors(
+    const ScopedArenaVector<uint16_t>& merge_names) const {
+  // Implicit parameters:
+  //   - *work_lvn: the LVN for which we're checking predecessors.
+  //   - merge_lvns_: the predecessor LVNs.
+  DCHECK_EQ(merge_lvns_.size(), merge_names.size());
+  for (size_t i = 0, size = merge_lvns_.size(); i != size; ++i) {
+    const LocalValueNumbering* pred_lvn = merge_lvns_[i];
+    uint16_t value_name = merge_names[i];
+    if (!pred_lvn->IsValueDivZeroChecked(value_name)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace art
diff --git a/compiler/dex/global_value_numbering.h b/compiler/dex/global_value_numbering.h
index 72d1112..d72144a 100644
--- a/compiler/dex/global_value_numbering.h
+++ b/compiler/dex/global_value_numbering.h
@@ -39,6 +39,12 @@
         cu->mir_graph->GetMaxNestedLoops() > kMaxAllowedNestedLoops;
   }
 
+  // Instance and static field id map is held by MIRGraph to avoid multiple recalculations
+  // when doing LVN.
+  template <typename Container>  // Container of MirIFieldLoweringInfo or MirSFieldLoweringInfo.
+  static uint16_t* PrepareGvnFieldIds(ScopedArenaAllocator* allocator,
+                                      const Container& field_infos);
+
   GlobalValueNumbering(CompilationUnit* cu, ScopedArenaAllocator* allocator, Mode mode);
   ~GlobalValueNumbering();
 
@@ -114,34 +120,24 @@
     return (it != global_value_map_.end() && it->second == value);
   }
 
-  // FieldReference represents a unique resolved field.
-  struct FieldReference {
-    const DexFile* dex_file;
-    uint16_t field_idx;
-    uint16_t type;  // See comments for LocalValueNumbering::kFieldTypeCount.
-  };
+  // Get an instance field id.
+  uint16_t GetIFieldId(MIR* mir) {
+    return GetMirGraph()->GetGvnIFieldId(mir);
+  }
 
-  struct FieldReferenceComparator {
-    bool operator()(const FieldReference& lhs, const FieldReference& rhs) const {
-      if (lhs.field_idx != rhs.field_idx) {
-        return lhs.field_idx < rhs.field_idx;
-      }
-      // If the field_idx and dex_file match, the type must also match.
-      DCHECK(lhs.dex_file != rhs.dex_file || lhs.type == rhs.type);
-      return lhs.dex_file < rhs.dex_file;
-    }
-  };
+  // Get a static field id.
+  uint16_t GetSFieldId(MIR* mir) {
+    return GetMirGraph()->GetGvnSFieldId(mir);
+  }
 
-  // Maps field key to field id for resolved fields.
-  typedef ScopedArenaSafeMap<FieldReference, uint32_t, FieldReferenceComparator> FieldIndexMap;
+  // Get an instance field type based on field id.
+  uint16_t GetIFieldType(uint16_t field_id) {
+    return static_cast<uint16_t>(GetMirGraph()->GetIFieldLoweringInfo(field_id).MemAccessType());
+  }
 
-  // Get a field id.
-  uint16_t GetFieldId(const MirFieldInfo& field_info, uint16_t type);
-
-  // Get a field type based on field id.
-  uint16_t GetFieldType(uint16_t field_id) {
-    DCHECK_LT(field_id, field_index_reverse_map_.size());
-    return field_index_reverse_map_[field_id]->first.type;
+  // Get a static field type based on field id.
+  uint16_t GetSFieldType(uint16_t field_id) {
+    return static_cast<uint16_t>(GetMirGraph()->GetSFieldLoweringInfo(field_id).MemAccessType());
   }
 
   struct ArrayLocation {
@@ -199,6 +195,8 @@
 
   bool NullCheckedInAllPredecessors(const ScopedArenaVector<uint16_t>& merge_names) const;
 
+  bool DivZeroCheckedInAllPredecessors(const ScopedArenaVector<uint16_t>& merge_names) const;
+
   CompilationUnit* GetCompilationUnit() const {
     return cu_;
   }
@@ -239,8 +237,6 @@
   Mode mode_;
 
   ValueMap global_value_map_;
-  FieldIndexMap field_index_map_;
-  ScopedArenaVector<const FieldIndexMap::value_type*> field_index_reverse_map_;
   ArrayLocationMap array_location_map_;
   ScopedArenaVector<const ArrayLocationMap::value_type*> array_location_reverse_map_;
   RefSetIdMap ref_set_map_;
@@ -268,6 +264,32 @@
   return last_value_;
 }
 
+template <typename Container>  // Container of MirIFieldLoweringInfo or MirSFieldLoweringInfo.
+uint16_t* GlobalValueNumbering::PrepareGvnFieldIds(ScopedArenaAllocator* allocator,
+                                                   const Container& field_infos) {
+  size_t size = field_infos.size();
+  uint16_t* field_ids = reinterpret_cast<uint16_t*>(allocator->Alloc(size * sizeof(uint16_t),
+                                                                     kArenaAllocMisc));
+  for (size_t i = 0u; i != size; ++i) {
+    size_t idx = i;
+    const MirFieldInfo& cur_info = field_infos[i];
+    if (cur_info.IsResolved()) {
+      for (size_t j = 0; j != i; ++j) {
+        const MirFieldInfo& prev_info = field_infos[j];
+        if (prev_info.IsResolved() &&
+            prev_info.DeclaringDexFile() == cur_info.DeclaringDexFile() &&
+            prev_info.DeclaringFieldIndex() == cur_info.DeclaringFieldIndex()) {
+          DCHECK_EQ(cur_info.MemAccessType(), prev_info.MemAccessType());
+          idx = j;
+          break;
+        }
+      }
+    }
+    field_ids[i] = idx;
+  }
+  return field_ids;
+}
+
 }  // namespace art
 
 #endif  // ART_COMPILER_DEX_GLOBAL_VALUE_NUMBERING_H_
diff --git a/compiler/dex/global_value_numbering_test.cc b/compiler/dex/global_value_numbering_test.cc
index 35d5b99..7e3b4d8 100644
--- a/compiler/dex/global_value_numbering_test.cc
+++ b/compiler/dex/global_value_numbering_test.cc
@@ -17,6 +17,7 @@
 #include "compiler_internals.h"
 #include "dataflow_iterator.h"
 #include "dataflow_iterator-inl.h"
+#include "dex/mir_field_info.h"
 #include "global_value_numbering.h"
 #include "local_value_numbering.h"
 #include "gtest/gtest.h"
@@ -32,6 +33,7 @@
     uintptr_t declaring_dex_file;
     uint16_t declaring_field_idx;
     bool is_volatile;
+    DexMemAccessType type;
   };
 
   struct SFieldDef {
@@ -39,6 +41,7 @@
     uintptr_t declaring_dex_file;
     uint16_t declaring_field_idx;
     bool is_volatile;
+    DexMemAccessType type;
   };
 
   struct BBDef {
@@ -131,18 +134,19 @@
     { bb, opcode, 0u, 0u, 2, { src, src + 1 }, 2, { reg, reg + 1 } }
 #define DEF_PHI2(bb, reg, src1, src2) \
     { bb, static_cast<Instruction::Code>(kMirOpPhi), 0, 0u, 2u, { src1, src2 }, 1, { reg } }
+#define DEF_DIV_REM(bb, opcode, result, dividend, divisor) \
+    { bb, opcode, 0u, 0u, 2, { dividend, divisor }, 1, { result } }
 
   void DoPrepareIFields(const IFieldDef* defs, size_t count) {
     cu_.mir_graph->ifield_lowering_infos_.clear();
     cu_.mir_graph->ifield_lowering_infos_.reserve(count);
     for (size_t i = 0u; i != count; ++i) {
       const IFieldDef* def = &defs[i];
-      MirIFieldLoweringInfo field_info(def->field_idx);
+      MirIFieldLoweringInfo field_info(def->field_idx, def->type);
       if (def->declaring_dex_file != 0u) {
         field_info.declaring_dex_file_ = reinterpret_cast<const DexFile*>(def->declaring_dex_file);
         field_info.declaring_field_idx_ = def->declaring_field_idx;
-        field_info.flags_ = 0u |  // Without kFlagIsStatic.
-            (def->is_volatile ? MirIFieldLoweringInfo::kFlagIsVolatile : 0u);
+        field_info.flags_ &= ~(def->is_volatile ? 0u : MirSFieldLoweringInfo::kFlagIsVolatile);
       }
       cu_.mir_graph->ifield_lowering_infos_.push_back(field_info);
     }
@@ -158,15 +162,14 @@
     cu_.mir_graph->sfield_lowering_infos_.reserve(count);
     for (size_t i = 0u; i != count; ++i) {
       const SFieldDef* def = &defs[i];
-      MirSFieldLoweringInfo field_info(def->field_idx);
+      MirSFieldLoweringInfo field_info(def->field_idx, def->type);
       // Mark even unresolved fields as initialized.
-      field_info.flags_ = MirSFieldLoweringInfo::kFlagIsStatic |
-          MirSFieldLoweringInfo::kFlagClassIsInitialized;
+      field_info.flags_ |= MirSFieldLoweringInfo::kFlagClassIsInitialized;
       // NOTE: MirSFieldLoweringInfo::kFlagClassIsInDexCache isn't used by GVN.
       if (def->declaring_dex_file != 0u) {
         field_info.declaring_dex_file_ = reinterpret_cast<const DexFile*>(def->declaring_dex_file);
         field_info.declaring_field_idx_ = def->declaring_field_idx;
-        field_info.flags_ |= (def->is_volatile ? MirSFieldLoweringInfo::kFlagIsVolatile : 0u);
+        field_info.flags_ &= ~(def->is_volatile ? 0u : MirSFieldLoweringInfo::kFlagIsVolatile);
       }
       cu_.mir_graph->sfield_lowering_infos_.push_back(field_info);
     }
@@ -238,12 +241,16 @@
       mir->dalvikInsn.opcode = def->opcode;
       mir->dalvikInsn.vB = static_cast<int32_t>(def->value);
       mir->dalvikInsn.vB_wide = def->value;
-      if (def->opcode >= Instruction::IGET && def->opcode <= Instruction::IPUT_SHORT) {
+      if (IsInstructionIGetOrIPut(def->opcode)) {
         ASSERT_LT(def->field_info, cu_.mir_graph->ifield_lowering_infos_.size());
         mir->meta.ifield_lowering_info = def->field_info;
-      } else if (def->opcode >= Instruction::SGET && def->opcode <= Instruction::SPUT_SHORT) {
+        ASSERT_EQ(cu_.mir_graph->ifield_lowering_infos_[def->field_info].MemAccessType(),
+                  IGetOrIPutMemAccessType(def->opcode));
+      } else if (IsInstructionSGetOrSPut(def->opcode)) {
         ASSERT_LT(def->field_info, cu_.mir_graph->sfield_lowering_infos_.size());
         mir->meta.sfield_lowering_info = def->field_info;
+        ASSERT_EQ(cu_.mir_graph->sfield_lowering_infos_[def->field_info].MemAccessType(),
+                  SGetOrSPutMemAccessType(def->opcode));
       } else if (def->opcode == static_cast<Instruction::Code>(kMirOpPhi)) {
         mir->meta.phi_incoming = static_cast<BasicBlockId*>(
             allocator_->Alloc(def->num_uses * sizeof(BasicBlockId), kArenaAllocDFInfo));
@@ -288,6 +295,10 @@
     cu_.mir_graph->ComputeDominators();
     cu_.mir_graph->ComputeTopologicalSortOrder();
     cu_.mir_graph->SSATransformationEnd();
+    cu_.mir_graph->temp_.gvn.ifield_ids_ =  GlobalValueNumbering::PrepareGvnFieldIds(
+        allocator_.get(), cu_.mir_graph->ifield_lowering_infos_);
+    cu_.mir_graph->temp_.gvn.sfield_ids_ =  GlobalValueNumbering::PrepareGvnFieldIds(
+        allocator_.get(), cu_.mir_graph->sfield_lowering_infos_);
     ASSERT_TRUE(gvn_ == nullptr);
     gvn_.reset(new (allocator_.get()) GlobalValueNumbering(&cu_, allocator_.get(),
                                                            GlobalValueNumbering::kModeGvn));
@@ -498,18 +509,18 @@
 
 TEST_F(GlobalValueNumberingTestDiamond, NonAliasingIFields) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Int.
-      { 1u, 1u, 1u, false },  // Int.
-      { 2u, 1u, 2u, false },  // Int.
-      { 3u, 1u, 3u, false },  // Int.
-      { 4u, 1u, 4u, false },  // Short.
-      { 5u, 1u, 5u, false },  // Char.
-      { 6u, 0u, 0u, false },  // Unresolved, Short.
-      { 7u, 1u, 7u, false },  // Int.
-      { 8u, 0u, 0u, false },  // Unresolved, Int.
-      { 9u, 1u, 9u, false },  // Int.
-      { 10u, 1u, 10u, false },  // Int.
-      { 11u, 1u, 11u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
+      { 3u, 1u, 3u, false, kDexMemAccessWord },
+      { 4u, 1u, 4u, false, kDexMemAccessShort },
+      { 5u, 1u, 5u, false, kDexMemAccessChar },
+      { 6u, 0u, 0u, false, kDexMemAccessShort },   // Unresolved.
+      { 7u, 1u, 7u, false, kDexMemAccessWord },
+      { 8u, 0u, 0u, false, kDexMemAccessWord },    // Unresolved.
+      { 9u, 1u, 9u, false, kDexMemAccessWord },
+      { 10u, 1u, 10u, false, kDexMemAccessWord },
+      { 11u, 1u, 11u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       // NOTE: MIRs here are ordered by unique tests. They will be put into appropriate blocks.
@@ -604,15 +615,15 @@
 
 TEST_F(GlobalValueNumberingTestDiamond, AliasingIFieldsSingleObject) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Int.
-      { 1u, 1u, 1u, false },  // Int.
-      { 2u, 1u, 2u, false },  // Int.
-      { 3u, 1u, 3u, false },  // Int.
-      { 4u, 1u, 4u, false },  // Short.
-      { 5u, 1u, 5u, false },  // Char.
-      { 6u, 0u, 0u, false },  // Unresolved, Short.
-      { 7u, 1u, 7u, false },  // Int.
-      { 8u, 1u, 8u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
+      { 3u, 1u, 3u, false, kDexMemAccessWord },
+      { 4u, 1u, 4u, false, kDexMemAccessShort },
+      { 5u, 1u, 5u, false, kDexMemAccessChar },
+      { 6u, 0u, 0u, false, kDexMemAccessShort },  // Unresolved.
+      { 7u, 1u, 7u, false, kDexMemAccessWord },
+      { 8u, 1u, 8u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       // NOTE: MIRs here are ordered by unique tests. They will be put into appropriate blocks.
@@ -671,15 +682,15 @@
 
 TEST_F(GlobalValueNumberingTestDiamond, AliasingIFieldsTwoObjects) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Int.
-      { 1u, 1u, 1u, false },  // Int.
-      { 2u, 1u, 2u, false },  // Int.
-      { 3u, 1u, 3u, false },  // Int.
-      { 4u, 1u, 4u, false },  // Short.
-      { 5u, 1u, 5u, false },  // Char.
-      { 6u, 0u, 0u, false },  // Unresolved, Short.
-      { 7u, 1u, 7u, false },  // Int.
-      { 8u, 1u, 8u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
+      { 3u, 1u, 3u, false, kDexMemAccessWord },
+      { 4u, 1u, 4u, false, kDexMemAccessShort },
+      { 5u, 1u, 5u, false, kDexMemAccessChar },
+      { 6u, 0u, 0u, false, kDexMemAccessShort },   // Unresolved.
+      { 7u, 1u, 7u, false, kDexMemAccessWord },
+      { 8u, 1u, 8u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       // NOTE: MIRs here are ordered by unique tests. They will be put into appropriate blocks.
@@ -740,15 +751,15 @@
 
 TEST_F(GlobalValueNumberingTestDiamond, SFields) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, false },  // Int.
-      { 1u, 1u, 1u, false },  // Int.
-      { 2u, 1u, 2u, false },  // Int.
-      { 3u, 1u, 3u, false },  // Int.
-      { 4u, 1u, 4u, false },  // Short.
-      { 5u, 1u, 5u, false },  // Char.
-      { 6u, 0u, 0u, false },  // Unresolved, Short.
-      { 7u, 1u, 7u, false },  // Int.
-      { 8u, 1u, 8u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
+      { 3u, 1u, 3u, false, kDexMemAccessWord },
+      { 4u, 1u, 4u, false, kDexMemAccessShort },
+      { 5u, 1u, 5u, false, kDexMemAccessChar },
+      { 6u, 0u, 0u, false, kDexMemAccessShort },   // Unresolved.
+      { 7u, 1u, 7u, false, kDexMemAccessWord },
+      { 8u, 1u, 8u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       // NOTE: MIRs here are ordered by unique tests. They will be put into appropriate blocks.
@@ -1078,18 +1089,18 @@
 
 TEST_F(GlobalValueNumberingTestLoop, NonAliasingIFields) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Int.
-      { 1u, 1u, 1u, false },  // Int.
-      { 2u, 1u, 2u, false },  // Int.
-      { 3u, 1u, 3u, false },  // Int.
-      { 4u, 1u, 4u, false },  // Int.
-      { 5u, 1u, 5u, false },  // Short.
-      { 6u, 1u, 6u, false },  // Char.
-      { 7u, 0u, 0u, false },  // Unresolved, Short.
-      { 8u, 1u, 8u, false },  // Int.
-      { 9u, 0u, 0u, false },  // Unresolved, Int.
-      { 10u, 1u, 10u, false },  // Int.
-      { 11u, 1u, 11u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
+      { 3u, 1u, 3u, false, kDexMemAccessWord },
+      { 4u, 1u, 4u, false, kDexMemAccessWord },
+      { 5u, 1u, 5u, false, kDexMemAccessShort },
+      { 6u, 1u, 6u, false, kDexMemAccessChar },
+      { 7u, 0u, 0u, false, kDexMemAccessShort },   // Unresolved.
+      { 8u, 1u, 8u, false, kDexMemAccessWord },
+      { 9u, 0u, 0u, false, kDexMemAccessWord },    // Unresolved.
+      { 10u, 1u, 10u, false, kDexMemAccessWord },
+      { 11u, 1u, 11u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       // NOTE: MIRs here are ordered by unique tests. They will be put into appropriate blocks.
@@ -1201,14 +1212,14 @@
 
 TEST_F(GlobalValueNumberingTestLoop, AliasingIFieldsSingleObject) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Int.
-      { 1u, 1u, 1u, false },  // Int.
-      { 2u, 1u, 2u, false },  // Int.
-      { 3u, 1u, 3u, false },  // Int.
-      { 4u, 1u, 4u, false },  // Int.
-      { 5u, 1u, 5u, false },  // Short.
-      { 6u, 1u, 6u, false },  // Char.
-      { 7u, 0u, 0u, false },  // Unresolved, Short.
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
+      { 3u, 1u, 3u, false, kDexMemAccessWord },
+      { 4u, 1u, 4u, false, kDexMemAccessWord },
+      { 5u, 1u, 5u, false, kDexMemAccessShort },
+      { 6u, 1u, 6u, false, kDexMemAccessChar },
+      { 7u, 0u, 0u, false, kDexMemAccessShort },   // Unresolved.
   };
   static const MIRDef mirs[] = {
       // NOTE: MIRs here are ordered by unique tests. They will be put into appropriate blocks.
@@ -1272,14 +1283,14 @@
 
 TEST_F(GlobalValueNumberingTestLoop, AliasingIFieldsTwoObjects) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Int.
-      { 1u, 1u, 1u, false },  // Int.
-      { 2u, 1u, 2u, false },  // Int.
-      { 3u, 1u, 3u, false },  // Short.
-      { 4u, 1u, 4u, false },  // Char.
-      { 5u, 0u, 0u, false },  // Unresolved, Short.
-      { 6u, 1u, 6u, false },  // Int.
-      { 7u, 1u, 7u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
+      { 3u, 1u, 3u, false, kDexMemAccessShort },
+      { 4u, 1u, 4u, false, kDexMemAccessChar },
+      { 5u, 0u, 0u, false, kDexMemAccessShort },   // Unresolved.
+      { 6u, 1u, 6u, false, kDexMemAccessWord },
+      { 7u, 1u, 7u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       // NOTE: MIRs here are ordered by unique tests. They will be put into appropriate blocks.
@@ -1341,7 +1352,7 @@
 
 TEST_F(GlobalValueNumberingTestLoop, IFieldToBaseDependency) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       // For the IGET that loads sreg 3u using base 2u, the following IPUT creates a dependency
@@ -1366,9 +1377,9 @@
 
 TEST_F(GlobalValueNumberingTestLoop, SFields) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, false },  // Int.
-      { 1u, 1u, 1u, false },  // Int.
-      { 2u, 1u, 2u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       // NOTE: MIRs here are ordered by unique tests. They will be put into appropriate blocks.
@@ -1562,8 +1573,8 @@
 
 TEST_F(GlobalValueNumberingTestCatch, IFields) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },
-      { 1u, 1u, 1u, false },
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_UNIQUE_REF(3, Instruction::NEW_INSTANCE, 200u),
@@ -1608,8 +1619,8 @@
 
 TEST_F(GlobalValueNumberingTestCatch, SFields) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, false },
-      { 1u, 1u, 1u, false },
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_SGET(3, Instruction::SGET, 0u, 0u),
@@ -1731,8 +1742,8 @@
 
 TEST_F(GlobalValueNumberingTest, NullCheckIFields) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Object.
-      { 1u, 1u, 1u, false },  // Object.
+      { 0u, 1u, 0u, false, kDexMemAccessObject },  // Object.
+      { 1u, 1u, 1u, false, kDexMemAccessObject },  // Object.
   };
   static const BBDef bbs[] = {
       DEF_BB(kNullBlock, DEF_SUCC0(), DEF_PRED0()),
@@ -1780,8 +1791,8 @@
 
 TEST_F(GlobalValueNumberingTest, NullCheckSFields) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, false },  // Object.
-      { 1u, 1u, 1u, false },  // Object.
+      { 0u, 1u, 0u, false, kDexMemAccessObject },
+      { 1u, 1u, 1u, false, kDexMemAccessObject },
   };
   static const BBDef bbs[] = {
       DEF_BB(kNullBlock, DEF_SUCC0(), DEF_PRED0()),
@@ -1907,12 +1918,12 @@
 
 TEST_F(GlobalValueNumberingTestDiamond, MergeSameValueInDifferentMemoryLocations) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Int.
-      { 1u, 1u, 1u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
   };
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, false },  // Int.
-      { 1u, 1u, 1u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessWord },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_UNIQUE_REF(3, Instruction::NEW_INSTANCE, 100u),
@@ -1977,7 +1988,7 @@
   // LVN's aliasing_array_value_map_'s load_value_map for BBs #9, #4, #5, #7 because of the
   // DFS ordering of LVN evaluation.
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Object.
+      { 0u, 1u, 0u, false, kDexMemAccessObject },
   };
   static const BBDef bbs[] = {
       DEF_BB(kNullBlock, DEF_SUCC0(), DEF_PRED0()),
@@ -2015,7 +2026,7 @@
 
 TEST_F(GlobalValueNumberingTestTwoConsecutiveLoops, IFieldAndPhi) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessObject },
   };
   static const MIRDef mirs[] = {
       DEF_MOVE(3, Instruction::MOVE_OBJECT, 0u, 100u),
@@ -2052,10 +2063,10 @@
 
 TEST_F(GlobalValueNumberingTestTwoConsecutiveLoops, NullCheck) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessObject },
   };
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessObject },
   };
   static const MIRDef mirs[] = {
       DEF_MOVE(3, Instruction::MOVE_OBJECT, 0u, 100u),
@@ -2143,7 +2154,7 @@
 
 TEST_F(GlobalValueNumberingTestTwoNestedLoops, IFieldAndPhi) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, false },  // Int.
+      { 0u, 1u, 0u, false, kDexMemAccessObject },
   };
   static const MIRDef mirs[] = {
       DEF_MOVE(3, Instruction::MOVE_OBJECT, 0u, 100u),
@@ -2213,4 +2224,45 @@
   PerformGVN();
 }
 
+TEST_F(GlobalValueNumberingTestDiamond, DivZeroCheckDiamond) {
+  static const MIRDef mirs[] = {
+      DEF_DIV_REM(3u, Instruction::DIV_INT, 1u, 20u, 21u),
+      DEF_DIV_REM(3u, Instruction::DIV_INT, 2u, 24u, 21u),
+      DEF_DIV_REM(3u, Instruction::DIV_INT, 3u, 20u, 23u),
+      DEF_DIV_REM(4u, Instruction::DIV_INT, 4u, 24u, 22u),
+      DEF_DIV_REM(4u, Instruction::DIV_INT, 9u, 24u, 25u),
+      DEF_DIV_REM(5u, Instruction::DIV_INT, 5u, 24u, 21u),
+      DEF_DIV_REM(5u, Instruction::DIV_INT, 10u, 24u, 26u),
+      DEF_PHI2(6u, 27u, 25u, 26u),
+      DEF_DIV_REM(6u, Instruction::DIV_INT, 12u, 20u, 27u),
+      DEF_DIV_REM(6u, Instruction::DIV_INT, 6u, 24u, 21u),
+      DEF_DIV_REM(6u, Instruction::DIV_INT, 7u, 20u, 23u),
+      DEF_DIV_REM(6u, Instruction::DIV_INT, 8u, 20u, 22u),
+  };
+
+  static const bool expected_ignore_div_zero_check[] = {
+      false,  // New divisor seen.
+      true,   // Eliminated since it has first divisor as first one.
+      false,  // New divisor seen.
+      false,  // New divisor seen.
+      false,  // New divisor seen.
+      true,   // Eliminated in dominating block.
+      false,  // New divisor seen.
+      false,  // Phi node.
+      true,   // Eliminated on both sides of diamond and merged via phi.
+      true,   // Eliminated in dominating block.
+      true,   // Eliminated in dominating block.
+      false,  // Only eliminated on one path of diamond.
+  };
+
+  PrepareMIRs(mirs);
+  PerformGVN();
+  PerformGVNCodeModifications();
+  ASSERT_EQ(arraysize(expected_ignore_div_zero_check), mir_count_);
+  for (size_t i = 0u; i != mir_count_; ++i) {
+    int expected = expected_ignore_div_zero_check[i] ? MIR_IGNORE_DIV_ZERO_CHECK : 0u;
+    EXPECT_EQ(expected, mirs_[i].optimization_flags) << i;
+  }
+}
+
 }  // namespace art
diff --git a/compiler/dex/local_value_numbering.cc b/compiler/dex/local_value_numbering.cc
index c1ce2ac..114346d 100644
--- a/compiler/dex/local_value_numbering.cc
+++ b/compiler/dex/local_value_numbering.cc
@@ -56,7 +56,7 @@
  public:
   static uint16_t StartMemoryVersion(GlobalValueNumbering* gvn, const LocalValueNumbering* lvn,
                                      uint16_t field_id) {
-    uint16_t type = gvn->GetFieldType(field_id);
+    uint16_t type = gvn->GetIFieldType(field_id);
     return gvn->LookupValue(kAliasingIFieldStartVersionOp, field_id,
                             lvn->global_memory_version_, lvn->unresolved_ifield_version_[type]);
   }
@@ -75,7 +75,7 @@
   static uint16_t LookupMergeValue(GlobalValueNumbering* gvn, const LocalValueNumbering* lvn,
                                    uint16_t field_id, uint16_t base) {
     // If the base/field_id is non-aliasing in lvn, use the non-aliasing value.
-    uint16_t type = gvn->GetFieldType(field_id);
+    uint16_t type = gvn->GetIFieldType(field_id);
     if (lvn->IsNonAliasingIField(base, field_id, type)) {
       uint16_t loc = gvn->LookupValue(kNonAliasingIFieldLocOp, base, field_id, type);
       auto lb = lvn->non_aliasing_ifield_value_map_.find(loc);
@@ -89,7 +89,7 @@
 
   static bool HasNewBaseVersion(GlobalValueNumbering* gvn, const LocalValueNumbering* lvn,
                                 uint16_t field_id) {
-    uint16_t type = gvn->GetFieldType(field_id);
+    uint16_t type = gvn->GetIFieldType(field_id);
     return lvn->unresolved_ifield_version_[type] == lvn->merge_new_memory_version_ ||
         lvn->global_memory_version_ == lvn->merge_new_memory_version_;
   }
@@ -339,11 +339,12 @@
       escaped_array_clobber_set_(EscapedArrayClobberKeyComparator(), allocator->Adapter()),
       range_checked_(RangeCheckKeyComparator() , allocator->Adapter()),
       null_checked_(std::less<uint16_t>(), allocator->Adapter()),
+      div_zero_checked_(std::less<uint16_t>(), allocator->Adapter()),
       merge_names_(allocator->Adapter()),
       merge_map_(std::less<ScopedArenaVector<BasicBlockId>>(), allocator->Adapter()),
       merge_new_memory_version_(kNoValue) {
-  std::fill_n(unresolved_sfield_version_, kFieldTypeCount, 0u);
-  std::fill_n(unresolved_ifield_version_, kFieldTypeCount, 0u);
+  std::fill_n(unresolved_sfield_version_, arraysize(unresolved_sfield_version_), 0u);
+  std::fill_n(unresolved_ifield_version_, arraysize(unresolved_ifield_version_), 0u);
 }
 
 bool LocalValueNumbering::Equals(const LocalValueNumbering& other) const {
@@ -362,7 +363,8 @@
       escaped_ifield_clobber_set_ == other.escaped_ifield_clobber_set_ &&
       escaped_array_clobber_set_ == other.escaped_array_clobber_set_ &&
       range_checked_ == other.range_checked_ &&
-      null_checked_ == other.null_checked_;
+      null_checked_ == other.null_checked_ &&
+      div_zero_checked_ == other.div_zero_checked_;
 }
 
 void LocalValueNumbering::MergeOne(const LocalValueNumbering& other, MergeType merge_type) {
@@ -379,6 +381,7 @@
   non_aliasing_refs_ = other.non_aliasing_refs_;
   range_checked_ = other.range_checked_;
   null_checked_ = other.null_checked_;
+  div_zero_checked_ = other.div_zero_checked_;
 
   const BasicBlock* pred_bb = gvn_->GetBasicBlock(other.Id());
   if (GlobalValueNumbering::HasNullCheckLastInsn(pred_bb, Id())) {
@@ -389,16 +392,20 @@
   if (merge_type == kCatchMerge) {
     // Memory is clobbered. Use new memory version and don't merge aliasing locations.
     global_memory_version_ = NewMemoryVersion(&merge_new_memory_version_);
-    std::fill_n(unresolved_sfield_version_, kFieldTypeCount, global_memory_version_);
-    std::fill_n(unresolved_ifield_version_, kFieldTypeCount, global_memory_version_);
+    std::fill_n(unresolved_sfield_version_, arraysize(unresolved_sfield_version_),
+                global_memory_version_);
+    std::fill_n(unresolved_ifield_version_, arraysize(unresolved_ifield_version_),
+                global_memory_version_);
     PruneNonAliasingRefsForCatch();
     return;
   }
 
   DCHECK(merge_type == kNormalMerge);
   global_memory_version_ = other.global_memory_version_;
-  std::copy_n(other.unresolved_ifield_version_, kFieldTypeCount, unresolved_ifield_version_);
-  std::copy_n(other.unresolved_sfield_version_, kFieldTypeCount, unresolved_sfield_version_);
+  std::copy_n(other.unresolved_ifield_version_, arraysize(unresolved_sfield_version_),
+              unresolved_ifield_version_);
+  std::copy_n(other.unresolved_sfield_version_, arraysize(unresolved_ifield_version_),
+              unresolved_sfield_version_);
   sfield_value_map_ = other.sfield_value_map_;
   CopyAliasingValuesMap(&aliasing_ifield_value_map_, other.aliasing_ifield_value_map_);
   CopyAliasingValuesMap(&aliasing_array_value_map_, other.aliasing_array_value_map_);
@@ -410,9 +417,11 @@
 bool LocalValueNumbering::SameMemoryVersion(const LocalValueNumbering& other) const {
   return
       global_memory_version_ == other.global_memory_version_ &&
-      std::equal(unresolved_ifield_version_, unresolved_ifield_version_ + kFieldTypeCount,
+      std::equal(unresolved_ifield_version_,
+                 unresolved_ifield_version_ + arraysize(unresolved_ifield_version_),
                  other.unresolved_ifield_version_) &&
-      std::equal(unresolved_sfield_version_, unresolved_sfield_version_ + kFieldTypeCount,
+      std::equal(unresolved_sfield_version_,
+                 unresolved_sfield_version_ + arraysize(unresolved_sfield_version_),
                  other.unresolved_sfield_version_);
 }
 
@@ -439,18 +448,22 @@
   }
   if (new_global_version) {
     global_memory_version_ = NewMemoryVersion(&merge_new_memory_version_);
-    std::fill_n(unresolved_sfield_version_, kFieldTypeCount, merge_new_memory_version_);
-    std::fill_n(unresolved_ifield_version_, kFieldTypeCount, merge_new_memory_version_);
+    std::fill_n(unresolved_sfield_version_, arraysize(unresolved_sfield_version_),
+                merge_new_memory_version_);
+    std::fill_n(unresolved_ifield_version_, arraysize(unresolved_ifield_version_),
+                merge_new_memory_version_);
   } else {
     // Initialize with a copy of memory versions from the comparison LVN.
     global_memory_version_ = cmp->global_memory_version_;
-    std::copy_n(cmp->unresolved_ifield_version_, kFieldTypeCount, unresolved_ifield_version_);
-    std::copy_n(cmp->unresolved_sfield_version_, kFieldTypeCount, unresolved_sfield_version_);
+    std::copy_n(cmp->unresolved_ifield_version_, arraysize(unresolved_sfield_version_),
+                unresolved_ifield_version_);
+    std::copy_n(cmp->unresolved_sfield_version_, arraysize(unresolved_ifield_version_),
+                unresolved_sfield_version_);
     for (const LocalValueNumbering* lvn : gvn_->merge_lvns_) {
       if (lvn == cmp) {
         continue;
       }
-      for (size_t i = 0; i != kFieldTypeCount; ++i) {
+      for (size_t i = 0; i != kDexMemAccessTypeCount; ++i) {
         if (lvn->unresolved_ifield_version_[i] != cmp->unresolved_ifield_version_[i]) {
           unresolved_ifield_version_[i] = NewMemoryVersion(&merge_new_memory_version_);
         }
@@ -699,6 +712,28 @@
   }
 }
 
+void LocalValueNumbering::MergeDivZeroChecked() {
+  DCHECK_GE(gvn_->merge_lvns_.size(), 2u);
+
+  // Find the LVN with the least entries in the set.
+  const LocalValueNumbering* least_entries_lvn = gvn_->merge_lvns_[0];
+  for (const LocalValueNumbering* lvn : gvn_->merge_lvns_) {
+    if (lvn->div_zero_checked_.size() < least_entries_lvn->div_zero_checked_.size()) {
+      least_entries_lvn = lvn;
+    }
+  }
+
+  // For each div-zero value name check if it's div-zero checked in all the LVNs.
+  for (const auto& value_name : least_entries_lvn->div_zero_checked_) {
+    // Merge null_checked_ for this ref.
+    merge_names_.clear();
+    merge_names_.resize(gvn_->merge_lvns_.size(), value_name);
+    if (gvn_->DivZeroCheckedInAllPredecessors(merge_names_)) {
+      div_zero_checked_.insert(div_zero_checked_.end(), value_name);
+    }
+  }
+}
+
 void LocalValueNumbering::MergeSFieldValues(const SFieldToValueMap::value_type& entry,
                                             SFieldToValueMap::iterator hint) {
   uint16_t field_id = entry.first;
@@ -711,7 +746,7 @@
     if (it != lvn->sfield_value_map_.end()) {
       value_name = it->second;
     } else {
-      uint16_t type = gvn_->GetFieldType(field_id);
+      uint16_t type = gvn_->GetSFieldType(field_id);
       value_name = gvn_->LookupValue(kResolvedSFieldOp, field_id,
                                      lvn->unresolved_sfield_version_[type],
                                      lvn->global_memory_version_);
@@ -931,6 +966,9 @@
   // Merge null_checked_. We may later insert more, such as merged object field values.
   MergeNullChecked();
 
+  // Now merge the div_zero_checked_.
+  MergeDivZeroChecked();
+
   if (merge_type == kCatchMerge) {
     // Memory is clobbered. New memory version already created, don't merge aliasing locations.
     return;
@@ -1054,10 +1092,30 @@
   }
 }
 
+void LocalValueNumbering::HandleDivZeroCheck(MIR* mir, uint16_t reg) {
+  auto lb = div_zero_checked_.lower_bound(reg);
+  if (lb != div_zero_checked_.end() && *lb == reg) {
+    if (LIKELY(gvn_->CanModify())) {
+      if (gvn_->GetCompilationUnit()->verbose) {
+        LOG(INFO) << "Removing div zero check for 0x" << std::hex << mir->offset;
+      }
+      mir->optimization_flags |= MIR_IGNORE_DIV_ZERO_CHECK;
+    }
+  } else {
+    div_zero_checked_.insert(lb, reg);
+  }
+}
+
 void LocalValueNumbering::HandlePutObject(MIR* mir) {
   // If we're storing a non-aliasing reference, stop tracking it as non-aliasing now.
   uint16_t base = GetOperandValue(mir->ssa_rep->uses[0]);
   HandleEscapingRef(base);
+  if (gvn_->CanModify() && null_checked_.count(base) != 0u) {
+    if (gvn_->GetCompilationUnit()->verbose) {
+      LOG(INFO) << "Removing GC card mark value null check for 0x" << std::hex << mir->offset;
+    }
+    mir->optimization_flags |= MIR_STORE_NON_NULL_VALUE;
+  }
 }
 
 void LocalValueNumbering::HandleEscapingRef(uint16_t base) {
@@ -1139,6 +1197,9 @@
       if (!wide && gvn_->NullCheckedInAllPredecessors(merge_names_)) {
         null_checked_.insert(value_name);
       }
+      if (gvn_->DivZeroCheckedInAllPredecessors(merge_names_)) {
+        div_zero_checked_.insert(value_name);
+      }
     }
   }
   if (wide) {
@@ -1150,12 +1211,11 @@
 }
 
 uint16_t LocalValueNumbering::HandleAGet(MIR* mir, uint16_t opcode) {
-  // uint16_t type = opcode - Instruction::AGET;
   uint16_t array = GetOperandValue(mir->ssa_rep->uses[0]);
   HandleNullCheck(mir, array);
   uint16_t index = GetOperandValue(mir->ssa_rep->uses[1]);
   HandleRangeCheck(mir, array, index);
-  uint16_t type = opcode - Instruction::AGET;
+  uint16_t type = AGetMemAccessType(static_cast<Instruction::Code>(opcode));
   // Establish value number for loaded register.
   uint16_t res;
   if (IsNonAliasingArray(array, type)) {
@@ -1182,7 +1242,7 @@
   uint16_t index = GetOperandValue(mir->ssa_rep->uses[index_idx]);
   HandleRangeCheck(mir, array, index);
 
-  uint16_t type = opcode - Instruction::APUT;
+  uint16_t type = APutMemAccessType(static_cast<Instruction::Code>(opcode));
   uint16_t value = (opcode == Instruction::APUT_WIDE)
                    ? GetOperandValueWide(mir->ssa_rep->uses[0])
                    : GetOperandValue(mir->ssa_rep->uses[0]);
@@ -1224,8 +1284,8 @@
     // Use result s_reg - will be unique.
     res = gvn_->LookupValue(kNoValue, mir->ssa_rep->defs[0], kNoValue, kNoValue);
   } else {
-    uint16_t type = opcode - Instruction::IGET;
-    uint16_t field_id = gvn_->GetFieldId(field_info, type);
+    uint16_t type = IGetMemAccessType(static_cast<Instruction::Code>(opcode));
+    uint16_t field_id = gvn_->GetIFieldId(mir);
     if (IsNonAliasingIField(base, field_id, type)) {
       uint16_t loc = gvn_->LookupValue(kNonAliasingIFieldLocOp, base, field_id, type);
       auto lb = non_aliasing_ifield_value_map_.lower_bound(loc);
@@ -1249,10 +1309,10 @@
 }
 
 void LocalValueNumbering::HandleIPut(MIR* mir, uint16_t opcode) {
-  uint16_t type = opcode - Instruction::IPUT;
   int base_reg = (opcode == Instruction::IPUT_WIDE) ? 2 : 1;
   uint16_t base = GetOperandValue(mir->ssa_rep->uses[base_reg]);
   HandleNullCheck(mir, base);
+  uint16_t type = IPutMemAccessType(static_cast<Instruction::Code>(opcode));
   const MirFieldInfo& field_info = gvn_->GetMirGraph()->GetIFieldLoweringInfo(mir);
   if (!field_info.IsResolved()) {
     // Unresolved fields always alias with everything of the same type.
@@ -1272,7 +1332,7 @@
     // Aliasing fields of the same type may have been overwritten.
     auto it = aliasing_ifield_value_map_.begin(), end = aliasing_ifield_value_map_.end();
     while (it != end) {
-      if (gvn_->GetFieldType(it->first) != type) {
+      if (gvn_->GetIFieldType(it->first) != type) {
         ++it;
       } else {
         it = aliasing_ifield_value_map_.erase(it);
@@ -1282,7 +1342,7 @@
     // Nothing to do, resolved volatile fields always get a new memory version anyway and
     // can't alias with resolved non-volatile fields.
   } else {
-    uint16_t field_id = gvn_->GetFieldId(field_info, type);
+    uint16_t field_id = gvn_->GetIFieldId(mir);
     uint16_t value = (opcode == Instruction::IPUT_WIDE)
                      ? GetOperandValueWide(mir->ssa_rep->uses[0])
                      : GetOperandValue(mir->ssa_rep->uses[0]);
@@ -1333,8 +1393,8 @@
     // Use result s_reg - will be unique.
     res = gvn_->LookupValue(kNoValue, mir->ssa_rep->defs[0], kNoValue, kNoValue);
   } else {
-    uint16_t type = opcode - Instruction::SGET;
-    uint16_t field_id = gvn_->GetFieldId(field_info, type);
+    uint16_t type = SGetMemAccessType(static_cast<Instruction::Code>(opcode));
+    uint16_t field_id = gvn_->GetSFieldId(mir);
     auto lb = sfield_value_map_.lower_bound(field_id);
     if (lb != sfield_value_map_.end() && lb->first == field_id) {
       res = lb->second;
@@ -1362,7 +1422,7 @@
     // Class initialization can call arbitrary functions, we need to wipe aliasing values.
     HandleInvokeOrClInitOrAcquireOp(mir);
   }
-  uint16_t type = opcode - Instruction::SPUT;
+  uint16_t type = SPutMemAccessType(static_cast<Instruction::Code>(opcode));
   if (!field_info.IsResolved()) {
     // Unresolved fields always alias with everything of the same type.
     // Use mir->offset as modifier; without elaborate inlining, it will be unique.
@@ -1373,7 +1433,7 @@
     // Nothing to do, resolved volatile fields always get a new memory version anyway and
     // can't alias with resolved non-volatile fields.
   } else {
-    uint16_t field_id = gvn_->GetFieldId(field_info, type);
+    uint16_t field_id = gvn_->GetSFieldId(mir);
     uint16_t value = (opcode == Instruction::SPUT_WIDE)
                      ? GetOperandValueWide(mir->ssa_rep->uses[0])
                      : GetOperandValue(mir->ssa_rep->uses[0]);
@@ -1397,7 +1457,7 @@
 void LocalValueNumbering::RemoveSFieldsForType(uint16_t type) {
   // Erase all static fields of this type from the sfield_value_map_.
   for (auto it = sfield_value_map_.begin(), end = sfield_value_map_.end(); it != end; ) {
-    if (gvn_->GetFieldType(it->first) == type) {
+    if (gvn_->GetSFieldType(it->first) == type) {
       it = sfield_value_map_.erase(it);
     } else {
       ++it;
@@ -1696,6 +1756,13 @@
       }
       break;
 
+    case Instruction::DIV_INT:
+    case Instruction::DIV_INT_2ADDR:
+    case Instruction::REM_INT:
+    case Instruction::REM_INT_2ADDR:
+      HandleDivZeroCheck(mir, GetOperandValue(mir->ssa_rep->uses[1]));
+      FALLTHROUGH_INTENDED;
+
     case Instruction::CMPG_FLOAT:
     case Instruction::CMPL_FLOAT:
     case Instruction::ADD_INT:
@@ -1710,10 +1777,6 @@
     case Instruction::XOR_INT_2ADDR:
     case Instruction::SUB_INT:
     case Instruction::SUB_INT_2ADDR:
-    case Instruction::DIV_INT:
-    case Instruction::DIV_INT_2ADDR:
-    case Instruction::REM_INT:
-    case Instruction::REM_INT_2ADDR:
     case Instruction::SHL_INT:
     case Instruction::SHL_INT_2ADDR:
     case Instruction::SHR_INT:
@@ -1728,19 +1791,22 @@
       }
       break;
 
+    case Instruction::DIV_LONG:
+    case Instruction::REM_LONG:
+    case Instruction::DIV_LONG_2ADDR:
+    case Instruction::REM_LONG_2ADDR:
+      HandleDivZeroCheck(mir, GetOperandValueWide(mir->ssa_rep->uses[2]));
+      FALLTHROUGH_INTENDED;
+
     case Instruction::ADD_LONG:
     case Instruction::SUB_LONG:
     case Instruction::MUL_LONG:
-    case Instruction::DIV_LONG:
-    case Instruction::REM_LONG:
     case Instruction::AND_LONG:
     case Instruction::OR_LONG:
     case Instruction::XOR_LONG:
     case Instruction::ADD_LONG_2ADDR:
     case Instruction::SUB_LONG_2ADDR:
     case Instruction::MUL_LONG_2ADDR:
-    case Instruction::DIV_LONG_2ADDR:
-    case Instruction::REM_LONG_2ADDR:
     case Instruction::AND_LONG_2ADDR:
     case Instruction::OR_LONG_2ADDR:
     case Instruction::XOR_LONG_2ADDR:
diff --git a/compiler/dex/local_value_numbering.h b/compiler/dex/local_value_numbering.h
index 979fd5a..9b89c95 100644
--- a/compiler/dex/local_value_numbering.h
+++ b/compiler/dex/local_value_numbering.h
@@ -22,6 +22,7 @@
 #include "compiler_internals.h"
 #include "global_value_numbering.h"
 #include "utils/arena_object.h"
+#include "utils/dex_instruction_utils.h"
 
 namespace art {
 
@@ -47,6 +48,10 @@
     return null_checked_.find(value_name) != null_checked_.end();
   }
 
+  bool IsValueDivZeroChecked(uint16_t value_name) const {
+    return div_zero_checked_.find(value_name) != div_zero_checked_.end();
+  }
+
   bool IsSregValue(uint16_t s_reg, uint16_t value_name) const {
     auto it = sreg_value_map_.find(s_reg);
     if (it != sreg_value_map_.end()) {
@@ -72,17 +77,6 @@
   // A set of value names.
   typedef GlobalValueNumbering::ValueNameSet ValueNameSet;
 
-  // Field types correspond to the ordering of GET/PUT instructions; this order is the same
-  // for IGET, IPUT, SGET, SPUT, AGET and APUT:
-  // op         0
-  // op_WIDE    1
-  // op_OBJECT  2
-  // op_BOOLEAN 3
-  // op_BYTE    4
-  // op_CHAR    5
-  // op_SHORT   6
-  static constexpr size_t kFieldTypeCount = 7;
-
   // Key is s_reg, value is value name.
   typedef ScopedArenaSafeMap<uint16_t, uint16_t> SregValueMap;
 
@@ -286,6 +280,7 @@
   bool IsNonAliasingArray(uint16_t reg, uint16_t type) const;
   void HandleNullCheck(MIR* mir, uint16_t reg);
   void HandleRangeCheck(MIR* mir, uint16_t array, uint16_t index);
+  void HandleDivZeroCheck(MIR* mir, uint16_t reg);
   void HandlePutObject(MIR* mir);
   void HandleEscapingRef(uint16_t base);
   void HandleInvokeArgs(const MIR* mir, const LocalValueNumbering* mir_lvn);
@@ -337,6 +332,7 @@
   void MergeNonAliasingIFieldValues(const IFieldLocToValueMap::value_type& entry,
                                     IFieldLocToValueMap::iterator hint);
   void MergeNullChecked();
+  void MergeDivZeroChecked();
 
   template <typename Map, Map LocalValueNumbering::*map_ptr, typename Versions>
   void MergeAliasingValues(const typename Map::value_type& entry, typename Map::iterator hint);
@@ -358,8 +354,8 @@
 
   // Data for dealing with memory clobbering and store/load aliasing.
   uint16_t global_memory_version_;
-  uint16_t unresolved_sfield_version_[kFieldTypeCount];
-  uint16_t unresolved_ifield_version_[kFieldTypeCount];
+  uint16_t unresolved_sfield_version_[kDexMemAccessTypeCount];
+  uint16_t unresolved_ifield_version_[kDexMemAccessTypeCount];
   // Value names of references to objects that cannot be reached through a different value name.
   ValueNameSet non_aliasing_refs_;
   // Previously non-aliasing refs that escaped but can still be used for non-aliasing AGET/IGET.
@@ -371,6 +367,7 @@
   // Range check and null check elimination.
   RangeCheckSet range_checked_;
   ValueNameSet null_checked_;
+  ValueNameSet div_zero_checked_;
 
   // Reuse one vector for all merges to avoid leaking too much memory on the ArenaStack.
   ScopedArenaVector<BasicBlockId> merge_names_;
diff --git a/compiler/dex/local_value_numbering_test.cc b/compiler/dex/local_value_numbering_test.cc
index 824c323..0fcb584 100644
--- a/compiler/dex/local_value_numbering_test.cc
+++ b/compiler/dex/local_value_numbering_test.cc
@@ -15,6 +15,7 @@
  */
 
 #include "compiler_internals.h"
+#include "dex/mir_field_info.h"
 #include "global_value_numbering.h"
 #include "local_value_numbering.h"
 #include "gtest/gtest.h"
@@ -28,6 +29,7 @@
     uintptr_t declaring_dex_file;
     uint16_t declaring_field_idx;
     bool is_volatile;
+    DexMemAccessType type;
   };
 
   struct SFieldDef {
@@ -35,6 +37,7 @@
     uintptr_t declaring_dex_file;
     uint16_t declaring_field_idx;
     bool is_volatile;
+    DexMemAccessType type;
   };
 
   struct MIRDef {
@@ -84,18 +87,21 @@
     { opcode, 0u, 0u, 1, { reg }, 0, { } }
 #define DEF_UNIQUE_REF(opcode, reg) \
     { opcode, 0u, 0u, 0, { }, 1, { reg } }  // CONST_CLASS, CONST_STRING, NEW_ARRAY, ...
+#define DEF_DIV_REM(opcode, result, dividend, divisor) \
+    { opcode, 0u, 0u, 2, { dividend, divisor }, 1, { result } }
+#define DEF_DIV_REM_WIDE(opcode, result, dividend, divisor) \
+    { opcode, 0u, 0u, 4, { dividend, dividend + 1, divisor, divisor + 1 }, 2, { result, result + 1 } }
 
   void DoPrepareIFields(const IFieldDef* defs, size_t count) {
     cu_.mir_graph->ifield_lowering_infos_.clear();
     cu_.mir_graph->ifield_lowering_infos_.reserve(count);
     for (size_t i = 0u; i != count; ++i) {
       const IFieldDef* def = &defs[i];
-      MirIFieldLoweringInfo field_info(def->field_idx);
+      MirIFieldLoweringInfo field_info(def->field_idx, def->type);
       if (def->declaring_dex_file != 0u) {
         field_info.declaring_dex_file_ = reinterpret_cast<const DexFile*>(def->declaring_dex_file);
         field_info.declaring_field_idx_ = def->declaring_field_idx;
-        field_info.flags_ = 0u |  // Without kFlagIsStatic.
-            (def->is_volatile ? MirIFieldLoweringInfo::kFlagIsVolatile : 0u);
+        field_info.flags_ &= ~(def->is_volatile ? 0u : MirSFieldLoweringInfo::kFlagIsVolatile);
       }
       cu_.mir_graph->ifield_lowering_infos_.push_back(field_info);
     }
@@ -111,15 +117,14 @@
     cu_.mir_graph->sfield_lowering_infos_.reserve(count);
     for (size_t i = 0u; i != count; ++i) {
       const SFieldDef* def = &defs[i];
-      MirSFieldLoweringInfo field_info(def->field_idx);
+      MirSFieldLoweringInfo field_info(def->field_idx, def->type);
       // Mark even unresolved fields as initialized.
-      field_info.flags_ = MirSFieldLoweringInfo::kFlagIsStatic |
-          MirSFieldLoweringInfo::kFlagClassIsInitialized;
+      field_info.flags_ |= MirSFieldLoweringInfo::kFlagClassIsInitialized;
       // NOTE: MirSFieldLoweringInfo::kFlagClassIsInDexCache isn't used by LVN.
       if (def->declaring_dex_file != 0u) {
         field_info.declaring_dex_file_ = reinterpret_cast<const DexFile*>(def->declaring_dex_file);
         field_info.declaring_field_idx_ = def->declaring_field_idx;
-        field_info.flags_ |= (def->is_volatile ? MirSFieldLoweringInfo::kFlagIsVolatile : 0u);
+        field_info.flags_ &= ~(def->is_volatile ? 0u : MirSFieldLoweringInfo::kFlagIsVolatile);
       }
       cu_.mir_graph->sfield_lowering_infos_.push_back(field_info);
     }
@@ -140,12 +145,16 @@
       mir->dalvikInsn.opcode = def->opcode;
       mir->dalvikInsn.vB = static_cast<int32_t>(def->value);
       mir->dalvikInsn.vB_wide = def->value;
-      if (def->opcode >= Instruction::IGET && def->opcode <= Instruction::IPUT_SHORT) {
+      if (IsInstructionIGetOrIPut(def->opcode)) {
         ASSERT_LT(def->field_info, cu_.mir_graph->ifield_lowering_infos_.size());
         mir->meta.ifield_lowering_info = def->field_info;
-      } else if (def->opcode >= Instruction::SGET && def->opcode <= Instruction::SPUT_SHORT) {
+        ASSERT_EQ(cu_.mir_graph->ifield_lowering_infos_[def->field_info].MemAccessType(),
+                  IGetOrIPutMemAccessType(def->opcode));
+      } else if (IsInstructionSGetOrSPut(def->opcode)) {
         ASSERT_LT(def->field_info, cu_.mir_graph->sfield_lowering_infos_.size());
         mir->meta.sfield_lowering_info = def->field_info;
+        ASSERT_EQ(cu_.mir_graph->sfield_lowering_infos_[def->field_info].MemAccessType(),
+                  SGetOrSPutMemAccessType(def->opcode));
       }
       mir->ssa_rep = &ssa_reps_[i];
       mir->ssa_rep->num_uses = def->num_uses;
@@ -177,6 +186,13 @@
   }
 
   void PerformLVN() {
+    cu_.mir_graph->temp_.gvn.ifield_ids_ =  GlobalValueNumbering::PrepareGvnFieldIds(
+        allocator_.get(), cu_.mir_graph->ifield_lowering_infos_);
+    cu_.mir_graph->temp_.gvn.sfield_ids_ =  GlobalValueNumbering::PrepareGvnFieldIds(
+        allocator_.get(), cu_.mir_graph->sfield_lowering_infos_);
+    gvn_.reset(new (allocator_.get()) GlobalValueNumbering(&cu_, allocator_.get(),
+                                                           GlobalValueNumbering::kModeLvn));
+    lvn_.reset(new (allocator_.get()) LocalValueNumbering(gvn_.get(), 0u, allocator_.get()));
     value_names_.resize(mir_count_);
     for (size_t i = 0; i != mir_count_; ++i) {
       value_names_[i] =  lvn_->GetValueNumber(&mirs_[i]);
@@ -196,9 +212,6 @@
         value_names_() {
     cu_.mir_graph.reset(new MIRGraph(&cu_, &cu_.arena));
     allocator_.reset(ScopedArenaAllocator::Create(&cu_.arena_stack));
-    gvn_.reset(new (allocator_.get()) GlobalValueNumbering(&cu_, allocator_.get(),
-                                                           GlobalValueNumbering::kModeLvn));
-    lvn_.reset(new (allocator_.get()) LocalValueNumbering(gvn_.get(), 0u, allocator_.get()));
   }
 
   ArenaPool pool_;
@@ -214,7 +227,7 @@
 
 TEST_F(LocalValueNumberingTest, IGetIGetInvokeIGet) {
   static const IFieldDef ifields[] = {
-      { 1u, 1u, 1u, false },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_IGET(Instruction::IGET, 0u, 10u, 0u),
@@ -237,8 +250,8 @@
 
 TEST_F(LocalValueNumberingTest, IGetIPutIGetIGetIGet) {
   static const IFieldDef ifields[] = {
-      { 1u, 1u, 1u, false },
-      { 2u, 1u, 2u, false },
+      { 1u, 1u, 1u, false, kDexMemAccessObject },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_IGET(Instruction::IGET_OBJECT, 0u, 10u, 0u),
@@ -262,7 +275,7 @@
 
 TEST_F(LocalValueNumberingTest, UniquePreserve1) {
   static const IFieldDef ifields[] = {
-      { 1u, 1u, 1u, false },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_UNIQUE_REF(Instruction::NEW_INSTANCE, 10u),
@@ -284,7 +297,7 @@
 
 TEST_F(LocalValueNumberingTest, UniquePreserve2) {
   static const IFieldDef ifields[] = {
-      { 1u, 1u, 1u, false },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_UNIQUE_REF(Instruction::NEW_INSTANCE, 11u),
@@ -306,7 +319,7 @@
 
 TEST_F(LocalValueNumberingTest, UniquePreserveAndEscape) {
   static const IFieldDef ifields[] = {
-      { 1u, 1u, 1u, false },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_UNIQUE_REF(Instruction::NEW_INSTANCE, 10u),
@@ -331,8 +344,8 @@
 
 TEST_F(LocalValueNumberingTest, Volatile) {
   static const IFieldDef ifields[] = {
-      { 1u, 1u, 1u, false },
-      { 2u, 1u, 2u, true },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
+      { 2u, 1u, 2u, true, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_IGET(Instruction::IGET, 0u, 10u, 1u),  // Volatile.
@@ -358,9 +371,9 @@
 
 TEST_F(LocalValueNumberingTest, UnresolvedIField) {
   static const IFieldDef ifields[] = {
-      { 1u, 1u, 1u, false },  // Resolved field #1.
-      { 2u, 1u, 2u, false },  // Resolved field #2.
-      { 3u, 0u, 0u, false },  // Unresolved field.
+      { 1u, 1u, 1u, false, kDexMemAccessWord },  // Resolved field #1.
+      { 2u, 1u, 2u, false, kDexMemAccessWide },  // Resolved field #2.
+      { 3u, 0u, 0u, false, kDexMemAccessWord },  // Unresolved field.
   };
   static const MIRDef mirs[] = {
       DEF_UNIQUE_REF(Instruction::NEW_INSTANCE, 20u),
@@ -407,9 +420,9 @@
 
 TEST_F(LocalValueNumberingTest, UnresolvedSField) {
   static const SFieldDef sfields[] = {
-      { 1u, 1u, 1u, false },  // Resolved field #1.
-      { 2u, 1u, 2u, false },  // Resolved field #2.
-      { 3u, 0u, 0u, false },  // Unresolved field.
+      { 1u, 1u, 1u, false, kDexMemAccessWord },  // Resolved field #1.
+      { 2u, 1u, 2u, false, kDexMemAccessWide },  // Resolved field #2.
+      { 3u, 0u, 0u, false, kDexMemAccessWord },  // Unresolved field.
   };
   static const MIRDef mirs[] = {
       DEF_SGET(Instruction::SGET, 0u, 0u),            // Resolved field #1.
@@ -438,11 +451,11 @@
 
 TEST_F(LocalValueNumberingTest, UninitializedSField) {
   static const IFieldDef ifields[] = {
-      { 1u, 1u, 1u, false },  // Resolved field #1.
+      { 1u, 1u, 1u, false, kDexMemAccessWord },  // Resolved field #1.
   };
   static const SFieldDef sfields[] = {
-      { 1u, 1u, 1u, false },  // Resolved field #1.
-      { 2u, 1u, 2u, false },  // Resolved field #2; uninitialized.
+      { 1u, 1u, 1u, false, kDexMemAccessWord },  // Resolved field #1.
+      { 2u, 1u, 2u, false, kDexMemAccessWord },  // Resolved field #2; uninitialized.
   };
   static const MIRDef mirs[] = {
       DEF_UNIQUE_REF(Instruction::NEW_INSTANCE, 200u),
@@ -487,11 +500,11 @@
 
 TEST_F(LocalValueNumberingTest, SameValueInDifferentMemoryLocations) {
   static const IFieldDef ifields[] = {
-      { 1u, 1u, 1u, false },
-      { 2u, 1u, 2u, false },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
   };
   static const SFieldDef sfields[] = {
-      { 3u, 1u, 3u, false },
+      { 3u, 1u, 3u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_UNIQUE_REF(Instruction::NEW_ARRAY, 201u),
@@ -551,12 +564,12 @@
 
 TEST_F(LocalValueNumberingTest, EscapingRefs) {
   static const IFieldDef ifields[] = {
-      { 1u, 1u, 1u, false },  // Field #1.
-      { 2u, 1u, 2u, false },  // Field #2.
-      { 3u, 1u, 3u, false },  // Reference field for storing escaping refs.
-      { 4u, 1u, 4u, false },  // Wide.
-      { 5u, 0u, 0u, false },  // Unresolved field, int.
-      { 6u, 0u, 0u, false },  // Unresolved field, wide.
+      { 1u, 1u, 1u, false, kDexMemAccessWord },    // Field #1.
+      { 2u, 1u, 2u, false, kDexMemAccessWord },    // Field #2.
+      { 3u, 1u, 3u, false, kDexMemAccessObject },  // For storing escaping refs.
+      { 4u, 1u, 4u, false, kDexMemAccessWide },    // Wide.
+      { 5u, 0u, 0u, false, kDexMemAccessWord },    // Unresolved field, int.
+      { 6u, 0u, 0u, false, kDexMemAccessWide },    // Unresolved field, wide.
   };
   static const MIRDef mirs[] = {
       DEF_UNIQUE_REF(Instruction::NEW_INSTANCE, 20u),
@@ -594,7 +607,9 @@
   EXPECT_NE(value_names_[13], value_names_[16]);  // New value.
   EXPECT_NE(value_names_[14], value_names_[17]);  // New value.
   for (size_t i = 0u; i != mir_count_; ++i) {
-    int expected = (i != 0u && i != 3u && i != 6u) ? MIR_IGNORE_NULL_CHECK : 0;
+    int expected =
+        ((i != 0u && i != 3u && i != 6u) ? MIR_IGNORE_NULL_CHECK : 0) |
+        ((i == 3u) ? MIR_STORE_NON_NULL_VALUE: 0);
     EXPECT_EQ(expected, mirs_[i].optimization_flags) << i;
   }
 }
@@ -627,18 +642,19 @@
   for (size_t i = 0u; i != mir_count_; ++i) {
     int expected =
         ((i != 0u && i != 3u && i != 6u && i != 9u) ? MIR_IGNORE_NULL_CHECK : 0u) |
-        ((i >= 4 && i != 6u && i != 9u) ? MIR_IGNORE_RANGE_CHECK : 0u);
+        ((i >= 4 && i != 6u && i != 9u) ? MIR_IGNORE_RANGE_CHECK : 0u) |
+        ((i == 3u) ? MIR_STORE_NON_NULL_VALUE: 0);
     EXPECT_EQ(expected, mirs_[i].optimization_flags) << i;
   }
 }
 
 TEST_F(LocalValueNumberingTest, StoringSameValueKeepsMemoryVersion) {
   static const IFieldDef ifields[] = {
-      { 1u, 1u, 1u, false },
-      { 2u, 1u, 2u, false },
+      { 1u, 1u, 1u, false, kDexMemAccessWord },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
   };
   static const SFieldDef sfields[] = {
-      { 2u, 1u, 2u, false },
+      { 2u, 1u, 2u, false, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_IGET(Instruction::IGET, 0u, 30u, 0u),
@@ -716,8 +732,8 @@
 
 TEST_F(LocalValueNumberingTest, ClInitOnSget) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, false },
-      { 1u, 2u, 1u, false },
+      { 0u, 1u, 0u, false, kDexMemAccessObject },
+      { 1u, 2u, 1u, false, kDexMemAccessObject },
   };
   static const MIRDef mirs[] = {
       DEF_SGET(Instruction::SGET_OBJECT, 0u, 0u),
@@ -735,4 +751,26 @@
   EXPECT_NE(value_names_[0], value_names_[3]);
 }
 
+TEST_F(LocalValueNumberingTest, DivZeroCheck) {
+  static const MIRDef mirs[] = {
+      DEF_DIV_REM(Instruction::DIV_INT, 1u, 10u, 20u),
+      DEF_DIV_REM(Instruction::DIV_INT, 2u, 20u, 20u),
+      DEF_DIV_REM(Instruction::DIV_INT_2ADDR, 3u, 10u, 1u),
+      DEF_DIV_REM(Instruction::REM_INT, 4u, 30u, 20u),
+      DEF_DIV_REM_WIDE(Instruction::REM_LONG, 5u, 12u, 14u),
+      DEF_DIV_REM_WIDE(Instruction::DIV_LONG_2ADDR, 7u, 16u, 14u),
+  };
+
+  static const bool expected_ignore_div_zero_check[] = {
+      false, true, false, true, false, true,
+  };
+
+  PrepareMIRs(mirs);
+  PerformLVN();
+  for (size_t i = 0u; i != mir_count_; ++i) {
+    int expected = expected_ignore_div_zero_check[i] ? MIR_IGNORE_DIV_ZERO_CHECK : 0u;
+    EXPECT_EQ(expected, mirs_[i].optimization_flags) << i;
+  }
+}
+
 }  // namespace art
diff --git a/compiler/dex/mir_analysis.cc b/compiler/dex/mir_analysis.cc
index 44f69ba..7b53b14 100644
--- a/compiler/dex/mir_analysis.cc
+++ b/compiler/dex/mir_analysis.cc
@@ -21,6 +21,7 @@
 #include "dataflow_iterator-inl.h"
 #include "dex_instruction.h"
 #include "dex_instruction-inl.h"
+#include "dex/mir_field_info.h"
 #include "dex/verified_method.h"
 #include "dex/quick/dex_file_method_inliner.h"
 #include "dex/quick/dex_file_to_method_inliner_map.h"
@@ -1204,6 +1205,8 @@
   ScopedArenaAllocator allocator(&cu_->arena_stack);
   uint16_t* field_idxs =
       reinterpret_cast<uint16_t*>(allocator.Alloc(max_refs * sizeof(uint16_t), kArenaAllocMisc));
+  DexMemAccessType* field_types = reinterpret_cast<DexMemAccessType*>(
+      allocator.Alloc(max_refs * sizeof(DexMemAccessType), kArenaAllocMisc));
 
   // Find IGET/IPUT/SGET/SPUT insns, store IGET/IPUT fields at the beginning, SGET/SPUT at the end.
   size_t ifield_pos = 0u;
@@ -1214,38 +1217,41 @@
       continue;
     }
     for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
-      if (mir->dalvikInsn.opcode >= Instruction::IGET &&
-          mir->dalvikInsn.opcode <= Instruction::SPUT_SHORT) {
-        // Get field index and try to find it among existing indexes. If found, it's usually among
-        // the last few added, so we'll start the search from ifield_pos/sfield_pos. Though this
-        // is a linear search, it actually performs much better than map based approach.
-        if (mir->dalvikInsn.opcode <= Instruction::IPUT_SHORT) {
-          uint16_t field_idx = mir->dalvikInsn.vC;
-          size_t i = ifield_pos;
-          while (i != 0u && field_idxs[i - 1] != field_idx) {
-            --i;
-          }
-          if (i != 0u) {
-            mir->meta.ifield_lowering_info = i - 1;
-          } else {
-            mir->meta.ifield_lowering_info = ifield_pos;
-            field_idxs[ifield_pos++] = field_idx;
-          }
-        } else {
-          uint16_t field_idx = mir->dalvikInsn.vB;
-          size_t i = sfield_pos;
-          while (i != max_refs && field_idxs[i] != field_idx) {
-            ++i;
-          }
-          if (i != max_refs) {
-            mir->meta.sfield_lowering_info = max_refs - i - 1u;
-          } else {
-            mir->meta.sfield_lowering_info = max_refs - sfield_pos;
-            field_idxs[--sfield_pos] = field_idx;
-          }
+      // Get field index and try to find it among existing indexes. If found, it's usually among
+      // the last few added, so we'll start the search from ifield_pos/sfield_pos. Though this
+      // is a linear search, it actually performs much better than map based approach.
+      if (IsInstructionIGetOrIPut(mir->dalvikInsn.opcode)) {
+        uint16_t field_idx = mir->dalvikInsn.vC;
+        size_t i = ifield_pos;
+        while (i != 0u && field_idxs[i - 1] != field_idx) {
+          --i;
         }
-        DCHECK_LE(ifield_pos, sfield_pos);
+        if (i != 0u) {
+          mir->meta.ifield_lowering_info = i - 1;
+          DCHECK_EQ(field_types[i - 1], IGetOrIPutMemAccessType(mir->dalvikInsn.opcode));
+        } else {
+          mir->meta.ifield_lowering_info = ifield_pos;
+          field_idxs[ifield_pos] = field_idx;
+          field_types[ifield_pos] = IGetOrIPutMemAccessType(mir->dalvikInsn.opcode);
+          ++ifield_pos;
+        }
+      } else if (IsInstructionSGetOrSPut(mir->dalvikInsn.opcode)) {
+        uint16_t field_idx = mir->dalvikInsn.vB;
+        size_t i = sfield_pos;
+        while (i != max_refs && field_idxs[i] != field_idx) {
+          ++i;
+        }
+        if (i != max_refs) {
+          mir->meta.sfield_lowering_info = max_refs - i - 1u;
+          DCHECK_EQ(field_types[i], SGetOrSPutMemAccessType(mir->dalvikInsn.opcode));
+        } else {
+          mir->meta.sfield_lowering_info = max_refs - sfield_pos;
+          --sfield_pos;
+          field_idxs[sfield_pos] = field_idx;
+          field_types[sfield_pos] = SGetOrSPutMemAccessType(mir->dalvikInsn.opcode);
+        }
       }
+      DCHECK_LE(ifield_pos, sfield_pos);
     }
   }
 
@@ -1254,7 +1260,7 @@
     DCHECK_EQ(ifield_lowering_infos_.size(), 0u);
     ifield_lowering_infos_.reserve(ifield_pos);
     for (size_t pos = 0u; pos != ifield_pos; ++pos) {
-      ifield_lowering_infos_.push_back(MirIFieldLoweringInfo(field_idxs[pos]));
+      ifield_lowering_infos_.push_back(MirIFieldLoweringInfo(field_idxs[pos], field_types[pos]));
     }
     MirIFieldLoweringInfo::Resolve(cu_->compiler_driver, GetCurrentDexCompilationUnit(),
                                    ifield_lowering_infos_.data(), ifield_pos);
@@ -1266,7 +1272,7 @@
     sfield_lowering_infos_.reserve(max_refs - sfield_pos);
     for (size_t pos = max_refs; pos != sfield_pos;) {
       --pos;
-      sfield_lowering_infos_.push_back(MirSFieldLoweringInfo(field_idxs[pos]));
+      sfield_lowering_infos_.push_back(MirSFieldLoweringInfo(field_idxs[pos], field_types[pos]));
     }
     MirSFieldLoweringInfo::Resolve(cu_->compiler_driver, GetCurrentDexCompilationUnit(),
                                    sfield_lowering_infos_.data(), max_refs - sfield_pos);
@@ -1329,19 +1335,10 @@
       continue;
     }
     for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
-      if (mir->dalvikInsn.opcode >= Instruction::INVOKE_VIRTUAL &&
-          mir->dalvikInsn.opcode <= Instruction::INVOKE_INTERFACE_RANGE &&
-          mir->dalvikInsn.opcode != Instruction::RETURN_VOID_BARRIER) {
+      if (IsInstructionInvoke(mir->dalvikInsn.opcode)) {
         // Decode target method index and invoke type.
-        uint16_t target_method_idx;
-        uint16_t invoke_type_idx;
-        if (mir->dalvikInsn.opcode <= Instruction::INVOKE_INTERFACE) {
-          target_method_idx = mir->dalvikInsn.vB;
-          invoke_type_idx = mir->dalvikInsn.opcode - Instruction::INVOKE_VIRTUAL;
-        } else {
-          target_method_idx = mir->dalvikInsn.vB;
-          invoke_type_idx = mir->dalvikInsn.opcode - Instruction::INVOKE_VIRTUAL_RANGE;
-        }
+        uint16_t target_method_idx = mir->dalvikInsn.vB;
+        DexInvokeType invoke_type_idx = InvokeInstructionType(mir->dalvikInsn.opcode);
 
         // Find devirtualization target.
         // TODO: The devirt map is ordered by the dex pc here. Is there a way to get INVOKEs
diff --git a/compiler/dex/mir_field_info.cc b/compiler/dex/mir_field_info.cc
index 1db3b5b..53afcad 100644
--- a/compiler/dex/mir_field_info.cc
+++ b/compiler/dex/mir_field_info.cc
@@ -35,7 +35,7 @@
     DCHECK(field_infos != nullptr);
     DCHECK_NE(count, 0u);
     for (auto it = field_infos, end = field_infos + count; it != end; ++it) {
-      MirIFieldLoweringInfo unresolved(it->field_idx_);
+      MirIFieldLoweringInfo unresolved(it->field_idx_, it->MemAccessType());
       DCHECK_EQ(memcmp(&unresolved, &*it, sizeof(*it)), 0);
     }
   }
@@ -66,6 +66,7 @@
     std::pair<bool, bool> fast_path = compiler_driver->IsFastInstanceField(
         dex_cache.Get(), referrer_class.Get(), resolved_field, field_idx);
     it->flags_ = 0u |  // Without kFlagIsStatic.
+        (it->flags_ & (kMemAccessTypeMask << kBitMemAccessTypeBegin)) |
         (is_volatile ? kFlagIsVolatile : 0u) |
         (fast_path.first ? kFlagFastGet : 0u) |
         (fast_path.second ? kFlagFastPut : 0u);
@@ -79,7 +80,7 @@
     DCHECK(field_infos != nullptr);
     DCHECK_NE(count, 0u);
     for (auto it = field_infos, end = field_infos + count; it != end; ++it) {
-      MirSFieldLoweringInfo unresolved(it->field_idx_);
+      MirSFieldLoweringInfo unresolved(it->field_idx_, it->MemAccessType());
       // In 64-bit builds, there's padding after storage_index_, don't include it in memcmp.
       size_t size = OFFSETOF_MEMBER(MirSFieldLoweringInfo, storage_index_) +
           sizeof(it->storage_index_);
@@ -114,6 +115,7 @@
     std::pair<bool, bool> fast_path = compiler_driver->IsFastStaticField(
         dex_cache.Get(), referrer_class, resolved_field, field_idx, &it->storage_index_);
     uint16_t flags = kFlagIsStatic |
+        (it->flags_ & (kMemAccessTypeMask << kBitMemAccessTypeBegin)) |
         (is_volatile ? kFlagIsVolatile : 0u) |
         (fast_path.first ? kFlagFastGet : 0u) |
         (fast_path.second ? kFlagFastPut : 0u);
diff --git a/compiler/dex/mir_field_info.h b/compiler/dex/mir_field_info.h
index e97f7a0..ff427f8 100644
--- a/compiler/dex/mir_field_info.h
+++ b/compiler/dex/mir_field_info.h
@@ -20,6 +20,7 @@
 #include "base/macros.h"
 #include "dex_file.h"
 #include "offsets.h"
+#include "utils/dex_instruction_utils.h"
 
 namespace art {
 
@@ -63,18 +64,27 @@
     return (flags_ & kFlagIsVolatile) != 0u;
   }
 
+  DexMemAccessType MemAccessType() const {
+    return static_cast<DexMemAccessType>((flags_ >> kBitMemAccessTypeBegin) & kMemAccessTypeMask);
+  }
+
  protected:
   enum {
     kBitIsStatic = 0,
     kBitIsVolatile,
-    kFieldInfoBitEnd
+    kBitMemAccessTypeBegin,
+    kBitMemAccessTypeEnd = kBitMemAccessTypeBegin + 3,  // 3 bits for raw type.
+    kFieldInfoBitEnd = kBitMemAccessTypeEnd
   };
   static constexpr uint16_t kFlagIsVolatile = 1u << kBitIsVolatile;
   static constexpr uint16_t kFlagIsStatic = 1u << kBitIsStatic;
+  static constexpr uint16_t kMemAccessTypeMask = 7u;
+  static_assert((1u << (kBitMemAccessTypeEnd - kBitMemAccessTypeBegin)) - 1u == kMemAccessTypeMask,
+                "Invalid raw type mask");
 
-  MirFieldInfo(uint16_t field_idx, uint16_t flags)
+  MirFieldInfo(uint16_t field_idx, uint16_t flags, DexMemAccessType type)
       : field_idx_(field_idx),
-        flags_(flags),
+        flags_(flags | static_cast<uint16_t>(type) << kBitMemAccessTypeBegin),
         declaring_field_idx_(0u),
         declaring_class_idx_(0u),
         declaring_dex_file_(nullptr) {
@@ -107,8 +117,8 @@
       LOCKS_EXCLUDED(Locks::mutator_lock_);
 
   // Construct an unresolved instance field lowering info.
-  explicit MirIFieldLoweringInfo(uint16_t field_idx)
-      : MirFieldInfo(field_idx, kFlagIsVolatile),  // Without kFlagIsStatic.
+  explicit MirIFieldLoweringInfo(uint16_t field_idx, DexMemAccessType type)
+      : MirFieldInfo(field_idx, kFlagIsVolatile, type),  // Without kFlagIsStatic.
         field_offset_(0u) {
   }
 
@@ -155,8 +165,8 @@
       LOCKS_EXCLUDED(Locks::mutator_lock_);
 
   // Construct an unresolved static field lowering info.
-  explicit MirSFieldLoweringInfo(uint16_t field_idx)
-      : MirFieldInfo(field_idx, kFlagIsVolatile | kFlagIsStatic),
+  explicit MirSFieldLoweringInfo(uint16_t field_idx, DexMemAccessType type)
+      : MirFieldInfo(field_idx, kFlagIsVolatile | kFlagIsStatic, type),
         field_offset_(0u),
         storage_index_(DexFile::kDexNoIndex) {
   }
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index b87ab66..023abca 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -97,11 +97,6 @@
       max_nested_loops_(0u),
       i_dom_list_(NULL),
       temp_scoped_alloc_(),
-      temp_insn_data_(nullptr),
-      temp_bit_vector_size_(0u),
-      temp_bit_vector_(nullptr),
-      temp_bit_matrix_(nullptr),
-      temp_gvn_(),
       block_list_(arena->Adapter(kArenaAllocBBList)),
       try_block_addr_(NULL),
       entry_block_(NULL),
@@ -133,6 +128,7 @@
       sfield_lowering_infos_(arena->Adapter(kArenaAllocLoweringInfo)),
       method_lowering_infos_(arena->Adapter(kArenaAllocLoweringInfo)),
       gen_suspend_test_list_(arena->Adapter()) {
+  memset(&temp_, 0, sizeof(temp_));
   use_counts_.reserve(256);
   raw_use_counts_.reserve(256);
   block_list_.reserve(100);
@@ -262,8 +258,6 @@
   DCHECK(insn != orig_block->first_mir_insn);
   DCHECK(insn == bottom_block->first_mir_insn);
   DCHECK_EQ(insn->offset, bottom_block->start_offset);
-  DCHECK(static_cast<int>(insn->dalvikInsn.opcode) == kMirOpCheck ||
-         !MIR::DecodedInstruction::IsPseudoMirOp(insn->dalvikInsn.opcode));
   DCHECK_EQ(dex_pc_to_block_map_[insn->offset], orig_block->id);
   // Scan the "bottom" instructions, remapping them to the
   // newly created "bottom" block.
@@ -771,8 +765,9 @@
       } else {
         DCHECK(cur_block->fall_through == NullBasicBlockId);
         DCHECK(cur_block->taken == NullBasicBlockId);
-        // Unreachable instruction, mark for no continuation.
+        // Unreachable instruction, mark for no continuation and end basic block.
         flags &= ~Instruction::kContinue;
+        FindBlock(current_offset_ + width, /* create */ true, /* immed_pred_block_p */ nullptr);
       }
     } else {
       cur_block->AppendMIR(insn);
@@ -919,7 +914,7 @@
                 bb->first_mir_insn ? " | " : " ");
         for (mir = bb->first_mir_insn; mir; mir = mir->next) {
             int opcode = mir->dalvikInsn.opcode;
-            fprintf(file, "    {%04x %s %s %s %s %s %s %s %s\\l}%s\\\n", mir->offset,
+            fprintf(file, "    {%04x %s %s %s %s %s %s %s %s %s\\l}%s\\\n", mir->offset,
                       mir->ssa_rep ? GetDalvikDisassembly(mir) :
                       !MIR::DecodedInstruction::IsPseudoMirOp(opcode) ?
                         Instruction::Name(mir->dalvikInsn.opcode) :
@@ -931,6 +926,7 @@
                       (mir->optimization_flags & MIR_CALLEE) != 0 ? " inlined" : " ",
                       (mir->optimization_flags & MIR_CLASS_IS_INITIALIZED) != 0 ? " cl_inited" : " ",
                       (mir->optimization_flags & MIR_CLASS_IS_IN_DEX_CACHE) != 0 ? " cl_in_cache" : " ",
+                      (mir->optimization_flags & MIR_IGNORE_DIV_ZERO_CHECK) != 0 ? " no_div_check" : " ",
                       mir->next ? " | " : " ");
         }
         fprintf(file, "  }\"];\n\n");
@@ -1173,6 +1169,14 @@
   return true;
 }
 
+MIR* BasicBlock::GetFirstNonPhiInsn() {
+  MIR* mir = first_mir_insn;
+  while (mir != nullptr && static_cast<int>(mir->dalvikInsn.opcode) == kMirOpPhi) {
+    mir = mir->next;
+  }
+  return mir;
+}
+
 MIR* BasicBlock::GetNextUnconditionalMir(MIRGraph* mir_graph, MIR* current) {
   MIR* next_mir = nullptr;
 
@@ -1213,6 +1217,10 @@
   int defs = (ssa_rep != nullptr) ? ssa_rep->num_defs : 0;
   int uses = (ssa_rep != nullptr) ? ssa_rep->num_uses : 0;
 
+  if (opcode < kMirOpFirst) {
+    return;  // It is not an extended instruction.
+  }
+
   decoded_mir->append(extended_mir_op_names_[opcode - kMirOpFirst]);
 
   switch (opcode) {
@@ -1348,9 +1356,10 @@
           decoded_mir->append(", ");
           decoded_mir->append(GetSSANameWithConst(ssa_rep->defs[1], false));
         }
-        decoded_mir->append(StringPrintf(" = vect%d", mir->dalvikInsn.vB));
+        decoded_mir->append(StringPrintf(" = vect%d (extr_idx:%d)", mir->dalvikInsn.vB, mir->dalvikInsn.arg[0]));
       } else {
-        decoded_mir->append(StringPrintf(" v%d = vect%d", mir->dalvikInsn.vA, mir->dalvikInsn.vB));
+        decoded_mir->append(StringPrintf(" v%d = vect%d (extr_idx:%d)", mir->dalvikInsn.vA,
+                                         mir->dalvikInsn.vB, mir->dalvikInsn.arg[0]));
       }
       FillTypeSizeString(mir->dalvikInsn.vC, decoded_mir);
       break;
@@ -1681,9 +1690,9 @@
 void MIRGraph::SSATransformationStart() {
   DCHECK(temp_scoped_alloc_.get() == nullptr);
   temp_scoped_alloc_.reset(ScopedArenaAllocator::Create(&cu_->arena_stack));
-  temp_bit_vector_size_ = GetNumOfCodeAndTempVRs();
-  temp_bit_vector_ = new (temp_scoped_alloc_.get()) ArenaBitVector(
-      temp_scoped_alloc_.get(), temp_bit_vector_size_, false, kBitMapRegisterV);
+  temp_.ssa.num_vregs = GetNumOfCodeAndTempVRs();
+  temp_.ssa.work_live_vregs = new (temp_scoped_alloc_.get()) ArenaBitVector(
+      temp_scoped_alloc_.get(), temp_.ssa.num_vregs, false, kBitMapRegisterV);
 }
 
 void MIRGraph::SSATransformationEnd() {
@@ -1692,9 +1701,9 @@
     VerifyDataflow();
   }
 
-  temp_bit_vector_size_ = 0u;
-  temp_bit_vector_ = nullptr;
-  temp_bit_matrix_ = nullptr;  // Def block matrix.
+  temp_.ssa.num_vregs = 0u;
+  temp_.ssa.work_live_vregs = nullptr;
+  temp_.ssa.def_block_matrix = nullptr;
   DCHECK(temp_scoped_alloc_.get() != nullptr);
   temp_scoped_alloc_.reset();
 
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index a1d24e2..1a18841 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -145,9 +145,8 @@
 #define INVALID_OFFSET (0xDEADF00FU)
 
 #define MIR_IGNORE_NULL_CHECK           (1 << kMIRIgnoreNullCheck)
-#define MIR_NULL_CHECK_ONLY             (1 << kMIRNullCheckOnly)
 #define MIR_IGNORE_RANGE_CHECK          (1 << kMIRIgnoreRangeCheck)
-#define MIR_RANGE_CHECK_ONLY            (1 << kMIRRangeCheckOnly)
+#define MIR_STORE_NON_NULL_VALUE        (1 << kMIRStoreNonNullValue)
 #define MIR_CLASS_IS_INITIALIZED        (1 << kMIRClassIsInitialized)
 #define MIR_CLASS_IS_IN_DEX_CACHE       (1 << kMIRClassIsInDexCache)
 #define MIR_IGNORE_DIV_ZERO_CHECK       (1 << kMirIgnoreDivZeroCheck)
@@ -444,6 +443,11 @@
   void UpdatePredecessor(BasicBlockId old_pred, BasicBlockId new_pred);
 
   /**
+   * @brief Return first non-Phi insn.
+   */
+  MIR* GetFirstNonPhiInsn();
+
+  /**
    * @brief Used to obtain the next MIR that follows unconditionally.
    * @details The implementation does not guarantee that a MIR does not
    * follow even if this method returns nullptr.
@@ -661,13 +665,29 @@
   void DoCacheFieldLoweringInfo();
 
   const MirIFieldLoweringInfo& GetIFieldLoweringInfo(MIR* mir) const {
-    DCHECK_LT(mir->meta.ifield_lowering_info, ifield_lowering_infos_.size());
-    return ifield_lowering_infos_[mir->meta.ifield_lowering_info];
+    return GetIFieldLoweringInfo(mir->meta.ifield_lowering_info);
+  }
+
+  const MirIFieldLoweringInfo& GetIFieldLoweringInfo(uint32_t lowering_info) const {
+    DCHECK_LT(lowering_info, ifield_lowering_infos_.size());
+    return ifield_lowering_infos_[lowering_info];
+  }
+
+  size_t GetIFieldLoweringInfoCount() const {
+    return ifield_lowering_infos_.size();
   }
 
   const MirSFieldLoweringInfo& GetSFieldLoweringInfo(MIR* mir) const {
-    DCHECK_LT(mir->meta.sfield_lowering_info, sfield_lowering_infos_.size());
-    return sfield_lowering_infos_[mir->meta.sfield_lowering_info];
+    return GetSFieldLoweringInfo(mir->meta.sfield_lowering_info);
+  }
+
+  const MirSFieldLoweringInfo& GetSFieldLoweringInfo(uint32_t lowering_info) const {
+    DCHECK_LT(lowering_info, sfield_lowering_infos_.size());
+    return sfield_lowering_infos_[lowering_info];
+  }
+
+  size_t GetSFieldLoweringInfoCount() const {
+    return sfield_lowering_infos_.size();
   }
 
   void DoCacheMethodLoweringInfo();
@@ -1035,6 +1055,21 @@
   bool ApplyGlobalValueNumberingGate();
   bool ApplyGlobalValueNumbering(BasicBlock* bb);
   void ApplyGlobalValueNumberingEnd();
+
+  uint16_t GetGvnIFieldId(MIR* mir) const {
+    DCHECK(IsInstructionIGetOrIPut(mir->dalvikInsn.opcode));
+    DCHECK_LT(mir->meta.ifield_lowering_info, ifield_lowering_infos_.size());
+    DCHECK(temp_.gvn.ifield_ids_ != nullptr);
+    return temp_.gvn.ifield_ids_[mir->meta.ifield_lowering_info];
+  }
+
+  uint16_t GetGvnSFieldId(MIR* mir) const {
+    DCHECK(IsInstructionSGetOrSPut(mir->dalvikInsn.opcode));
+    DCHECK_LT(mir->meta.sfield_lowering_info, sfield_lowering_infos_.size());
+    DCHECK(temp_.gvn.sfield_ids_ != nullptr);
+    return temp_.gvn.sfield_ids_[mir->meta.sfield_lowering_info];
+  }
+
   /*
    * Type inference handling helpers.  Because Dalvik's bytecode is not fully typed,
    * we have to do some work to figure out the sreg type.  For some operations it is
@@ -1270,15 +1305,40 @@
   size_t max_nested_loops_;
   int* i_dom_list_;
   std::unique_ptr<ScopedArenaAllocator> temp_scoped_alloc_;
-  uint16_t* temp_insn_data_;
-  uint32_t temp_bit_vector_size_;
-  ArenaBitVector* temp_bit_vector_;
-  // temp_bit_matrix_ used as one of
-  //   - def_block_matrix: original num registers x num_blocks_,
-  //   - ending_null_check_matrix: num_blocks_ x original num registers,
-  //   - ending_clinit_check_matrix: num_blocks_ x unique class count.
-  ArenaBitVector** temp_bit_matrix_;
-  std::unique_ptr<GlobalValueNumbering> temp_gvn_;
+  // Union of temporaries used by different passes.
+  union {
+    // Class init check elimination.
+    struct {
+      size_t num_class_bits;  // 2 bits per class: class initialized and class in dex cache.
+      ArenaBitVector* work_classes_to_check;
+      ArenaBitVector** ending_classes_to_check_matrix;  // num_blocks_ x num_class_bits.
+      uint16_t* indexes;
+    } cice;
+    // Null check elimination.
+    struct {
+      size_t num_vregs;
+      ArenaBitVector* work_vregs_to_check;
+      ArenaBitVector** ending_vregs_to_check_matrix;  // num_blocks_ x num_vregs.
+    } nce;
+    // Special method inlining.
+    struct {
+      size_t num_indexes;
+      ArenaBitVector* processed_indexes;
+      uint16_t* lowering_infos;
+    } smi;
+    // SSA transformation.
+    struct {
+      size_t num_vregs;
+      ArenaBitVector* work_live_vregs;
+      ArenaBitVector** def_block_matrix;  // num_vregs x num_blocks_.
+    } ssa;
+    // Global value numbering.
+    struct {
+      GlobalValueNumbering* gvn;
+      uint16_t* ifield_ids_;  // Part of GVN/LVN but cached here for LVN to avoid recalculation.
+      uint16_t* sfield_ids_;  // Ditto.
+    } gvn;
+  } temp_;
   static const int kInvalidEntry = -1;
   ArenaVector<BasicBlock*> block_list_;
   ArenaBitVector* try_block_addr_;
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index a0ad213..55f2abc 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -19,6 +19,7 @@
 #include "dataflow_iterator-inl.h"
 #include "global_value_numbering.h"
 #include "local_value_numbering.h"
+#include "mir_field_info.h"
 #include "quick/dex_file_method_inliner.h"
 #include "quick/dex_file_to_method_inliner_map.h"
 #include "stack.h"
@@ -217,10 +218,6 @@
 static_assert(arraysize(kIfCcZConditionCodes) == Instruction::IF_LEZ - Instruction::IF_EQZ + 1,
               "if_ccz_ccodes_size1");
 
-static constexpr bool IsInstructionIfCcZ(Instruction::Code opcode) {
-  return Instruction::IF_EQZ <= opcode && opcode <= Instruction::IF_LEZ;
-}
-
 static constexpr ConditionCode ConditionCodeForIfCcZ(Instruction::Code opcode) {
   return kIfCcZConditionCodes[opcode - Instruction::IF_EQZ];
 }
@@ -402,6 +399,28 @@
   return compiler_temp;
 }
 
+static bool EvaluateBranch(Instruction::Code opcode, int32_t src1, int32_t src2) {
+  bool is_taken;
+  switch (opcode) {
+    case Instruction::IF_EQ: is_taken = (src1 == src2); break;
+    case Instruction::IF_NE: is_taken = (src1 != src2); break;
+    case Instruction::IF_LT: is_taken = (src1 < src2); break;
+    case Instruction::IF_GE: is_taken = (src1 >= src2); break;
+    case Instruction::IF_GT: is_taken = (src1 > src2); break;
+    case Instruction::IF_LE: is_taken = (src1 <= src2); break;
+    case Instruction::IF_EQZ: is_taken = (src1 == 0); break;
+    case Instruction::IF_NEZ: is_taken = (src1 != 0); break;
+    case Instruction::IF_LTZ: is_taken = (src1 < 0); break;
+    case Instruction::IF_GEZ: is_taken = (src1 >= 0); break;
+    case Instruction::IF_GTZ: is_taken = (src1 > 0); break;
+    case Instruction::IF_LEZ: is_taken = (src1 <= 0); break;
+    default:
+      LOG(FATAL) << "Unexpected opcode " << opcode;
+      UNREACHABLE();
+  }
+  return is_taken;
+}
+
 /* Do some MIR-level extended basic block optimizations */
 bool MIRGraph::BasicBlockOpt(BasicBlock* bb) {
   if (bb->block_type == kDead) {
@@ -427,6 +446,46 @@
       // Look for interesting opcodes, skip otherwise
       Instruction::Code opcode = mir->dalvikInsn.opcode;
       switch (opcode) {
+        case Instruction::IF_EQ:
+        case Instruction::IF_NE:
+        case Instruction::IF_LT:
+        case Instruction::IF_GE:
+        case Instruction::IF_GT:
+        case Instruction::IF_LE:
+          if (!IsConst(mir->ssa_rep->uses[1])) {
+            break;
+          }
+          FALLTHROUGH_INTENDED;
+        case Instruction::IF_EQZ:
+        case Instruction::IF_NEZ:
+        case Instruction::IF_LTZ:
+        case Instruction::IF_GEZ:
+        case Instruction::IF_GTZ:
+        case Instruction::IF_LEZ:
+          // Result known at compile time?
+          if (IsConst(mir->ssa_rep->uses[0])) {
+            int32_t rhs = (mir->ssa_rep->num_uses == 2) ? ConstantValue(mir->ssa_rep->uses[1]) : 0;
+            bool is_taken = EvaluateBranch(opcode, ConstantValue(mir->ssa_rep->uses[0]), rhs);
+            BasicBlockId edge_to_kill = is_taken ? bb->fall_through : bb->taken;
+            if (is_taken) {
+              // Replace with GOTO.
+              bb->fall_through = NullBasicBlockId;
+              mir->dalvikInsn.opcode = Instruction::GOTO;
+              mir->dalvikInsn.vA =
+                  IsInstructionIfCc(opcode) ? mir->dalvikInsn.vC : mir->dalvikInsn.vB;
+            } else {
+              // Make NOP.
+              bb->taken = NullBasicBlockId;
+              mir->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
+            }
+            mir->ssa_rep->num_uses = 0;
+            BasicBlock* successor_to_unlink = GetBasicBlock(edge_to_kill);
+            successor_to_unlink->ErasePredecessor(bb->id);
+            if (successor_to_unlink->predecessors.empty()) {
+              successor_to_unlink->KillUnreachable(this);
+            }
+          }
+          break;
         case Instruction::CMPL_FLOAT:
         case Instruction::CMPL_DOUBLE:
         case Instruction::CMPG_FLOAT:
@@ -480,29 +539,25 @@
             }
           }
           break;
-        case Instruction::GOTO:
-        case Instruction::GOTO_16:
-        case Instruction::GOTO_32:
-        case Instruction::IF_EQ:
-        case Instruction::IF_NE:
-        case Instruction::IF_LT:
-        case Instruction::IF_GE:
-        case Instruction::IF_GT:
-        case Instruction::IF_LE:
-        case Instruction::IF_EQZ:
-        case Instruction::IF_NEZ:
-        case Instruction::IF_LTZ:
-        case Instruction::IF_GEZ:
-        case Instruction::IF_GTZ:
-        case Instruction::IF_LEZ:
-          // If we've got a backwards branch to return, no need to suspend check.
-          if ((IsBackedge(bb, bb->taken) && GetBasicBlock(bb->taken)->dominates_return) ||
-              (IsBackedge(bb, bb->fall_through) &&
-                          GetBasicBlock(bb->fall_through)->dominates_return)) {
-            mir->optimization_flags |= MIR_IGNORE_SUSPEND_CHECK;
-            if (cu_->verbose) {
-              LOG(INFO) << "Suppressed suspend check on branch to return at 0x" << std::hex
-                        << mir->offset;
+        case Instruction::RETURN_VOID:
+        case Instruction::RETURN:
+        case Instruction::RETURN_WIDE:
+        case Instruction::RETURN_OBJECT:
+          if (bb->GetFirstNonPhiInsn() == mir) {
+            // This is a simple return BB. Eliminate suspend checks on predecessor back-edges.
+            for (BasicBlockId pred_id : bb->predecessors) {
+              BasicBlock* pred_bb = GetBasicBlock(pred_id);
+              DCHECK(pred_bb != nullptr);
+              if (IsBackedge(pred_bb, bb->id) && pred_bb->last_mir_insn != nullptr &&
+                  (IsInstructionIfCc(pred_bb->last_mir_insn->dalvikInsn.opcode) ||
+                   IsInstructionIfCcZ(pred_bb->last_mir_insn->dalvikInsn.opcode) ||
+                   IsInstructionGoto(pred_bb->last_mir_insn->dalvikInsn.opcode))) {
+                pred_bb->last_mir_insn->optimization_flags |= MIR_IGNORE_SUSPEND_CHECK;
+                if (cu_->verbose) {
+                  LOG(INFO) << "Suppressed suspend check on branch to return at 0x" << std::hex
+                            << pred_bb->last_mir_insn->offset;
+                }
+              }
             }
           }
           break;
@@ -801,17 +856,18 @@
     BasicBlock* bb_next = GetBasicBlock(bb->fall_through);
     DCHECK(!bb_next->catch_entry);
     DCHECK_EQ(bb_next->predecessors.size(), 1u);
-    // Overwrite the kMirOpCheck insn with the paired opcode.
+
+    // Now move instructions from bb_next to bb. Start off with doing a sanity check
+    // that kMirOpCheck's throw instruction is first one in the bb_next.
     DCHECK_EQ(bb_next->first_mir_insn, throw_insn);
-    *bb->last_mir_insn = *throw_insn;
-    // And grab the rest of the instructions from bb_next.
-    bb->last_mir_insn = bb_next->last_mir_insn;
-    throw_insn->next = nullptr;
-    bb_next->last_mir_insn = throw_insn;
-    // Mark acquired instructions as belonging to bb.
-    for (MIR* insn = mir; insn != nullptr; insn = insn->next) {
-      insn->bb = bb->id;
-    }
+    // Now move all instructions (throw instruction to last one) from bb_next to bb.
+    MIR* last_to_move = bb_next->last_mir_insn;
+    bb_next->RemoveMIRList(throw_insn, last_to_move);
+    bb->InsertMIRListAfter(bb->last_mir_insn, throw_insn, last_to_move);
+    // The kMirOpCheck instruction is not needed anymore.
+    mir->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
+    bb->RemoveMIR(mir);
+
     // Before we overwrite successors, remove their predecessor links to bb.
     bb_next->ErasePredecessor(bb->id);
     if (bb->taken != NullBasicBlockId) {
@@ -891,12 +947,12 @@
 
   DCHECK(temp_scoped_alloc_.get() == nullptr);
   temp_scoped_alloc_.reset(ScopedArenaAllocator::Create(&cu_->arena_stack));
-  temp_bit_vector_size_ = GetNumOfCodeVRs();
-  temp_bit_vector_ = new (temp_scoped_alloc_.get()) ArenaBitVector(
-      temp_scoped_alloc_.get(), temp_bit_vector_size_, false, kBitMapNullCheck);
-  temp_bit_matrix_ = static_cast<ArenaBitVector**>(
+  temp_.nce.num_vregs = GetNumOfCodeAndTempVRs();
+  temp_.nce.work_vregs_to_check = new (temp_scoped_alloc_.get()) ArenaBitVector(
+      temp_scoped_alloc_.get(), temp_.nce.num_vregs, false, kBitMapNullCheck);
+  temp_.nce.ending_vregs_to_check_matrix = static_cast<ArenaBitVector**>(
       temp_scoped_alloc_->Alloc(sizeof(ArenaBitVector*) * GetNumBlocks(), kArenaAllocMisc));
-  std::fill_n(temp_bit_matrix_, GetNumBlocks(), nullptr);
+  std::fill_n(temp_.nce.ending_vregs_to_check_matrix, GetNumBlocks(), nullptr);
 
   // reset MIR_MARK
   AllNodesIterator iter(this);
@@ -919,7 +975,7 @@
     return false;
   }
 
-  ArenaBitVector* vregs_to_check = temp_bit_vector_;
+  ArenaBitVector* vregs_to_check = temp_.nce.work_vregs_to_check;
   /*
    * Set initial state. Catch blocks don't need any special treatment.
    */
@@ -940,7 +996,7 @@
     // Starting state is union of all incoming arcs.
     bool copied_first = false;
     for (BasicBlockId pred_id : bb->predecessors) {
-      if (temp_bit_matrix_[pred_id] == nullptr) {
+      if (temp_.nce.ending_vregs_to_check_matrix[pred_id] == nullptr) {
         continue;
       }
       BasicBlock* pred_bb = GetBasicBlock(pred_id);
@@ -962,9 +1018,9 @@
       }
       if (!copied_first) {
         copied_first = true;
-        vregs_to_check->Copy(temp_bit_matrix_[pred_id]);
+        vregs_to_check->Copy(temp_.nce.ending_vregs_to_check_matrix[pred_id]);
       } else {
-        vregs_to_check->Union(temp_bit_matrix_[pred_id]);
+        vregs_to_check->Union(temp_.nce.ending_vregs_to_check_matrix[pred_id]);
       }
       if (null_check_insn != nullptr) {
         vregs_to_check->ClearBit(null_check_insn->dalvikInsn.vA);
@@ -979,7 +1035,10 @@
   for (MIR* mir = bb->first_mir_insn; mir != NULL; mir = mir->next) {
     uint64_t df_attributes = GetDataFlowAttributes(mir);
 
-    DCHECK_EQ(df_attributes & DF_NULL_TRANSFER_N, 0u);  // No Phis yet.
+    if ((df_attributes & DF_NULL_TRANSFER_N) != 0u) {
+      // The algorithm was written in a phi agnostic way.
+      continue;
+    }
 
     // Might need a null check?
     if (df_attributes & DF_HAS_NULL_CHKS) {
@@ -1057,27 +1116,27 @@
 
   // Did anything change?
   bool nce_changed = false;
-  ArenaBitVector* old_ending_ssa_regs_to_check = temp_bit_matrix_[bb->id];
+  ArenaBitVector* old_ending_ssa_regs_to_check = temp_.nce.ending_vregs_to_check_matrix[bb->id];
   if (old_ending_ssa_regs_to_check == nullptr) {
     DCHECK(temp_scoped_alloc_.get() != nullptr);
     nce_changed = vregs_to_check->GetHighestBitSet() != -1;
-    temp_bit_matrix_[bb->id] = vregs_to_check;
+    temp_.nce.ending_vregs_to_check_matrix[bb->id] = vregs_to_check;
     // Create a new vregs_to_check for next BB.
-    temp_bit_vector_ = new (temp_scoped_alloc_.get()) ArenaBitVector(
-        temp_scoped_alloc_.get(), temp_bit_vector_size_, false, kBitMapNullCheck);
+    temp_.nce.work_vregs_to_check = new (temp_scoped_alloc_.get()) ArenaBitVector(
+        temp_scoped_alloc_.get(), temp_.nce.num_vregs, false, kBitMapNullCheck);
   } else if (!vregs_to_check->SameBitsSet(old_ending_ssa_regs_to_check)) {
     nce_changed = true;
-    temp_bit_matrix_[bb->id] = vregs_to_check;
-    temp_bit_vector_ = old_ending_ssa_regs_to_check;  // Reuse for vregs_to_check for next BB.
+    temp_.nce.ending_vregs_to_check_matrix[bb->id] = vregs_to_check;
+    temp_.nce.work_vregs_to_check = old_ending_ssa_regs_to_check;  // Reuse for next BB.
   }
   return nce_changed;
 }
 
 void MIRGraph::EliminateNullChecksEnd() {
   // Clean up temporaries.
-  temp_bit_vector_size_ = 0u;
-  temp_bit_vector_ = nullptr;
-  temp_bit_matrix_ = nullptr;
+  temp_.nce.num_vregs = 0u;
+  temp_.nce.work_vregs_to_check = nullptr;
+  temp_.nce.ending_vregs_to_check_matrix = nullptr;
   DCHECK(temp_scoped_alloc_.get() != nullptr);
   temp_scoped_alloc_.reset();
 
@@ -1124,9 +1183,9 @@
 
   // Each insn we use here has at least 2 code units, offset/2 will be a unique index.
   const size_t end = (GetNumDalvikInsns() + 1u) / 2u;
-  temp_insn_data_ = static_cast<uint16_t*>(
-      temp_scoped_alloc_->Alloc(end * sizeof(*temp_insn_data_), kArenaAllocGrowableArray));
-  std::fill_n(temp_insn_data_, end, 0xffffu);
+  temp_.cice.indexes = static_cast<uint16_t*>(
+      temp_scoped_alloc_->Alloc(end * sizeof(*temp_.cice.indexes), kArenaAllocGrowableArray));
+  std::fill_n(temp_.cice.indexes, end, 0xffffu);
 
   uint32_t unique_class_count = 0u;
   {
@@ -1159,8 +1218,7 @@
     for (BasicBlock* bb = iter.Next(); bb != nullptr; bb = iter.Next()) {
       if (bb->block_type == kDalvikByteCode) {
         for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
-          if (mir->dalvikInsn.opcode >= Instruction::SGET &&
-              mir->dalvikInsn.opcode <= Instruction::SPUT_SHORT) {
+          if (IsInstructionSGetOrSPut(mir->dalvikInsn.opcode)) {
             const MirSFieldLoweringInfo& field_info = GetSFieldLoweringInfo(mir);
             if (!field_info.IsReferrersClass()) {
               DCHECK_LT(class_to_index_map.size(), 0xffffu);
@@ -1173,11 +1231,10 @@
                   static_cast<uint16_t>(class_to_index_map.size())
               };
               uint16_t index = class_to_index_map.insert(entry).first->index;
-              // Using offset/2 for index into temp_insn_data_.
-              temp_insn_data_[mir->offset / 2u] = index;
+              // Using offset/2 for index into temp_.cice.indexes.
+              temp_.cice.indexes[mir->offset / 2u] = index;
             }
-          } else if (mir->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
-              mir->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE) {
+          } else if (IsInstructionInvokeStatic(mir->dalvikInsn.opcode)) {
             const MirMethodLoweringInfo& method_info = GetMethodLoweringInfo(mir);
             DCHECK(method_info.IsStatic());
             if (method_info.FastPath() && !method_info.IsReferrersClass()) {
@@ -1187,8 +1244,8 @@
                   static_cast<uint16_t>(class_to_index_map.size())
               };
               uint16_t index = class_to_index_map.insert(entry).first->index;
-              // Using offset/2 for index into temp_insn_data_.
-              temp_insn_data_[mir->offset / 2u] = index;
+              // Using offset/2 for index into temp_.cice.indexes.
+              temp_.cice.indexes[mir->offset / 2u] = index;
             }
           }
         }
@@ -1199,19 +1256,19 @@
 
   if (unique_class_count == 0u) {
     // All SGET/SPUTs refer to initialized classes. Nothing to do.
-    temp_insn_data_ = nullptr;
+    temp_.cice.indexes = nullptr;
     temp_scoped_alloc_.reset();
     return false;
   }
 
   // 2 bits for each class: is class initialized, is class in dex cache.
-  temp_bit_vector_size_ = 2u * unique_class_count;
-  temp_bit_vector_ = new (temp_scoped_alloc_.get()) ArenaBitVector(
-      temp_scoped_alloc_.get(), temp_bit_vector_size_, false, kBitMapClInitCheck);
-  temp_bit_matrix_ = static_cast<ArenaBitVector**>(
+  temp_.cice.num_class_bits = 2u * unique_class_count;
+  temp_.cice.work_classes_to_check = new (temp_scoped_alloc_.get()) ArenaBitVector(
+      temp_scoped_alloc_.get(), temp_.cice.num_class_bits, false, kBitMapClInitCheck);
+  temp_.cice.ending_classes_to_check_matrix = static_cast<ArenaBitVector**>(
       temp_scoped_alloc_->Alloc(sizeof(ArenaBitVector*) * GetNumBlocks(), kArenaAllocMisc));
-  std::fill_n(temp_bit_matrix_, GetNumBlocks(), nullptr);
-  DCHECK_GT(temp_bit_vector_size_, 0u);
+  std::fill_n(temp_.cice.ending_classes_to_check_matrix, GetNumBlocks(), nullptr);
+  DCHECK_GT(temp_.cice.num_class_bits, 0u);
   return true;
 }
 
@@ -1229,22 +1286,22 @@
   /*
    * Set initial state.  Catch blocks don't need any special treatment.
    */
-  ArenaBitVector* classes_to_check = temp_bit_vector_;
+  ArenaBitVector* classes_to_check = temp_.cice.work_classes_to_check;
   DCHECK(classes_to_check != nullptr);
   if (bb->block_type == kEntryBlock) {
-    classes_to_check->SetInitialBits(temp_bit_vector_size_);
+    classes_to_check->SetInitialBits(temp_.cice.num_class_bits);
   } else {
     // Starting state is union of all incoming arcs.
     bool copied_first = false;
     for (BasicBlockId pred_id : bb->predecessors) {
-      if (temp_bit_matrix_[pred_id] == nullptr) {
+      if (temp_.cice.ending_classes_to_check_matrix[pred_id] == nullptr) {
         continue;
       }
       if (!copied_first) {
         copied_first = true;
-        classes_to_check->Copy(temp_bit_matrix_[pred_id]);
+        classes_to_check->Copy(temp_.cice.ending_classes_to_check_matrix[pred_id]);
       } else {
-        classes_to_check->Union(temp_bit_matrix_[pred_id]);
+        classes_to_check->Union(temp_.cice.ending_classes_to_check_matrix[pred_id]);
       }
     }
     DCHECK(copied_first);  // At least one predecessor must have been processed before this bb.
@@ -1253,7 +1310,7 @@
 
   // Walk through the instruction in the block, updating as necessary
   for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
-    uint16_t index = temp_insn_data_[mir->offset / 2u];
+    uint16_t index = temp_.cice.indexes[mir->offset / 2u];
     if (index != 0xffffu) {
       bool check_initialization = false;
       bool check_dex_cache = false;
@@ -1261,12 +1318,10 @@
       // NOTE: index != 0xffff does not guarantee that this is an SGET/SPUT/INVOKE_STATIC.
       // Dex instructions with width 1 can have the same offset/2.
 
-      if (mir->dalvikInsn.opcode >= Instruction::SGET &&
-          mir->dalvikInsn.opcode <= Instruction::SPUT_SHORT) {
+      if (IsInstructionSGetOrSPut(mir->dalvikInsn.opcode)) {
         check_initialization = true;
         check_dex_cache = true;
-      } else if (mir->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
-               mir->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE) {
+      } else if (IsInstructionInvokeStatic(mir->dalvikInsn.opcode)) {
         check_initialization = true;
         // NOTE: INVOKE_STATIC doesn't guarantee that the type will be in the dex cache.
       }
@@ -1299,29 +1354,29 @@
 
   // Did anything change?
   bool changed = false;
-  ArenaBitVector* old_ending_classes_to_check = temp_bit_matrix_[bb->id];
+  ArenaBitVector* old_ending_classes_to_check = temp_.cice.ending_classes_to_check_matrix[bb->id];
   if (old_ending_classes_to_check == nullptr) {
     DCHECK(temp_scoped_alloc_.get() != nullptr);
     changed = classes_to_check->GetHighestBitSet() != -1;
-    temp_bit_matrix_[bb->id] = classes_to_check;
+    temp_.cice.ending_classes_to_check_matrix[bb->id] = classes_to_check;
     // Create a new classes_to_check for next BB.
-    temp_bit_vector_ = new (temp_scoped_alloc_.get()) ArenaBitVector(
-        temp_scoped_alloc_.get(), temp_bit_vector_size_, false, kBitMapClInitCheck);
+    temp_.cice.work_classes_to_check = new (temp_scoped_alloc_.get()) ArenaBitVector(
+        temp_scoped_alloc_.get(), temp_.cice.num_class_bits, false, kBitMapClInitCheck);
   } else if (!classes_to_check->Equal(old_ending_classes_to_check)) {
     changed = true;
-    temp_bit_matrix_[bb->id] = classes_to_check;
-    temp_bit_vector_ = old_ending_classes_to_check;  // Reuse for classes_to_check for next BB.
+    temp_.cice.ending_classes_to_check_matrix[bb->id] = classes_to_check;
+    temp_.cice.work_classes_to_check = old_ending_classes_to_check;  // Reuse for next BB.
   }
   return changed;
 }
 
 void MIRGraph::EliminateClassInitChecksEnd() {
   // Clean up temporaries.
-  temp_bit_vector_size_ = 0u;
-  temp_bit_vector_ = nullptr;
-  temp_bit_matrix_ = nullptr;
-  DCHECK(temp_insn_data_ != nullptr);
-  temp_insn_data_ = nullptr;
+  temp_.cice.num_class_bits = 0u;
+  temp_.cice.work_classes_to_check = nullptr;
+  temp_.cice.ending_classes_to_check_matrix = nullptr;
+  DCHECK(temp_.cice.indexes != nullptr);
+  temp_.cice.indexes = nullptr;
   DCHECK(temp_scoped_alloc_.get() != nullptr);
   temp_scoped_alloc_.reset();
 }
@@ -1333,39 +1388,43 @@
 
   DCHECK(temp_scoped_alloc_ == nullptr);
   temp_scoped_alloc_.reset(ScopedArenaAllocator::Create(&cu_->arena_stack));
-  DCHECK(temp_gvn_ == nullptr);
-  temp_gvn_.reset(
-      new (temp_scoped_alloc_.get()) GlobalValueNumbering(cu_, temp_scoped_alloc_.get(),
-                                                          GlobalValueNumbering::kModeGvn));
+  temp_.gvn.ifield_ids_ =
+      GlobalValueNumbering::PrepareGvnFieldIds(temp_scoped_alloc_.get(), ifield_lowering_infos_);
+  temp_.gvn.sfield_ids_ =
+      GlobalValueNumbering::PrepareGvnFieldIds(temp_scoped_alloc_.get(), sfield_lowering_infos_);
+  DCHECK(temp_.gvn.gvn == nullptr);
+  temp_.gvn.gvn = new (temp_scoped_alloc_.get()) GlobalValueNumbering(
+      cu_, temp_scoped_alloc_.get(), GlobalValueNumbering::kModeGvn);
   return true;
 }
 
 bool MIRGraph::ApplyGlobalValueNumbering(BasicBlock* bb) {
-  DCHECK(temp_gvn_ != nullptr);
-  LocalValueNumbering* lvn = temp_gvn_->PrepareBasicBlock(bb);
+  DCHECK(temp_.gvn.gvn != nullptr);
+  LocalValueNumbering* lvn = temp_.gvn.gvn->PrepareBasicBlock(bb);
   if (lvn != nullptr) {
     for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
       lvn->GetValueNumber(mir);
     }
   }
-  bool change = (lvn != nullptr) && temp_gvn_->FinishBasicBlock(bb);
+  bool change = (lvn != nullptr) && temp_.gvn.gvn->FinishBasicBlock(bb);
   return change;
 }
 
 void MIRGraph::ApplyGlobalValueNumberingEnd() {
   // Perform modifications.
-  if (temp_gvn_->Good()) {
+  DCHECK(temp_.gvn.gvn != nullptr);
+  if (temp_.gvn.gvn->Good()) {
     if (max_nested_loops_ != 0u) {
-      temp_gvn_->StartPostProcessing();
+      temp_.gvn.gvn->StartPostProcessing();
       TopologicalSortIterator iter(this);
       for (BasicBlock* bb = iter.Next(); bb != nullptr; bb = iter.Next()) {
         ScopedArenaAllocator allocator(&cu_->arena_stack);  // Reclaim memory after each LVN.
-        LocalValueNumbering* lvn = temp_gvn_->PrepareBasicBlock(bb, &allocator);
+        LocalValueNumbering* lvn = temp_.gvn.gvn->PrepareBasicBlock(bb, &allocator);
         if (lvn != nullptr) {
           for (MIR* mir = bb->first_mir_insn; mir != nullptr; mir = mir->next) {
             lvn->GetValueNumber(mir);
           }
-          bool change = temp_gvn_->FinishBasicBlock(bb);
+          bool change = temp_.gvn.gvn->FinishBasicBlock(bb);
           DCHECK(!change) << PrettyMethod(cu_->method_idx, *cu_->dex_file);
         }
       }
@@ -1376,16 +1435,18 @@
     LOG(WARNING) << "GVN failed for " << PrettyMethod(cu_->method_idx, *cu_->dex_file);
   }
 
-  DCHECK(temp_gvn_ != nullptr);
-  temp_gvn_.reset();
+  delete temp_.gvn.gvn;
+  temp_.gvn.gvn = nullptr;
+  temp_.gvn.ifield_ids_ = nullptr;
+  temp_.gvn.sfield_ids_ = nullptr;
   DCHECK(temp_scoped_alloc_ != nullptr);
   temp_scoped_alloc_.reset();
 }
 
 void MIRGraph::ComputeInlineIFieldLoweringInfo(uint16_t field_idx, MIR* invoke, MIR* iget_or_iput) {
   uint32_t method_index = invoke->meta.method_lowering_info;
-  if (temp_bit_vector_->IsBitSet(method_index)) {
-    iget_or_iput->meta.ifield_lowering_info = temp_insn_data_[method_index];
+  if (temp_.smi.processed_indexes->IsBitSet(method_index)) {
+    iget_or_iput->meta.ifield_lowering_info = temp_.smi.lowering_infos[method_index];
     DCHECK_EQ(field_idx, GetIFieldLoweringInfo(iget_or_iput).FieldIndex());
     return;
   }
@@ -1396,14 +1457,15 @@
       cu_, cu_->class_loader, cu_->class_linker, *target.dex_file,
       nullptr /* code_item not used */, 0u /* class_def_idx not used */, target.dex_method_index,
       0u /* access_flags not used */, nullptr /* verified_method not used */);
-  MirIFieldLoweringInfo inlined_field_info(field_idx);
+  DexMemAccessType type = IGetOrIPutMemAccessType(iget_or_iput->dalvikInsn.opcode);
+  MirIFieldLoweringInfo inlined_field_info(field_idx, type);
   MirIFieldLoweringInfo::Resolve(cu_->compiler_driver, &inlined_unit, &inlined_field_info, 1u);
   DCHECK(inlined_field_info.IsResolved());
 
   uint32_t field_info_index = ifield_lowering_infos_.size();
   ifield_lowering_infos_.push_back(inlined_field_info);
-  temp_bit_vector_->SetBit(method_index);
-  temp_insn_data_[method_index] = field_info_index;
+  temp_.smi.processed_indexes->SetBit(method_index);
+  temp_.smi.lowering_infos[method_index] = field_info_index;
   iget_or_iput->meta.ifield_lowering_info = field_info_index;
 }
 
@@ -1425,12 +1487,12 @@
 
   DCHECK(temp_scoped_alloc_.get() == nullptr);
   temp_scoped_alloc_.reset(ScopedArenaAllocator::Create(&cu_->arena_stack));
-  temp_bit_vector_size_ = method_lowering_infos_.size();
-  temp_bit_vector_ = new (temp_scoped_alloc_.get()) ArenaBitVector(
-      temp_scoped_alloc_.get(), temp_bit_vector_size_, false, kBitMapMisc);
-  temp_bit_vector_->ClearAllBits();
-  temp_insn_data_ = static_cast<uint16_t*>(temp_scoped_alloc_->Alloc(
-      temp_bit_vector_size_ * sizeof(*temp_insn_data_), kArenaAllocGrowableArray));
+  temp_.smi.num_indexes = method_lowering_infos_.size();
+  temp_.smi.processed_indexes = new (temp_scoped_alloc_.get()) ArenaBitVector(
+      temp_scoped_alloc_.get(), temp_.smi.num_indexes, false, kBitMapMisc);
+  temp_.smi.processed_indexes->ClearAllBits();
+  temp_.smi.lowering_infos = static_cast<uint16_t*>(temp_scoped_alloc_->Alloc(
+      temp_.smi.num_indexes * sizeof(*temp_.smi.lowering_infos), kArenaAllocGrowableArray));
 }
 
 void MIRGraph::InlineSpecialMethods(BasicBlock* bb) {
@@ -1477,10 +1539,12 @@
 }
 
 void MIRGraph::InlineSpecialMethodsEnd() {
-  DCHECK(temp_insn_data_ != nullptr);
-  temp_insn_data_ = nullptr;
-  DCHECK(temp_bit_vector_ != nullptr);
-  temp_bit_vector_ = nullptr;
+  // Clean up temporaries.
+  DCHECK(temp_.smi.lowering_infos != nullptr);
+  temp_.smi.lowering_infos = nullptr;
+  temp_.smi.num_indexes = 0u;
+  DCHECK(temp_.smi.processed_indexes != nullptr);
+  temp_.smi.processed_indexes = nullptr;
   DCHECK(temp_scoped_alloc_.get() != nullptr);
   temp_scoped_alloc_.reset();
 }
@@ -1542,6 +1606,14 @@
 }
 
 void MIRGraph::BasicBlockOptimization() {
+  if ((cu_->disable_opt & (1 << kLocalValueNumbering)) == 0) {
+    temp_scoped_alloc_.reset(ScopedArenaAllocator::Create(&cu_->arena_stack));
+    temp_.gvn.ifield_ids_ =
+        GlobalValueNumbering::PrepareGvnFieldIds(temp_scoped_alloc_.get(), ifield_lowering_infos_);
+    temp_.gvn.sfield_ids_ =
+        GlobalValueNumbering::PrepareGvnFieldIds(temp_scoped_alloc_.get(), sfield_lowering_infos_);
+  }
+
   if ((cu_->disable_opt & (1 << kSuppressExceptionEdges)) != 0) {
     ClearAllVisitedFlags();
     PreOrderDfsIterator iter2(this);
@@ -1558,6 +1630,11 @@
       BasicBlockOpt(bb);
     }
   }
+
+  // Clean up after LVN.
+  temp_.gvn.ifield_ids_ = nullptr;
+  temp_.gvn.sfield_ids_ = nullptr;
+  temp_scoped_alloc_.reset();
 }
 
 }  // namespace art
diff --git a/compiler/dex/mir_optimization_test.cc b/compiler/dex/mir_optimization_test.cc
index 8874faf..c794cc6 100644
--- a/compiler/dex/mir_optimization_test.cc
+++ b/compiler/dex/mir_optimization_test.cc
@@ -19,6 +19,7 @@
 #include "compiler_internals.h"
 #include "dataflow_iterator.h"
 #include "dataflow_iterator-inl.h"
+#include "dex/mir_field_info.h"
 #include "gtest/gtest.h"
 
 namespace art {
@@ -236,15 +237,17 @@
       ASSERT_LT(def->bbid, cu_.mir_graph->block_list_.size());
       BasicBlock* bb = cu_.mir_graph->block_list_[def->bbid];
       bb->AppendMIR(mir);
-      if (def->opcode >= Instruction::SGET && def->opcode <= Instruction::SPUT_SHORT) {
-        ASSERT_LT(def->field_or_method_info, cu_.mir_graph->sfield_lowering_infos_.size());
-        mir->meta.sfield_lowering_info = def->field_or_method_info;
-      } else if (def->opcode >= Instruction::IGET && def->opcode <= Instruction::IPUT_SHORT) {
+      if (IsInstructionIGetOrIPut(def->opcode)) {
         ASSERT_LT(def->field_or_method_info, cu_.mir_graph->ifield_lowering_infos_.size());
         mir->meta.ifield_lowering_info = def->field_or_method_info;
-      } else if (def->opcode >= Instruction::INVOKE_VIRTUAL &&
-          def->opcode < Instruction::INVOKE_INTERFACE_RANGE &&
-          def->opcode != Instruction::RETURN_VOID_BARRIER) {
+        ASSERT_EQ(cu_.mir_graph->ifield_lowering_infos_[def->field_or_method_info].MemAccessType(),
+                  IGetOrIPutMemAccessType(def->opcode));
+      } else if (IsInstructionSGetOrSPut(def->opcode)) {
+        ASSERT_LT(def->field_or_method_info, cu_.mir_graph->sfield_lowering_infos_.size());
+        mir->meta.sfield_lowering_info = def->field_or_method_info;
+        ASSERT_EQ(cu_.mir_graph->sfield_lowering_infos_[def->field_or_method_info].MemAccessType(),
+                  SGetOrSPutMemAccessType(def->opcode));
+      } else if (IsInstructionInvoke(def->opcode)) {
         ASSERT_LT(def->field_or_method_info, cu_.mir_graph->method_lowering_infos_.size());
         mir->meta.method_lowering_info = def->field_or_method_info;
       }
@@ -294,6 +297,7 @@
     uintptr_t declaring_dex_file;
     uint16_t declaring_class_idx;
     uint16_t declaring_field_idx;
+    DexMemAccessType type;
   };
 
   void DoPrepareSFields(const SFieldDef* defs, size_t count) {
@@ -301,12 +305,12 @@
     cu_.mir_graph->sfield_lowering_infos_.reserve(count);
     for (size_t i = 0u; i != count; ++i) {
       const SFieldDef* def = &defs[i];
-      MirSFieldLoweringInfo field_info(def->field_idx);
+      MirSFieldLoweringInfo field_info(def->field_idx, def->type);
       if (def->declaring_dex_file != 0u) {
         field_info.declaring_dex_file_ = reinterpret_cast<const DexFile*>(def->declaring_dex_file);
         field_info.declaring_class_idx_ = def->declaring_class_idx;
         field_info.declaring_field_idx_ = def->declaring_field_idx;
-        field_info.flags_ = MirSFieldLoweringInfo::kFlagIsStatic;
+        // We don't care about the volatile flag in these tests.
       }
       ASSERT_EQ(def->declaring_dex_file != 0u, field_info.IsResolved());
       ASSERT_FALSE(field_info.IsClassInitialized());
@@ -343,6 +347,7 @@
     uintptr_t declaring_dex_file;
     uint16_t declaring_class_idx;
     uint16_t declaring_field_idx;
+    DexMemAccessType type;
   };
 
   void DoPrepareIFields(const IFieldDef* defs, size_t count) {
@@ -350,11 +355,12 @@
     cu_.mir_graph->ifield_lowering_infos_.reserve(count);
     for (size_t i = 0u; i != count; ++i) {
       const IFieldDef* def = &defs[i];
-      MirIFieldLoweringInfo field_info(def->field_idx);
+      MirIFieldLoweringInfo field_info(def->field_idx, def->type);
       if (def->declaring_dex_file != 0u) {
         field_info.declaring_dex_file_ = reinterpret_cast<const DexFile*>(def->declaring_dex_file);
         field_info.declaring_class_idx_ = def->declaring_class_idx;
         field_info.declaring_field_idx_ = def->declaring_field_idx;
+        // We don't care about the volatile flag in these tests.
       }
       ASSERT_EQ(def->declaring_dex_file != 0u, field_info.IsResolved());
       cu_.mir_graph->ifield_lowering_infos_.push_back(field_info);
@@ -393,12 +399,12 @@
 
 TEST_F(ClassInitCheckEliminationTest, SingleBlock) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, 0u },
-      { 1u, 1u, 1u, 1u },
-      { 2u, 1u, 2u, 2u },
-      { 3u, 1u, 3u, 3u },  // Same declaring class as sfield[4].
-      { 4u, 1u, 3u, 4u },  // Same declaring class as sfield[3].
-      { 5u, 0u, 0u, 0u },  // Unresolved.
+      { 0u, 1u, 0u, 0u, kDexMemAccessWord },
+      { 1u, 1u, 1u, 1u, kDexMemAccessWord },
+      { 2u, 1u, 2u, 2u, kDexMemAccessWord },
+      { 3u, 1u, 3u, 3u, kDexMemAccessWord },  // Same declaring class as sfield[4].
+      { 4u, 1u, 3u, 4u, kDexMemAccessWord },  // Same declaring class as sfield[3].
+      { 5u, 0u, 0u, 0u, kDexMemAccessWord },  // Unresolved.
   };
   static const MIRDef mirs[] = {
       DEF_SGET_SPUT(3u, Instruction::SPUT, 0u, 5u),  // Unresolved.
@@ -432,9 +438,9 @@
 
 TEST_F(ClassInitCheckEliminationTest, SingleBlockWithInvokes) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, 0u },
-      { 1u, 1u, 1u, 1u },
-      { 2u, 1u, 2u, 2u },
+      { 0u, 1u, 0u, 0u, kDexMemAccessWord },
+      { 1u, 1u, 1u, 1u, kDexMemAccessWord },
+      { 2u, 1u, 2u, 2u, kDexMemAccessWord },
   };
   static const MethodDef methods[] = {
       { 0u, 1u, 0u, 0u, kStatic, kStatic, false, false },
@@ -473,17 +479,17 @@
 
 TEST_F(ClassInitCheckEliminationTest, Diamond) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, 0u },
-      { 1u, 1u, 1u, 1u },
-      { 2u, 1u, 2u, 2u },
-      { 3u, 1u, 3u, 3u },
-      { 4u, 1u, 4u, 4u },
-      { 5u, 1u, 5u, 5u },
-      { 6u, 1u, 6u, 6u },
-      { 7u, 1u, 7u, 7u },
-      { 8u, 1u, 8u, 8u },  // Same declaring class as sfield[9].
-      { 9u, 1u, 8u, 9u },  // Same declaring class as sfield[8].
-      { 10u, 0u, 0u, 0u },  // Unresolved.
+      { 0u, 1u, 0u, 0u, kDexMemAccessWord },
+      { 1u, 1u, 1u, 1u, kDexMemAccessWord },
+      { 2u, 1u, 2u, 2u, kDexMemAccessWord },
+      { 3u, 1u, 3u, 3u, kDexMemAccessWord },
+      { 4u, 1u, 4u, 4u, kDexMemAccessWord },
+      { 5u, 1u, 5u, 5u, kDexMemAccessWord },
+      { 6u, 1u, 6u, 6u, kDexMemAccessWord },
+      { 7u, 1u, 7u, 7u, kDexMemAccessWord },
+      { 8u, 1u, 8u, 8u, kDexMemAccessWord },   // Same declaring class as sfield[9].
+      { 9u, 1u, 8u, 9u, kDexMemAccessWord },   // Same declaring class as sfield[8].
+      { 10u, 0u, 0u, 0u, kDexMemAccessWord },  // Unresolved.
   };
   static const MIRDef mirs[] = {
       // NOTE: MIRs here are ordered by unique tests. They will be put into appropriate blocks.
@@ -539,11 +545,11 @@
 
 TEST_F(ClassInitCheckEliminationTest, DiamondWithInvokes) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, 0u },
-      { 1u, 1u, 1u, 1u },
-      { 2u, 1u, 2u, 2u },
-      { 3u, 1u, 3u, 3u },
-      { 4u, 1u, 4u, 4u },
+      { 0u, 1u, 0u, 0u, kDexMemAccessWord },
+      { 1u, 1u, 1u, 1u, kDexMemAccessWord },
+      { 2u, 1u, 2u, 2u, kDexMemAccessWord },
+      { 3u, 1u, 3u, 3u, kDexMemAccessWord },
+      { 4u, 1u, 4u, 4u, kDexMemAccessWord },
   };
   static const MethodDef methods[] = {
       { 0u, 1u, 0u, 0u, kStatic, kStatic, false, false },
@@ -600,9 +606,9 @@
 
 TEST_F(ClassInitCheckEliminationTest, Loop) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, 0u },
-      { 1u, 1u, 1u, 1u },
-      { 2u, 1u, 2u, 2u },
+      { 0u, 1u, 0u, 0u, kDexMemAccessWord },
+      { 1u, 1u, 1u, 1u, kDexMemAccessWord },
+      { 2u, 1u, 2u, 2u, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_SGET_SPUT(3u, Instruction::SGET, 0u, 0u),
@@ -631,7 +637,7 @@
 
 TEST_F(ClassInitCheckEliminationTest, LoopWithInvokes) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, 0u },
+      { 0u, 1u, 0u, 0u, kDexMemAccessWord },
   };
   static const MethodDef methods[] = {
       { 0u, 1u, 0u, 0u, kStatic, kStatic, false, false },
@@ -671,10 +677,10 @@
 
 TEST_F(ClassInitCheckEliminationTest, Catch) {
   static const SFieldDef sfields[] = {
-      { 0u, 1u, 0u, 0u },
-      { 1u, 1u, 1u, 1u },
-      { 2u, 1u, 2u, 2u },
-      { 3u, 1u, 3u, 3u },
+      { 0u, 1u, 0u, 0u, kDexMemAccessWord },
+      { 1u, 1u, 1u, 1u, kDexMemAccessWord },
+      { 2u, 1u, 2u, 2u, kDexMemAccessWord },
+      { 3u, 1u, 3u, 3u, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_SGET_SPUT(3u, Instruction::SGET, 0u, 0u),  // Before the exception edge.
@@ -707,9 +713,9 @@
 
 TEST_F(NullCheckEliminationTest, SingleBlock) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, 0u },
-      { 1u, 1u, 0u, 1u },
-      { 2u, 1u, 0u, 2u },  // Object.
+      { 0u, 1u, 0u, 0u, kDexMemAccessWord },
+      { 1u, 1u, 0u, 1u, kDexMemAccessWord },
+      { 2u, 1u, 0u, 2u, kDexMemAccessObject },
   };
   static const MIRDef mirs[] = {
       DEF_IGET_IPUT(3u, Instruction::IGET_OBJECT, 0u, 100u, 2u),
@@ -768,9 +774,9 @@
 
 TEST_F(NullCheckEliminationTest, Diamond) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, 0u },
-      { 1u, 1u, 0u, 1u },
-      { 2u, 1u, 0u, 2u },  // int[].
+      { 0u, 1u, 0u, 0u, kDexMemAccessWord },
+      { 1u, 1u, 0u, 1u, kDexMemAccessWord },
+      { 2u, 1u, 0u, 2u, kDexMemAccessObject },  // int[].
   };
   static const MIRDef mirs[] = {
       // NOTE: MIRs here are ordered by unique tests. They will be put into appropriate blocks.
@@ -816,8 +822,8 @@
 
 TEST_F(NullCheckEliminationTest, Loop) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, 0u },
-      { 1u, 1u, 1u, 1u },
+      { 0u, 1u, 0u, 0u, kDexMemAccessWord },
+      { 1u, 1u, 1u, 1u, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_IGET_IPUT(3u, Instruction::IGET, 0u, 100u, 0u),
@@ -846,8 +852,8 @@
 
 TEST_F(NullCheckEliminationTest, Catch) {
   static const IFieldDef ifields[] = {
-      { 0u, 1u, 0u, 0u },
-      { 1u, 1u, 1u, 1u },
+      { 0u, 1u, 0u, 0u, kDexMemAccessWord },
+      { 1u, 1u, 1u, 1u, kDexMemAccessWord },
   };
   static const MIRDef mirs[] = {
       DEF_IGET_IPUT(3u, Instruction::IGET, 0u, 100u, 0u),  // Before the exception edge.
diff --git a/compiler/dex/quick/arm/assemble_arm.cc b/compiler/dex/quick/arm/assemble_arm.cc
index 4e20d76..76ec9df 100644
--- a/compiler/dex/quick/arm/assemble_arm.cc
+++ b/compiler/dex/quick/arm/assemble_arm.cc
@@ -671,12 +671,12 @@
                  kFmtBitBlt, 15, 0, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1,
                  IS_UNARY_OP | REG_DEF_SP | REG_USE_SP | REG_DEF_LIST0
-                 | IS_LOAD | NEEDS_FIXUP, "pop", "<!0R>", 4, kFixupPushPop),
+                 | IS_LOAD, "pop", "<!0R>", 4, kFixupNone),
     ENCODING_MAP(kThumb2Push,          0xe92d0000,
                  kFmtBitBlt, 15, 0, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1,
                  IS_UNARY_OP | REG_DEF_SP | REG_USE_SP | REG_USE_LIST0
-                 | IS_STORE | NEEDS_FIXUP, "push", "<!0R>", 4, kFixupPushPop),
+                 | IS_STORE, "push", "<!0R>", 4, kFixupNone),
     ENCODING_MAP(kThumb2CmpRI8M, 0xf1b00f00,
                  kFmtBitBlt, 19, 16, kFmtModImm, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1,
@@ -1396,31 +1396,6 @@
           }
           break;
         }
-        case kFixupPushPop: {
-          if (__builtin_popcount(lir->operands[0]) == 1) {
-            /*
-             * The standard push/pop multiple instruction
-             * requires at least two registers in the list.
-             * If we've got just one, switch to the single-reg
-             * encoding.
-             */
-            lir->opcode = (lir->opcode == kThumb2Push) ? kThumb2Push1 :
-                kThumb2Pop1;
-            int reg = 0;
-            while (lir->operands[0]) {
-              if (lir->operands[0] & 0x1) {
-                break;
-              } else {
-                reg++;
-                lir->operands[0] >>= 1;
-              }
-            }
-            lir->operands[0] = reg;
-            // This won't change again, don't bother unlinking, just reset fixup kind
-            lir->flags.fixup = kFixupNone;
-          }
-          break;
-        }
         case kFixupCondBranch: {
           LIR *target_lir = lir->target;
           int32_t delta = 0;
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index a3b4df3..99b2166 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -288,18 +288,12 @@
   StoreValue(rl_dest, rl_result);
 }
 
-/*
- * Mark garbage collection card. Skip if the value we're storing is null.
- */
-void ArmMir2Lir::MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) {
+void ArmMir2Lir::UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) {
   RegStorage reg_card_base = AllocTemp();
   RegStorage reg_card_no = AllocTemp();
-  LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
   LoadWordDisp(rs_rARM_SELF, Thread::CardTableOffset<4>().Int32Value(), reg_card_base);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
-  LIR* target = NewLIR0(kPseudoTargetLabel);
-  branch_over->target = target;
   FreeTemp(reg_card_base);
   FreeTemp(reg_card_no);
 }
@@ -353,7 +347,20 @@
     }
   }
   /* Spill core callee saves */
-  NewLIR1(kThumb2Push, core_spill_mask_);
+  if (core_spill_mask_ == 0u) {
+    // Nothing to spill.
+  } else if ((core_spill_mask_ & ~(0xffu | (1u << rs_rARM_LR.GetRegNum()))) == 0u) {
+    // Spilling only low regs and/or LR, use 16-bit PUSH.
+    constexpr int lr_bit_shift = rs_rARM_LR.GetRegNum() - 8;
+    NewLIR1(kThumbPush,
+            (core_spill_mask_ & ~(1u << rs_rARM_LR.GetRegNum())) |
+            ((core_spill_mask_ & (1u << rs_rARM_LR.GetRegNum())) >> lr_bit_shift));
+  } else if (IsPowerOfTwo(core_spill_mask_)) {
+    // kThumb2Push cannot be used to spill a single register.
+    NewLIR1(kThumb2Push1, CTZ(core_spill_mask_));
+  } else {
+    NewLIR1(kThumb2Push, core_spill_mask_);
+  }
   /* Need to spill any FP regs? */
   if (num_fp_spills_) {
     /*
@@ -450,13 +457,26 @@
   if (num_fp_spills_) {
     NewLIR1(kThumb2VPopCS, num_fp_spills_);
   }
-  if (core_spill_mask_ & (1 << rs_rARM_LR.GetRegNum())) {
+  if ((core_spill_mask_ & (1 << rs_rARM_LR.GetRegNum())) != 0) {
     /* Unspill rARM_LR to rARM_PC */
     core_spill_mask_ &= ~(1 << rs_rARM_LR.GetRegNum());
     core_spill_mask_ |= (1 << rs_rARM_PC.GetRegNum());
   }
-  NewLIR1(kThumb2Pop, core_spill_mask_);
-  if (!(core_spill_mask_ & (1 << rs_rARM_PC.GetRegNum()))) {
+  if (core_spill_mask_ == 0u) {
+    // Nothing to unspill.
+  } else if ((core_spill_mask_ & ~(0xffu | (1u << rs_rARM_PC.GetRegNum()))) == 0u) {
+    // Unspilling only low regs and/or PC, use 16-bit POP.
+    constexpr int pc_bit_shift = rs_rARM_PC.GetRegNum() - 8;
+    NewLIR1(kThumbPop,
+            (core_spill_mask_ & ~(1u << rs_rARM_PC.GetRegNum())) |
+            ((core_spill_mask_ & (1u << rs_rARM_PC.GetRegNum())) >> pc_bit_shift));
+  } else if (IsPowerOfTwo(core_spill_mask_)) {
+    // kThumb2Pop cannot be used to unspill a single register.
+    NewLIR1(kThumb2Pop1, CTZ(core_spill_mask_));
+  } else {
+    NewLIR1(kThumb2Pop, core_spill_mask_);
+  }
+  if ((core_spill_mask_ & (1 << rs_rARM_PC.GetRegNum())) == 0) {
     /* We didn't pop to rARM_PC, so must do a bv rARM_LR */
     NewLIR1(kThumbBx, rs_rARM_LR.GetReg());
   }
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index d235199..e8d0c32 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -106,7 +106,9 @@
                        OpSize size, VolatileKind is_volatile) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
                           OpSize size) OVERRIDE;
-    void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg);
+
+    /// @copydoc Mir2Lir::UnconditionallyMarkGCCard(RegStorage)
+    void UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) OVERRIDE;
 
     // Required for target - register utilities.
     RegStorage TargetReg(SpecialTargetRegister reg) OVERRIDE;
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 57544b5..4aedbaf 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -16,6 +16,7 @@
 
 /* This file contains codegen for the Thumb2 ISA. */
 
+#include "arch/instruction_set_features.h"
 #include "arm_lir.h"
 #include "codegen_arm.h"
 #include "dex/quick/mir_to_lir-inl.h"
@@ -871,7 +872,7 @@
 
   if (is_object && !mir_graph_->IsConstantNullRef(rl_new_value)) {
     // Mark card for object assuming new value is stored.
-    MarkGCCard(rl_new_value.reg, rl_object.reg);
+    MarkGCCard(0, rl_new_value.reg, rl_object.reg);
   }
 
   RegLocation rl_offset = LoadValue(rl_src_offset, kCoreReg);
@@ -1119,7 +1120,9 @@
 }
 
 bool ArmMir2Lir::GenMemBarrier(MemBarrierKind barrier_kind) {
-#if ANDROID_SMP != 0
+  if (!cu_->GetInstructionSetFeatures()->IsSmp()) {
+    return false;
+  }
   // Start off with using the last LIR as the barrier. If it is not enough, then we will generate one.
   LIR* barrier = last_lir_insn_;
 
@@ -1149,9 +1152,6 @@
   DCHECK(!barrier->flags.use_def_invalid);
   barrier->u.m.def_mask = &kEncodeAll;
   return ret;
-#else
-  return false;
-#endif
 }
 
 void ArmMir2Lir::GenNegLong(RegLocation rl_dest, RegLocation rl_src) {
@@ -1162,6 +1162,7 @@
   // Check for destructive overlap
   if (rl_result.reg.GetLowReg() == rl_src.reg.GetHighReg()) {
     RegStorage t_reg = AllocTemp();
+    OpRegCopy(t_reg, rl_result.reg.GetLow());
     OpRegRegReg(kOpSub, rl_result.reg.GetLow(), z_reg, rl_src.reg.GetLow());
     OpRegRegReg(kOpSbc, rl_result.reg.GetHigh(), z_reg, t_reg);
     FreeTemp(t_reg);
@@ -1471,7 +1472,7 @@
     FreeTemp(reg_ptr);
   }
   if (card_mark) {
-    MarkGCCard(rl_src.reg, rl_array.reg);
+    MarkGCCard(opt_flags, rl_src.reg, rl_array.reg);
   }
 }
 
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 0c7812b..7168b9f 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -213,10 +213,7 @@
 }
 
 LIR* ArmMir2Lir::OpCondBranch(ConditionCode cc, LIR* target) {
-  // This is kThumb2BCond instead of kThumbBCond for performance reasons. The assembly
-  // time required for a new pass after kThumbBCond is fixed up to kThumb2BCond is
-  // substantial.
-  LIR* branch = NewLIR2(kThumb2BCond, 0 /* offset to be patched */,
+  LIR* branch = NewLIR2(kThumbBCond, 0 /* offset to be patched */,
                         ArmConditionEncoding(cc));
   branch->target = target;
   return branch;
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index 973279e..f8a7310 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -320,6 +320,7 @@
   kA64Mul3rrr,       // mul [00011011000] rm[20-16] [011111] rn[9-5] rd[4-0].
   kA64Msub4rrrr,     // msub[s0011011000] rm[20-16] [1] ra[14-10] rn[9-5] rd[4-0].
   kA64Neg3rro,       // neg alias of "sub arg0, rzr, arg1, arg2".
+  kA64Nop0,          // nop alias of "hint #0" [11010101000000110010000000011111].
   kA64Orr3Rrl,       // orr [s01100100] N[22] imm_r[21-16] imm_s[15-10] rn[9-5] rd[4-0].
   kA64Orr4rrro,      // orr [s0101010] shift[23-22] [0] rm[20-16] imm_6[15-10] rn[9-5] rd[4-0].
   kA64Ret,           // ret [11010110010111110000001111000000].
@@ -332,7 +333,7 @@
   kA64Scvtf2fw,      // scvtf  [000111100s100010000000] rn[9-5] rd[4-0].
   kA64Scvtf2fx,      // scvtf  [100111100s100010000000] rn[9-5] rd[4-0].
   kA64Sdiv3rrr,      // sdiv[s0011010110] rm[20-16] [000011] rn[9-5] rd[4-0].
-  kA64Smaddl4xwwx,   // smaddl [10011011001] rm[20-16] [0] ra[14-10] rn[9-5] rd[4-0].
+  kA64Smull3xww,     // smull [10011011001] rm[20-16] [011111] rn[9-5] rd[4-0].
   kA64Smulh3xxx,     // smulh [10011011010] rm[20-16] [011111] rn[9-5] rd[4-0].
   kA64Stp4ffXD,      // stp [0s10110100] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
   kA64Stp4rrXD,      // stp [s010100100] imm_7[21-15] rt2[14-10] rn[9-5] rt[4-0].
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index 9cdabf1..cab11cc 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
-#include "arm64_lir.h"
 #include "codegen_arm64.h"
+
+#include "arch/arm64/instruction_set_features_arm64.h"
+#include "arm64_lir.h"
 #include "dex/quick/mir_to_lir-inl.h"
 
 namespace art {
@@ -468,13 +470,17 @@
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE12,
                  "mul", "!0r, !1r, !2r", kFixupNone),
     ENCODING_MAP(WIDE(kA64Msub4rrrr), SF_VARIANTS(0x1b008000),
-                 kFmtRegR, 4, 0, kFmtRegR, 9, 5, kFmtRegR, 14, 10,
-                 kFmtRegR, 20, 16, IS_QUAD_OP | REG_DEF0_USE123,
-                 "msub", "!0r, !1r, !3r, !2r", kFixupNone),
+                 kFmtRegR, 4, 0, kFmtRegR, 9, 5, kFmtRegR, 20, 16,
+                 kFmtRegR, 14, 10, IS_QUAD_OP | REG_DEF0_USE123 | NEEDS_FIXUP,
+                 "msub", "!0r, !1r, !2r, !3r", kFixupA53Erratum835769),
     ENCODING_MAP(WIDE(kA64Neg3rro), SF_VARIANTS(0x4b0003e0),
                  kFmtRegR, 4, 0, kFmtRegR, 20, 16, kFmtShift, -1, -1,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE1,
                  "neg", "!0r, !1r!2o", kFixupNone),
+    ENCODING_MAP(kA64Nop0, NO_VARIANTS(0xd503201f),
+                 kFmtUnused, -1, -1, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1, NO_OPERAND,
+                 "nop", "", kFixupNone),
     ENCODING_MAP(WIDE(kA64Orr3Rrl), SF_VARIANTS(0x32000000),
                  kFmtRegROrSp, 4, 0, kFmtRegR, 9, 5, kFmtBitBlt, 22, 10,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE1,
@@ -523,10 +529,10 @@
                  kFmtRegR, 4, 0, kFmtRegR, 9, 5, kFmtRegR, 20, 16,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE12,
                  "sdiv", "!0r, !1r, !2r", kFixupNone),
-    ENCODING_MAP(WIDE(kA64Smaddl4xwwx), NO_VARIANTS(0x9b200000),
+    ENCODING_MAP(kA64Smull3xww, NO_VARIANTS(0x9b207c00),
                  kFmtRegX, 4, 0, kFmtRegW, 9, 5, kFmtRegW, 20, 16,
-                 kFmtRegX, 14, 10, IS_QUAD_OP | REG_DEF0_USE123,
-                 "smaddl", "!0x, !1w, !2w, !3x", kFixupNone),
+                 kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE12,
+                 "smull", "!0x, !1w, !2w", kFixupNone),
     ENCODING_MAP(kA64Smulh3xxx, NO_VARIANTS(0x9b407c00),
                  kFmtRegX, 4, 0, kFmtRegX, 9, 5, kFmtRegX, 20, 16,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE12,
@@ -988,6 +994,30 @@
           lir->operands[1] = delta;
           break;
         }
+        case kFixupA53Erratum835769:
+          // Avoid emitting code that could trigger Cortex A53's erratum 835769.
+          // This fixup should be carried out for all multiply-accumulate instructions: madd, msub,
+          // smaddl, smsubl, umaddl and umsubl.
+          if (cu_->GetInstructionSetFeatures()->AsArm64InstructionSetFeatures()
+              ->NeedFixCortexA53_835769()) {
+            // Check that this is a 64-bit multiply-accumulate.
+            if (IS_WIDE(lir->opcode)) {
+              uint64_t prev_insn_flags = EncodingMap[UNWIDE(lir->prev->opcode)].flags;
+              // Check that the instruction preceding the multiply-accumulate is a load or store.
+              if ((prev_insn_flags & IS_LOAD) != 0 || (prev_insn_flags & IS_STORE) != 0) {
+                // insert a NOP between the load/store and the multiply-accumulate.
+                LIR* new_lir = RawLIR(lir->dalvik_offset, kA64Nop0, 0, 0, 0, 0, 0, NULL);
+                new_lir->offset = lir->offset;
+                new_lir->flags.fixup = kFixupNone;
+                new_lir->flags.size = EncodingMap[kA64Nop0].size;
+                InsertLIRBefore(lir, new_lir);
+                lir->offset += new_lir->flags.size;
+                offset_adjustment += new_lir->flags.size;
+                res = kRetryAll;
+              }
+            }
+          }
+          break;
         default:
           LOG(FATAL) << "Unexpected case " << lir->flags.fixup;
       }
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 3e5b7bf..089e4b6 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -251,20 +251,14 @@
   StoreValue(rl_dest, rl_result);
 }
 
-/*
- * Mark garbage collection card. Skip if the value we're storing is null.
- */
-void Arm64Mir2Lir::MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) {
+void Arm64Mir2Lir::UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) {
   RegStorage reg_card_base = AllocTempWide();
   RegStorage reg_card_no = AllocTempWide();  // Needs to be wide as addr is ref=64b
-  LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
   LoadWordDisp(rs_xSELF, Thread::CardTableOffset<8>().Int32Value(), reg_card_base);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   // TODO(Arm64): generate "strb wB, [xB, wC, uxtw]" rather than "strb wB, [xB, xC]"?
   StoreBaseIndexed(reg_card_base, reg_card_no, As32BitReg(reg_card_base),
                    0, kUnsignedByte);
-  LIR* target = NewLIR0(kPseudoTargetLabel);
-  branch_over->target = target;
   FreeTemp(reg_card_base);
   FreeTemp(reg_card_no);
 }
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 5182a89..5e10f80 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -94,7 +94,10 @@
   LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
                         OpSize size) OVERRIDE;
   LIR* StoreRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale) OVERRIDE;
-  void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) OVERRIDE;
+
+  /// @copydoc Mir2Lir::UnconditionallyMarkGCCard(RegStorage)
+  void UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) OVERRIDE;
+
   LIR* OpCmpMemImmBranch(ConditionCode cond, RegStorage temp_reg, RegStorage base_reg,
                          int offset, int check_value, LIR* target, LIR** compare) OVERRIDE;
 
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index 8a5a58c..57e67d5 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -16,6 +16,7 @@
 
 /* This file contains codegen for the Thumb2 ISA. */
 
+#include "arch/instruction_set_features.h"
 #include "arm64_lir.h"
 #include "codegen_arm64.h"
 #include "dex/quick/mir_to_lir-inl.h"
@@ -427,8 +428,7 @@
   rl_src = LoadValue(rl_src, kCoreReg);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage r_long_mul = AllocTemp();
-  NewLIR4(kA64Smaddl4xwwx, As64BitReg(r_long_mul).GetReg(),
-          r_magic.GetReg(), rl_src.reg.GetReg(), rxzr);
+  NewLIR3(kA64Smull3xww, As64BitReg(r_long_mul).GetReg(), r_magic.GetReg(), rl_src.reg.GetReg());
   switch (pattern) {
     case Divide3:
       OpRegRegImm(kOpLsr, As64BitReg(r_long_mul), As64BitReg(r_long_mul), 32);
@@ -483,6 +483,7 @@
       } else {
         reconstructed_imm = base + 1;
       }
+      DCHECK_EQ(reconstructed_imm, magic_table[lit].magic64) << " for literal " << lit;
     }
 
     // Load the magic constant in two instructions.
@@ -648,7 +649,7 @@
     }
     OpRegRegReg(kOpDiv, temp, r_src1, r_src2);
     NewLIR4(kA64Msub4rrrr | wide, rl_result.reg.GetReg(), temp.GetReg(),
-            r_src1.GetReg(), r_src2.GetReg());
+            r_src2.GetReg(), r_src1.GetReg());
     FreeTemp(temp);
   }
   return rl_result;
@@ -758,7 +759,7 @@
 
   if (is_object && !mir_graph_->IsConstantNullRef(rl_new_value)) {
     // Mark card for object assuming new value is stored.
-    MarkGCCard(rl_new_value.reg, rl_object.reg);
+    MarkGCCard(0, rl_new_value.reg, rl_object.reg);
   }
 
   RegStorage r_ptr = AllocTempRef();
@@ -979,7 +980,9 @@
 }
 
 bool Arm64Mir2Lir::GenMemBarrier(MemBarrierKind barrier_kind) {
-#if ANDROID_SMP != 0
+  if (!cu_->GetInstructionSetFeatures()->IsSmp()) {
+    return false;
+  }
   // Start off with using the last LIR as the barrier. If it is not enough, then we will generate one.
   LIR* barrier = last_lir_insn_;
 
@@ -1015,9 +1018,6 @@
   DCHECK(!barrier->flags.use_def_invalid);
   barrier->u.m.def_mask = &kEncodeAll;
   return ret;
-#else
-  return false;
-#endif
 }
 
 void Arm64Mir2Lir::GenIntToLong(RegLocation rl_dest, RegLocation rl_src) {
@@ -1282,7 +1282,7 @@
     FreeTemp(reg_ptr);
   }
   if (card_mark) {
-    MarkGCCard(rl_src.reg, rl_array.reg);
+    MarkGCCard(opt_flags, rl_src.reg, rl_array.reg);
   }
 }
 
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 9403516..58bcee2 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -314,6 +314,19 @@
   }
 }
 
+void Mir2Lir::MarkGCCard(int opt_flags, RegStorage val_reg, RegStorage tgt_addr_reg) {
+  DCHECK(val_reg.Valid());
+  DCHECK_EQ(val_reg.Is64Bit(), cu_->target64);
+  if ((opt_flags & MIR_STORE_NON_NULL_VALUE) != 0) {
+    UnconditionallyMarkGCCard(tgt_addr_reg);
+  } else {
+    LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, nullptr);
+    UnconditionallyMarkGCCard(tgt_addr_reg);
+    LIR* target = NewLIR0(kPseudoTargetLabel);
+    branch_over->target = target;
+  }
+}
+
 /* Dump instructions and constant pool contents */
 void Mir2Lir::CodegenDump() {
   LOG(INFO) << "Dumping LIR insns for "
@@ -909,28 +922,6 @@
   NewLIR1(kPseudoDalvikByteCodeBoundary, WrapPointer(ArenaStrdup(inst_str)));
 }
 
-bool Mir2Lir::EvaluateBranch(Instruction::Code opcode, int32_t src1, int32_t src2) {
-  bool is_taken;
-  switch (opcode) {
-    case Instruction::IF_EQ: is_taken = (src1 == src2); break;
-    case Instruction::IF_NE: is_taken = (src1 != src2); break;
-    case Instruction::IF_LT: is_taken = (src1 < src2); break;
-    case Instruction::IF_GE: is_taken = (src1 >= src2); break;
-    case Instruction::IF_GT: is_taken = (src1 > src2); break;
-    case Instruction::IF_LE: is_taken = (src1 <= src2); break;
-    case Instruction::IF_EQZ: is_taken = (src1 == 0); break;
-    case Instruction::IF_NEZ: is_taken = (src1 != 0); break;
-    case Instruction::IF_LTZ: is_taken = (src1 < 0); break;
-    case Instruction::IF_GEZ: is_taken = (src1 >= 0); break;
-    case Instruction::IF_GTZ: is_taken = (src1 > 0); break;
-    case Instruction::IF_LEZ: is_taken = (src1 <= 0); break;
-    default:
-      LOG(FATAL) << "Unexpected opcode " << opcode;
-      UNREACHABLE();
-  }
-  return is_taken;
-}
-
 // Convert relation of src1/src2 to src2/src1
 ConditionCode Mir2Lir::FlipComparisonOrder(ConditionCode before) {
   ConditionCode res;
diff --git a/compiler/dex/quick/dex_file_method_inliner.cc b/compiler/dex/quick/dex_file_method_inliner.cc
index e12d305..3039852 100644
--- a/compiler/dex/quick/dex_file_method_inliner.cc
+++ b/compiler/dex/quick/dex_file_method_inliner.cc
@@ -112,18 +112,18 @@
 uint32_t GetInvokeReg(MIR* invoke, uint32_t arg) {
   DCHECK_LT(arg, invoke->dalvikInsn.vA);
   DCHECK(!MIR::DecodedInstruction::IsPseudoMirOp(invoke->dalvikInsn.opcode));
-  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc) {
-    return invoke->dalvikInsn.vC + arg;  // Non-range invoke.
+  if (IsInvokeInstructionRange(invoke->dalvikInsn.opcode)) {
+    return invoke->dalvikInsn.vC + arg;  // Range invoke.
   } else {
     DCHECK_EQ(Instruction::FormatOf(invoke->dalvikInsn.opcode), Instruction::k35c);
-    return invoke->dalvikInsn.arg[arg];  // Range invoke.
+    return invoke->dalvikInsn.arg[arg];  // Non-range invoke.
   }
 }
 
 bool WideArgIsInConsecutiveDalvikRegs(MIR* invoke, uint32_t arg) {
   DCHECK_LT(arg + 1, invoke->dalvikInsn.vA);
   DCHECK(!MIR::DecodedInstruction::IsPseudoMirOp(invoke->dalvikInsn.opcode));
-  return Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc ||
+  return IsInvokeInstructionRange(invoke->dalvikInsn.opcode) ||
       invoke->dalvikInsn.arg[arg + 1u] == invoke->dalvikInsn.arg[arg] + 1u;
 }
 
@@ -573,8 +573,7 @@
     // If the invoke has not been eliminated yet, check now whether we should do it.
     // This is done so that dataflow analysis does not get tripped up seeing nop invoke.
     if (static_cast<int>(invoke->dalvikInsn.opcode) != kMirOpNop) {
-      bool is_static = invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
-          invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE;
+      bool is_static = IsInstructionInvokeStatic(invoke->dalvikInsn.opcode);
       if (is_static || (invoke->optimization_flags & MIR_IGNORE_NULL_CHECK) != 0) {
         // No null object register involved here so we can eliminate the invoke.
         invoke->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
@@ -804,9 +803,7 @@
     return !data.is_volatile;
   }
 
-  DCHECK_EQ(data.method_is_static != 0u,
-            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
-            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE);
+  DCHECK_EQ(data.method_is_static != 0u, IsInstructionInvokeStatic(invoke->dalvikInsn.opcode));
   bool object_is_this = (data.method_is_static == 0u && data.object_arg == 0u);
   if (!object_is_this) {
     // TODO: Implement inlining of IGET on non-"this" registers (needs correct stack trace for NPE).
@@ -865,9 +862,7 @@
     return false;
   }
 
-  DCHECK_EQ(data.method_is_static != 0u,
-            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
-            invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE);
+  DCHECK_EQ(data.method_is_static != 0u, IsInstructionInvokeStatic(invoke->dalvikInsn.opcode));
   bool object_is_this = (data.method_is_static == 0u && data.object_arg == 0u);
   if (!object_is_this) {
     // TODO: Implement inlining of IPUT on non-"this" registers (needs correct stack trace for NPE).
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 98ddc36..774176e 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -322,6 +322,12 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
+void Mir2Lir::GenLongToInt(RegLocation rl_dest, RegLocation rl_src) {
+  rl_src = UpdateLocWide(rl_src);
+  rl_src = NarrowRegLoc(rl_src);
+  StoreValue(rl_dest, rl_src);
+}
+
 void Mir2Lir::GenIntNarrowing(Instruction::Code opcode, RegLocation rl_dest,
                               RegLocation rl_src) {
   rl_src = LoadValue(rl_src, kCoreReg);
@@ -416,8 +422,8 @@
   // share array alignment with ints (see comment at head of function)
   size_t component_size = sizeof(int32_t);
 
-  // Having a range of 0 is legal
-  if (info->is_range && (elems > 0)) {
+  if (elems > 5) {
+    DCHECK(info->is_range);  // Non-range insn can't encode more than 5 elems.
     /*
      * Bit of ugliness here.  We're going generate a mem copy loop
      * on the register range, but it is possible that some regs
@@ -487,7 +493,11 @@
       OpRegRegImm(kOpAdd, ref_reg, r_dst,
                   -mirror::Array::DataOffset(component_size).Int32Value());
     }
-  } else if (!info->is_range) {
+    FreeTemp(r_idx);
+    FreeTemp(r_dst);
+    FreeTemp(r_src);
+  } else {
+    DCHECK_LE(elems, 5);  // Usually but not necessarily non-range.
     // TUNING: interleave
     for (int i = 0; i < elems; i++) {
       RegLocation rl_arg;
@@ -507,6 +517,15 @@
       }
     }
   }
+  if (elems != 0 && info->args[0].ref) {
+    // If there is at least one potentially non-null value, unconditionally mark the GC card.
+    for (int i = 0; i < elems; i++) {
+      if (!mir_graph_->IsConstantNullRef(info->args[i])) {
+        UnconditionallyMarkGCCard(ref_reg);
+        break;
+      }
+    }
+  }
   if (info->result.location != kLocInvalid) {
     StoreValue(info->result, GetReturn(kRefReg));
   }
@@ -570,6 +589,7 @@
 
 void Mir2Lir::GenSput(MIR* mir, RegLocation rl_src, OpSize size) {
   const MirSFieldLoweringInfo& field_info = mir_graph_->GetSFieldLoweringInfo(mir);
+  DCHECK_EQ(SPutMemAccessType(mir->dalvikInsn.opcode), field_info.MemAccessType());
   cu_->compiler_driver->ProcessedStaticField(field_info.FastPut(), field_info.IsReferrersClass());
   if (!SLOW_FIELD_PATH && field_info.FastPut()) {
     DCHECK_GE(field_info.FieldOffset().Int32Value(), 0);
@@ -651,7 +671,7 @@
                     field_info.IsVolatile() ? kVolatile : kNotVolatile);
     }
     if (IsRef(size) && !mir_graph_->IsConstantNullRef(rl_src)) {
-      MarkGCCard(rl_src.reg, r_base);
+      MarkGCCard(mir->optimization_flags, rl_src.reg, r_base);
     }
     FreeTemp(r_base);
   } else {
@@ -688,6 +708,7 @@
 
 void Mir2Lir::GenSget(MIR* mir, RegLocation rl_dest, OpSize size, Primitive::Type type) {
   const MirSFieldLoweringInfo& field_info = mir_graph_->GetSFieldLoweringInfo(mir);
+  DCHECK_EQ(SGetMemAccessType(mir->dalvikInsn.opcode), field_info.MemAccessType());
   cu_->compiler_driver->ProcessedStaticField(field_info.FastGet(), field_info.IsReferrersClass());
 
   if (!SLOW_FIELD_PATH && field_info.FastGet()) {
@@ -826,6 +847,7 @@
 void Mir2Lir::GenIGet(MIR* mir, int opt_flags, OpSize size, Primitive::Type type,
                       RegLocation rl_dest, RegLocation rl_obj) {
   const MirIFieldLoweringInfo& field_info = mir_graph_->GetIFieldLoweringInfo(mir);
+  DCHECK_EQ(IGetMemAccessType(mir->dalvikInsn.opcode), field_info.MemAccessType());
   cu_->compiler_driver->ProcessedInstanceField(field_info.FastGet());
   if (!SLOW_FIELD_PATH && field_info.FastGet()) {
     RegisterClass reg_class = RegClassForFieldLoadStore(size, field_info.IsVolatile());
@@ -899,6 +921,7 @@
 void Mir2Lir::GenIPut(MIR* mir, int opt_flags, OpSize size,
                       RegLocation rl_src, RegLocation rl_obj) {
   const MirIFieldLoweringInfo& field_info = mir_graph_->GetIFieldLoweringInfo(mir);
+  DCHECK_EQ(IPutMemAccessType(mir->dalvikInsn.opcode), field_info.MemAccessType());
   cu_->compiler_driver->ProcessedInstanceField(field_info.FastPut());
   if (!SLOW_FIELD_PATH && field_info.FastPut()) {
     RegisterClass reg_class = RegClassForFieldLoadStore(size, field_info.IsVolatile());
@@ -923,7 +946,7 @@
     }
     MarkPossibleNullPointerExceptionAfter(opt_flags, store);
     if (IsRef(size) && !mir_graph_->IsConstantNullRef(rl_src)) {
-      MarkGCCard(rl_src.reg, rl_obj.reg);
+      MarkGCCard(opt_flags, rl_src.reg, rl_obj.reg);
     }
   } else {
     QuickEntrypointEnum target;
@@ -1051,7 +1074,11 @@
       r_method = TargetReg(kArg2, kRef);
       LoadCurrMethodDirect(r_method);
     }
-    LoadRefDisp(r_method, mirror::ArtMethod::DexCacheStringsOffset().Int32Value(),
+    // Method to declaring class.
+    LoadRefDisp(r_method, mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
+                TargetReg(kArg0, kRef), kNotVolatile);
+    // Declaring class to dex cache strings.
+    LoadRefDisp(TargetReg(kArg0, kRef), mirror::Class::DexCacheStringsOffset().Int32Value(),
                 TargetReg(kArg0, kRef), kNotVolatile);
 
     // Might call out to helper, which will return resolved string in kRet0
@@ -1089,7 +1116,9 @@
     RegLocation rl_method = LoadCurrMethod();
     RegStorage res_reg = AllocTempRef();
     RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
-    LoadRefDisp(rl_method.reg, mirror::ArtMethod::DexCacheStringsOffset().Int32Value(), res_reg,
+    LoadRefDisp(rl_method.reg, mirror::ArtMethod::DeclaringClassOffset().Int32Value(), res_reg,
+                kNotVolatile);
+    LoadRefDisp(res_reg, mirror::Class::DexCacheStringsOffset().Int32Value(), res_reg,
                 kNotVolatile);
     LoadRefDisp(res_reg, offset_of_string, rl_result.reg, kNotVolatile);
     StoreValue(rl_dest, rl_result);
@@ -2199,43 +2228,53 @@
 }
 
 void Mir2Lir::GenSmallPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
+  BasicBlock* bb = mir_graph_->GetBasicBlock(mir->bb);
+  DCHECK(bb != nullptr);
+  ArenaVector<SuccessorBlockInfo*>::const_iterator succ_bb_iter = bb->successor_blocks.cbegin();
   const uint16_t* table = mir_graph_->GetTable(mir, table_offset);
   const uint16_t entries = table[1];
   // Chained cmp-and-branch.
   const int32_t* as_int32 = reinterpret_cast<const int32_t*>(&table[2]);
   int32_t starting_key = as_int32[0];
-  const int32_t* targets = &as_int32[1];
   rl_src = LoadValue(rl_src, kCoreReg);
   int i = 0;
-  for (; i < entries; i++) {
+  for (; i < entries; ++i, ++succ_bb_iter) {
     if (!InexpensiveConstantInt(starting_key + i, Instruction::Code::IF_EQ)) {
       // Switch to using a temp and add.
       break;
     }
-    BasicBlock* case_block =
-        mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
-    OpCmpImmBranch(kCondEq, rl_src.reg, starting_key + i, &block_label_list_[case_block->id]);
+    SuccessorBlockInfo* successor_block_info = *succ_bb_iter;
+    DCHECK(successor_block_info != nullptr);
+    int case_block_id = successor_block_info->block;
+    DCHECK_EQ(starting_key + i, successor_block_info->key);
+    OpCmpImmBranch(kCondEq, rl_src.reg, starting_key + i, &block_label_list_[case_block_id]);
   }
   if (i < entries) {
     // The rest do not seem to be inexpensive. Try to allocate a temp and use add.
     RegStorage key_temp = AllocTypedTemp(false, kCoreReg, false);
     if (key_temp.Valid()) {
       LoadConstantNoClobber(key_temp, starting_key + i);
-      for (; i < entries - 1; i++) {
-        BasicBlock* case_block =
-            mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
-        OpCmpBranch(kCondEq, rl_src.reg, key_temp, &block_label_list_[case_block->id]);
+      for (; i < entries - 1; ++i, ++succ_bb_iter) {
+        SuccessorBlockInfo* successor_block_info = *succ_bb_iter;
+        DCHECK(successor_block_info != nullptr);
+        int case_block_id = successor_block_info->block;
+        DCHECK_EQ(starting_key + i, successor_block_info->key);
+        OpCmpBranch(kCondEq, rl_src.reg, key_temp, &block_label_list_[case_block_id]);
         OpRegImm(kOpAdd, key_temp, 1);  // Increment key.
       }
-      BasicBlock* case_block =
-          mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
-      OpCmpBranch(kCondEq, rl_src.reg, key_temp, &block_label_list_[case_block->id]);
+      SuccessorBlockInfo* successor_block_info = *succ_bb_iter;
+      DCHECK(successor_block_info != nullptr);
+      int case_block_id = successor_block_info->block;
+      DCHECK_EQ(starting_key + i, successor_block_info->key);
+      OpCmpBranch(kCondEq, rl_src.reg, key_temp, &block_label_list_[case_block_id]);
     } else {
       // No free temp, just finish the old loop.
-      for (; i < entries; i++) {
-        BasicBlock* case_block =
-            mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
-        OpCmpImmBranch(kCondEq, rl_src.reg, starting_key + i, &block_label_list_[case_block->id]);
+      for (; i < entries; ++i, ++succ_bb_iter) {
+        SuccessorBlockInfo* successor_block_info = *succ_bb_iter;
+        DCHECK(successor_block_info != nullptr);
+        int case_block_id = successor_block_info->block;
+        DCHECK_EQ(starting_key + i, successor_block_info->key);
+        OpCmpImmBranch(kCondEq, rl_src.reg, starting_key + i, &block_label_list_[case_block_id]);
       }
     }
   }
@@ -2244,7 +2283,7 @@
 void Mir2Lir::GenPackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
   const uint16_t* table = mir_graph_->GetTable(mir, table_offset);
   if (cu_->verbose) {
-    DumpSparseSwitchTable(table);
+    DumpPackedSwitchTable(table);
   }
 
   const uint16_t entries = table[1];
@@ -2257,18 +2296,20 @@
 }
 
 void Mir2Lir::GenSmallSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
+  BasicBlock* bb = mir_graph_->GetBasicBlock(mir->bb);
+  DCHECK(bb != nullptr);
   const uint16_t* table = mir_graph_->GetTable(mir, table_offset);
   const uint16_t entries = table[1];
   // Chained cmp-and-branch.
-  const int32_t* keys = reinterpret_cast<const int32_t*>(&table[2]);
-  const int32_t* targets = &keys[entries];
   rl_src = LoadValue(rl_src, kCoreReg);
-  for (int i = 0; i < entries; i++) {
-    int key = keys[i];
-    BasicBlock* case_block =
-        mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
-    OpCmpImmBranch(kCondEq, rl_src.reg, key, &block_label_list_[case_block->id]);
+  int i = 0;
+  for (SuccessorBlockInfo* successor_block_info : bb->successor_blocks) {
+    int case_block_id = successor_block_info->block;
+    int key = successor_block_info->key;
+    OpCmpImmBranch(kCondEq, rl_src.reg, key, &block_label_list_[case_block_id]);
+    i++;
   }
+  DCHECK_EQ(i, entries);
 }
 
 void Mir2Lir::GenSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index a7900ae..31b81bf 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -1668,7 +1668,7 @@
     GenMemBarrier(kAnyAny);
   }
   if (is_object) {
-    MarkGCCard(rl_value.reg, rl_object.reg);
+    MarkGCCard(0, rl_value.reg, rl_object.reg);
   }
   return true;
 }
diff --git a/compiler/dex/quick/mips/call_mips.cc b/compiler/dex/quick/mips/call_mips.cc
index ed73ef0..3bb81bf 100644
--- a/compiler/dex/quick/mips/call_mips.cc
+++ b/compiler/dex/quick/mips/call_mips.cc
@@ -222,19 +222,13 @@
   StoreValue(rl_dest, rl_result);
 }
 
-/*
- * Mark garbage collection card. Skip if the value we're storing is null.
- */
-void MipsMir2Lir::MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) {
+void MipsMir2Lir::UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) {
   RegStorage reg_card_base = AllocTemp();
   RegStorage reg_card_no = AllocTemp();
-  LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
   // NOTE: native pointer.
   LoadWordDisp(rs_rMIPS_SELF, Thread::CardTableOffset<4>().Int32Value(), reg_card_base);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
-  LIR* target = NewLIR0(kPseudoTargetLabel);
-  branch_over->target = target;
   FreeTemp(reg_card_base);
   FreeTemp(reg_card_no);
 }
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index 7e9d80d..e08846c 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -49,7 +49,9 @@
                           OpSize size) OVERRIDE;
     LIR* GenAtomic64Load(RegStorage r_base, int displacement, RegStorage r_dest);
     LIR* GenAtomic64Store(RegStorage r_base, int displacement, RegStorage r_src);
-    void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg);
+
+    /// @copydoc Mir2Lir::UnconditionallyMarkGCCard(RegStorage)
+    void UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) OVERRIDE;
 
     // Required for target - register utilities.
     RegStorage Solo64ToPair64(RegStorage reg);
diff --git a/compiler/dex/quick/mips/int_mips.cc b/compiler/dex/quick/mips/int_mips.cc
index fb47238..0778c3b 100644
--- a/compiler/dex/quick/mips/int_mips.cc
+++ b/compiler/dex/quick/mips/int_mips.cc
@@ -641,7 +641,7 @@
     FreeTemp(reg_ptr);
   }
   if (card_mark) {
-    MarkGCCard(rl_src.reg, rl_array.reg);
+    MarkGCCard(opt_flags, rl_src.reg, rl_array.reg);
   }
 }
 
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index c945f7f..320c0f4 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -201,6 +201,16 @@
     RegStorage reg_arg_low = GetArgMappingToPhysicalReg(in_position);
     RegStorage reg_arg_high = GetArgMappingToPhysicalReg(in_position + 1);
 
+    if (cu_->instruction_set == kX86) {
+      // Can't handle double split between reg & memory.  Flush reg half to memory.
+      if (rl_dest.reg.IsDouble() && (reg_arg_low.Valid() != reg_arg_high.Valid())) {
+        DCHECK(reg_arg_low.Valid());
+        DCHECK(!reg_arg_high.Valid());
+        Store32Disp(TargetPtrReg(kSp), offset, reg_arg_low);
+        reg_arg_low = RegStorage::InvalidReg();
+      }
+    }
+
     if (reg_arg_low.Valid() && reg_arg_high.Valid()) {
       OpRegCopyWide(rl_dest.reg, RegStorage::MakeRegPair(reg_arg_low, reg_arg_high));
     } else if (reg_arg_low.Valid() && !reg_arg_high.Valid()) {
@@ -322,7 +332,7 @@
         kNotVolatile);
   }
   if (IsRef(size)) {
-    MarkGCCard(reg_src, reg_obj);
+    MarkGCCard(0, reg_src, reg_obj);
   }
   return true;
 }
@@ -647,24 +657,12 @@
     case Instruction::IF_GT:
     case Instruction::IF_LE: {
       LIR* taken = &label_list[bb->taken];
-      // Result known at compile time?
-      if (rl_src[0].is_const && rl_src[1].is_const) {
-        bool is_taken = EvaluateBranch(opcode, mir_graph_->ConstantValue(rl_src[0].orig_sreg),
-                                       mir_graph_->ConstantValue(rl_src[1].orig_sreg));
-        BasicBlockId target_id = is_taken ? bb->taken : bb->fall_through;
-        if (mir_graph_->IsBackedge(bb, target_id) &&
-            (kLeafOptimization || !mir_graph_->HasSuspendTestBetween(bb, target_id))) {
-          GenSuspendTest(opt_flags);
-        }
-        OpUnconditionalBranch(&label_list[target_id]);
-      } else {
-        if (mir_graph_->IsBackwardsBranch(bb) &&
-            (kLeafOptimization || !mir_graph_->HasSuspendTestBetween(bb, bb->taken) ||
-             !mir_graph_->HasSuspendTestBetween(bb, bb->fall_through))) {
-          GenSuspendTest(opt_flags);
-        }
-        GenCompareAndBranch(opcode, rl_src[0], rl_src[1], taken);
+      if (mir_graph_->IsBackwardsBranch(bb) &&
+          (kLeafOptimization || !mir_graph_->HasSuspendTestBetween(bb, bb->taken) ||
+           !mir_graph_->HasSuspendTestBetween(bb, bb->fall_through))) {
+        GenSuspendTest(opt_flags);
       }
+      GenCompareAndBranch(opcode, rl_src[0], rl_src[1], taken);
       break;
     }
     case Instruction::IF_EQZ:
@@ -674,23 +672,12 @@
     case Instruction::IF_GTZ:
     case Instruction::IF_LEZ: {
       LIR* taken = &label_list[bb->taken];
-      // Result known at compile time?
-      if (rl_src[0].is_const) {
-        bool is_taken = EvaluateBranch(opcode, mir_graph_->ConstantValue(rl_src[0].orig_sreg), 0);
-        BasicBlockId target_id = is_taken ? bb->taken : bb->fall_through;
-        if (mir_graph_->IsBackedge(bb, target_id) &&
-            (kLeafOptimization || !mir_graph_->HasSuspendTestBetween(bb, target_id))) {
-          GenSuspendTest(opt_flags);
-        }
-        OpUnconditionalBranch(&label_list[target_id]);
-      } else {
-        if (mir_graph_->IsBackwardsBranch(bb) &&
-            (kLeafOptimization || !mir_graph_->HasSuspendTestBetween(bb, bb->taken) ||
-             !mir_graph_->HasSuspendTestBetween(bb, bb->fall_through))) {
-          GenSuspendTest(opt_flags);
-        }
-        GenCompareZeroAndBranch(opcode, rl_src[0], taken);
+      if (mir_graph_->IsBackwardsBranch(bb) &&
+          (kLeafOptimization || !mir_graph_->HasSuspendTestBetween(bb, bb->taken) ||
+           !mir_graph_->HasSuspendTestBetween(bb, bb->fall_through))) {
+        GenSuspendTest(opt_flags);
       }
+      GenCompareZeroAndBranch(opcode, rl_src[0], taken);
       break;
     }
 
@@ -946,9 +933,7 @@
       break;
 
     case Instruction::LONG_TO_INT:
-      rl_src[0] = UpdateLocWide(rl_src[0]);
-      rl_src[0] = NarrowRegLoc(rl_src[0]);
-      StoreValue(rl_dest, rl_src[0]);
+      GenLongToInt(rl_dest, rl_src[0]);
       break;
 
     case Instruction::INT_TO_BYTE:
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 13ebc1e..5d78a6e 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -666,7 +666,6 @@
     void MarkBoundary(DexOffset offset, const char* inst_str);
     void NopLIR(LIR* lir);
     void UnlinkLIR(LIR* lir);
-    bool EvaluateBranch(Instruction::Code opcode, int src1, int src2);
     bool IsInexpensiveConstant(RegLocation rl_src);
     ConditionCode FlipComparisonOrder(ConditionCode before);
     ConditionCode NegateComparison(ConditionCode before);
@@ -680,7 +679,7 @@
     int AssignSwitchTablesOffset(CodeOffset offset);
     int AssignFillArrayDataOffset(CodeOffset offset);
     virtual LIR* InsertCaseLabel(DexOffset vaddr, int keyVal);
-    void MarkPackedCaseLabels(Mir2Lir::SwitchTable* tab_rec);
+    virtual void MarkPackedCaseLabels(Mir2Lir::SwitchTable* tab_rec);
     void MarkSparseCaseLabels(Mir2Lir::SwitchTable* tab_rec);
 
     // Handle bookkeeping to convert a wide RegLocation to a narrow RegLocation.  No code generated.
@@ -811,6 +810,7 @@
                              LIR* taken);
     void GenCompareZeroAndBranch(Instruction::Code opcode, RegLocation rl_src, LIR* taken);
     virtual void GenIntToLong(RegLocation rl_dest, RegLocation rl_src);
+    virtual void GenLongToInt(RegLocation rl_dest, RegLocation rl_src);
     void GenIntNarrowing(Instruction::Code opcode, RegLocation rl_dest,
                          RegLocation rl_src);
     void GenNewArray(uint32_t type_idx, RegLocation rl_dest,
@@ -1071,6 +1071,14 @@
     // Update LIR for verbose listings.
     void UpdateLIROffsets();
 
+    /**
+     * @brief Mark a garbage collection card. Skip if the stored value is null.
+     * @param val_reg the register holding the stored value to check against null.
+     * @param tgt_addr_reg the address of the object or array where the value was stored.
+     * @param opt_flags the optimization flags which may indicate that the value is non-null.
+     */
+    void MarkGCCard(int opt_flags, RegStorage val_reg, RegStorage tgt_addr_reg);
+
     /*
      * @brief Load the address of the dex method into the register.
      * @param target_method The MethodReference of the method to be invoked.
@@ -1139,7 +1147,12 @@
                                OpSize size, VolatileKind is_volatile) = 0;
     virtual LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src,
                                   int scale, OpSize size) = 0;
-    virtual void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) = 0;
+
+    /**
+     * @brief Unconditionally mark a garbage collection card.
+     * @param tgt_addr_reg the address of the object or array where the value was stored.
+     */
+    virtual void UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) = 0;
 
     // Required for target - register utilities.
 
diff --git a/compiler/dex/quick/quick_compiler.cc b/compiler/dex/quick/quick_compiler.cc
index a54c55f..8d4cb3c 100644
--- a/compiler/dex/quick/quick_compiler.cc
+++ b/compiler/dex/quick/quick_compiler.cc
@@ -613,7 +613,8 @@
 }
 
 uintptr_t QuickCompiler::GetEntryPointOf(mirror::ArtMethod* method) const {
-  return reinterpret_cast<uintptr_t>(method->GetEntryPointFromQuickCompiledCode());
+  return reinterpret_cast<uintptr_t>(method->GetEntryPointFromQuickCompiledCodePtrSize(
+      InstructionSetPointerSize(GetCompilerDriver()->GetInstructionSet())));
 }
 
 bool QuickCompiler::WriteElf(art::File* file,
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 3933b21..84d68d2 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -424,15 +424,15 @@
   { kX86PextrbRRI, kRegRegImmStore, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1, false }, "PextbRRI", "!0r,!1r,!2d" },
   { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1, false }, "PextwRRI", "!0r,!1r,!2d" },
   { kX86PextrdRRI, kRegRegImmStore, IS_TERTIARY_OP | REG_DEF0  | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" },
-  { kX86PextrbMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrbMRI", "[!0r+!1d],!2r,!3d" },
-  { kX86PextrwMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrwMRI", "[!0r+!1d],!2r,!3d" },
-  { kX86PextrdMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrdMRI", "[!0r+!1d],!2r,!3d" },
+  { kX86PextrbMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextrbMRI", "[!0r+!1d],!2r,!3d" },
+  { kX86PextrwMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextrwMRI", "[!0r+!1d],!2r,!3d" },
+  { kX86PextrdMRI, kMemRegImm, IS_QUAD_OP     | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextrdMRI", "[!0r+!1d],!2r,!3d" },
 
   { kX86PshuflwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0xF2, 0, 0x0F, 0x70, 0, 0, 0, 1, false }, "PshuflwRRI", "!0r,!1r,!2d" },
   { kX86PshufdRRI,  kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x70, 0, 0, 0, 1, false }, "PshuffRRI", "!0r,!1r,!2d" },
 
-  { kX86ShufpsRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x00, 0, 0x0F, 0xC6, 0, 0, 0, 1, false }, "kX86ShufpsRRI", "!0r,!1r,!2d" },
-  { kX86ShufpdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0xC6, 0, 0, 0, 1, false }, "kX86ShufpdRRI", "!0r,!1r,!2d" },
+  { kX86ShufpsRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0_USE0 | REG_USE1, { 0x00, 0, 0x0F, 0xC6, 0, 0, 0, 1, false }, "ShufpsRRI", "!0r,!1r,!2d" },
+  { kX86ShufpdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0_USE0 | REG_USE1, { 0x66, 0, 0x0F, 0xC6, 0, 0, 0, 1, false }, "ShufpdRRI", "!0r,!1r,!2d" },
 
   { kX86PsrawRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 4, 0, 1, false }, "PsrawRI", "!0r,!1d" },
   { kX86PsradRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 4, 0, 1, false }, "PsradRI", "!0r,!1d" },
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index 61dcc28..be10d93 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -30,23 +30,88 @@
  * pairs.
  */
 void X86Mir2Lir::GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
-  const uint16_t* table = mir_graph_->GetTable(mir, table_offset);
+  GenSmallSparseSwitch(mir, table_offset, rl_src);
+}
+
+/*
+ * We override InsertCaseLabel, because the first parameter represents
+ * a basic block id, instead of a dex offset.
+ */
+LIR* X86Mir2Lir::InsertCaseLabel(DexOffset bbid, int keyVal) {
+  LIR* boundary_lir = &block_label_list_[bbid];
+  LIR* res = boundary_lir;
   if (cu_->verbose) {
-    DumpSparseSwitchTable(table);
+    // Only pay the expense if we're pretty-printing.
+    LIR* new_label = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocLIR));
+    BasicBlock* bb = mir_graph_->GetBasicBlock(bbid);
+    DCHECK(bb != nullptr);
+    new_label->dalvik_offset = bb->start_offset;;
+    new_label->opcode = kPseudoCaseLabel;
+    new_label->operands[0] = keyVal;
+    new_label->flags.fixup = kFixupLabel;
+    DCHECK(!new_label->flags.use_def_invalid);
+    new_label->u.m.def_mask = &kEncodeAll;
+    InsertLIRAfter(boundary_lir, new_label);
+    res = new_label;
   }
+  return res;
+}
+
+void X86Mir2Lir::MarkPackedCaseLabels(Mir2Lir::SwitchTable* tab_rec) {
+  const uint16_t* table = tab_rec->table;
+  const int32_t *targets = reinterpret_cast<const int32_t*>(&table[4]);
   int entries = table[1];
-  const int32_t* keys = reinterpret_cast<const int32_t*>(&table[2]);
-  const int32_t* targets = &keys[entries];
-  rl_src = LoadValue(rl_src, kCoreReg);
+  int low_key = s4FromSwitchData(&table[2]);
   for (int i = 0; i < entries; i++) {
-    int key = keys[i];
-    BasicBlock* case_block =
-        mir_graph_->FindBlock(current_dalvik_offset_ + targets[i]);
-    OpCmpImmBranch(kCondEq, rl_src.reg, key, &block_label_list_[case_block->id]);
+    // The value at targets[i] is a basic block id, instead of a dex offset.
+    tab_rec->targets[i] = InsertCaseLabel(targets[i], i + low_key);
   }
 }
 
 /*
+ * We convert and create a new packed switch table that stores
+ * basic block ids to targets[] by examining successor blocks.
+ * Note that the original packed switch table stores dex offsets to targets[].
+ */
+const uint16_t* X86Mir2Lir::ConvertPackedSwitchTable(MIR* mir, const uint16_t* table) {
+  /*
+   * The original packed switch data format:
+   *  ushort ident = 0x0100  magic value
+   *  ushort size            number of entries in the table
+   *  int first_key          first (and lowest) switch case value
+   *  int targets[size]      branch targets, relative to switch opcode
+   *
+   * Total size is (4+size*2) 16-bit code units.
+   *
+   * Note that the new packed switch data format is the same as the original
+   * format, except that targets[] are basic block ids.
+   *
+   */
+  BasicBlock* bb = mir_graph_->GetBasicBlock(mir->bb);
+  DCHECK(bb != nullptr);
+  // Get the number of entries.
+  int entries = table[1];
+  const int32_t* as_int32 = reinterpret_cast<const int32_t*>(&table[2]);
+  int32_t starting_key = as_int32[0];
+  // Create a new table.
+  int size = sizeof(uint16_t) * (4 + entries * 2);
+  uint16_t* new_table = reinterpret_cast<uint16_t*>(arena_->Alloc(size, kArenaAllocMisc));
+  // Copy ident, size, and first_key to the new table.
+  memcpy(new_table, table, sizeof(uint16_t) * 4);
+  // Get the new targets.
+  int32_t* new_targets = reinterpret_cast<int32_t*>(&new_table[4]);
+  // Find out targets for each entry.
+  int i = 0;
+  for (SuccessorBlockInfo* successor_block_info : bb->successor_blocks) {
+    DCHECK_EQ(starting_key + i, successor_block_info->key);
+    // Save target basic block id.
+    new_targets[i++] = successor_block_info->block;
+  }
+  DCHECK_EQ(i, entries);
+  return new_table;
+}
+
+/*
  * Code pattern will look something like:
  *
  * mov  r_val, ..
@@ -63,10 +128,8 @@
  * done:
  */
 void X86Mir2Lir::GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) {
-  const uint16_t* table = mir_graph_->GetTable(mir, table_offset);
-  if (cu_->verbose) {
-    DumpPackedSwitchTable(table);
-  }
+  const uint16_t* old_table = mir_graph_->GetTable(mir, table_offset);
+  const uint16_t* table = ConvertPackedSwitchTable(mir, old_table);
   // Add the table to the list - we'll process it later
   SwitchTable* tab_rec =
       static_cast<SwitchTable*>(arena_->Alloc(sizeof(SwitchTable), kArenaAllocData));
@@ -136,23 +199,16 @@
   StoreValue(rl_dest, rl_result);
 }
 
-/*
- * Mark garbage collection card. Skip if the value we're storing is null.
- */
-void X86Mir2Lir::MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) {
+void X86Mir2Lir::UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) {
   DCHECK_EQ(tgt_addr_reg.Is64Bit(), cu_->target64);
-  DCHECK_EQ(val_reg.Is64Bit(), cu_->target64);
   RegStorage reg_card_base = AllocTempRef();
   RegStorage reg_card_no = AllocTempRef();
-  LIR* branch_over = OpCmpImmBranch(kCondEq, val_reg, 0, NULL);
   int ct_offset = cu_->target64 ?
       Thread::CardTableOffset<8>().Int32Value() :
       Thread::CardTableOffset<4>().Int32Value();
   NewLIR2(cu_->target64 ? kX86Mov64RT : kX86Mov32RT, reg_card_base.GetReg(), ct_offset);
   OpRegRegImm(kOpLsr, reg_card_no, tgt_addr_reg, gc::accounting::CardTable::kCardShift);
   StoreBaseIndexed(reg_card_base, reg_card_no, reg_card_base, 0, kUnsignedByte);
-  LIR* target = NewLIR0(kPseudoTargetLabel);
-  branch_over->target = target;
   FreeTemp(reg_card_base);
   FreeTemp(reg_card_no);
 }
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index d57dffb..9cb0bf5 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -90,11 +90,15 @@
                        OpSize size) OVERRIDE;
   LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
   LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
+  void GenLongToInt(RegLocation rl_dest, RegLocation rl_src);
   LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
                      OpSize size, VolatileKind is_volatile) OVERRIDE;
   LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
                         OpSize size) OVERRIDE;
-  void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) OVERRIDE;
+
+  /// @copydoc Mir2Lir::UnconditionallyMarkGCCard(RegStorage)
+  void UnconditionallyMarkGCCard(RegStorage tgt_addr_reg) OVERRIDE;
+
   void GenImplicitNullCheck(RegStorage reg, int opt_flags) OVERRIDE;
 
   // Required for target - register utilities.
@@ -261,8 +265,11 @@
                                      int first_bit, int second_bit) OVERRIDE;
   void GenNegDouble(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
   void GenNegFloat(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
+  const uint16_t* ConvertPackedSwitchTable(MIR* mir, const uint16_t* table);
   void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
   void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
+  LIR* InsertCaseLabel(DexOffset vaddr, int keyVal) OVERRIDE;
+  void MarkPackedCaseLabels(Mir2Lir::SwitchTable* tab_rec) OVERRIDE;
 
   /**
    * @brief Implement instanceof a final class with x86 specific code.
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 3f501b4..80cdc83 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -1170,7 +1170,7 @@
     if (is_object && !mir_graph_->IsConstantNullRef(rl_new_value)) {
       // Mark card for object assuming new value is stored.
       FreeTemp(rs_r0);  // Temporarily release EAX for MarkGCCard().
-      MarkGCCard(rl_new_value.reg, rl_object.reg);
+      MarkGCCard(0, rl_new_value.reg, rl_object.reg);
       LockTemp(rs_r0);
     }
 
@@ -1898,6 +1898,16 @@
     AnnotateDalvikRegAccess(lir, (displacement + HIWORD_OFFSET) >> 2,
                             false /* is_load */, true /* is64bit */);
   }
+
+  int v_src_reg = mir_graph_->SRegToVReg(rl_src.s_reg_low);
+  int v_dst_reg = mir_graph_->SRegToVReg(rl_dest.s_reg_low);
+
+  // If the left operand is in memory and the right operand is in a register
+  // and both belong to the same dalvik register then we should clobber the
+  // right one because it doesn't hold valid data anymore.
+  if (v_src_reg == v_dst_reg) {
+    Clobber(rl_src.reg);
+  }
 }
 
 void X86Mir2Lir::GenLongArith(RegLocation rl_dest, RegLocation rl_src1,
@@ -2398,7 +2408,7 @@
     if (!constant_index) {
       FreeTemp(rl_index.reg);
     }
-    MarkGCCard(rl_src.reg, rl_array.reg);
+    MarkGCCard(opt_flags, rl_src.reg, rl_array.reg);
   }
 }
 
@@ -3203,6 +3213,26 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
+void X86Mir2Lir::GenLongToInt(RegLocation rl_dest, RegLocation rl_src) {
+  rl_src = UpdateLocWide(rl_src);
+  rl_src = NarrowRegLoc(rl_src);
+  StoreValue(rl_dest, rl_src);
+
+  if (cu_->target64) {
+    // if src and dest are in the same phys reg then StoreValue generates
+    // no operation but we need explicit 32-bit mov R, R to clear
+    // the higher 32-bits
+    rl_dest = UpdateLoc(rl_dest);
+    if (rl_src.location == kLocPhysReg && rl_dest.location == kLocPhysReg
+           && IsSameReg(rl_src.reg, rl_dest.reg)) {
+        LIR* copy_lir = OpRegCopyNoInsert(rl_dest.reg, rl_dest.reg);
+        // remove nop flag set by OpRegCopyNoInsert if src == dest
+        copy_lir->flags.is_nop = false;
+        AppendLIR(copy_lir);
+    }
+  }
+}
+
 void X86Mir2Lir::GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
                         RegLocation rl_src1, RegLocation rl_shift) {
   if (!cu_->target64) {
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index f5f7113..998aeff 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -18,6 +18,7 @@
 #include <inttypes.h>
 #include <string>
 
+#include "arch/instruction_set_features.h"
 #include "backend_x86.h"
 #include "codegen_x86.h"
 #include "dex/compiler_internals.h"
@@ -594,7 +595,9 @@
 }
 
 bool X86Mir2Lir::GenMemBarrier(MemBarrierKind barrier_kind) {
-#if ANDROID_SMP != 0
+  if (!cu_->GetInstructionSetFeatures()->IsSmp()) {
+    return false;
+  }
   // Start off with using the last LIR as the barrier. If it is not enough, then we will update it.
   LIR* mem_barrier = last_lir_insn_;
 
@@ -630,9 +633,6 @@
     mem_barrier->u.m.def_mask = &kEncodeAll;
   }
   return ret;
-#else
-  return false;
-#endif
 }
 
 void X86Mir2Lir::CompilerInitializeRegAlloc() {
@@ -2263,7 +2263,8 @@
       StoreFinalValue(rl_dest, rl_result);
     } else {
       int displacement = SRegOffset(rl_result.s_reg_low);
-      LIR *l = NewLIR3(extr_opcode, rs_rX86_SP_32.GetReg(), displacement, vector_src.GetReg());
+      LIR *l = NewLIR4(extr_opcode, rs_rX86_SP_32.GetReg(), displacement, vector_src.GetReg(),
+                       extract_index);
       AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */);
       AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */);
     }
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index c1c79ca..ad3222c 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -488,6 +488,7 @@
       case kOpAdc:
       case kOpAnd:
       case kOpXor:
+      case kOpMul:
         break;
       default:
         LOG(FATAL) << "Bad case in OpRegRegReg " << op;
diff --git a/compiler/dex/ssa_transformation.cc b/compiler/dex/ssa_transformation.cc
index d3d76ba..ed33882 100644
--- a/compiler/dex/ssa_transformation.cc
+++ b/compiler/dex/ssa_transformation.cc
@@ -126,7 +126,7 @@
 
   for (uint32_t idx : bb->data_flow_info->def_v->Indexes()) {
     /* Block bb defines register idx */
-    temp_bit_matrix_[idx]->SetBit(bb->id);
+    temp_.ssa.def_block_matrix[idx]->SetBit(bb->id);
   }
   return true;
 }
@@ -135,16 +135,16 @@
   int num_registers = GetNumOfCodeAndTempVRs();
   /* Allocate num_registers bit vector pointers */
   DCHECK(temp_scoped_alloc_ != nullptr);
-  DCHECK(temp_bit_matrix_ == nullptr);
-  temp_bit_matrix_ = static_cast<ArenaBitVector**>(
+  DCHECK(temp_.ssa.def_block_matrix == nullptr);
+  temp_.ssa.def_block_matrix = static_cast<ArenaBitVector**>(
       temp_scoped_alloc_->Alloc(sizeof(ArenaBitVector*) * num_registers, kArenaAllocDFInfo));
   int i;
 
   /* Initialize num_register vectors with num_blocks bits each */
   for (i = 0; i < num_registers; i++) {
-    temp_bit_matrix_[i] = new (temp_scoped_alloc_.get()) ArenaBitVector(arena_, GetNumBlocks(),
-                                                                        false, kBitMapBMatrix);
-    temp_bit_matrix_[i]->ClearAllBits();
+    temp_.ssa.def_block_matrix[i] = new (temp_scoped_alloc_.get()) ArenaBitVector(
+        arena_, GetNumBlocks(), false, kBitMapBMatrix);
+    temp_.ssa.def_block_matrix[i]->ClearAllBits();
   }
 
   AllNodesIterator iter(this);
@@ -163,7 +163,7 @@
   int num_regs = GetNumOfCodeVRs();
   int in_reg = GetFirstInVR();
   for (; in_reg < num_regs; in_reg++) {
-    temp_bit_matrix_[in_reg]->SetBit(GetEntryBlock()->id);
+    temp_.ssa.def_block_matrix[in_reg]->SetBit(GetEntryBlock()->id);
   }
 }
 
@@ -435,32 +435,32 @@
  * insert a phi node if the variable is live-in to the block.
  */
 bool MIRGraph::ComputeBlockLiveIns(BasicBlock* bb) {
-  DCHECK_EQ(temp_bit_vector_size_, cu_->mir_graph.get()->GetNumOfCodeAndTempVRs());
-  ArenaBitVector* temp_dalvik_register_v = temp_bit_vector_;
+  DCHECK_EQ(temp_.ssa.num_vregs, cu_->mir_graph.get()->GetNumOfCodeAndTempVRs());
+  ArenaBitVector* temp_live_vregs = temp_.ssa.work_live_vregs;
 
   if (bb->data_flow_info == NULL) {
     return false;
   }
-  temp_dalvik_register_v->Copy(bb->data_flow_info->live_in_v);
+  temp_live_vregs->Copy(bb->data_flow_info->live_in_v);
   BasicBlock* bb_taken = GetBasicBlock(bb->taken);
   BasicBlock* bb_fall_through = GetBasicBlock(bb->fall_through);
   if (bb_taken && bb_taken->data_flow_info)
-    ComputeSuccLineIn(temp_dalvik_register_v, bb_taken->data_flow_info->live_in_v,
+    ComputeSuccLineIn(temp_live_vregs, bb_taken->data_flow_info->live_in_v,
                       bb->data_flow_info->def_v);
   if (bb_fall_through && bb_fall_through->data_flow_info)
-    ComputeSuccLineIn(temp_dalvik_register_v, bb_fall_through->data_flow_info->live_in_v,
+    ComputeSuccLineIn(temp_live_vregs, bb_fall_through->data_flow_info->live_in_v,
                       bb->data_flow_info->def_v);
   if (bb->successor_block_list_type != kNotUsed) {
     for (SuccessorBlockInfo* successor_block_info : bb->successor_blocks) {
       BasicBlock* succ_bb = GetBasicBlock(successor_block_info->block);
       if (succ_bb->data_flow_info) {
-        ComputeSuccLineIn(temp_dalvik_register_v, succ_bb->data_flow_info->live_in_v,
+        ComputeSuccLineIn(temp_live_vregs, succ_bb->data_flow_info->live_in_v,
                           bb->data_flow_info->def_v);
       }
     }
   }
-  if (!temp_dalvik_register_v->Equal(bb->data_flow_info->live_in_v)) {
-    bb->data_flow_info->live_in_v->Copy(temp_dalvik_register_v);
+  if (!temp_live_vregs->Equal(bb->data_flow_info->live_in_v)) {
+    bb->data_flow_info->live_in_v->Copy(temp_live_vregs);
     return true;
   }
   return false;
@@ -482,7 +482,7 @@
 
   /* Iterate through each Dalvik register */
   for (dalvik_reg = GetNumOfCodeAndTempVRs() - 1; dalvik_reg >= 0; dalvik_reg--) {
-    input_blocks->Copy(temp_bit_matrix_[dalvik_reg]);
+    input_blocks->Copy(temp_.ssa.def_block_matrix[dalvik_reg]);
     phi_blocks->ClearAllBits();
     do {
       // TUNING: When we repeat this, we could skip indexes from the previous pass.
diff --git a/compiler/dex/verified_method.cc b/compiler/dex/verified_method.cc
index 9f0a696..17328c4 100644
--- a/compiler/dex/verified_method.cc
+++ b/compiler/dex/verified_method.cc
@@ -282,6 +282,10 @@
     Instruction::Code code = inst->Opcode();
     if ((code == Instruction::CHECK_CAST) || (code == Instruction::APUT_OBJECT)) {
       uint32_t dex_pc = inst->GetDexPc(code_item->insns_);
+      if (!method_verifier->GetInstructionFlags(dex_pc).IsVisited()) {
+        // Do not attempt to quicken this instruction, it's unreachable anyway.
+        continue;
+      }
       const verifier::RegisterLine* line = method_verifier->GetRegLine(dex_pc);
       bool is_safe_cast = false;
       if (code == Instruction::CHECK_CAST) {
diff --git a/compiler/dex/vreg_analysis.cc b/compiler/dex/vreg_analysis.cc
index f6c7d52..a541c7d 100644
--- a/compiler/dex/vreg_analysis.cc
+++ b/compiler/dex/vreg_analysis.cc
@@ -276,8 +276,7 @@
       }
       int num_uses = mir->dalvikInsn.vA;
       // If this is a non-static invoke, mark implicit "this"
-      if (((mir->dalvikInsn.opcode != Instruction::INVOKE_STATIC) &&
-          (mir->dalvikInsn.opcode != Instruction::INVOKE_STATIC_RANGE))) {
+      if (!IsInstructionInvokeStatic(mir->dalvikInsn.opcode)) {
         reg_location_[uses[next]].defined = true;
         reg_location_[uses[next]].ref = true;
         type_mismatch |= reg_location_[uses[next]].wide;
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 08041e8..2e9f835 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1986,6 +1986,7 @@
     case kArm:
     case kArm64:
     case kThumb2:
+    case kMips:
     case kX86:
     case kX86_64: return true;
     default: return false;
diff --git a/compiler/image_test.cc b/compiler/image_test.cc
index 7e2be3e..dac1ef4 100644
--- a/compiler/image_test.cc
+++ b/compiler/image_test.cc
@@ -105,13 +105,16 @@
     ASSERT_TRUE(success_image);
     bool success_fixup = ElfWriter::Fixup(dup_oat.get(), writer->GetOatDataBegin());
     ASSERT_TRUE(success_fixup);
+
+    ASSERT_EQ(dup_oat->FlushCloseOrErase(), 0) << "Could not flush and close oat file "
+                                               << oat_file.GetFilename();
   }
 
   {
     std::unique_ptr<File> file(OS::OpenFileForReading(image_file.GetFilename().c_str()));
     ASSERT_TRUE(file.get() != NULL);
     ImageHeader image_header;
-    file->ReadFully(&image_header, sizeof(image_header));
+    ASSERT_EQ(file->ReadFully(&image_header, sizeof(image_header)), true);
     ASSERT_TRUE(image_header.IsValid());
     ASSERT_GE(image_header.GetImageBitmapOffset(), sizeof(image_header));
     ASSERT_NE(0U, image_header.GetImageBitmapSize());
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 64d2de1..3b1d914 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -54,7 +54,8 @@
 #include "runtime.h"
 #include "scoped_thread_state_change.h"
 #include "handle_scope-inl.h"
-#include "utils.h"
+
+#include <numeric>
 
 using ::art::mirror::ArtField;
 using ::art::mirror::ArtMethod;
@@ -67,6 +68,9 @@
 
 namespace art {
 
+// Separate objects into multiple bins to optimize dirty memory use.
+static constexpr bool kBinObjects = true;
+
 bool ImageWriter::PrepareImageAddressSpace() {
   target_ptr_size_ = InstructionSetPointerSize(compiler_driver_.GetInstructionSet());
   {
@@ -149,6 +153,11 @@
 
   SetOatChecksumFromElfFile(oat_file.get());
 
+  if (oat_file->FlushCloseOrErase() != 0) {
+    LOG(ERROR) << "Failed to flush and close oat file " << oat_filename << " for " << oat_location;
+    return false;
+  }
+
   std::unique_ptr<File> image_file(OS::CreateEmptyFile(image_filename.c_str()));
   ImageHeader* image_header = reinterpret_cast<ImageHeader*>(image_->Begin());
   if (image_file.get() == NULL) {
@@ -157,6 +166,7 @@
   }
   if (fchmod(image_file->Fd(), 0644) != 0) {
     PLOG(ERROR) << "Failed to make image file world readable: " << image_filename;
+    image_file->Erase();
     return EXIT_FAILURE;
   }
 
@@ -164,6 +174,7 @@
   CHECK_EQ(image_end_, image_header->GetImageSize());
   if (!image_file->WriteFully(image_->Begin(), image_end_)) {
     PLOG(ERROR) << "Failed to write image file " << image_filename;
+    image_file->Erase();
     return false;
   }
 
@@ -173,60 +184,54 @@
                          image_header->GetImageBitmapSize(),
                          image_header->GetImageBitmapOffset())) {
     PLOG(ERROR) << "Failed to write image file " << image_filename;
+    image_file->Erase();
     return false;
   }
 
+  if (image_file->FlushCloseOrErase() != 0) {
+    PLOG(ERROR) << "Failed to flush and close image file " << image_filename;
+    return false;
+  }
   return true;
 }
 
-void ImageWriter::SetImageOffset(mirror::Object* object, size_t offset) {
+void ImageWriter::SetImageOffset(mirror::Object* object,
+                                 ImageWriter::BinSlot bin_slot,
+                                 size_t offset) {
   DCHECK(object != nullptr);
   DCHECK_NE(offset, 0U);
-  DCHECK(!IsImageOffsetAssigned(object));
   mirror::Object* obj = reinterpret_cast<mirror::Object*>(image_->Begin() + offset);
   DCHECK_ALIGNED(obj, kObjectAlignment);
-  image_bitmap_->Set(obj);
-  // Before we stomp over the lock word, save the hash code for later.
-  Monitor::Deflate(Thread::Current(), object);;
-  LockWord lw(object->GetLockWord(false));
-  switch (lw.GetState()) {
-    case LockWord::kFatLocked: {
-      LOG(FATAL) << "Fat locked object " << obj << " found during object copy";
-      break;
+
+  image_bitmap_->Set(obj);  // Mark the obj as mutated, since we will end up changing it.
+  {
+    // Remember the object-inside-of-the-image's hash code so we can restore it after the copy.
+    auto hash_it = saved_hashes_map_.find(bin_slot);
+    if (hash_it != saved_hashes_map_.end()) {
+      std::pair<BinSlot, uint32_t> slot_hash = *hash_it;
+      saved_hashes_.push_back(std::make_pair(obj, slot_hash.second));
+      saved_hashes_map_.erase(hash_it);
     }
-    case LockWord::kThinLocked: {
-      LOG(FATAL) << "Thin locked object " << obj << " found during object copy";
-      break;
-    }
-    case LockWord::kUnlocked:
-      // No hash, don't need to save it.
-      break;
-    case LockWord::kHashCode:
-      saved_hashes_.push_back(std::make_pair(obj, lw.GetHashCode()));
-      break;
-    default:
-      LOG(FATAL) << "Unreachable.";
-      UNREACHABLE();
   }
+  // The object is already deflated from when we set the bin slot. Just overwrite the lock word.
   object->SetLockWord(LockWord::FromForwardingAddress(offset), false);
   DCHECK(IsImageOffsetAssigned(object));
 }
 
-void ImageWriter::AssignImageOffset(mirror::Object* object) {
+void ImageWriter::AssignImageOffset(mirror::Object* object, ImageWriter::BinSlot bin_slot) {
   DCHECK(object != nullptr);
-  SetImageOffset(object, image_end_);
-  size_t object_size;
-  if (object->IsArtMethod()) {
-    // Methods are sized based on the target pointer size.
-    object_size = mirror::ArtMethod::InstanceSize(target_ptr_size_);
-  } else {
-    object_size = object->SizeOf();
-  }
-  image_end_ += RoundUp(object_size, 8);  // 64-bit alignment
-  DCHECK_LT(image_end_, image_->Size());
+  DCHECK_NE(image_objects_offset_begin_, 0u);
+
+  size_t previous_bin_sizes = GetBinSizeSum(bin_slot.GetBin());  // sum sizes in [0..bin#)
+  size_t new_offset = image_objects_offset_begin_ + previous_bin_sizes + bin_slot.GetIndex();
+  DCHECK_ALIGNED(new_offset, kObjectAlignment);
+
+  SetImageOffset(object, bin_slot, new_offset);
+  DCHECK_LT(new_offset, image_end_);
 }
 
 bool ImageWriter::IsImageOffsetAssigned(mirror::Object* object) const {
+  // Will also return true if the bin slot was assigned since we are reusing the lock word.
   DCHECK(object != nullptr);
   return object->GetLockWord(false).GetState() == LockWord::kForwardingAddress;
 }
@@ -240,6 +245,178 @@
   return offset;
 }
 
+void ImageWriter::SetImageBinSlot(mirror::Object* object, BinSlot bin_slot) {
+  DCHECK(object != nullptr);
+  DCHECK(!IsImageOffsetAssigned(object));
+  DCHECK(!IsImageBinSlotAssigned(object));
+
+  // Before we stomp over the lock word, save the hash code for later.
+  Monitor::Deflate(Thread::Current(), object);;
+  LockWord lw(object->GetLockWord(false));
+  switch (lw.GetState()) {
+    case LockWord::kFatLocked: {
+      LOG(FATAL) << "Fat locked object " << object << " found during object copy";
+      break;
+    }
+    case LockWord::kThinLocked: {
+      LOG(FATAL) << "Thin locked object " << object << " found during object copy";
+      break;
+    }
+    case LockWord::kUnlocked:
+      // No hash, don't need to save it.
+      break;
+    case LockWord::kHashCode:
+      saved_hashes_map_[bin_slot] = lw.GetHashCode();
+      break;
+    default:
+      LOG(FATAL) << "Unreachable.";
+      UNREACHABLE();
+  }
+  object->SetLockWord(LockWord::FromForwardingAddress(static_cast<uint32_t>(bin_slot)),
+                      false);
+  DCHECK(IsImageBinSlotAssigned(object));
+}
+
+void ImageWriter::AssignImageBinSlot(mirror::Object* object) {
+  DCHECK(object != nullptr);
+  size_t object_size;
+  if (object->IsArtMethod()) {
+    // Methods are sized based on the target pointer size.
+    object_size = mirror::ArtMethod::InstanceSize(target_ptr_size_);
+  } else {
+    object_size = object->SizeOf();
+  }
+
+  // The magic happens here. We segregate objects into different bins based
+  // on how likely they are to get dirty at runtime.
+  //
+  // Likely-to-dirty objects get packed together into the same bin so that
+  // at runtime their page dirtiness ratio (how many dirty objects a page has) is
+  // maximized.
+  //
+  // This means more pages will stay either clean or shared dirty (with zygote) and
+  // the app will use less of its own (private) memory.
+  Bin bin = kBinRegular;
+
+  if (kBinObjects) {
+    //
+    // Changing the bin of an object is purely a memory-use tuning.
+    // It has no change on runtime correctness.
+    //
+    // Memory analysis has determined that the following types of objects get dirtied
+    // the most:
+    //
+    // * Class'es which are verified [their clinit runs only at runtime]
+    //   - classes in general [because their static fields get overwritten]
+    //   - initialized classes with all-final statics are unlikely to be ever dirty,
+    //     so bin them separately
+    // * Art Methods that are:
+    //   - native [their native entry point is not looked up until runtime]
+    //   - have declaring classes that aren't initialized
+    //            [their interpreter/quick entry points are trampolines until the class
+    //             becomes initialized]
+    //
+    // We also assume the following objects get dirtied either never or extremely rarely:
+    //  * Strings (they are immutable)
+    //  * Art methods that aren't native and have initialized declared classes
+    //
+    // We assume that "regular" bin objects are highly unlikely to become dirtied,
+    // so packing them together will not result in a noticeably tighter dirty-to-clean ratio.
+    //
+    if (object->IsClass()) {
+      bin = kBinClassVerified;
+      mirror::Class* klass = object->AsClass();
+
+      if (klass->GetStatus() == Class::kStatusInitialized) {
+        bin = kBinClassInitialized;
+
+        // If the class's static fields are all final, put it into a separate bin
+        // since it's very likely it will stay clean.
+        uint32_t num_static_fields = klass->NumStaticFields();
+        if (num_static_fields == 0) {
+          bin = kBinClassInitializedFinalStatics;
+        } else {
+          // Maybe all the statics are final?
+          bool all_final = true;
+          for (uint32_t i = 0; i < num_static_fields; ++i) {
+            ArtField* field = klass->GetStaticField(i);
+            if (!field->IsFinal()) {
+              all_final = false;
+              break;
+            }
+          }
+
+          if (all_final) {
+            bin = kBinClassInitializedFinalStatics;
+          }
+        }
+      }
+    } else if (object->IsArtMethod<kVerifyNone>()) {
+      mirror::ArtMethod* art_method = down_cast<ArtMethod*>(object);
+      if (art_method->IsNative()) {
+        bin = kBinArtMethodNative;
+      } else {
+        mirror::Class* declaring_class = art_method->GetDeclaringClass();
+        if (declaring_class->GetStatus() != Class::kStatusInitialized) {
+          bin = kBinArtMethodNotInitialized;
+        } else {
+          // This is highly unlikely to dirty since there's no entry points to mutate.
+          bin = kBinArtMethodsManagedInitialized;
+        }
+      }
+    } else if (object->GetClass<kVerifyNone>()->IsStringClass()) {
+      bin = kBinString;  // Strings are almost always immutable (except for object header).
+    }  // else bin = kBinRegular
+  }
+
+  size_t current_offset = bin_slot_sizes_[bin];  // How many bytes the current bin is at (aligned).
+  // Move the current bin size up to accomodate the object we just assigned a bin slot.
+  size_t offset_delta = RoundUp(object_size, kObjectAlignment);  // 64-bit alignment
+  bin_slot_sizes_[bin] += offset_delta;
+
+  BinSlot new_bin_slot(bin, current_offset);
+  SetImageBinSlot(object, new_bin_slot);
+
+  ++bin_slot_count_[bin];
+
+  DCHECK_LT(GetBinSizeSum(), image_->Size());
+
+  // Grow the image closer to the end by the object we just assigned.
+  image_end_ += offset_delta;
+  DCHECK_LT(image_end_, image_->Size());
+}
+
+bool ImageWriter::IsImageBinSlotAssigned(mirror::Object* object) const {
+  DCHECK(object != nullptr);
+
+  // We always stash the bin slot into a lockword, in the 'forwarding address' state.
+  // If it's in some other state, then we haven't yet assigned an image bin slot.
+  if (object->GetLockWord(false).GetState() != LockWord::kForwardingAddress) {
+    return false;
+  } else if (kIsDebugBuild) {
+    LockWord lock_word = object->GetLockWord(false);
+    size_t offset = lock_word.ForwardingAddress();
+    BinSlot bin_slot(offset);
+    DCHECK_LT(bin_slot.GetIndex(), bin_slot_sizes_[bin_slot.GetBin()])
+      << "bin slot offset should not exceed the size of that bin";
+  }
+  return true;
+}
+
+ImageWriter::BinSlot ImageWriter::GetImageBinSlot(mirror::Object* object) const {
+  DCHECK(object != nullptr);
+  DCHECK(IsImageBinSlotAssigned(object));
+
+  LockWord lock_word = object->GetLockWord(false);
+  size_t offset = lock_word.ForwardingAddress();  // TODO: ForwardingAddress should be uint32_t
+  DCHECK_LE(offset, std::numeric_limits<uint32_t>::max());
+
+  BinSlot bin_slot(static_cast<uint32_t>(offset));
+  DCHECK_LT(bin_slot.GetIndex(), bin_slot_sizes_[bin_slot.GetBin()]);
+
+  return bin_slot;
+}
+
 bool ImageWriter::AllocMemory() {
   size_t length = RoundUp(Runtime::Current()->GetHeap()->GetTotalMemory(), kPageSize);
   std::string error_msg;
@@ -315,23 +492,16 @@
   Handle<mirror::ObjectArray<mirror::String>> strings_;
 };
 
-// If string a is a prefix of b or b is a prefix of a then they are considered equal. This
-// enables us to find prefixes instead of exact matches. Otherwise we do a normal string
-// comparison. The strings compared of the form <position, length> inside of the chars_ array.
+// Normal string < comparison through the chars_ array.
 class SubstringComparator {
  public:
   explicit SubstringComparator(const std::vector<uint16_t>* const chars) : chars_(chars) {
   }
   bool operator()(const std::pair<size_t, size_t>& a, const std::pair<size_t, size_t>& b) {
-    size_t compare_length = std::min(a.second, b.second);
-    const uint16_t* ptr_a = &chars_->at(a.first);
-    const uint16_t* ptr_b = &chars_->at(b.first);
-    for (size_t i = 0; i < compare_length; ++i) {
-      if (ptr_a[i] != ptr_b[i]) {
-        return ptr_a[i] < ptr_b[i];
-      }
-    }
-    return false;
+    return std::lexicographical_compare(chars_->begin() + a.first,
+                                        chars_->begin() + a.first + a.second,
+                                        chars_->begin() + b.first,
+                                        chars_->begin() + b.first + b.second);
   }
 
  private:
@@ -387,11 +557,15 @@
     // Try to see if the string exists as a prefix of an existing string.
     size_t new_offset = 0;
     std::pair<size_t, size_t> new_string(num_chars - length, length);
-    auto it = existing_strings.find(new_string);
+    auto it = existing_strings.lower_bound(new_string);
+    bool is_prefix = false;
     if (it != existing_strings.end()) {
-      for (size_t j = 0; j < length; ++j) {
-        DCHECK_EQ(combined_chars[it->first + j], s->CharAt(j));
-      }
+      CHECK_LE(length, it->second);
+      is_prefix = std::equal(combined_chars.begin() + it->first,
+                             combined_chars.begin() + it->first + it->second,
+                             combined_chars.begin() + new_string.first);
+    }
+    if (is_prefix) {
       // Shares a prefix, set the offset to where the new offset will be.
       new_offset = it->first;
       // Remove the added chars.
@@ -413,7 +587,7 @@
   for (size_t i = 0; i < total_strings; ++i) {
     strings->GetWithoutChecks(i)->SetArray(array);
   }
-  VLOG(compiler) << "Total # image strings=" << total_strings << " combined length="
+  LOG(INFO) << "Total # image strings=" << total_strings << " combined length="
       << total_length << " prefix saved chars=" << prefix_saved_chars;
   ComputeEagerResolvedStrings();
 }
@@ -546,29 +720,29 @@
   }
 }
 
-void ImageWriter::CalculateObjectOffsets(Object* obj) {
+void ImageWriter::CalculateObjectBinSlots(Object* obj) {
   DCHECK(obj != NULL);
   // if it is a string, we want to intern it if its not interned.
   if (obj->GetClass()->IsStringClass()) {
     // we must be an interned string that was forward referenced and already assigned
-    if (IsImageOffsetAssigned(obj)) {
+    if (IsImageBinSlotAssigned(obj)) {
       DCHECK_EQ(obj, obj->AsString()->Intern());
       return;
     }
     mirror::String* const interned = obj->AsString()->Intern();
     if (obj != interned) {
-      if (!IsImageOffsetAssigned(interned)) {
+      if (!IsImageBinSlotAssigned(interned)) {
         // interned obj is after us, allocate its location early
-        AssignImageOffset(interned);
+        AssignImageBinSlot(interned);
       }
       // point those looking for this object to the interned version.
-      SetImageOffset(obj, GetImageOffset(interned));
+      SetImageBinSlot(obj, GetImageBinSlot(interned));
       return;
     }
     // else (obj == interned), nothing to do but fall through to the normal case
   }
 
-  AssignImageOffset(obj);
+  AssignImageBinSlot(obj);
 }
 
 ObjectArray<Object>* ImageWriter::CreateImageRoots() const {
@@ -649,13 +823,15 @@
 
 // For an unvisited object, visit it then all its children found via fields.
 void ImageWriter::WalkFieldsInOrder(mirror::Object* obj) {
-  if (!IsImageOffsetAssigned(obj)) {
+  // Use our own visitor routine (instead of GC visitor) to get better locality between
+  // an object and its fields
+  if (!IsImageBinSlotAssigned(obj)) {
     // Walk instance fields of all objects
     StackHandleScope<2> hs(Thread::Current());
     Handle<mirror::Object> h_obj(hs.NewHandle(obj));
     Handle<mirror::Class> klass(hs.NewHandle(obj->GetClass()));
     // visit the object itself.
-    CalculateObjectOffsets(h_obj.Get());
+    CalculateObjectBinSlots(h_obj.Get());
     WalkInstanceFields(h_obj.Get(), klass.Get());
     // Walk static fields of a Class.
     if (h_obj->IsClass()) {
@@ -689,6 +865,24 @@
   writer->WalkFieldsInOrder(obj);
 }
 
+void ImageWriter::UnbinObjectsIntoOffsetCallback(mirror::Object* obj, void* arg) {
+  ImageWriter* writer = reinterpret_cast<ImageWriter*>(arg);
+  DCHECK(writer != nullptr);
+  writer->UnbinObjectsIntoOffset(obj);
+}
+
+void ImageWriter::UnbinObjectsIntoOffset(mirror::Object* obj) {
+  CHECK(obj != nullptr);
+
+  // We know the bin slot, and the total bin sizes for all objects by now,
+  // so calculate the object's final image offset.
+
+  DCHECK(IsImageBinSlotAssigned(obj));
+  BinSlot bin_slot = GetImageBinSlot(obj);
+  // Change the lockword from a bin slot into an offset
+  AssignImageOffset(obj, bin_slot);
+}
+
 void ImageWriter::CalculateNewObjectOffsets() {
   Thread* self = Thread::Current();
   StackHandleScope<1> hs(self);
@@ -699,16 +893,22 @@
 
   // Leave space for the header, but do not write it yet, we need to
   // know where image_roots is going to end up
-  image_end_ += RoundUp(sizeof(ImageHeader), 8);  // 64-bit-alignment
+  image_end_ += RoundUp(sizeof(ImageHeader), kObjectAlignment);  // 64-bit-alignment
 
   {
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
     // TODO: Image spaces only?
     DCHECK_LT(image_end_, image_->Size());
-    // Clear any pre-existing monitors which may have been in the monitor words.
+    image_objects_offset_begin_ = image_end_;
+    // Clear any pre-existing monitors which may have been in the monitor words, assign bin slots.
     heap->VisitObjects(WalkFieldsCallback, this);
+    // Transform each object's bin slot into an offset which will be used to do the final copy.
+    heap->VisitObjects(UnbinObjectsIntoOffsetCallback, this);
+    DCHECK(saved_hashes_map_.empty());  // All binslot hashes should've been put into vector by now.
   }
 
+  DCHECK_GT(image_end_, GetBinSizeSum());
+
   image_roots_address_ = PointerToLowMemUInt32(GetImageAddress(image_roots.Get()));
 
   // Note that image_end_ is left at end of used space
@@ -718,6 +918,7 @@
   CHECK_NE(0U, oat_loaded_size);
   const uint8_t* oat_file_begin = GetOatFileBegin();
   const uint8_t* oat_file_end = oat_file_begin + oat_loaded_size;
+
   oat_data_begin_ = oat_file_begin + oat_data_offset;
   const uint8_t* oat_data_end = oat_data_begin_ + oat_file_->Size();
 
@@ -766,7 +967,7 @@
   if (obj->IsArtMethod()) {
     // Size without pointer fields since we don't want to overrun the buffer if target art method
     // is 32 bits but source is 64 bits.
-    n = mirror::ArtMethod::SizeWithoutPointerFields();
+    n = mirror::ArtMethod::SizeWithoutPointerFields(sizeof(void*));
   } else {
     n = obj->SizeOf();
   }
@@ -779,6 +980,7 @@
   image_writer->FixupObject(obj, copy);
 }
 
+// Rewrite all the references in the copied object to point to their image address equivalent
 class FixupVisitor {
  public:
   FixupVisitor(ImageWriter* image_writer, Object* copy) : image_writer_(image_writer), copy_(copy) {
@@ -814,8 +1016,9 @@
   void operator()(Object* obj, MemberOffset offset, bool /*is_static*/) const
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
     DCHECK(obj->IsClass());
-    FixupVisitor::operator()(obj, offset, false);
+    FixupVisitor::operator()(obj, offset, /*is_static*/false);
 
+    // TODO: Remove dead code
     if (offset.Uint32Value() < mirror::Class::EmbeddedVTableOffset().Uint32Value()) {
       return;
     }
@@ -922,7 +1125,6 @@
   copy->SetEntryPointFromJniPtrSize<kVerifyNone>(orig->GetEntryPointFromJni(), target_ptr_size_);
   copy->SetEntryPointFromQuickCompiledCodePtrSize<kVerifyNone>(
       orig->GetEntryPointFromQuickCompiledCode(), target_ptr_size_);
-  copy->SetNativeGcMapPtrSize<kVerifyNone>(orig->GetNativeGcMap(), target_ptr_size_);
 
   // The resolution method has a special trampoline to call.
   Runtime* runtime = Runtime::Current();
@@ -983,11 +1185,6 @@
         // Note this is not the code_ pointer, that is handled above.
         copy->SetEntryPointFromJniPtrSize<kVerifyNone>(GetOatAddress(jni_dlsym_lookup_offset_),
                                                        target_ptr_size_);
-      } else {
-        // Normal (non-abstract non-native) methods have various tables to relocate.
-        uint32_t native_gc_map_offset = orig->GetOatNativeGcMapOffset();
-        const uint8_t* native_gc_map = GetOatAddress(native_gc_map_offset);
-        copy->SetNativeGcMapPtrSize<kVerifyNone>(native_gc_map, target_ptr_size_);
       }
 
       // Interpreter entrypoint:
@@ -1029,4 +1226,32 @@
   image_header->SetOatChecksum(oat_header->GetChecksum());
 }
 
+size_t ImageWriter::GetBinSizeSum(ImageWriter::Bin up_to) const {
+  DCHECK_LE(up_to, kBinSize);
+  return std::accumulate(&bin_slot_sizes_[0], &bin_slot_sizes_[up_to], /*init*/0);
+}
+
+ImageWriter::BinSlot::BinSlot(uint32_t lockword) : lockword_(lockword) {
+  // These values may need to get updated if more bins are added to the enum Bin
+  static_assert(kBinBits == 3, "wrong number of bin bits");
+  static_assert(kBinShift == 29, "wrong number of shift");
+  static_assert(sizeof(BinSlot) == sizeof(LockWord), "BinSlot/LockWord must have equal sizes");
+
+  DCHECK_LT(GetBin(), kBinSize);
+  DCHECK_ALIGNED(GetIndex(), kObjectAlignment);
+}
+
+ImageWriter::BinSlot::BinSlot(Bin bin, uint32_t index)
+    : BinSlot(index | (static_cast<uint32_t>(bin) << kBinShift)) {
+  DCHECK_EQ(index, GetIndex());
+}
+
+ImageWriter::Bin ImageWriter::BinSlot::GetBin() const {
+  return static_cast<Bin>((lockword_ & kBinMask) >> kBinShift);
+}
+
+uint32_t ImageWriter::BinSlot::GetIndex() const {
+  return lockword_ & ~kBinMask;
+}
+
 }  // namespace art
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index 2fec0aa..8c84b68 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -23,6 +23,7 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <ostream>
 
 #include "base/macros.h"
 #include "driver/compiler_driver.h"
@@ -32,6 +33,8 @@
 #include "mirror/dex_cache.h"
 #include "os.h"
 #include "safe_map.h"
+#include "gc/space/space.h"
+#include "utils.h"
 
 namespace art {
 
@@ -41,14 +44,15 @@
   ImageWriter(const CompilerDriver& compiler_driver, uintptr_t image_begin,
               bool compile_pic)
       : compiler_driver_(compiler_driver), image_begin_(reinterpret_cast<uint8_t*>(image_begin)),
-        image_end_(0), image_roots_address_(0), oat_file_(nullptr),
+        image_end_(0), image_objects_offset_begin_(0), image_roots_address_(0), oat_file_(nullptr),
         oat_data_begin_(nullptr), interpreter_to_interpreter_bridge_offset_(0),
         interpreter_to_compiled_code_bridge_offset_(0), jni_dlsym_lookup_offset_(0),
         portable_imt_conflict_trampoline_offset_(0), portable_resolution_trampoline_offset_(0),
         portable_to_interpreter_bridge_offset_(0), quick_generic_jni_trampoline_offset_(0),
         quick_imt_conflict_trampoline_offset_(0), quick_resolution_trampoline_offset_(0),
         quick_to_interpreter_bridge_offset_(0), compile_pic_(compile_pic),
-        target_ptr_size_(InstructionSetPointerSize(compiler_driver_.GetInstructionSet())) {
+        target_ptr_size_(InstructionSetPointerSize(compiler_driver_.GetInstructionSet())),
+        bin_slot_sizes_(), bin_slot_count_() {
     CHECK_NE(image_begin, 0U);
   }
 
@@ -87,14 +91,71 @@
   // Mark the objects defined in this space in the given live bitmap.
   void RecordImageAllocations() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Classify different kinds of bins that objects end up getting packed into during image writing.
+  enum Bin {
+    // Likely-clean:
+    kBinString,                        // [String] Almost always immutable (except for obj header).
+    kBinArtMethodsManagedInitialized,  // [ArtMethod] Not-native, and initialized. Unlikely to dirty
+    // Unknown mix of clean/dirty:
+    kBinRegular,
+    // Likely-dirty:
+    // All classes get their own bins since their fields often dirty
+    kBinClassInitializedFinalStatics,  // Class initializers have been run, no non-final statics
+    kBinClassInitialized,         // Class initializers have been run
+    kBinClassVerified,            // Class verified, but initializers haven't been run
+    kBinArtMethodNative,          // Art method that is actually native
+    kBinArtMethodNotInitialized,  // Art method with a declaring class that wasn't initialized
+    // Don't care about other art methods since they don't dirty
+    // Add more bins here if we add more segregation code.
+    kBinSize,
+  };
+
+  friend std::ostream& operator<<(std::ostream& stream, const Bin& bin);
+
+  static constexpr size_t kBinBits = MinimumBitsToStore(kBinSize - 1);
+  // uint32 = typeof(lockword_)
+  static constexpr size_t kBinShift = BitSizeOf<uint32_t>() - kBinBits;
+  // 111000.....0
+  static constexpr size_t kBinMask = ((static_cast<size_t>(1) << kBinBits) - 1) << kBinShift;
+
+  // We use the lock word to store the bin # and bin index of the object in the image.
+  //
+  // The struct size must be exactly sizeof(LockWord), currently 32-bits, since this will end up
+  // stored in the lock word bit-for-bit when object forwarding addresses are being calculated.
+  struct BinSlot {
+    explicit BinSlot(uint32_t lockword);
+    BinSlot(Bin bin, uint32_t index);
+
+    // The bin an object belongs to, i.e. regular, class/verified, class/initialized, etc.
+    Bin GetBin() const;
+    // The offset in bytes from the beginning of the bin. Aligned to object size.
+    uint32_t GetIndex() const;
+    // Pack into a single uint32_t, for storing into a lock word.
+    explicit operator uint32_t() const { return lockword_; }
+    // Comparison operator for map support
+    bool operator<(const BinSlot& other) const  { return lockword_ < other.lockword_; }
+
+  private:
+    // Must be the same size as LockWord, any larger and we would truncate the data.
+    const uint32_t lockword_;
+  };
+
   // We use the lock word to store the offset of the object in the image.
-  void AssignImageOffset(mirror::Object* object) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void SetImageOffset(mirror::Object* object, size_t offset)
+  void AssignImageOffset(mirror::Object* object, BinSlot bin_slot)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void SetImageOffset(mirror::Object* object, BinSlot bin_slot, size_t offset)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   bool IsImageOffsetAssigned(mirror::Object* object) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   size_t GetImageOffset(mirror::Object* object) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  void AssignImageBinSlot(mirror::Object* object) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void SetImageBinSlot(mirror::Object* object, BinSlot bin_slot)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool IsImageBinSlotAssigned(mirror::Object* object) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  BinSlot GetImageBinSlot(mirror::Object* object) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   static void* GetImageAddressCallback(void* writer, mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return reinterpret_cast<ImageWriter*>(writer)->GetImageAddress(obj);
@@ -157,7 +218,9 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::ObjectArray<mirror::Object>* CreateImageRoots() const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void CalculateObjectOffsets(mirror::Object* obj)
+  void CalculateObjectBinSlots(mirror::Object* obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void UnbinObjectsIntoOffset(mirror::Object* obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void WalkInstanceFields(mirror::Object* obj, mirror::Class* klass)
@@ -166,6 +229,8 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static void WalkFieldsCallback(mirror::Object* obj, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  static void UnbinObjectsIntoOffsetCallback(mirror::Object* obj, void* arg)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Creates the contiguous image in memory and adjusts pointers.
   void CopyAndFixupObjects() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -186,6 +251,9 @@
   // Patches references in OatFile to expect runtime addresses.
   void SetOatChecksumFromElfFile(File* elf_file);
 
+  // Calculate the sum total of the bin slot sizes in [0, up_to). Defaults to all bins.
+  size_t GetBinSizeSum(Bin up_to = kBinSize) const;
+
   const CompilerDriver& compiler_driver_;
 
   // Beginning target image address for the output image.
@@ -194,6 +262,9 @@
   // Offset to the free space in image_.
   size_t image_end_;
 
+  // Offset from image_begin_ to where the first object is in image_.
+  size_t image_objects_offset_begin_;
+
   // The image roots address in the image.
   uint32_t image_roots_address_;
 
@@ -206,6 +277,9 @@
   // Saved hashes (objects are inside of the image so that they don't move).
   std::vector<std::pair<mirror::Object*, uint32_t>> saved_hashes_;
 
+  // Saved hashes (objects are bin slots to inside of the image, not yet allocated an address).
+  std::map<BinSlot, uint32_t> saved_hashes_map_;
+
   // Beginning target oat address for the pointers from the output image to its oat file.
   const uint8_t* oat_data_begin_;
 
@@ -228,6 +302,10 @@
   // Size of pointers on the target architecture.
   size_t target_ptr_size_;
 
+  // Bin slot tracking for dirty object packing
+  size_t bin_slot_sizes_[kBinSize];  // Number of bytes in a bin
+  size_t bin_slot_count_[kBinSize];  // Number of objects in a bin
+
   friend class FixupVisitor;
   friend class FixupClassVisitor;
   DISALLOW_COPY_AND_ASSIGN(ImageWriter);
diff --git a/compiler/llvm/ir_builder.h b/compiler/llvm/ir_builder.h
index 03498ef..990ba02 100644
--- a/compiler/llvm/ir_builder.h
+++ b/compiler/llvm/ir_builder.h
@@ -101,10 +101,8 @@
   // Extend memory barrier
   //--------------------------------------------------------------------------
   void CreateMemoryBarrier(MemBarrierKind barrier_kind) {
-#if ANDROID_SMP
     // TODO: select atomic ordering according to given barrier kind.
     CreateFence(::llvm::SequentiallyConsistent);
-#endif
   }
 
   //--------------------------------------------------------------------------
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index ce4ed6d..9fe98e3 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -190,8 +190,8 @@
   // If this test is failing and you have to update these constants,
   // it is time to update OatHeader::kOatVersion
   EXPECT_EQ(84U, sizeof(OatHeader));
-  EXPECT_EQ(8U, sizeof(OatMethodOffsets));
-  EXPECT_EQ(24U, sizeof(OatQuickMethodHeader));
+  EXPECT_EQ(4U, sizeof(OatMethodOffsets));
+  EXPECT_EQ(28U, sizeof(OatQuickMethodHeader));
   EXPECT_EQ(91 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
 }
 
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index c6beb36..8a7abb4 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -366,6 +366,8 @@
     Offset offset(mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
         kArm64PointerSize).Int32Value());
     assembler.JumpTo(ManagedRegister(arm64::X0), offset, ManagedRegister(arm64::IP0));
+    // Ensure we emit the literal pool.
+    assembler.EmitSlowPaths();
     std::vector<uint8_t> thunk_code(assembler.CodeSize());
     MemoryRegion code(thunk_code.data(), thunk_code.size());
     assembler.FinalizeInstructions(code);
@@ -510,15 +512,18 @@
   }
 
   static uint32_t GetOffset(OatClass* oat_class, size_t method_offsets_index) ALWAYS_INLINE {
-    return oat_class->method_offsets_[method_offsets_index].gc_map_offset_;
+    uint32_t offset = oat_class->method_headers_[method_offsets_index].gc_map_offset_;
+    return offset == 0u ? 0u :
+        (oat_class->method_offsets_[method_offsets_index].code_offset_ & ~1) - offset;
   }
 
   static void SetOffset(OatClass* oat_class, size_t method_offsets_index, uint32_t offset)
       ALWAYS_INLINE {
-    oat_class->method_offsets_[method_offsets_index].gc_map_offset_ = offset;
+    oat_class->method_headers_[method_offsets_index].gc_map_offset_ =
+        (oat_class->method_offsets_[method_offsets_index].code_offset_ & ~1) - offset;
   }
 
-  static const char* Name() ALWAYS_INLINE {
+  static const char* Name() {
     return "GC map";
   }
 };
@@ -540,7 +545,7 @@
         (oat_class->method_offsets_[method_offsets_index].code_offset_ & ~1) - offset;
   }
 
-  static const char* Name() ALWAYS_INLINE {
+  static const char* Name() {
     return "mapping table";
   }
 };
@@ -562,7 +567,7 @@
         (oat_class->method_offsets_[method_offsets_index].code_offset_ & ~1) - offset;
   }
 
-  static const char* Name() ALWAYS_INLINE {
+  static const char* Name() {
     return "vmap table";
   }
 };
@@ -764,6 +769,7 @@
         OatQuickMethodHeader* method_header = &oat_class->method_headers_[method_offsets_index_];
         uint32_t mapping_table_offset = method_header->mapping_table_offset_;
         uint32_t vmap_table_offset = method_header->vmap_table_offset_;
+        uint32_t gc_map_offset = method_header->gc_map_offset_;
         // The code offset was 0 when the mapping/vmap table offset was set, so it's set
         // to 0-offset and we need to adjust it by code_offset.
         uint32_t code_offset = quick_code_offset - thumb_offset;
@@ -775,12 +781,16 @@
           vmap_table_offset += code_offset;
           DCHECK_LT(vmap_table_offset, code_offset);
         }
+        if (gc_map_offset != 0u) {
+          gc_map_offset += code_offset;
+          DCHECK_LT(gc_map_offset, code_offset);
+        }
         uint32_t frame_size_in_bytes = compiled_method->GetFrameSizeInBytes();
         uint32_t core_spill_mask = compiled_method->GetCoreSpillMask();
         uint32_t fp_spill_mask = compiled_method->GetFpSpillMask();
         *method_header = OatQuickMethodHeader(mapping_table_offset, vmap_table_offset,
-                                              frame_size_in_bytes, core_spill_mask, fp_spill_mask,
-                                              code_size);
+                                              gc_map_offset, frame_size_in_bytes, core_spill_mask,
+                                              fp_spill_mask, code_size);
 
         if (!deduped) {
           // Update offsets. (Checksum is updated when writing.)
@@ -909,7 +919,7 @@
     OatClass* oat_class = writer_->oat_classes_[oat_class_index_];
     CompiledMethod* compiled_method = oat_class->GetCompiledMethod(class_def_method_index);
 
-    OatMethodOffsets offsets(0u, 0u);
+    OatMethodOffsets offsets(0u);
     if (compiled_method != nullptr) {
       DCHECK_LT(method_offsets_index_, oat_class->method_offsets_.size());
       offsets = oat_class->method_offsets_[method_offsets_index_];
@@ -920,7 +930,7 @@
     InvokeType invoke_type = it.GetMethodInvokeType(dex_file_->GetClassDef(class_def_index_));
     // Unchecked as we hold mutator_lock_ on entry.
     ScopedObjectAccessUnchecked soa(Thread::Current());
-    StackHandleScope<2> hs(soa.Self());
+    StackHandleScope<1> hs(soa.Self());
     Handle<mirror::DexCache> dex_cache(hs.NewHandle(linker->FindDexCache(*dex_file_)));
     mirror::ArtMethod* method = linker->ResolveMethod(*dex_file_, it.GetMemberIndex(), dex_cache,
                                                       NullHandle<mirror::ClassLoader>(),
@@ -936,7 +946,6 @@
     }
     // Portable code offsets are set by ElfWriterMclinker::FixupCompiledCodeOffset after linking.
     method->SetQuickOatCodeOffset(offsets.code_offset_);
-    method->SetOatNativeGcMapOffset(offsets.gc_map_offset_);
 
     return true;
   }
@@ -1157,7 +1166,7 @@
 class OatWriter::WriteMapMethodVisitor : public OatDexMethodVisitor {
  public:
   WriteMapMethodVisitor(OatWriter* writer, OutputStream* out, const size_t file_offset,
-                          size_t relative_offset)
+                        size_t relative_offset)
     : OatDexMethodVisitor(writer, relative_offset),
       out_(out),
       file_offset_(file_offset) {
@@ -1179,7 +1188,8 @@
       size_t map_size = map == nullptr ? 0 : map->size() * sizeof((*map)[0]);
       DCHECK((map_size == 0u && map_offset == 0u) ||
             (map_size != 0u && map_offset != 0u && map_offset <= offset_))
-          << PrettyMethod(it.GetMemberIndex(), *dex_file_);
+          << map_size << " " << map_offset << " " << offset_ << " "
+          << PrettyMethod(it.GetMemberIndex(), *dex_file_) << " for " << DataAccess::Name();
       if (map_size != 0u && map_offset == offset_) {
         if (UNLIKELY(!out->WriteFully(&(*map)[0], map_size))) {
           ReportWriteFailure(it);
diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h
index 5b61f21..b3ac7ff 100644
--- a/compiler/oat_writer.h
+++ b/compiler/oat_writer.h
@@ -353,6 +353,9 @@
       if (UNLIKELY(&lhs->GetVmapTable() != &rhs->GetVmapTable())) {
         return &lhs->GetVmapTable() < &rhs->GetVmapTable();
       }
+      if (UNLIKELY(lhs->GetGcMap() != rhs->GetGcMap())) {
+        return lhs->GetGcMap() < rhs->GetGcMap();
+      }
       const auto& lhs_patches = lhs->GetPatches();
       const auto& rhs_patches = rhs->GetPatches();
       if (UNLIKELY(lhs_patches.size() != rhs_patches.size())) {
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index be8631a..777a117 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -305,6 +305,24 @@
 }
 
 template<typename T>
+void HGraphBuilder::Binop_23x_shift(const Instruction& instruction,
+                                    Primitive::Type type) {
+  HInstruction* first = LoadLocal(instruction.VRegB(), type);
+  HInstruction* second = LoadLocal(instruction.VRegC(), Primitive::kPrimInt);
+  current_block_->AddInstruction(new (arena_) T(type, first, second));
+  UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+}
+
+void HGraphBuilder::Binop_23x_cmp(const Instruction& instruction,
+                                  Primitive::Type type,
+                                  HCompare::Bias bias) {
+  HInstruction* first = LoadLocal(instruction.VRegB(), type);
+  HInstruction* second = LoadLocal(instruction.VRegC(), type);
+  current_block_->AddInstruction(new (arena_) HCompare(type, first, second, bias));
+  UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+}
+
+template<typename T>
 void HGraphBuilder::Binop_12x(const Instruction& instruction, Primitive::Type type) {
   HInstruction* first = LoadLocal(instruction.VRegA(), type);
   HInstruction* second = LoadLocal(instruction.VRegB(), type);
@@ -313,6 +331,14 @@
 }
 
 template<typename T>
+void HGraphBuilder::Binop_12x_shift(const Instruction& instruction, Primitive::Type type) {
+  HInstruction* first = LoadLocal(instruction.VRegA(), type);
+  HInstruction* second = LoadLocal(instruction.VRegB(), Primitive::kPrimInt);
+  current_block_->AddInstruction(new (arena_) T(type, first, second));
+  UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
+}
+
+template<typename T>
 void HGraphBuilder::Binop_12x(const Instruction& instruction,
                               Primitive::Type type,
                               uint32_t dex_pc) {
@@ -1017,6 +1043,16 @@
       break;
     }
 
+    case Instruction::LONG_TO_FLOAT: {
+      Conversion_12x(instruction, Primitive::kPrimLong, Primitive::kPrimFloat);
+      break;
+    }
+
+    case Instruction::LONG_TO_DOUBLE: {
+      Conversion_12x(instruction, Primitive::kPrimLong, Primitive::kPrimDouble);
+      break;
+    }
+
     case Instruction::INT_TO_BYTE: {
       Conversion_12x(instruction, Primitive::kPrimInt, Primitive::kPrimByte);
       break;
@@ -1141,6 +1177,36 @@
       break;
     }
 
+    case Instruction::SHL_INT: {
+      Binop_23x_shift<HShl>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::SHL_LONG: {
+      Binop_23x_shift<HShl>(instruction, Primitive::kPrimLong);
+      break;
+    }
+
+    case Instruction::SHR_INT: {
+      Binop_23x_shift<HShr>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::SHR_LONG: {
+      Binop_23x_shift<HShr>(instruction, Primitive::kPrimLong);
+      break;
+    }
+
+    case Instruction::USHR_INT: {
+      Binop_23x_shift<HUShr>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::USHR_LONG: {
+      Binop_23x_shift<HUShr>(instruction, Primitive::kPrimLong);
+      break;
+    }
+
     case Instruction::OR_INT: {
       Binop_23x<HOr>(instruction, Primitive::kPrimInt);
       break;
@@ -1240,6 +1306,36 @@
       break;
     }
 
+    case Instruction::SHL_INT_2ADDR: {
+      Binop_12x_shift<HShl>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::SHL_LONG_2ADDR: {
+      Binop_12x_shift<HShl>(instruction, Primitive::kPrimLong);
+      break;
+    }
+
+    case Instruction::SHR_INT_2ADDR: {
+      Binop_12x_shift<HShr>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::SHR_LONG_2ADDR: {
+      Binop_12x_shift<HShr>(instruction, Primitive::kPrimLong);
+      break;
+    }
+
+    case Instruction::USHR_INT_2ADDR: {
+      Binop_12x_shift<HUShr>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::USHR_LONG_2ADDR: {
+      Binop_12x_shift<HUShr>(instruction, Primitive::kPrimLong);
+      break;
+    }
+
     case Instruction::DIV_FLOAT_2ADDR: {
       Binop_12x<HDiv>(instruction, Primitive::kPrimFloat, dex_pc);
       break;
@@ -1354,6 +1450,21 @@
       break;
     }
 
+    case Instruction::SHL_INT_LIT8: {
+      Binop_22b<HShl>(instruction, false);
+      break;
+    }
+
+    case Instruction::SHR_INT_LIT8: {
+      Binop_22b<HShr>(instruction, false);
+      break;
+    }
+
+    case Instruction::USHR_INT_LIT8: {
+      Binop_22b<HUShr>(instruction, false);
+      break;
+    }
+
     case Instruction::NEW_INSTANCE: {
       current_block_->AddInstruction(
           new (arena_) HNewInstance(dex_pc, instruction.VRegB_21c()));
@@ -1400,7 +1511,27 @@
       break;
 
     case Instruction::CMP_LONG: {
-      Binop_23x<HCompare>(instruction, Primitive::kPrimLong);
+      Binop_23x_cmp(instruction, Primitive::kPrimLong, HCompare::kNoBias);
+      break;
+    }
+
+    case Instruction::CMPG_FLOAT: {
+      Binop_23x_cmp(instruction, Primitive::kPrimFloat, HCompare::kGtBias);
+      break;
+    }
+
+    case Instruction::CMPG_DOUBLE: {
+      Binop_23x_cmp(instruction, Primitive::kPrimDouble, HCompare::kGtBias);
+      break;
+    }
+
+    case Instruction::CMPL_FLOAT: {
+      Binop_23x_cmp(instruction, Primitive::kPrimFloat, HCompare::kLtBias);
+      break;
+    }
+
+    case Instruction::CMPL_DOUBLE: {
+      Binop_23x_cmp(instruction, Primitive::kPrimDouble, HCompare::kLtBias);
       break;
     }
 
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index 897bcec..25781b0 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -105,12 +105,20 @@
   void Binop_23x(const Instruction& instruction, Primitive::Type type, uint32_t dex_pc);
 
   template<typename T>
+  void Binop_23x_shift(const Instruction& instruction, Primitive::Type type);
+
+  void Binop_23x_cmp(const Instruction& instruction, Primitive::Type type, HCompare::Bias bias);
+
+  template<typename T>
   void Binop_12x(const Instruction& instruction, Primitive::Type type);
 
   template<typename T>
   void Binop_12x(const Instruction& instruction, Primitive::Type type, uint32_t dex_pc);
 
   template<typename T>
+  void Binop_12x_shift(const Instruction& instruction, Primitive::Type type);
+
+  template<typename T>
   void Binop_22b(const Instruction& instruction, bool reverse);
 
   template<typename T>
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 4d71cb7..e581af2 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -71,11 +71,7 @@
     }
   }
   GenerateSlowPaths();
-
-  size_t code_size = GetAssembler()->CodeSize();
-  uint8_t* buffer = allocator->Allocate(code_size);
-  MemoryRegion code(buffer, code_size);
-  GetAssembler()->FinalizeInstructions(code);
+  Finalize(allocator);
 }
 
 void CodeGenerator::CompileOptimized(CodeAllocator* allocator) {
@@ -97,9 +93,13 @@
     }
   }
   GenerateSlowPaths();
+  Finalize(allocator);
+}
 
+void CodeGenerator::Finalize(CodeAllocator* allocator) {
   size_t code_size = GetAssembler()->CodeSize();
   uint8_t* buffer = allocator->Allocate(code_size);
+
   MemoryRegion code(buffer, code_size);
   GetAssembler()->FinalizeInstructions(code);
 }
@@ -228,7 +228,8 @@
       DCHECK(!blocked_fpu_registers_[loc.reg()]);
       blocked_fpu_registers_[loc.reg()] = true;
     } else {
-      DCHECK_EQ(loc.GetPolicy(), Location::kRequiresRegister);
+      DCHECK(loc.GetPolicy() == Location::kRequiresRegister
+             || loc.GetPolicy() == Location::kRequiresFpuRegister);
     }
   }
 
@@ -259,10 +260,21 @@
   for (size_t i = 0, e = locations->GetTempCount(); i < e; ++i) {
     Location loc = locations->GetTemp(i);
     if (loc.IsUnallocated()) {
-      DCHECK_EQ(loc.GetPolicy(), Location::kRequiresRegister);
-      // TODO: Adjust handling of temps. We currently consider temps to use
-      // core registers. They may also use floating point registers at some point.
-      loc = AllocateFreeRegister(Primitive::kPrimInt);
+      switch (loc.GetPolicy()) {
+        case Location::kRequiresRegister:
+          // Allocate a core register (large enough to fit a 32-bit integer).
+          loc = AllocateFreeRegister(Primitive::kPrimInt);
+          break;
+
+        case Location::kRequiresFpuRegister:
+          // Allocate a core register (large enough to fit a 64-bit double).
+          loc = AllocateFreeRegister(Primitive::kPrimDouble);
+          break;
+
+        default:
+          LOG(FATAL) << "Unexpected policy for temporary location "
+                     << loc.GetPolicy();
+      }
       locations->SetTempAt(i, loc);
     }
   }
@@ -589,12 +601,14 @@
       if (locations->RegisterContainsObject(i)) {
         locations->SetStackBit(stack_offset / kVRegSize);
       }
+      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
       stack_offset += SaveCoreRegister(stack_offset, i);
     }
   }
 
   for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
     if (register_set->ContainsFloatingPointRegister(i)) {
+      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
       stack_offset += SaveFloatingPointRegister(stack_offset, i);
     }
   }
@@ -605,12 +619,14 @@
   size_t stack_offset = first_register_slot_in_slow_path_;
   for (size_t i = 0, e = GetNumberOfCoreRegisters(); i < e; ++i) {
     if (register_set->ContainsCoreRegister(i)) {
+      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
       stack_offset += RestoreCoreRegister(stack_offset, i);
     }
   }
 
   for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
     if (register_set->ContainsFloatingPointRegister(i)) {
+      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
       stack_offset += RestoreFloatingPointRegister(stack_offset, i);
     }
   }
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index f906eb8..4c0d3ea 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -30,6 +30,11 @@
 static size_t constexpr kVRegSize = 4;
 static size_t constexpr kUninitializedFrameSize = 0;
 
+// Binary encoding of 2^32 for type double.
+static int64_t constexpr k2Pow32EncodingForDouble = INT64_C(0x41F0000000000000);
+// Binary encoding of 2^31 for type double.
+static int64_t constexpr k2Pow31EncodingForDouble = INT64_C(0x41E0000000000000);
+
 class Assembler;
 class CodeGenerator;
 class DexCompilationUnit;
@@ -85,6 +90,7 @@
   }
 
   virtual void Initialize() = 0;
+  virtual void Finalize(CodeAllocator* allocator);
   virtual void GenerateFrameEntry() = 0;
   virtual void GenerateFrameExit() = 0;
   virtual void Bind(HBasicBlock* block) = 0;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index dc0a829..5b2be2e 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -495,7 +495,8 @@
         codegen_(codegen) {}
 
 void CodeGeneratorARM::GenerateFrameEntry() {
-  bool skip_overflow_check = IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kArm);
+  bool skip_overflow_check =
+      IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kArm);
   if (!skip_overflow_check) {
     if (kExplicitStackOverflowCheck) {
       SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathARM();
@@ -655,26 +656,26 @@
   }
   if (destination.IsRegister()) {
     if (source.IsRegister()) {
-      __ Mov(destination.As<Register>(), source.As<Register>());
+      __ Mov(destination.AsRegister<Register>(), source.AsRegister<Register>());
     } else if (source.IsFpuRegister()) {
-      __ vmovrs(destination.As<Register>(), source.As<SRegister>());
+      __ vmovrs(destination.AsRegister<Register>(), source.AsFpuRegister<SRegister>());
     } else {
-      __ LoadFromOffset(kLoadWord, destination.As<Register>(), SP, source.GetStackIndex());
+      __ LoadFromOffset(kLoadWord, destination.AsRegister<Register>(), SP, source.GetStackIndex());
     }
   } else if (destination.IsFpuRegister()) {
     if (source.IsRegister()) {
-      __ vmovsr(destination.As<SRegister>(), source.As<Register>());
+      __ vmovsr(destination.AsFpuRegister<SRegister>(), source.AsRegister<Register>());
     } else if (source.IsFpuRegister()) {
-      __ vmovs(destination.As<SRegister>(), source.As<SRegister>());
+      __ vmovs(destination.AsFpuRegister<SRegister>(), source.AsFpuRegister<SRegister>());
     } else {
-      __ LoadSFromOffset(destination.As<SRegister>(), SP, source.GetStackIndex());
+      __ LoadSFromOffset(destination.AsFpuRegister<SRegister>(), SP, source.GetStackIndex());
     }
   } else {
     DCHECK(destination.IsStackSlot()) << destination;
     if (source.IsRegister()) {
-      __ StoreToOffset(kStoreWord, source.As<Register>(), SP, destination.GetStackIndex());
+      __ StoreToOffset(kStoreWord, source.AsRegister<Register>(), SP, destination.GetStackIndex());
     } else if (source.IsFpuRegister()) {
-      __ StoreSToOffset(source.As<SRegister>(), SP, destination.GetStackIndex());
+      __ StoreSToOffset(source.AsFpuRegister<SRegister>(), SP, destination.GetStackIndex());
     } else {
       DCHECK(source.IsStackSlot()) << source;
       __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
@@ -689,19 +690,25 @@
   }
   if (destination.IsRegisterPair()) {
     if (source.IsRegisterPair()) {
-      __ Mov(destination.AsRegisterPairLow<Register>(), source.AsRegisterPairLow<Register>());
-      __ Mov(destination.AsRegisterPairHigh<Register>(), source.AsRegisterPairHigh<Register>());
+      EmitParallelMoves(
+          Location::RegisterLocation(source.AsRegisterPairHigh<Register>()),
+          Location::RegisterLocation(destination.AsRegisterPairHigh<Register>()),
+          Location::RegisterLocation(source.AsRegisterPairLow<Register>()),
+          Location::RegisterLocation(destination.AsRegisterPairLow<Register>()));
     } else if (source.IsFpuRegister()) {
       UNIMPLEMENTED(FATAL);
     } else if (source.IsQuickParameter()) {
       uint16_t register_index = source.GetQuickParameterRegisterIndex();
       uint16_t stack_index = source.GetQuickParameterStackIndex();
       InvokeDexCallingConvention calling_convention;
-      __ Mov(destination.AsRegisterPairLow<Register>(),
-             calling_convention.GetRegisterAt(register_index));
-      __ LoadFromOffset(kLoadWord, destination.AsRegisterPairHigh<Register>(),
-             SP, calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize());
+      EmitParallelMoves(
+          Location::RegisterLocation(calling_convention.GetRegisterAt(register_index)),
+          Location::RegisterLocation(destination.AsRegisterPairLow<Register>()),
+          Location::StackSlot(
+              calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize()),
+          Location::RegisterLocation(destination.AsRegisterPairHigh<Register>()));
     } else {
+      // No conflict possible, so just do the moves.
       DCHECK(source.IsDoubleStackSlot());
       if (destination.AsRegisterPairLow<Register>() == R1) {
         DCHECK_EQ(destination.AsRegisterPairHigh<Register>(), R2);
@@ -725,22 +732,21 @@
     uint16_t register_index = destination.GetQuickParameterRegisterIndex();
     uint16_t stack_index = destination.GetQuickParameterStackIndex();
     if (source.IsRegisterPair()) {
-      __ Mov(calling_convention.GetRegisterAt(register_index),
-             source.AsRegisterPairLow<Register>());
-      __ StoreToOffset(kStoreWord, source.AsRegisterPairHigh<Register>(),
-             SP, calling_convention.GetStackOffsetOf(stack_index + 1));
+      UNIMPLEMENTED(FATAL);
     } else if (source.IsFpuRegister()) {
       UNIMPLEMENTED(FATAL);
     } else {
       DCHECK(source.IsDoubleStackSlot());
-      __ LoadFromOffset(
-          kLoadWord, calling_convention.GetRegisterAt(register_index), SP, source.GetStackIndex());
-      __ LoadFromOffset(kLoadWord, R0, SP, source.GetHighStackIndex(kArmWordSize));
-      __ StoreToOffset(kStoreWord, R0, SP, calling_convention.GetStackOffsetOf(stack_index + 1));
+      EmitParallelMoves(
+          Location::StackSlot(source.GetStackIndex()),
+          Location::RegisterLocation(calling_convention.GetRegisterAt(register_index)),
+          Location::StackSlot(source.GetHighStackIndex(kArmWordSize)),
+          Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index + 1)));
     }
   } else {
     DCHECK(destination.IsDoubleStackSlot());
     if (source.IsRegisterPair()) {
+      // No conflict possible, so just do the moves.
       if (source.AsRegisterPairLow<Register>() == R1) {
         DCHECK_EQ(source.AsRegisterPairHigh<Register>(), R2);
         __ StoreToOffset(kStoreWord, R1, SP, destination.GetStackIndex());
@@ -753,21 +759,24 @@
       InvokeDexCallingConvention calling_convention;
       uint16_t register_index = source.GetQuickParameterRegisterIndex();
       uint16_t stack_index = source.GetQuickParameterStackIndex();
+      // Just move the low part. The only time a source is a quick parameter is
+      // when moving the parameter to its stack locations. And the (Java) caller
+      // of this method has already done that.
       __ StoreToOffset(kStoreWord, calling_convention.GetRegisterAt(register_index),
-             SP, destination.GetStackIndex());
-      __ LoadFromOffset(kLoadWord, R0,
-             SP, calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize());
-      __ StoreToOffset(kStoreWord, R0, SP, destination.GetHighStackIndex(kArmWordSize));
+                       SP, destination.GetStackIndex());
+      DCHECK_EQ(calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize(),
+                static_cast<size_t>(destination.GetHighStackIndex(kArmWordSize)));
     } else if (source.IsFpuRegisterPair()) {
       __ StoreDToOffset(FromLowSToD(source.AsFpuRegisterPairLow<SRegister>()),
                         SP,
                         destination.GetStackIndex());
     } else {
       DCHECK(source.IsDoubleStackSlot());
-      __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
-      __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
-      __ LoadFromOffset(kLoadWord, IP, SP, source.GetHighStackIndex(kArmWordSize));
-      __ StoreToOffset(kStoreWord, IP, SP, destination.GetHighStackIndex(kArmWordSize));
+      EmitParallelMoves(
+          Location::StackSlot(source.GetStackIndex()),
+          Location::StackSlot(destination.GetStackIndex()),
+          Location::StackSlot(source.GetHighStackIndex(kArmWordSize)),
+          Location::StackSlot(destination.GetHighStackIndex(kArmWordSize)));
     }
   }
 }
@@ -783,7 +792,7 @@
     if (const_to_move->IsIntConstant()) {
       int32_t value = const_to_move->AsIntConstant()->GetValue();
       if (location.IsRegister()) {
-        __ LoadImmediate(location.As<Register>(), value);
+        __ LoadImmediate(location.AsRegister<Register>(), value);
       } else {
         DCHECK(location.IsStackSlot());
         __ LoadImmediate(IP, value);
@@ -933,27 +942,27 @@
     if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
       // Condition has been materialized, compare the output to 0
       DCHECK(if_instr->GetLocations()->InAt(0).IsRegister());
-      __ cmp(if_instr->GetLocations()->InAt(0).As<Register>(),
+      __ cmp(if_instr->GetLocations()->InAt(0).AsRegister<Register>(),
              ShifterOperand(0));
       __ b(codegen_->GetLabelOf(if_instr->IfTrueSuccessor()), NE);
     } else {
       // Condition has not been materialized, use its inputs as the
       // comparison and its condition as the branch condition.
       LocationSummary* locations = cond->GetLocations();
+      Register left = locations->InAt(0).AsRegister<Register>();
       if (locations->InAt(1).IsRegister()) {
-        __ cmp(locations->InAt(0).As<Register>(),
-               ShifterOperand(locations->InAt(1).As<Register>()));
+        __ cmp(left, ShifterOperand(locations->InAt(1).AsRegister<Register>()));
       } else {
         DCHECK(locations->InAt(1).IsConstant());
         int32_t value =
             locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
         ShifterOperand operand;
-        if (ShifterOperand::CanHoldArm(value, &operand)) {
-          __ cmp(locations->InAt(0).As<Register>(), ShifterOperand(value));
+        if (GetAssembler()->ShifterOperandCanHold(R0, left, CMP, value, &operand)) {
+          __ cmp(left, operand);
         } else {
           Register temp = IP;
           __ LoadImmediate(temp, value);
-          __ cmp(locations->InAt(0).As<Register>(), ShifterOperand(temp));
+          __ cmp(left, ShifterOperand(temp));
         }
       }
       __ b(codegen_->GetLabelOf(if_instr->IfTrueSuccessor()),
@@ -979,27 +988,27 @@
 
 void InstructionCodeGeneratorARM::VisitCondition(HCondition* comp) {
   if (!comp->NeedsMaterialization()) return;
-
   LocationSummary* locations = comp->GetLocations();
+  Register left = locations->InAt(0).AsRegister<Register>();
+
   if (locations->InAt(1).IsRegister()) {
-    __ cmp(locations->InAt(0).As<Register>(),
-           ShifterOperand(locations->InAt(1).As<Register>()));
+    __ cmp(left, ShifterOperand(locations->InAt(1).AsRegister<Register>()));
   } else {
     DCHECK(locations->InAt(1).IsConstant());
     int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
     ShifterOperand operand;
-    if (ShifterOperand::CanHoldArm(value, &operand)) {
-      __ cmp(locations->InAt(0).As<Register>(), ShifterOperand(value));
+    if (GetAssembler()->ShifterOperandCanHold(R0, left, CMP, value, &operand)) {
+      __ cmp(left, operand);
     } else {
       Register temp = IP;
       __ LoadImmediate(temp, value);
-      __ cmp(locations->InAt(0).As<Register>(), ShifterOperand(temp));
+      __ cmp(left, ShifterOperand(temp));
     }
   }
   __ it(ARMCondition(comp->GetCondition()), kItElse);
-  __ mov(locations->Out().As<Register>(), ShifterOperand(1),
+  __ mov(locations->Out().AsRegister<Register>(), ShifterOperand(1),
          ARMCondition(comp->GetCondition()));
-  __ mov(locations->Out().As<Register>(), ShifterOperand(0),
+  __ mov(locations->Out().AsRegister<Register>(), ShifterOperand(0),
          ARMOppositeCondition(comp->GetCondition()));
 }
 
@@ -1169,7 +1178,7 @@
 }
 
 void InstructionCodeGeneratorARM::VisitInvokeStatic(HInvokeStatic* invoke) {
-  Register temp = invoke->GetLocations()->GetTemp(0).As<Register>();
+  Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>();
 
   // TODO: Implement all kinds of calls:
   // 1) boot -> boot
@@ -1189,7 +1198,7 @@
   // LR = temp[offset_of_quick_compiled_code]
   __ LoadFromOffset(kLoadWord, LR, temp,
                      mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-                         kArmPointerSize).Int32Value());
+                         kArmWordSize).Int32Value());
   // LR()
   __ blx(LR);
 
@@ -1216,7 +1225,7 @@
 }
 
 void InstructionCodeGeneratorARM::VisitInvokeVirtual(HInvokeVirtual* invoke) {
-  Register temp = invoke->GetLocations()->GetTemp(0).As<Register>();
+  Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>();
   uint32_t method_offset = mirror::Class::EmbeddedVTableOffset().Uint32Value() +
           invoke->GetVTableIndex() * sizeof(mirror::Class::VTableEntry);
   LocationSummary* locations = invoke->GetLocations();
@@ -1227,11 +1236,11 @@
     __ LoadFromOffset(kLoadWord, temp, SP, receiver.GetStackIndex());
     __ LoadFromOffset(kLoadWord, temp, temp, class_offset);
   } else {
-    __ LoadFromOffset(kLoadWord, temp, receiver.As<Register>(), class_offset);
+    __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset);
   }
   // temp = temp->GetMethodAt(method_offset);
   uint32_t entry_point = mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kArmPointerSize).Int32Value();
+      kArmWordSize).Int32Value();
   __ LoadFromOffset(kLoadWord, temp, temp, method_offset);
   // LR = temp->GetEntryPoint();
   __ LoadFromOffset(kLoadWord, LR, temp, entry_point);
@@ -1249,7 +1258,7 @@
 
 void InstructionCodeGeneratorARM::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
-  Register temp = invoke->GetLocations()->GetTemp(0).As<Register>();
+  Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>();
   uint32_t method_offset = mirror::Class::EmbeddedImTableOffset().Uint32Value() +
           (invoke->GetImtIndex() % mirror::Class::kImtSize) * sizeof(mirror::Class::ImTableEntry);
   LocationSummary* locations = invoke->GetLocations();
@@ -1257,18 +1266,19 @@
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
 
   // Set the hidden argument.
-  __ LoadImmediate(invoke->GetLocations()->GetTemp(1).As<Register>(), invoke->GetDexMethodIndex());
+  __ LoadImmediate(invoke->GetLocations()->GetTemp(1).AsRegister<Register>(),
+                   invoke->GetDexMethodIndex());
 
   // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
     __ LoadFromOffset(kLoadWord, temp, SP, receiver.GetStackIndex());
     __ LoadFromOffset(kLoadWord, temp, temp, class_offset);
   } else {
-    __ LoadFromOffset(kLoadWord, temp, receiver.As<Register>(), class_offset);
+    __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset);
   }
   // temp = temp->GetImtEntryAt(method_offset);
   uint32_t entry_point = mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kArmPointerSize).Int32Value();
+      kArmWordSize).Int32Value();
   __ LoadFromOffset(kLoadWord, temp, temp, method_offset);
   // LR = temp->GetEntryPoint();
   __ LoadFromOffset(kLoadWord, LR, temp, entry_point);
@@ -1308,7 +1318,7 @@
   switch (neg->GetResultType()) {
     case Primitive::kPrimInt:
       DCHECK(in.IsRegister());
-      __ rsb(out.As<Register>(), in.As<Register>(), ShifterOperand(0));
+      __ rsb(out.AsRegister<Register>(), in.AsRegister<Register>(), ShifterOperand(0));
       break;
 
     case Primitive::kPrimLong:
@@ -1334,7 +1344,7 @@
 
     case Primitive::kPrimFloat:
       DCHECK(in.IsFpuRegister());
-      __ vnegs(out.As<SRegister>(), in.As<SRegister>());
+      __ vnegs(out.AsFpuRegister<SRegister>(), in.AsFpuRegister<SRegister>());
       break;
 
     case Primitive::kPrimDouble:
@@ -1353,6 +1363,7 @@
       new (GetGraph()->GetArena()) LocationSummary(conversion, LocationSummary::kNoCall);
   Primitive::Type result_type = conversion->GetResultType();
   Primitive::Type input_type = conversion->GetInputType();
+  DCHECK_NE(result_type, input_type);
   switch (result_type) {
     case Primitive::kPrimByte:
       switch (input_type) {
@@ -1434,7 +1445,6 @@
         case Primitive::kPrimByte:
         case Primitive::kPrimShort:
         case Primitive::kPrimInt:
-        case Primitive::kPrimChar:
           // Processing a Dex `int-to-char' instruction.
           locations->SetInAt(0, Location::RequiresRegister());
           locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
@@ -1458,6 +1468,15 @@
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-float' instruction.
+          locations->SetInAt(0, Location::RequiresRegister());
+          locations->SetOut(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresRegister());
+          locations->AddTemp(Location::RequiresRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          break;
+
         case Primitive::kPrimDouble:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1481,6 +1500,14 @@
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-double' instruction.
+          locations->SetInAt(0, Location::RequiresRegister());
+          locations->SetOut(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresRegister());
+          locations->AddTemp(Location::RequiresRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          break;
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1504,6 +1531,7 @@
   Location in = locations->InAt(0);
   Primitive::Type result_type = conversion->GetResultType();
   Primitive::Type input_type = conversion->GetInputType();
+  DCHECK_NE(result_type, input_type);
   switch (result_type) {
     case Primitive::kPrimByte:
       switch (input_type) {
@@ -1511,7 +1539,7 @@
         case Primitive::kPrimInt:
         case Primitive::kPrimChar:
           // Processing a Dex `int-to-byte' instruction.
-          __ sbfx(out.As<Register>(), in.As<Register>(), 0, 8);
+          __ sbfx(out.AsRegister<Register>(), in.AsRegister<Register>(), 0, 8);
           break;
 
         default:
@@ -1526,7 +1554,7 @@
         case Primitive::kPrimInt:
         case Primitive::kPrimChar:
           // Processing a Dex `int-to-short' instruction.
-          __ sbfx(out.As<Register>(), in.As<Register>(), 0, 16);
+          __ sbfx(out.AsRegister<Register>(), in.AsRegister<Register>(), 0, 16);
           break;
 
         default:
@@ -1541,14 +1569,14 @@
           // Processing a Dex `long-to-int' instruction.
           DCHECK(out.IsRegister());
           if (in.IsRegisterPair()) {
-            __ Mov(out.As<Register>(), in.AsRegisterPairLow<Register>());
+            __ Mov(out.AsRegister<Register>(), in.AsRegisterPairLow<Register>());
           } else if (in.IsDoubleStackSlot()) {
-            __ LoadFromOffset(kLoadWord, out.As<Register>(), SP, in.GetStackIndex());
+            __ LoadFromOffset(kLoadWord, out.AsRegister<Register>(), SP, in.GetStackIndex());
           } else {
             DCHECK(in.IsConstant());
             DCHECK(in.GetConstant()->IsLongConstant());
             int64_t value = in.GetConstant()->AsLongConstant()->GetValue();
-            __ LoadImmediate(out.As<Register>(), static_cast<int32_t>(value));
+            __ LoadImmediate(out.AsRegister<Register>(), static_cast<int32_t>(value));
           }
           break;
 
@@ -1573,7 +1601,7 @@
           // Processing a Dex `int-to-long' instruction.
           DCHECK(out.IsRegisterPair());
           DCHECK(in.IsRegister());
-          __ Mov(out.AsRegisterPairLow<Register>(), in.As<Register>());
+          __ Mov(out.AsRegisterPairLow<Register>(), in.AsRegister<Register>());
           // Sign extension.
           __ Asr(out.AsRegisterPairHigh<Register>(),
                  out.AsRegisterPairLow<Register>(),
@@ -1597,9 +1625,8 @@
         case Primitive::kPrimByte:
         case Primitive::kPrimShort:
         case Primitive::kPrimInt:
-        case Primitive::kPrimChar:
           // Processing a Dex `int-to-char' instruction.
-          __ ubfx(out.As<Register>(), in.As<Register>(), 0, 16);
+          __ ubfx(out.AsRegister<Register>(), in.AsRegister<Register>(), 0, 16);
           break;
 
         default:
@@ -1615,12 +1642,53 @@
         case Primitive::kPrimInt:
         case Primitive::kPrimChar: {
           // Processing a Dex `int-to-float' instruction.
-          __ vmovsr(out.As<SRegister>(), in.As<Register>());
-          __ vcvtsi(out.As<SRegister>(), out.As<SRegister>());
+          __ vmovsr(out.AsFpuRegister<SRegister>(), in.AsRegister<Register>());
+          __ vcvtsi(out.AsFpuRegister<SRegister>(), out.AsFpuRegister<SRegister>());
           break;
         }
 
-        case Primitive::kPrimLong:
+        case Primitive::kPrimLong: {
+          // Processing a Dex `long-to-float' instruction.
+          Register low = in.AsRegisterPairLow<Register>();
+          Register high = in.AsRegisterPairHigh<Register>();
+          SRegister output = out.AsFpuRegister<SRegister>();
+          Register constant_low = locations->GetTemp(0).AsRegister<Register>();
+          Register constant_high = locations->GetTemp(1).AsRegister<Register>();
+          SRegister temp1_s = locations->GetTemp(2).AsFpuRegisterPairLow<SRegister>();
+          DRegister temp1_d = FromLowSToD(temp1_s);
+          SRegister temp2_s = locations->GetTemp(3).AsFpuRegisterPairLow<SRegister>();
+          DRegister temp2_d = FromLowSToD(temp2_s);
+
+          // Operations use doubles for precision reasons (each 32-bit
+          // half of a long fits in the 53-bit mantissa of a double,
+          // but not in the 24-bit mantissa of a float).  This is
+          // especially important for the low bits.  The result is
+          // eventually converted to float.
+
+          // temp1_d = int-to-double(high)
+          __ vmovsr(temp1_s, high);
+          __ vcvtdi(temp1_d, temp1_s);
+          // Using vmovd to load the `k2Pow32EncodingForDouble` constant
+          // as an immediate value into `temp2_d` does not work, as
+          // this instruction only transfers 8 significant bits of its
+          // immediate operand.  Instead, use two 32-bit core
+          // registers to load `k2Pow32EncodingForDouble` into
+          // `temp2_d`.
+          __ LoadImmediate(constant_low, Low32Bits(k2Pow32EncodingForDouble));
+          __ LoadImmediate(constant_high, High32Bits(k2Pow32EncodingForDouble));
+          __ vmovdrr(temp2_d, constant_low, constant_high);
+          // temp1_d = temp1_d * 2^32
+          __ vmuld(temp1_d, temp1_d, temp2_d);
+          // temp2_d = unsigned-to-double(low)
+          __ vmovsr(temp2_s, low);
+          __ vcvtdu(temp2_d, temp2_s);
+          // temp1_d = temp1_d + temp2_d
+          __ vaddd(temp1_d, temp1_d, temp2_d);
+          // output = double-to-float(temp1_d);
+          __ vcvtsd(output, temp1_d);
+          break;
+        }
+
         case Primitive::kPrimDouble:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1639,13 +1707,44 @@
         case Primitive::kPrimInt:
         case Primitive::kPrimChar: {
           // Processing a Dex `int-to-double' instruction.
-          __ vmovsr(out.AsFpuRegisterPairLow<SRegister>(), in.As<Register>());
+          __ vmovsr(out.AsFpuRegisterPairLow<SRegister>(), in.AsRegister<Register>());
           __ vcvtdi(FromLowSToD(out.AsFpuRegisterPairLow<SRegister>()),
                     out.AsFpuRegisterPairLow<SRegister>());
           break;
         }
 
-        case Primitive::kPrimLong:
+        case Primitive::kPrimLong: {
+          // Processing a Dex `long-to-double' instruction.
+          Register low = in.AsRegisterPairLow<Register>();
+          Register high = in.AsRegisterPairHigh<Register>();
+          SRegister out_s = out.AsFpuRegisterPairLow<SRegister>();
+          DRegister out_d = FromLowSToD(out_s);
+          Register constant_low = locations->GetTemp(0).AsRegister<Register>();
+          Register constant_high = locations->GetTemp(1).AsRegister<Register>();
+          SRegister temp_s = locations->GetTemp(2).AsFpuRegisterPairLow<SRegister>();
+          DRegister temp_d = FromLowSToD(temp_s);
+
+          // out_d = int-to-double(high)
+          __ vmovsr(out_s, high);
+          __ vcvtdi(out_d, out_s);
+          // Using vmovd to load the `k2Pow32EncodingForDouble` constant
+          // as an immediate value into `temp_d` does not work, as
+          // this instruction only transfers 8 significant bits of its
+          // immediate operand.  Instead, use two 32-bit core
+          // registers to load `k2Pow32EncodingForDouble` into `temp_d`.
+          __ LoadImmediate(constant_low, Low32Bits(k2Pow32EncodingForDouble));
+          __ LoadImmediate(constant_high, High32Bits(k2Pow32EncodingForDouble));
+          __ vmovdrr(temp_d, constant_low, constant_high);
+          // out_d = out_d * 2^32
+          __ vmuld(out_d, out_d, temp_d);
+          // temp_d = unsigned-to-double(low)
+          __ vmovsr(temp_s, low);
+          __ vcvtdu(temp_d, temp_s);
+          // out_d = out_d + temp_d
+          __ vaddd(out_d, out_d, temp_d);
+          break;
+        }
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1697,10 +1796,12 @@
   switch (add->GetResultType()) {
     case Primitive::kPrimInt:
       if (second.IsRegister()) {
-        __ add(out.As<Register>(), first.As<Register>(), ShifterOperand(second.As<Register>()));
+        __ add(out.AsRegister<Register>(),
+               first.AsRegister<Register>(),
+               ShifterOperand(second.AsRegister<Register>()));
       } else {
-        __ AddConstant(out.As<Register>(),
-                       first.As<Register>(),
+        __ AddConstant(out.AsRegister<Register>(),
+                       first.AsRegister<Register>(),
                        second.GetConstant()->AsIntConstant()->GetValue());
       }
       break;
@@ -1715,7 +1816,9 @@
       break;
 
     case Primitive::kPrimFloat:
-      __ vadds(out.As<SRegister>(), first.As<SRegister>(), second.As<SRegister>());
+      __ vadds(out.AsFpuRegister<SRegister>(),
+               first.AsFpuRegister<SRegister>(),
+               second.AsFpuRegister<SRegister>());
       break;
 
     case Primitive::kPrimDouble:
@@ -1761,10 +1864,12 @@
   switch (sub->GetResultType()) {
     case Primitive::kPrimInt: {
       if (second.IsRegister()) {
-        __ sub(out.As<Register>(), first.As<Register>(), ShifterOperand(second.As<Register>()));
+        __ sub(out.AsRegister<Register>(),
+               first.AsRegister<Register>(),
+               ShifterOperand(second.AsRegister<Register>()));
       } else {
-        __ AddConstant(out.As<Register>(),
-                       first.As<Register>(),
+        __ AddConstant(out.AsRegister<Register>(),
+                       first.AsRegister<Register>(),
                        -second.GetConstant()->AsIntConstant()->GetValue());
       }
       break;
@@ -1781,7 +1886,9 @@
     }
 
     case Primitive::kPrimFloat: {
-      __ vsubs(out.As<SRegister>(), first.As<SRegister>(), second.As<SRegister>());
+      __ vsubs(out.AsFpuRegister<SRegister>(),
+               first.AsFpuRegister<SRegister>(),
+               second.AsFpuRegister<SRegister>());
       break;
     }
 
@@ -1830,7 +1937,9 @@
   Location second = locations->InAt(1);
   switch (mul->GetResultType()) {
     case Primitive::kPrimInt: {
-      __ mul(out.As<Register>(), first.As<Register>(), second.As<Register>());
+      __ mul(out.AsRegister<Register>(),
+             first.AsRegister<Register>(),
+             second.AsRegister<Register>());
       break;
     }
     case Primitive::kPrimLong: {
@@ -1865,7 +1974,9 @@
     }
 
     case Primitive::kPrimFloat: {
-      __ vmuls(out.As<SRegister>(), first.As<SRegister>(), second.As<SRegister>());
+      __ vmuls(out.AsFpuRegister<SRegister>(),
+               first.AsFpuRegister<SRegister>(),
+               second.AsFpuRegister<SRegister>());
       break;
     }
 
@@ -1925,7 +2036,9 @@
 
   switch (div->GetResultType()) {
     case Primitive::kPrimInt: {
-      __ sdiv(out.As<Register>(), first.As<Register>(), second.As<Register>());
+      __ sdiv(out.AsRegister<Register>(),
+              first.AsRegister<Register>(),
+              second.AsRegister<Register>());
       break;
     }
 
@@ -1943,7 +2056,9 @@
     }
 
     case Primitive::kPrimFloat: {
-      __ vdivs(out.As<SRegister>(), first.As<SRegister>(), second.As<SRegister>());
+      __ vdivs(out.AsFpuRegister<SRegister>(),
+               first.AsFpuRegister<SRegister>(),
+               second.AsFpuRegister<SRegister>());
       break;
     }
 
@@ -2002,16 +2117,16 @@
 
   switch (rem->GetResultType()) {
     case Primitive::kPrimInt: {
-      Register reg1 = first.As<Register>();
-      Register reg2 = second.As<Register>();
-      Register temp = locations->GetTemp(0).As<Register>();
+      Register reg1 = first.AsRegister<Register>();
+      Register reg2 = second.AsRegister<Register>();
+      Register temp = locations->GetTemp(0).AsRegister<Register>();
 
       // temp = reg1 / reg2  (integer division)
       // temp = temp * reg2
       // dest = reg1 - temp
       __ sdiv(temp, reg1, reg2);
       __ mul(temp, temp, reg2);
-      __ sub(out.As<Register>(), reg1, ShifterOperand(temp));
+      __ sub(out.AsRegister<Register>(), reg1, ShifterOperand(temp));
       break;
     }
 
@@ -2058,7 +2173,7 @@
   switch (instruction->GetType()) {
     case Primitive::kPrimInt: {
       if (value.IsRegister()) {
-        __ cmp(value.As<Register>(), ShifterOperand(0));
+        __ cmp(value.AsRegister<Register>(), ShifterOperand(0));
         __ b(slow_path->GetEntryLabel(), EQ);
       } else {
         DCHECK(value.IsConstant()) << value;
@@ -2087,6 +2202,124 @@
   }
 }
 
+void LocationsBuilderARM::HandleShift(HBinaryOperation* op) {
+  DCHECK(op->IsShl() || op->IsShr() || op->IsUShr());
+
+  LocationSummary::CallKind call_kind = op->GetResultType() == Primitive::kPrimLong
+      ? LocationSummary::kCall
+      : LocationSummary::kNoCall;
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(op, call_kind);
+
+  switch (op->GetResultType()) {
+    case Primitive::kPrimInt: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(op->InputAt(1)));
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    }
+    case Primitive::kPrimLong: {
+      InvokeRuntimeCallingConvention calling_convention;
+      locations->SetInAt(0, Location::RegisterPairLocation(
+          calling_convention.GetRegisterAt(0), calling_convention.GetRegisterAt(1)));
+      locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
+      // The runtime helper puts the output in R0,R2.
+      locations->SetOut(Location::RegisterPairLocation(R0, R2));
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected operation type " << op->GetResultType();
+  }
+}
+
+void InstructionCodeGeneratorARM::HandleShift(HBinaryOperation* op) {
+  DCHECK(op->IsShl() || op->IsShr() || op->IsUShr());
+
+  LocationSummary* locations = op->GetLocations();
+  Location out = locations->Out();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+
+  Primitive::Type type = op->GetResultType();
+  switch (type) {
+    case Primitive::kPrimInt: {
+      Register out_reg = out.AsRegister<Register>();
+      Register first_reg = first.AsRegister<Register>();
+      // Arm doesn't mask the shift count so we need to do it ourselves.
+      if (second.IsRegister()) {
+        Register second_reg = second.AsRegister<Register>();
+        __ and_(second_reg, second_reg, ShifterOperand(kMaxIntShiftValue));
+        if (op->IsShl()) {
+          __ Lsl(out_reg, first_reg, second_reg);
+        } else if (op->IsShr()) {
+          __ Asr(out_reg, first_reg, second_reg);
+        } else {
+          __ Lsr(out_reg, first_reg, second_reg);
+        }
+      } else {
+        int32_t cst = second.GetConstant()->AsIntConstant()->GetValue();
+        uint32_t shift_value = static_cast<uint32_t>(cst & kMaxIntShiftValue);
+        if (shift_value == 0) {  // arm does not support shifting with 0 immediate.
+          __ Mov(out_reg, first_reg);
+        } else if (op->IsShl()) {
+          __ Lsl(out_reg, first_reg, shift_value);
+        } else if (op->IsShr()) {
+          __ Asr(out_reg, first_reg, shift_value);
+        } else {
+          __ Lsr(out_reg, first_reg, shift_value);
+        }
+      }
+      break;
+    }
+    case Primitive::kPrimLong: {
+      // TODO: Inline the assembly instead of calling the runtime.
+      InvokeRuntimeCallingConvention calling_convention;
+      DCHECK_EQ(calling_convention.GetRegisterAt(0), first.AsRegisterPairLow<Register>());
+      DCHECK_EQ(calling_convention.GetRegisterAt(1), first.AsRegisterPairHigh<Register>());
+      DCHECK_EQ(calling_convention.GetRegisterAt(2), second.AsRegister<Register>());
+      DCHECK_EQ(R0, out.AsRegisterPairLow<Register>());
+      DCHECK_EQ(R2, out.AsRegisterPairHigh<Register>());
+
+      int32_t entry_point_offset;
+      if (op->IsShl()) {
+        entry_point_offset = QUICK_ENTRY_POINT(pShlLong);
+      } else if (op->IsShr()) {
+        entry_point_offset = QUICK_ENTRY_POINT(pShrLong);
+      } else {
+        entry_point_offset = QUICK_ENTRY_POINT(pUshrLong);
+      }
+      __ LoadFromOffset(kLoadWord, LR, TR, entry_point_offset);
+      __ blx(LR);
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected operation type " << type;
+  }
+}
+
+void LocationsBuilderARM::VisitShl(HShl* shl) {
+  HandleShift(shl);
+}
+
+void InstructionCodeGeneratorARM::VisitShl(HShl* shl) {
+  HandleShift(shl);
+}
+
+void LocationsBuilderARM::VisitShr(HShr* shr) {
+  HandleShift(shr);
+}
+
+void InstructionCodeGeneratorARM::VisitShr(HShr* shr) {
+  HandleShift(shr);
+}
+
+void LocationsBuilderARM::VisitUShr(HUShr* ushr) {
+  HandleShift(ushr);
+}
+
+void InstructionCodeGeneratorARM::VisitUShr(HUShr* ushr) {
+  HandleShift(ushr);
+}
+
 void LocationsBuilderARM::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
@@ -2152,11 +2385,11 @@
   Location in = locations->InAt(0);
   switch (not_->InputAt(0)->GetType()) {
     case Primitive::kPrimBoolean:
-      __ eor(out.As<Register>(), in.As<Register>(), ShifterOperand(1));
+      __ eor(out.AsRegister<Register>(), in.AsRegister<Register>(), ShifterOperand(1));
       break;
 
     case Primitive::kPrimInt:
-      __ mvn(out.As<Register>(), ShifterOperand(in.As<Register>()));
+      __ mvn(out.AsRegister<Register>(), ShifterOperand(in.AsRegister<Register>()));
       break;
 
     case Primitive::kPrimLong:
@@ -2174,44 +2407,72 @@
 void LocationsBuilderARM::VisitCompare(HCompare* compare) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(compare, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  switch (compare->InputAt(0)->GetType()) {
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      break;
+    }
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble: {
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected type for compare operation " << compare->InputAt(0)->GetType();
+  }
 }
 
 void InstructionCodeGeneratorARM::VisitCompare(HCompare* compare) {
   LocationSummary* locations = compare->GetLocations();
-  switch (compare->InputAt(0)->GetType()) {
+  Register out = locations->Out().AsRegister<Register>();
+  Location left = locations->InAt(0);
+  Location right = locations->InAt(1);
+
+  Label less, greater, done;
+  Primitive::Type type = compare->InputAt(0)->GetType();
+  switch (type) {
     case Primitive::kPrimLong: {
-      Register output = locations->Out().As<Register>();
-      Location left = locations->InAt(0);
-      Location right = locations->InAt(1);
-      Label less, greater, done;
       __ cmp(left.AsRegisterPairHigh<Register>(),
              ShifterOperand(right.AsRegisterPairHigh<Register>()));  // Signed compare.
       __ b(&less, LT);
       __ b(&greater, GT);
-      // Do LoadImmediate before any `cmp`, as LoadImmediate might affect
-      // the status flags.
-      __ LoadImmediate(output, 0);
+      // Do LoadImmediate before any `cmp`, as LoadImmediate might affect the status flags.
+      __ LoadImmediate(out, 0);
       __ cmp(left.AsRegisterPairLow<Register>(),
              ShifterOperand(right.AsRegisterPairLow<Register>()));  // Unsigned compare.
-      __ b(&done, EQ);
-      __ b(&less, CC);
-
-      __ Bind(&greater);
-      __ LoadImmediate(output, 1);
-      __ b(&done);
-
-      __ Bind(&less);
-      __ LoadImmediate(output, -1);
-
-      __ Bind(&done);
+      break;
+    }
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble: {
+      __ LoadImmediate(out, 0);
+      if (type == Primitive::kPrimFloat) {
+        __ vcmps(left.AsFpuRegister<SRegister>(), right.AsFpuRegister<SRegister>());
+      } else {
+        __ vcmpd(FromLowSToD(left.AsFpuRegisterPairLow<SRegister>()),
+                 FromLowSToD(right.AsFpuRegisterPairLow<SRegister>()));
+      }
+      __ vmstat();  // transfer FP status register to ARM APSR.
+      __ b(compare->IsGtBias() ? &greater : &less, VS);  // VS for unordered.
       break;
     }
     default:
-      LOG(FATAL) << "Unimplemented compare type " << compare->InputAt(0)->GetType();
+      LOG(FATAL) << "Unexpected compare type " << type;
   }
+  __ b(&done, EQ);
+  __ b(&less, CC);  // CC is for both: unsigned compare for longs and 'less than' for floats.
+
+  __ Bind(&greater);
+  __ LoadImmediate(out, 1);
+  __ b(&done);
+
+  __ Bind(&less);
+  __ LoadImmediate(out, -1);
+
+  __ Bind(&done);
 }
 
 void LocationsBuilderARM::VisitPhi(HPhi* instruction) {
@@ -2244,32 +2505,32 @@
 
 void InstructionCodeGeneratorARM::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
   uint32_t offset = instruction->GetFieldOffset().Uint32Value();
   Primitive::Type field_type = instruction->GetFieldType();
 
   switch (field_type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
-      Register value = locations->InAt(1).As<Register>();
+      Register value = locations->InAt(1).AsRegister<Register>();
       __ StoreToOffset(kStoreByte, value, obj, offset);
       break;
     }
 
     case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
-      Register value = locations->InAt(1).As<Register>();
+      Register value = locations->InAt(1).AsRegister<Register>();
       __ StoreToOffset(kStoreHalfword, value, obj, offset);
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register value = locations->InAt(1).As<Register>();
+      Register value = locations->InAt(1).AsRegister<Register>();
       __ StoreToOffset(kStoreWord, value, obj, offset);
       if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->GetValue())) {
-        Register temp = locations->GetTemp(0).As<Register>();
-        Register card = locations->GetTemp(1).As<Register>();
+        Register temp = locations->GetTemp(0).AsRegister<Register>();
+        Register card = locations->GetTemp(1).AsRegister<Register>();
         codegen_->MarkGCCard(temp, card, obj, value);
       }
       break;
@@ -2282,7 +2543,7 @@
     }
 
     case Primitive::kPrimFloat: {
-      SRegister value = locations->InAt(1).As<SRegister>();
+      SRegister value = locations->InAt(1).AsFpuRegister<SRegister>();
       __ StoreSToOffset(value, obj, offset);
       break;
     }
@@ -2308,37 +2569,37 @@
 
 void InstructionCodeGeneratorARM::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
   uint32_t offset = instruction->GetFieldOffset().Uint32Value();
 
   switch (instruction->GetType()) {
     case Primitive::kPrimBoolean: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ LoadFromOffset(kLoadUnsignedByte, out, obj, offset);
       break;
     }
 
     case Primitive::kPrimByte: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ LoadFromOffset(kLoadSignedByte, out, obj, offset);
       break;
     }
 
     case Primitive::kPrimShort: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ LoadFromOffset(kLoadSignedHalfword, out, obj, offset);
       break;
     }
 
     case Primitive::kPrimChar: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ LoadFromOffset(kLoadUnsignedHalfword, out, obj, offset);
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ LoadFromOffset(kLoadWord, out, obj, offset);
       break;
     }
@@ -2351,7 +2612,7 @@
     }
 
     case Primitive::kPrimFloat: {
-      SRegister out = locations->Out().As<SRegister>();
+      SRegister out = locations->Out().AsFpuRegister<SRegister>();
       __ LoadSFromOffset(out, obj, offset);
       break;
     }
@@ -2385,7 +2646,7 @@
   Location obj = locations->InAt(0);
 
   if (obj.IsRegister()) {
-    __ cmp(obj.As<Register>(), ShifterOperand(0));
+    __ cmp(obj.AsRegister<Register>(), ShifterOperand(0));
     __ b(slow_path->GetEntryLabel(), EQ);
   } else {
     DCHECK(obj.IsConstant()) << obj;
@@ -2404,18 +2665,19 @@
 
 void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
   Location index = locations->InAt(1);
 
   switch (instruction->GetType()) {
     case Primitive::kPrimBoolean: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint8_t)).Uint32Value();
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
-        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
+        size_t offset =
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
         __ LoadFromOffset(kLoadUnsignedByte, out, obj, offset);
       } else {
-        __ add(IP, obj, ShifterOperand(index.As<Register>()));
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>()));
         __ LoadFromOffset(kLoadUnsignedByte, out, IP, data_offset);
       }
       break;
@@ -2423,12 +2685,13 @@
 
     case Primitive::kPrimByte: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int8_t)).Uint32Value();
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
-        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
+        size_t offset =
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
         __ LoadFromOffset(kLoadSignedByte, out, obj, offset);
       } else {
-        __ add(IP, obj, ShifterOperand(index.As<Register>()));
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>()));
         __ LoadFromOffset(kLoadSignedByte, out, IP, data_offset);
       }
       break;
@@ -2436,12 +2699,13 @@
 
     case Primitive::kPrimShort: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int16_t)).Uint32Value();
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
-        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
+        size_t offset =
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
         __ LoadFromOffset(kLoadSignedHalfword, out, obj, offset);
       } else {
-        __ add(IP, obj, ShifterOperand(index.As<Register>(), LSL, TIMES_2));
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_2));
         __ LoadFromOffset(kLoadSignedHalfword, out, IP, data_offset);
       }
       break;
@@ -2449,12 +2713,13 @@
 
     case Primitive::kPrimChar: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Uint32Value();
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
-        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
+        size_t offset =
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
         __ LoadFromOffset(kLoadUnsignedHalfword, out, obj, offset);
       } else {
-        __ add(IP, obj, ShifterOperand(index.As<Register>(), LSL, TIMES_2));
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_2));
         __ LoadFromOffset(kLoadUnsignedHalfword, out, IP, data_offset);
       }
       break;
@@ -2464,12 +2729,13 @@
     case Primitive::kPrimNot: {
       DCHECK_EQ(sizeof(mirror::HeapReference<mirror::Object>), sizeof(int32_t));
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
-        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+        size_t offset =
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
         __ LoadFromOffset(kLoadWord, out, obj, offset);
       } else {
-        __ add(IP, obj, ShifterOperand(index.As<Register>(), LSL, TIMES_4));
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
         __ LoadFromOffset(kLoadWord, out, IP, data_offset);
       }
       break;
@@ -2479,10 +2745,11 @@
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Uint32Value();
       Location out = locations->Out();
       if (index.IsConstant()) {
-        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
+        size_t offset =
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
         __ LoadFromOffset(kLoadWordPair, out.AsRegisterPairLow<Register>(), obj, offset);
       } else {
-        __ add(IP, obj, ShifterOperand(index.As<Register>(), LSL, TIMES_8));
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_8));
         __ LoadFromOffset(kLoadWordPair, out.AsRegisterPairLow<Register>(), IP, data_offset);
       }
       break;
@@ -2527,7 +2794,7 @@
 
 void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
   Location index = locations->InAt(1);
   Primitive::Type value_type = instruction->GetComponentType();
   bool needs_runtime_call = locations->WillCall();
@@ -2538,12 +2805,13 @@
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint8_t)).Uint32Value();
-      Register value = locations->InAt(2).As<Register>();
+      Register value = locations->InAt(2).AsRegister<Register>();
       if (index.IsConstant()) {
-        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
+        size_t offset =
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
         __ StoreToOffset(kStoreByte, value, obj, offset);
       } else {
-        __ add(IP, obj, ShifterOperand(index.As<Register>()));
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>()));
         __ StoreToOffset(kStoreByte, value, IP, data_offset);
       }
       break;
@@ -2552,12 +2820,13 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Uint32Value();
-      Register value = locations->InAt(2).As<Register>();
+      Register value = locations->InAt(2).AsRegister<Register>();
       if (index.IsConstant()) {
-        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
+        size_t offset =
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
         __ StoreToOffset(kStoreHalfword, value, obj, offset);
       } else {
-        __ add(IP, obj, ShifterOperand(index.As<Register>(), LSL, TIMES_2));
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_2));
         __ StoreToOffset(kStoreHalfword, value, IP, data_offset);
       }
       break;
@@ -2567,24 +2836,27 @@
     case Primitive::kPrimNot: {
       if (!needs_runtime_call) {
         uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-        Register value = locations->InAt(2).As<Register>();
+        Register value = locations->InAt(2).AsRegister<Register>();
         if (index.IsConstant()) {
-          size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+          size_t offset =
+              (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
           __ StoreToOffset(kStoreWord, value, obj, offset);
         } else {
           DCHECK(index.IsRegister()) << index;
-          __ add(IP, obj, ShifterOperand(index.As<Register>(), LSL, TIMES_4));
+          __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_4));
           __ StoreToOffset(kStoreWord, value, IP, data_offset);
         }
         if (needs_write_barrier) {
           DCHECK_EQ(value_type, Primitive::kPrimNot);
-          Register temp = locations->GetTemp(0).As<Register>();
-          Register card = locations->GetTemp(1).As<Register>();
+          Register temp = locations->GetTemp(0).AsRegister<Register>();
+          Register card = locations->GetTemp(1).AsRegister<Register>();
           codegen_->MarkGCCard(temp, card, obj, value);
         }
       } else {
         DCHECK_EQ(value_type, Primitive::kPrimNot);
-        codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pAputObject), instruction, instruction->GetDexPc());
+        codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pAputObject),
+                                instruction,
+                                instruction->GetDexPc());
       }
       break;
     }
@@ -2593,10 +2865,11 @@
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Uint32Value();
       Location value = locations->InAt(2);
       if (index.IsConstant()) {
-        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
+        size_t offset =
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
         __ StoreToOffset(kStoreWordPair, value.AsRegisterPairLow<Register>(), obj, offset);
       } else {
-        __ add(IP, obj, ShifterOperand(index.As<Register>(), LSL, TIMES_8));
+        __ add(IP, obj, ShifterOperand(index.AsRegister<Register>(), LSL, TIMES_8));
         __ StoreToOffset(kStoreWordPair, value.AsRegisterPairLow<Register>(), IP, data_offset);
       }
       break;
@@ -2622,8 +2895,8 @@
 void InstructionCodeGeneratorARM::VisitArrayLength(HArrayLength* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   uint32_t offset = mirror::Array::LengthOffset().Uint32Value();
-  Register obj = locations->InAt(0).As<Register>();
-  Register out = locations->Out().As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
+  Register out = locations->Out().AsRegister<Register>();
   __ LoadFromOffset(kLoadWord, out, obj, offset);
 }
 
@@ -2643,8 +2916,8 @@
       instruction, locations->InAt(0), locations->InAt(1));
   codegen_->AddSlowPath(slow_path);
 
-  Register index = locations->InAt(0).As<Register>();
-  Register length = locations->InAt(1).As<Register>();
+  Register index = locations->InAt(0).AsRegister<Register>();
+  Register length = locations->InAt(1).AsRegister<Register>();
 
   __ cmp(index, ShifterOperand(length));
   __ b(slow_path->GetEntryLabel(), CS);
@@ -2725,15 +2998,15 @@
 
   if (source.IsRegister()) {
     if (destination.IsRegister()) {
-      __ Mov(destination.As<Register>(), source.As<Register>());
+      __ Mov(destination.AsRegister<Register>(), source.AsRegister<Register>());
     } else {
       DCHECK(destination.IsStackSlot());
-      __ StoreToOffset(kStoreWord, source.As<Register>(),
+      __ StoreToOffset(kStoreWord, source.AsRegister<Register>(),
                        SP, destination.GetStackIndex());
     }
   } else if (source.IsStackSlot()) {
     if (destination.IsRegister()) {
-      __ LoadFromOffset(kLoadWord, destination.As<Register>(),
+      __ LoadFromOffset(kLoadWord, destination.AsRegister<Register>(),
                         SP, source.GetStackIndex());
     } else {
       DCHECK(destination.IsStackSlot());
@@ -2745,7 +3018,7 @@
     DCHECK(source.GetConstant()->IsIntConstant());
     int32_t value = source.GetConstant()->AsIntConstant()->GetValue();
     if (destination.IsRegister()) {
-      __ LoadImmediate(destination.As<Register>(), value);
+      __ LoadImmediate(destination.AsRegister<Register>(), value);
     } else {
       DCHECK(destination.IsStackSlot());
       __ LoadImmediate(IP, value);
@@ -2777,15 +3050,15 @@
   Location destination = move->GetDestination();
 
   if (source.IsRegister() && destination.IsRegister()) {
-    DCHECK_NE(source.As<Register>(), IP);
-    DCHECK_NE(destination.As<Register>(), IP);
-    __ Mov(IP, source.As<Register>());
-    __ Mov(source.As<Register>(), destination.As<Register>());
-    __ Mov(destination.As<Register>(), IP);
+    DCHECK_NE(source.AsRegister<Register>(), IP);
+    DCHECK_NE(destination.AsRegister<Register>(), IP);
+    __ Mov(IP, source.AsRegister<Register>());
+    __ Mov(source.AsRegister<Register>(), destination.AsRegister<Register>());
+    __ Mov(destination.AsRegister<Register>(), IP);
   } else if (source.IsRegister() && destination.IsStackSlot()) {
-    Exchange(source.As<Register>(), destination.GetStackIndex());
+    Exchange(source.AsRegister<Register>(), destination.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsRegister()) {
-    Exchange(destination.As<Register>(), source.GetStackIndex());
+    Exchange(destination.AsRegister<Register>(), source.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsStackSlot()) {
     Exchange(source.GetStackIndex(), destination.GetStackIndex());
   } else {
@@ -2811,7 +3084,7 @@
 }
 
 void InstructionCodeGeneratorARM::VisitLoadClass(HLoadClass* cls) {
-  Register out = cls->GetLocations()->Out().As<Register>();
+  Register out = cls->GetLocations()->Out().AsRegister<Register>();
   if (cls->IsReferrersClass()) {
     DCHECK(!cls->CanCallRuntime());
     DCHECK(!cls->MustGenerateClinitCheck());
@@ -2851,7 +3124,8 @@
   SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM(
       check->GetLoadClass(), check, check->GetDexPc(), true);
   codegen_->AddSlowPath(slow_path);
-  GenerateClassInitializationCheck(slow_path, check->GetLocations()->InAt(0).As<Register>());
+  GenerateClassInitializationCheck(slow_path,
+                                   check->GetLocations()->InAt(0).AsRegister<Register>());
 }
 
 void InstructionCodeGeneratorARM::GenerateClassInitializationCheck(
@@ -2874,37 +3148,37 @@
 
 void InstructionCodeGeneratorARM::VisitStaticFieldGet(HStaticFieldGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register cls = locations->InAt(0).As<Register>();
+  Register cls = locations->InAt(0).AsRegister<Register>();
   uint32_t offset = instruction->GetFieldOffset().Uint32Value();
 
   switch (instruction->GetType()) {
     case Primitive::kPrimBoolean: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ LoadFromOffset(kLoadUnsignedByte, out, cls, offset);
       break;
     }
 
     case Primitive::kPrimByte: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ LoadFromOffset(kLoadSignedByte, out, cls, offset);
       break;
     }
 
     case Primitive::kPrimShort: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ LoadFromOffset(kLoadSignedHalfword, out, cls, offset);
       break;
     }
 
     case Primitive::kPrimChar: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ LoadFromOffset(kLoadUnsignedHalfword, out, cls, offset);
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ LoadFromOffset(kLoadWord, out, cls, offset);
       break;
     }
@@ -2917,7 +3191,7 @@
     }
 
     case Primitive::kPrimFloat: {
-      SRegister out = locations->Out().As<SRegister>();
+      SRegister out = locations->Out().AsFpuRegister<SRegister>();
       __ LoadSFromOffset(out, cls, offset);
       break;
     }
@@ -2950,32 +3224,32 @@
 
 void InstructionCodeGeneratorARM::VisitStaticFieldSet(HStaticFieldSet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register cls = locations->InAt(0).As<Register>();
+  Register cls = locations->InAt(0).AsRegister<Register>();
   uint32_t offset = instruction->GetFieldOffset().Uint32Value();
   Primitive::Type field_type = instruction->GetFieldType();
 
   switch (field_type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
-      Register value = locations->InAt(1).As<Register>();
+      Register value = locations->InAt(1).AsRegister<Register>();
       __ StoreToOffset(kStoreByte, value, cls, offset);
       break;
     }
 
     case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
-      Register value = locations->InAt(1).As<Register>();
+      Register value = locations->InAt(1).AsRegister<Register>();
       __ StoreToOffset(kStoreHalfword, value, cls, offset);
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register value = locations->InAt(1).As<Register>();
+      Register value = locations->InAt(1).AsRegister<Register>();
       __ StoreToOffset(kStoreWord, value, cls, offset);
       if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->GetValue())) {
-        Register temp = locations->GetTemp(0).As<Register>();
-        Register card = locations->GetTemp(1).As<Register>();
+        Register temp = locations->GetTemp(0).AsRegister<Register>();
+        Register card = locations->GetTemp(1).AsRegister<Register>();
         codegen_->MarkGCCard(temp, card, cls, value);
       }
       break;
@@ -2988,7 +3262,7 @@
     }
 
     case Primitive::kPrimFloat: {
-      SRegister value = locations->InAt(1).As<SRegister>();
+      SRegister value = locations->InAt(1).AsFpuRegister<SRegister>();
       __ StoreSToOffset(value, cls, offset);
       break;
     }
@@ -3015,10 +3289,10 @@
   SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM(load);
   codegen_->AddSlowPath(slow_path);
 
-  Register out = load->GetLocations()->Out().As<Register>();
+  Register out = load->GetLocations()->Out().AsRegister<Register>();
   codegen_->LoadCurrentMethod(out);
-  __ LoadFromOffset(
-      kLoadWord, out, out, mirror::ArtMethod::DexCacheStringsOffset().Int32Value());
+  __ LoadFromOffset(kLoadWord, out, out, mirror::ArtMethod::DeclaringClassOffset().Int32Value());
+  __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
   __ LoadFromOffset(kLoadWord, out, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
   __ cmp(out, ShifterOperand(0));
   __ b(slow_path->GetEntryLabel(), EQ);
@@ -3032,7 +3306,7 @@
 }
 
 void InstructionCodeGeneratorARM::VisitLoadException(HLoadException* load) {
-  Register out = load->GetLocations()->Out().As<Register>();
+  Register out = load->GetLocations()->Out().AsRegister<Register>();
   int32_t offset = Thread::ExceptionOffset<kArmWordSize>().Int32Value();
   __ LoadFromOffset(kLoadWord, out, TR, offset);
   __ LoadImmediate(IP, 0);
@@ -3063,9 +3337,9 @@
 
 void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
-  Register cls = locations->InAt(1).As<Register>();
-  Register out = locations->Out().As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
+  Register cls = locations->InAt(1).AsRegister<Register>();
+  Register out = locations->Out().AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   Label done, zero;
   SlowPathCodeARM* slow_path = nullptr;
@@ -3110,9 +3384,9 @@
 
 void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
-  Register cls = locations->InAt(1).As<Register>();
-  Register temp = locations->GetTemp(0).As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
+  Register cls = locations->InAt(1).AsRegister<Register>();
+  Register temp = locations->GetTemp(0).AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
 
   SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathARM(
@@ -3174,9 +3448,9 @@
   LocationSummary* locations = instruction->GetLocations();
 
   if (instruction->GetResultType() == Primitive::kPrimInt) {
-    Register first = locations->InAt(0).As<Register>();
-    Register second = locations->InAt(1).As<Register>();
-    Register out = locations->Out().As<Register>();
+    Register first = locations->InAt(0).AsRegister<Register>();
+    Register second = locations->InAt(1).AsRegister<Register>();
+    Register out = locations->Out().AsRegister<Register>();
     if (instruction->IsAnd()) {
       __ and_(out, first, ShifterOperand(second));
     } else if (instruction->IsOr()) {
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index acc3fd6..226e635 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -28,7 +28,8 @@
 class CodeGeneratorARM;
 class SlowPathCodeARM;
 
-static constexpr size_t kArmWordSize = 4;
+// Use a local definition to prevent copying mistakes.
+static constexpr size_t kArmWordSize = kArmPointerSize;
 
 static constexpr Register kParameterCoreRegisters[] = { R1, R2, R3 };
 static constexpr RegisterPair kParameterCorePairRegisters[] = { R1_R2, R2_R3 };
@@ -108,6 +109,7 @@
  private:
   void HandleInvoke(HInvoke* invoke);
   void HandleBitwiseOperation(HBinaryOperation* operation);
+  void HandleShift(HBinaryOperation* operation);
 
   CodeGeneratorARM* const codegen_;
   InvokeDexCallingConventionVisitor parameter_visitor_;
@@ -135,6 +137,7 @@
   void GenerateSuspendCheck(HSuspendCheck* check, HBasicBlock* successor);
   void GenerateClassInitializationCheck(SlowPathCodeARM* slow_path, Register class_reg);
   void HandleBitwiseOperation(HBinaryOperation* operation);
+  void HandleShift(HBinaryOperation* operation);
 
   ArmAssembler* const assembler_;
   CodeGeneratorARM* const codegen_;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 2c586a1..a61ef2d 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -38,7 +38,7 @@
 
 namespace arm64 {
 
-// TODO: clean-up some of the constant definitions.
+static constexpr bool kExplicitStackOverflowCheck = false;
 static constexpr size_t kHeapRefSize = sizeof(mirror::HeapReference<mirror::Object>);
 static constexpr int kCurrentMethodStackOffset = 0;
 
@@ -167,7 +167,7 @@
   return MemOperand(sp, location.GetStackIndex());
 }
 
-MemOperand HeapOperand(const Register& base, size_t offset) {
+MemOperand HeapOperand(const Register& base, size_t offset = 0) {
   // A heap reference must be 32bit, so fit in a W register.
   DCHECK(base.IsW());
   return MemOperand(base.X(), offset);
@@ -393,6 +393,20 @@
   DISALLOW_COPY_AND_ASSIGN(NullCheckSlowPathARM64);
 };
 
+class StackOverflowCheckSlowPathARM64 : public SlowPathCodeARM64 {
+ public:
+  StackOverflowCheckSlowPathARM64() {}
+
+  virtual void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
+    __ Bind(GetEntryLabel());
+    arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pThrowStackOverflow), nullptr, 0);
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(StackOverflowCheckSlowPathARM64);
+};
+
 class SuspendCheckSlowPathARM64 : public SlowPathCodeARM64 {
  public:
   explicit SuspendCheckSlowPathARM64(HSuspendCheck* instruction,
@@ -418,7 +432,6 @@
     return &return_label_;
   }
 
-
  private:
   HSuspendCheck* const instruction_;
   // If not null, the block to branch to after the suspend check.
@@ -437,7 +450,7 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     __ Bind(GetEntryLabel());
     __ Brk(__LINE__);  // TODO: Unimplemented TypeCheckSlowPathARM64.
-    __ b(GetExitLabel());
+    __ B(GetExitLabel());
   }
 
  private:
@@ -479,13 +492,30 @@
 #undef __
 #define __ GetVIXLAssembler()->
 
+void CodeGeneratorARM64::Finalize(CodeAllocator* allocator) {
+  // Ensure we emit the literal pool.
+  __ FinalizeCode();
+  CodeGenerator::Finalize(allocator);
+}
+
 void CodeGeneratorARM64::GenerateFrameEntry() {
-  // TODO: Add proper support for the stack overflow check.
-  UseScratchRegisterScope temps(GetVIXLAssembler());
-  Register temp = temps.AcquireX();
-  __ Add(temp, sp, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm64)));
-  __ Ldr(temp, MemOperand(temp, 0));
-  RecordPcInfo(nullptr, 0);
+  bool do_overflow_check = FrameNeedsStackCheck(GetFrameSize(), kArm64) || !IsLeafMethod();
+  if (do_overflow_check) {
+    UseScratchRegisterScope temps(GetVIXLAssembler());
+    Register temp = temps.AcquireX();
+    if (kExplicitStackOverflowCheck) {
+      SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathARM64();
+      AddSlowPath(slow_path);
+
+      __ Ldr(temp, MemOperand(tr, Thread::StackEndOffset<kArm64WordSize>().Int32Value()));
+      __ Cmp(sp, temp);
+      __ B(lo, slow_path->GetEntryLabel());
+    } else {
+      __ Add(temp, sp, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm64)));
+      __ Ldr(wzr, MemOperand(temp, 0));
+      RecordPcInfo(nullptr, 0);
+    }
+  }
 
   CPURegList preserved_regs = GetFramePreservedRegisters();
   int frame_size = GetFrameSize();
@@ -588,12 +618,12 @@
 void CodeGeneratorARM64::MarkGCCard(Register object, Register value) {
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register card = temps.AcquireX();
-  Register temp = temps.AcquireX();
+  Register temp = temps.AcquireW();   // Index within the CardTable - 32bit.
   vixl::Label done;
   __ Cbz(value, &done);
   __ Ldr(card, MemOperand(tr, Thread::CardTableOffset<kArm64WordSize>().Int32Value()));
   __ Lsr(temp, object, gc::accounting::CardTable::kCardShift);
-  __ Strb(card, MemOperand(card, temp));
+  __ Strb(card, MemOperand(card, temp.X()));
   __ Bind(&done);
 }
 
@@ -601,7 +631,7 @@
   // Block reserved registers:
   //   ip0 (VIXL temporary)
   //   ip1 (VIXL temporary)
-  //   xSuspend (Suspend counter)
+  //   tr
   //   lr
   // sp is not part of the allocatable registers, so we don't need to block it.
   // TODO: Avoid blocking callee-saved registers, and instead preserve them
@@ -772,12 +802,14 @@
                                        uint32_t dex_pc) {
   __ Ldr(lr, MemOperand(tr, entry_point_offset));
   __ Blr(lr);
-  RecordPcInfo(instruction, dex_pc);
-  DCHECK(instruction->IsSuspendCheck()
-      || instruction->IsBoundsCheck()
-      || instruction->IsNullCheck()
-      || instruction->IsDivZeroCheck()
-      || !IsLeafMethod());
+  if (instruction != nullptr) {
+    RecordPcInfo(instruction, dex_pc);
+    DCHECK(instruction->IsSuspendCheck()
+        || instruction->IsBoundsCheck()
+        || instruction->IsNullCheck()
+        || instruction->IsDivZeroCheck()
+        || !IsLeafMethod());
+    }
 }
 
 void InstructionCodeGeneratorARM64::GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path,
@@ -787,12 +819,30 @@
   __ Ldr(temp, HeapOperand(class_reg, mirror::Class::StatusOffset()));
   __ Cmp(temp, mirror::Class::kStatusInitialized);
   __ B(lt, slow_path->GetEntryLabel());
-  // Even if the initialized flag is set, we may be in a situation where caches are not synced
-  // properly. Therefore, we do a memory fence.
-  __ Dmb(InnerShareable, BarrierAll);
+  // Even if the initialized flag is set, we need to ensure consistent memory ordering.
+  __ Dmb(InnerShareable, BarrierReads);
   __ Bind(slow_path->GetExitLabel());
 }
 
+void InstructionCodeGeneratorARM64::GenerateSuspendCheck(HSuspendCheck* instruction,
+                                                         HBasicBlock* successor) {
+  SuspendCheckSlowPathARM64* slow_path =
+    new (GetGraph()->GetArena()) SuspendCheckSlowPathARM64(instruction, successor);
+  codegen_->AddSlowPath(slow_path);
+  UseScratchRegisterScope temps(codegen_->GetVIXLAssembler());
+  Register temp = temps.AcquireW();
+
+  __ Ldrh(temp, MemOperand(tr, Thread::ThreadFlagsOffset<kArm64WordSize>().SizeValue()));
+  if (successor == nullptr) {
+    __ Cbnz(temp, slow_path->GetEntryLabel());
+    __ Bind(slow_path->GetReturnLabel());
+  } else {
+    __ Cbz(temp, codegen_->GetLabelOf(successor));
+    __ B(slow_path->GetEntryLabel());
+    // slow_path will return to GetLabelOf(successor).
+  }
+}
+
 InstructionCodeGeneratorARM64::InstructionCodeGeneratorARM64(HGraph* graph,
                                                              CodeGeneratorARM64* codegen)
       : HGraphVisitor(graph),
@@ -801,7 +851,6 @@
 
 #define FOR_EACH_UNIMPLEMENTED_INSTRUCTION(M)              \
   M(ParallelMove)                                          \
-  M(Rem)
 
 #define UNIMPLEMENTED_INSTRUCTION_BREAK_CODE(name) name##UnimplementedInstructionBreakCode
 
@@ -894,6 +943,63 @@
   }
 }
 
+void LocationsBuilderARM64::HandleShift(HBinaryOperation* instr) {
+  DCHECK(instr->IsShl() || instr->IsShr() || instr->IsUShr());
+
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
+  Primitive::Type type = instr->GetResultType();
+  switch (type) {
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(instr->InputAt(1)));
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected shift type " << type;
+  }
+}
+
+void InstructionCodeGeneratorARM64::HandleShift(HBinaryOperation* instr) {
+  DCHECK(instr->IsShl() || instr->IsShr() || instr->IsUShr());
+
+  Primitive::Type type = instr->GetType();
+  switch (type) {
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong: {
+      Register dst = OutputRegister(instr);
+      Register lhs = InputRegisterAt(instr, 0);
+      Operand rhs = InputOperandAt(instr, 1);
+      if (rhs.IsImmediate()) {
+        uint32_t shift_value = (type == Primitive::kPrimInt)
+          ? static_cast<uint32_t>(rhs.immediate() & kMaxIntShiftValue)
+          : static_cast<uint32_t>(rhs.immediate() & kMaxLongShiftValue);
+        if (instr->IsShl()) {
+          __ Lsl(dst, lhs, shift_value);
+        } else if (instr->IsShr()) {
+          __ Asr(dst, lhs, shift_value);
+        } else {
+          __ Lsr(dst, lhs, shift_value);
+        }
+      } else {
+        Register rhs_reg = dst.IsX() ? rhs.reg().X() : rhs.reg().W();
+
+        if (instr->IsShl()) {
+          __ Lsl(dst, lhs, rhs_reg);
+        } else if (instr->IsShr()) {
+          __ Asr(dst, lhs, rhs_reg);
+        } else {
+          __ Lsr(dst, lhs, rhs_reg);
+        }
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected shift operation type " << type;
+  }
+}
+
 void LocationsBuilderARM64::VisitAdd(HAdd* instruction) {
   HandleBinaryOp(instruction);
 }
@@ -924,17 +1030,17 @@
   Register obj = InputRegisterAt(instruction, 0);
   Location index = locations->InAt(1);
   size_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(type)).Uint32Value();
-  MemOperand source(obj);
+  MemOperand source = HeapOperand(obj);
   UseScratchRegisterScope temps(GetVIXLAssembler());
 
   if (index.IsConstant()) {
     offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(type);
-    source = MemOperand(obj, offset);
+    source = HeapOperand(obj, offset);
   } else {
     Register temp = temps.AcquireSameSizeAs(obj);
     Register index_reg = RegisterFrom(index, Primitive::kPrimInt);
     __ Add(temp, obj, Operand(index_reg, LSL, Primitive::ComponentSizeShift(type)));
-    source = MemOperand(temp, offset);
+    source = HeapOperand(temp, offset);
   }
 
   codegen_->Load(type, OutputCPURegister(instruction), source);
@@ -979,17 +1085,17 @@
     CPURegister value = InputCPURegisterAt(instruction, 2);
     Location index = locations->InAt(1);
     size_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(value_type)).Uint32Value();
-    MemOperand destination(obj);
+    MemOperand destination = HeapOperand(obj);
     UseScratchRegisterScope temps(GetVIXLAssembler());
 
     if (index.IsConstant()) {
       offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(value_type);
-      destination = MemOperand(obj, offset);
+      destination = HeapOperand(obj, offset);
     } else {
       Register temp = temps.AcquireSameSizeAs(obj);
       Register index_reg = InputRegisterAt(instruction, 1);
       __ Add(temp, obj, Operand(index_reg, LSL, Primitive::ComponentSizeShift(value_type)));
-      destination = MemOperand(temp, offset);
+      destination = HeapOperand(temp, offset);
     }
 
     codegen_->Store(value_type, value, destination);
@@ -1056,29 +1162,59 @@
   GenerateClassInitializationCheck(slow_path, InputRegisterAt(check, 0));
 }
 
-void LocationsBuilderARM64::VisitCompare(HCompare* instruction) {
+void LocationsBuilderARM64::VisitCompare(HCompare* compare) {
   LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
-}
-
-void InstructionCodeGeneratorARM64::VisitCompare(HCompare* instruction) {
-  Primitive::Type in_type = instruction->InputAt(0)->GetType();
-
-  DCHECK_EQ(in_type, Primitive::kPrimLong);
+      new (GetGraph()->GetArena()) LocationSummary(compare, LocationSummary::kNoCall);
+  Primitive::Type in_type = compare->InputAt(0)->GetType();
   switch (in_type) {
     case Primitive::kPrimLong: {
-      vixl::Label done;
-      Register result = OutputRegister(instruction);
-      Register left = InputRegisterAt(instruction, 0);
-      Operand right = InputOperandAt(instruction, 1);
-      __ Subs(result.X(), left, right);
-      __ B(eq, &done);
-      __ Mov(result, 1);
-      __ Cneg(result, result, le);
-      __ Bind(&done);
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(compare->InputAt(1)));
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      break;
+    }
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble: {
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected type for compare operation " << in_type;
+  }
+}
+
+void InstructionCodeGeneratorARM64::VisitCompare(HCompare* compare) {
+  Primitive::Type in_type = compare->InputAt(0)->GetType();
+
+  //  0 if: left == right
+  //  1 if: left  > right
+  // -1 if: left  < right
+  switch (in_type) {
+    case Primitive::kPrimLong: {
+      Register result = OutputRegister(compare);
+      Register left = InputRegisterAt(compare, 0);
+      Operand right = InputOperandAt(compare, 1);
+
+      __ Cmp(left, right);
+      __ Cset(result, ne);
+      __ Cneg(result, result, lt);
+      break;
+    }
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble: {
+      Register result = OutputRegister(compare);
+      FPRegister left = InputFPRegisterAt(compare, 0);
+      FPRegister right = InputFPRegisterAt(compare, 1);
+
+      __ Fcmp(left, right);
+      if (compare->IsGtBias()) {
+        __ Cset(result, ne);
+      } else {
+        __ Csetm(result, ne);
+      }
+      __ Cneg(result, result, compare->IsGtBias() ? mi : gt);
       break;
     }
     default:
@@ -1107,7 +1243,7 @@
   Condition cond = ARM64Condition(instruction->GetCondition());
 
   __ Cmp(lhs, rhs);
-  __ Csel(res, vixl::Assembler::AppropriateZeroRegFor(res), Operand(1), InvertCondition(cond));
+  __ Cset(res, cond);
 }
 
 #define FOR_EACH_CONDITION_INSTRUCTION(M)                                                \
@@ -1232,8 +1368,20 @@
 
 void InstructionCodeGeneratorARM64::VisitGoto(HGoto* got) {
   HBasicBlock* successor = got->GetSuccessor();
-  // TODO: Support for suspend checks emission.
-  if (!codegen_->GoesToNextBlock(got->GetBlock(), successor)) {
+  DCHECK(!successor->IsExitBlock());
+  HBasicBlock* block = got->GetBlock();
+  HInstruction* previous = got->GetPrevious();
+  HLoopInformation* info = block->GetLoopInformation();
+
+  if (info != nullptr && info->IsBackEdge(block) && info->HasSuspendCheck()) {
+    codegen_->ClearSpillSlotsFromLoopPhisInStackMap(info->GetSuspendCheck());
+    GenerateSuspendCheck(info->GetSuspendCheck(), successor);
+    return;
+  }
+  if (block->IsEntryBlock() && (previous != nullptr) && previous->IsSuspendCheck()) {
+    GenerateSuspendCheck(previous->AsSuspendCheck(), nullptr);
+  }
+  if (!codegen_->GoesToNextBlock(block, successor)) {
     __ B(codegen_->GetLabelOf(successor));
   }
 }
@@ -1241,27 +1389,32 @@
 void LocationsBuilderARM64::VisitIf(HIf* if_instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
   HInstruction* cond = if_instr->InputAt(0);
-  DCHECK(cond->IsCondition());
-  if (cond->AsCondition()->NeedsMaterialization()) {
+  if (!cond->IsCondition() || cond->AsCondition()->NeedsMaterialization()) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
 }
 
 void InstructionCodeGeneratorARM64::VisitIf(HIf* if_instr) {
   HInstruction* cond = if_instr->InputAt(0);
-  DCHECK(cond->IsCondition());
   HCondition* condition = cond->AsCondition();
   vixl::Label* true_target = codegen_->GetLabelOf(if_instr->IfTrueSuccessor());
   vixl::Label* false_target = codegen_->GetLabelOf(if_instr->IfFalseSuccessor());
 
-  // TODO: Support constant condition input in VisitIf.
-
-  if (condition->NeedsMaterialization()) {
+  if (cond->IsIntConstant()) {
+    int32_t cond_value = cond->AsIntConstant()->GetValue();
+    if (cond_value == 1) {
+      if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfTrueSuccessor())) {
+        __ B(true_target);
+      }
+      return;
+    } else {
+      DCHECK_EQ(cond_value, 0);
+    }
+  } else if (!cond->IsCondition() || condition->NeedsMaterialization()) {
     // The condition instruction has been materialized, compare the output to 0.
     Location cond_val = if_instr->GetLocations()->InAt(0);
     DCHECK(cond_val.IsRegister());
     __ Cbnz(InputRegisterAt(if_instr, 0), true_target);
-
   } else {
     // The condition instruction has not been materialized, use its inputs as
     // the comparison and its condition as the branch condition.
@@ -1279,7 +1432,6 @@
       __ B(arm64_cond, true_target);
     }
   }
-
   if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfFalseSuccessor())) {
     __ B(false_target);
   }
@@ -1292,8 +1444,7 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
-  MemOperand field = MemOperand(InputRegisterAt(instruction, 0),
-                                instruction->GetFieldOffset().Uint32Value());
+  MemOperand field = HeapOperand(InputRegisterAt(instruction, 0), instruction->GetFieldOffset());
   codegen_->Load(instruction->GetType(), OutputCPURegister(instruction), field);
 }
 
@@ -1307,7 +1458,7 @@
   Primitive::Type field_type = instruction->GetFieldType();
   CPURegister value = InputCPURegisterAt(instruction, 1);
   Register obj = InputRegisterAt(instruction, 0);
-  codegen_->Store(field_type, value, MemOperand(obj, instruction->GetFieldOffset().Uint32Value()));
+  codegen_->Store(field_type, value, HeapOperand(obj, instruction->GetFieldOffset()));
   if (field_type == Primitive::kPrimNot) {
     codegen_->MarkGCCard(obj, Register(value));
   }
@@ -1336,7 +1487,7 @@
   __ Cbz(obj, &done);
 
   // Compare the class of `obj` with `cls`.
-  __ Ldr(out, MemOperand(obj, mirror::Object::ClassOffset().Int32Value()));
+  __ Ldr(out, HeapOperand(obj, mirror::Object::ClassOffset()));
   __ Cmp(out, cls);
   if (instruction->IsClassFinal()) {
     // Classes must be equal for the instanceof to succeed.
@@ -1393,7 +1544,7 @@
           (invoke->GetImtIndex() % mirror::Class::kImtSize) * sizeof(mirror::Class::ImTableEntry);
   Location receiver = invoke->GetLocations()->InAt(0);
   Offset class_offset = mirror::Object::ClassOffset();
-  Offset entry_point = mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize);
+  Offset entry_point = mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize);
 
   // The register ip1 is required to be used for the hidden argument in
   // art_quick_imt_conflict_trampoline, so prevent VIXL from using it.
@@ -1443,14 +1594,12 @@
   // temp = method;
   codegen_->LoadCurrentMethod(temp);
   // temp = temp->dex_cache_resolved_methods_;
-  __ Ldr(temp, MemOperand(temp.X(),
-                          mirror::ArtMethod::DexCacheResolvedMethodsOffset().SizeValue()));
+  __ Ldr(temp, HeapOperand(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset()));
   // temp = temp[index_in_cache];
-  __ Ldr(temp, MemOperand(temp.X(), index_in_cache));
+  __ Ldr(temp, HeapOperand(temp, index_in_cache));
   // lr = temp->entry_point_from_quick_compiled_code_;
-  __ Ldr(lr, MemOperand(temp.X(),
-                        mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-                            kArm64PointerSize).SizeValue()));
+  __ Ldr(lr, HeapOperand(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+                          kArm64WordSize)));
   // lr();
   __ Blr(lr);
 
@@ -1461,24 +1610,24 @@
 void InstructionCodeGeneratorARM64::VisitInvokeVirtual(HInvokeVirtual* invoke) {
   LocationSummary* locations = invoke->GetLocations();
   Location receiver = locations->InAt(0);
-  Register temp = XRegisterFrom(invoke->GetLocations()->GetTemp(0));
+  Register temp = WRegisterFrom(invoke->GetLocations()->GetTemp(0));
   size_t method_offset = mirror::Class::EmbeddedVTableOffset().SizeValue() +
     invoke->GetVTableIndex() * sizeof(mirror::Class::VTableEntry);
   Offset class_offset = mirror::Object::ClassOffset();
-  Offset entry_point = mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize);
+  Offset entry_point = mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize);
 
   // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
-    __ Ldr(temp.W(), MemOperand(sp, receiver.GetStackIndex()));
-    __ Ldr(temp.W(), MemOperand(temp, class_offset.SizeValue()));
+    __ Ldr(temp, MemOperand(sp, receiver.GetStackIndex()));
+    __ Ldr(temp, HeapOperand(temp, class_offset));
   } else {
     DCHECK(receiver.IsRegister());
-    __ Ldr(temp.W(), HeapOperandFrom(receiver, class_offset));
+    __ Ldr(temp, HeapOperandFrom(receiver, class_offset));
   }
   // temp = temp->GetMethodAt(method_offset);
-  __ Ldr(temp.W(), MemOperand(temp, method_offset));
+  __ Ldr(temp, HeapOperand(temp, method_offset));
   // lr = temp->GetEntryPoint();
-  __ Ldr(lr, MemOperand(temp, entry_point.SizeValue()));
+  __ Ldr(lr, HeapOperand(temp, entry_point.SizeValue()));
   // lr();
   __ Blr(lr);
   DCHECK(!codegen_->IsLeafMethod());
@@ -1503,7 +1652,7 @@
     DCHECK(cls->CanCallRuntime());
     codegen_->LoadCurrentMethod(out);
     __ Ldr(out, HeapOperand(out, mirror::ArtMethod::DexCacheResolvedTypesOffset()));
-    __ Ldr(out, MemOperand(out.X(), CodeGenerator::GetCacheOffset(cls->GetTypeIndex())));
+    __ Ldr(out, HeapOperand(out, CodeGenerator::GetCacheOffset(cls->GetTypeIndex())));
 
     SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM64(
         cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
@@ -1550,8 +1699,9 @@
 
   Register out = OutputRegister(load);
   codegen_->LoadCurrentMethod(out);
-  __ Ldr(out, HeapOperand(out, mirror::ArtMethod::DexCacheStringsOffset()));
-  __ Ldr(out, MemOperand(out.X(), CodeGenerator::GetCacheOffset(load->GetStringIndex())));
+  __ Ldr(out, HeapOperand(out, mirror::ArtMethod::DeclaringClassOffset()));
+  __ Ldr(out, HeapOperand(out, mirror::Class::DexCacheStringsOffset()));
+  __ Ldr(out, HeapOperand(out, CodeGenerator::GetCacheOffset(load->GetStringIndex())));
   __ Cbz(out, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
@@ -1793,6 +1943,43 @@
   LOG(FATAL) << "Unreachable";
 }
 
+void LocationsBuilderARM64::VisitRem(HRem* rem) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(rem, LocationSummary::kNoCall);
+  switch (rem->GetResultType()) {
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      break;
+
+    default:
+      LOG(FATAL) << "Unexpected rem type " << rem->GetResultType();
+  }
+}
+
+void InstructionCodeGeneratorARM64::VisitRem(HRem* rem) {
+  Primitive::Type type = rem->GetResultType();
+  switch (type) {
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong: {
+      UseScratchRegisterScope temps(GetVIXLAssembler());
+      Register dividend = InputRegisterAt(rem, 0);
+      Register divisor = InputRegisterAt(rem, 1);
+      Register output = OutputRegister(rem);
+      Register temp = temps.AcquireSameSizeAs(output);
+
+      __ Sdiv(temp, dividend, divisor);
+      __ Msub(output, temp, divisor, dividend);
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unexpected rem type " << type;
+  }
+}
+
 void LocationsBuilderARM64::VisitReturn(HReturn* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
   Primitive::Type return_type = instruction->InputAt(0)->GetType();
@@ -1815,6 +2002,22 @@
   __ Br(lr);
 }
 
+void LocationsBuilderARM64::VisitShl(HShl* shl) {
+  HandleShift(shl);
+}
+
+void InstructionCodeGeneratorARM64::VisitShl(HShl* shl) {
+  HandleShift(shl);
+}
+
+void LocationsBuilderARM64::VisitShr(HShr* shr) {
+  HandleShift(shr);
+}
+
+void InstructionCodeGeneratorARM64::VisitShr(HShr* shr) {
+  HandleShift(shr);
+}
+
 void LocationsBuilderARM64::VisitStoreLocal(HStoreLocal* store) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(store);
   Primitive::Type field_type = store->InputAt(1)->GetType();
@@ -1859,9 +2062,8 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitStaticFieldGet(HStaticFieldGet* instruction) {
-  Register cls = InputRegisterAt(instruction, 0);
-  uint32_t offset = instruction->GetFieldOffset().Uint32Value();
-  codegen_->Load(instruction->GetType(), OutputCPURegister(instruction), MemOperand(cls, offset));
+  MemOperand field = HeapOperand(InputRegisterAt(instruction, 0), instruction->GetFieldOffset());
+  codegen_->Load(instruction->GetType(), OutputCPURegister(instruction), field);
 }
 
 void LocationsBuilderARM64::VisitStaticFieldSet(HStaticFieldSet* instruction) {
@@ -1874,10 +2076,10 @@
 void InstructionCodeGeneratorARM64::VisitStaticFieldSet(HStaticFieldSet* instruction) {
   CPURegister value = InputCPURegisterAt(instruction, 1);
   Register cls = InputRegisterAt(instruction, 0);
-  uint32_t offset = instruction->GetFieldOffset().Uint32Value();
+  Offset offset = instruction->GetFieldOffset();
   Primitive::Type field_type = instruction->GetFieldType();
 
-  codegen_->Store(field_type, value, MemOperand(cls, offset));
+  codegen_->Store(field_type, value, HeapOperand(cls, offset));
   if (field_type == Primitive::kPrimNot) {
     codegen_->MarkGCCard(cls, Register(value));
   }
@@ -1888,14 +2090,17 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitSuspendCheck(HSuspendCheck* instruction) {
-  // TODO: Improve support for suspend checks.
-  SuspendCheckSlowPathARM64* slow_path =
-      new (GetGraph()->GetArena()) SuspendCheckSlowPathARM64(instruction, nullptr);
-  codegen_->AddSlowPath(slow_path);
-
-  __ Subs(wSuspend, wSuspend, 1);
-  __ B(slow_path->GetEntryLabel(), le);
-  __ Bind(slow_path->GetReturnLabel());
+  HBasicBlock* block = instruction->GetBlock();
+  if (block->GetLoopInformation() != nullptr) {
+    DCHECK(block->GetLoopInformation()->GetSuspendCheck() == instruction);
+    // The back edge will generate the suspend check.
+    return;
+  }
+  if (block->IsEntryBlock() && instruction->GetNext()->IsGoto()) {
+    // The goto will generate the suspend check.
+    return;
+  }
+  GenerateSuspendCheck(instruction, nullptr);
 }
 
 void LocationsBuilderARM64::VisitTemporary(HTemporary* temp) {
@@ -1924,6 +2129,7 @@
       new (GetGraph()->GetArena()) LocationSummary(conversion, LocationSummary::kNoCall);
   Primitive::Type input_type = conversion->GetInputType();
   Primitive::Type result_type = conversion->GetResultType();
+  DCHECK_NE(input_type, result_type);
   if ((input_type == Primitive::kPrimNot) || (input_type == Primitive::kPrimVoid) ||
       (result_type == Primitive::kPrimNot) || (result_type == Primitive::kPrimVoid)) {
     LOG(FATAL) << "Unexpected type conversion from " << input_type << " to " << result_type;
@@ -1952,17 +2158,34 @@
     int result_size = Primitive::ComponentSize(result_type);
     int input_size = Primitive::ComponentSize(input_type);
     int min_size = kBitsPerByte * std::min(result_size, input_size);
+    Register output = OutputRegister(conversion);
+    Register source = InputRegisterAt(conversion, 0);
     if ((result_type == Primitive::kPrimChar) ||
         ((input_type == Primitive::kPrimChar) && (result_size > input_size))) {
-      __ Ubfx(OutputRegister(conversion), InputRegisterAt(conversion, 0), 0, min_size);
+      __ Ubfx(output, output.IsX() ? source.X() : source.W(), 0, min_size);
     } else {
-      __ Sbfx(OutputRegister(conversion), InputRegisterAt(conversion, 0), 0, min_size);
+      __ Sbfx(output, output.IsX() ? source.X() : source.W(), 0, min_size);
     }
-    return;
+  } else if (IsFPType(result_type) && IsIntegralType(input_type)) {
+    CHECK(input_type == Primitive::kPrimInt || input_type == Primitive::kPrimLong);
+    __ Scvtf(OutputFPRegister(conversion), InputRegisterAt(conversion, 0));
+  } else if (IsIntegralType(result_type) && IsFPType(input_type)) {
+    CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong);
+    __ Fcvtzs(OutputRegister(conversion), InputFPRegisterAt(conversion, 0));
+  } else if (IsFPType(result_type) && IsFPType(input_type)) {
+    __ Fcvt(OutputFPRegister(conversion), InputFPRegisterAt(conversion, 0));
+  } else {
+    LOG(FATAL) << "Unexpected or unimplemented type conversion from " << input_type
+                << " to " << result_type;
   }
+}
 
-  LOG(FATAL) << "Unexpected or unimplemented type conversion from " << input_type
-             << " to " << result_type;
+void LocationsBuilderARM64::VisitUShr(HUShr* ushr) {
+  HandleShift(ushr);
+}
+
+void InstructionCodeGeneratorARM64::VisitUShr(HUShr* ushr) {
+  HandleShift(ushr);
 }
 
 void LocationsBuilderARM64::VisitXor(HXor* instruction) {
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 6b71b94..0e3d25f 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -31,7 +31,9 @@
 class CodeGeneratorARM64;
 class SlowPathCodeARM64;
 
-static constexpr size_t kArm64WordSize = 8;
+// Use a local definition to prevent copying mistakes.
+static constexpr size_t kArm64WordSize = kArm64PointerSize;
+
 static const vixl::Register kParameterCoreRegisters[] = {
   vixl::x1, vixl::x2, vixl::x3, vixl::x4, vixl::x5, vixl::x6, vixl::x7
 };
@@ -42,12 +44,10 @@
 static constexpr size_t kParameterFPRegistersLength = arraysize(kParameterFPRegisters);
 
 const vixl::Register tr = vixl::x18;        // Thread Register
-const vixl::Register wSuspend = vixl::w19;  // Suspend Register
-const vixl::Register xSuspend = vixl::x19;
 
 const vixl::CPURegList vixl_reserved_core_registers(vixl::ip0, vixl::ip1);
 const vixl::CPURegList vixl_reserved_fp_registers(vixl::d31);
-const vixl::CPURegList runtime_reserved_core_registers(tr, xSuspend, vixl::lr);
+const vixl::CPURegList runtime_reserved_core_registers(tr, vixl::lr);
 const vixl::CPURegList quick_callee_saved_registers(vixl::CPURegister::kRegister,
                                                     vixl::kXRegSize,
                                                     kArm64CalleeSaveRefSpills);
@@ -108,7 +108,9 @@
 
  private:
   void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path, vixl::Register class_reg);
+  void GenerateSuspendCheck(HSuspendCheck* instruction, HBasicBlock* successor);
   void HandleBinaryOp(HBinaryOperation* instr);
+  void HandleShift(HBinaryOperation* instr);
 
   Arm64Assembler* const assembler_;
   CodeGeneratorARM64* const codegen_;
@@ -128,6 +130,7 @@
 
  private:
   void HandleBinaryOp(HBinaryOperation* instr);
+  void HandleShift(HBinaryOperation* instr);
   void HandleInvoke(HInvoke* instr);
 
   CodeGeneratorARM64* const codegen_;
@@ -230,6 +233,8 @@
     }
   }
 
+  void Finalize(CodeAllocator* allocator) OVERRIDE;
+
   // Code generation helpers.
   void MoveConstant(vixl::CPURegister destination, HConstant* constant);
   void MoveHelper(Location destination, Location source, Primitive::Type type);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index b0f36ce..fd794f9 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -143,7 +143,9 @@
   BoundsCheckSlowPathX86(HBoundsCheck* instruction,
                          Location index_location,
                          Location length_location)
-      : instruction_(instruction), index_location_(index_location), length_location_(length_location) {}
+      : instruction_(instruction),
+        index_location_(index_location),
+        length_location_(length_location) {}
 
   virtual void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
@@ -311,7 +313,8 @@
         Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
 
     if (instruction_->IsInstanceOf()) {
-      __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pInstanceofNonTrivial)));
+      __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize,
+                                                              pInstanceofNonTrivial)));
     } else {
       DCHECK(instruction_->IsCheckCast());
       __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pCheckCast)));
@@ -464,7 +467,8 @@
   static const int kFakeReturnRegister = 8;
   core_spill_mask_ |= (1 << kFakeReturnRegister);
 
-  bool skip_overflow_check = IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86);
+  bool skip_overflow_check =
+      IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86);
   if (!skip_overflow_check && !kExplicitStackOverflowCheck) {
     __ testl(EAX, Address(ESP, -static_cast<int32_t>(GetStackOverflowReservedBytes(kX86))));
     RecordPcInfo(nullptr, 0);
@@ -567,28 +571,28 @@
   }
   if (destination.IsRegister()) {
     if (source.IsRegister()) {
-      __ movl(destination.As<Register>(), source.As<Register>());
+      __ movl(destination.AsRegister<Register>(), source.AsRegister<Register>());
     } else if (source.IsFpuRegister()) {
-      __ movd(destination.As<Register>(), source.As<XmmRegister>());
+      __ movd(destination.AsRegister<Register>(), source.AsFpuRegister<XmmRegister>());
     } else {
       DCHECK(source.IsStackSlot());
-      __ movl(destination.As<Register>(), Address(ESP, source.GetStackIndex()));
+      __ movl(destination.AsRegister<Register>(), Address(ESP, source.GetStackIndex()));
     }
   } else if (destination.IsFpuRegister()) {
     if (source.IsRegister()) {
-      __ movd(destination.As<XmmRegister>(), source.As<Register>());
+      __ movd(destination.AsFpuRegister<XmmRegister>(), source.AsRegister<Register>());
     } else if (source.IsFpuRegister()) {
-      __ movaps(destination.As<XmmRegister>(), source.As<XmmRegister>());
+      __ movaps(destination.AsFpuRegister<XmmRegister>(), source.AsFpuRegister<XmmRegister>());
     } else {
       DCHECK(source.IsStackSlot());
-      __ movss(destination.As<XmmRegister>(), Address(ESP, source.GetStackIndex()));
+      __ movss(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
     }
   } else {
     DCHECK(destination.IsStackSlot()) << destination;
     if (source.IsRegister()) {
-      __ movl(Address(ESP, destination.GetStackIndex()), source.As<Register>());
+      __ movl(Address(ESP, destination.GetStackIndex()), source.AsRegister<Register>());
     } else if (source.IsFpuRegister()) {
-      __ movss(Address(ESP, destination.GetStackIndex()), source.As<XmmRegister>());
+      __ movss(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
     } else {
       DCHECK(source.IsStackSlot());
       __ pushl(Address(ESP, source.GetStackIndex()));
@@ -603,19 +607,25 @@
   }
   if (destination.IsRegisterPair()) {
     if (source.IsRegisterPair()) {
-      __ movl(destination.AsRegisterPairLow<Register>(), source.AsRegisterPairLow<Register>());
-      __ movl(destination.AsRegisterPairHigh<Register>(), source.AsRegisterPairHigh<Register>());
+      EmitParallelMoves(
+          Location::RegisterLocation(source.AsRegisterPairHigh<Register>()),
+          Location::RegisterLocation(destination.AsRegisterPairHigh<Register>()),
+          Location::RegisterLocation(source.AsRegisterPairLow<Register>()),
+          Location::RegisterLocation(destination.AsRegisterPairLow<Register>()));
     } else if (source.IsFpuRegister()) {
       LOG(FATAL) << "Unimplemented";
     } else if (source.IsQuickParameter()) {
       uint16_t register_index = source.GetQuickParameterRegisterIndex();
       uint16_t stack_index = source.GetQuickParameterStackIndex();
       InvokeDexCallingConvention calling_convention;
-      __ movl(destination.AsRegisterPairLow<Register>(),
-              calling_convention.GetRegisterAt(register_index));
-      __ movl(destination.AsRegisterPairHigh<Register>(), Address(ESP,
-          calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize()));
+      EmitParallelMoves(
+          Location::RegisterLocation(calling_convention.GetRegisterAt(register_index)),
+          Location::RegisterLocation(destination.AsRegisterPairLow<Register>()),
+          Location::StackSlot(
+              calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize()),
+          Location::RegisterLocation(destination.AsRegisterPairHigh<Register>()));
     } else {
+      // No conflict possible, so just do the moves.
       DCHECK(source.IsDoubleStackSlot());
       __ movl(destination.AsRegisterPairLow<Register>(), Address(ESP, source.GetStackIndex()));
       __ movl(destination.AsRegisterPairHigh<Register>(),
@@ -625,47 +635,52 @@
     InvokeDexCallingConvention calling_convention;
     uint16_t register_index = destination.GetQuickParameterRegisterIndex();
     uint16_t stack_index = destination.GetQuickParameterStackIndex();
-    if (source.IsRegister()) {
-      __ movl(calling_convention.GetRegisterAt(register_index), source.AsRegisterPairLow<Register>());
-      __ movl(Address(ESP, calling_convention.GetStackOffsetOf(stack_index + 1)),
-              source.AsRegisterPairHigh<Register>());
+    if (source.IsRegisterPair()) {
+      LOG(FATAL) << "Unimplemented";
     } else if (source.IsFpuRegister()) {
       LOG(FATAL) << "Unimplemented";
     } else {
       DCHECK(source.IsDoubleStackSlot());
-      __ movl(calling_convention.GetRegisterAt(register_index),
-              Address(ESP, source.GetStackIndex()));
-      __ pushl(Address(ESP, source.GetHighStackIndex(kX86WordSize)));
-      __ popl(Address(ESP, calling_convention.GetStackOffsetOf(stack_index + 1)));
+      EmitParallelMoves(
+          Location::StackSlot(source.GetStackIndex()),
+          Location::RegisterLocation(calling_convention.GetRegisterAt(register_index)),
+          Location::StackSlot(source.GetHighStackIndex(kX86WordSize)),
+          Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index + 1)));
     }
   } else if (destination.IsFpuRegister()) {
     if (source.IsDoubleStackSlot()) {
-      __ movsd(destination.As<XmmRegister>(), Address(ESP, source.GetStackIndex()));
+      __ movsd(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
     } else {
       LOG(FATAL) << "Unimplemented";
     }
   } else {
     DCHECK(destination.IsDoubleStackSlot()) << destination;
     if (source.IsRegisterPair()) {
+      // No conflict possible, so just do the moves.
       __ movl(Address(ESP, destination.GetStackIndex()), source.AsRegisterPairLow<Register>());
       __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)),
               source.AsRegisterPairHigh<Register>());
     } else if (source.IsQuickParameter()) {
+      // No conflict possible, so just do the move.
       InvokeDexCallingConvention calling_convention;
       uint16_t register_index = source.GetQuickParameterRegisterIndex();
       uint16_t stack_index = source.GetQuickParameterStackIndex();
+      // Just move the low part. The only time a source is a quick parameter is
+      // when moving the parameter to its stack locations. And the (Java) caller
+      // of this method has already done that.
       __ movl(Address(ESP, destination.GetStackIndex()),
               calling_convention.GetRegisterAt(register_index));
       DCHECK_EQ(calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize(),
                 static_cast<size_t>(destination.GetHighStackIndex(kX86WordSize)));
     } else if (source.IsFpuRegister()) {
-      __ movsd(Address(ESP, destination.GetStackIndex()), source.As<XmmRegister>());
+      __ movsd(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
     } else {
       DCHECK(source.IsDoubleStackSlot());
-      __ pushl(Address(ESP, source.GetStackIndex()));
-      __ popl(Address(ESP, destination.GetStackIndex()));
-      __ pushl(Address(ESP, source.GetHighStackIndex(kX86WordSize)));
-      __ popl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)));
+      EmitParallelMoves(
+          Location::StackSlot(source.GetStackIndex()),
+          Location::StackSlot(destination.GetStackIndex()),
+          Location::StackSlot(source.GetHighStackIndex(kX86WordSize)),
+          Location::StackSlot(destination.GetHighStackIndex(kX86WordSize)));
     }
   }
 }
@@ -681,7 +696,7 @@
     if (const_to_move->IsIntConstant()) {
       Immediate imm(const_to_move->AsIntConstant()->GetValue());
       if (location.IsRegister()) {
-        __ movl(location.As<Register>(), imm);
+        __ movl(location.AsRegister<Register>(), imm);
       } else if (location.IsStackSlot()) {
         __ movl(Address(ESP, location.GetStackIndex()), imm);
       } else {
@@ -695,7 +710,8 @@
         __ movl(location.AsRegisterPairHigh<Register>(), Immediate(High32Bits(value)));
       } else if (location.IsDoubleStackSlot()) {
         __ movl(Address(ESP, location.GetStackIndex()), Immediate(Low32Bits(value)));
-        __ movl(Address(ESP, location.GetHighStackIndex(kX86WordSize)), Immediate(High32Bits(value)));
+        __ movl(Address(ESP, location.GetHighStackIndex(kX86WordSize)),
+                Immediate(High32Bits(value)));
       } else {
         DCHECK(location.IsConstant());
         DCHECK_EQ(location.GetConstant(), instruction);
@@ -828,7 +844,7 @@
         // Materialized condition, compare against 0.
         Location lhs = if_instr->GetLocations()->InAt(0);
         if (lhs.IsRegister()) {
-          __ cmpl(lhs.As<Register>(), Immediate(0));
+          __ cmpl(lhs.AsRegister<Register>(), Immediate(0));
         } else {
           __ cmpl(Address(ESP, lhs.GetStackIndex()), Immediate(0));
         }
@@ -843,13 +859,13 @@
       // LHS is guaranteed to be in a register (see
       // LocationsBuilderX86::VisitCondition).
       if (rhs.IsRegister()) {
-        __ cmpl(lhs.As<Register>(), rhs.As<Register>());
+        __ cmpl(lhs.AsRegister<Register>(), rhs.AsRegister<Register>());
       } else if (rhs.IsConstant()) {
         HIntConstant* instruction = rhs.GetConstant()->AsIntConstant();
         Immediate imm(instruction->AsIntConstant()->GetValue());
-        __ cmpl(lhs.As<Register>(), imm);
+        __ cmpl(lhs.AsRegister<Register>(), imm);
       } else {
-        __ cmpl(lhs.As<Register>(), Address(ESP, rhs.GetStackIndex()));
+        __ cmpl(lhs.AsRegister<Register>(), Address(ESP, rhs.GetStackIndex()));
       }
       __ j(X86Condition(cond->AsCondition()->GetCondition()),
            codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
@@ -920,18 +936,18 @@
 void InstructionCodeGeneratorX86::VisitCondition(HCondition* comp) {
   if (comp->NeedsMaterialization()) {
     LocationSummary* locations = comp->GetLocations();
-    Register reg = locations->Out().As<Register>();
+    Register reg = locations->Out().AsRegister<Register>();
     // Clear register: setcc only sets the low byte.
     __ xorl(reg, reg);
     if (locations->InAt(1).IsRegister()) {
-      __ cmpl(locations->InAt(0).As<Register>(),
-              locations->InAt(1).As<Register>());
+      __ cmpl(locations->InAt(0).AsRegister<Register>(),
+              locations->InAt(1).AsRegister<Register>());
     } else if (locations->InAt(1).IsConstant()) {
       HConstant* instruction = locations->InAt(1).GetConstant();
       Immediate imm(instruction->AsIntConstant()->GetValue());
-      __ cmpl(locations->InAt(0).As<Register>(), imm);
+      __ cmpl(locations->InAt(0).AsRegister<Register>(), imm);
     } else {
-      __ cmpl(locations->InAt(0).As<Register>(),
+      __ cmpl(locations->InAt(0).AsRegister<Register>(),
               Address(ESP, locations->InAt(1).GetStackIndex()));
     }
     __ setb(X86Condition(comp->GetCondition()), reg);
@@ -1078,7 +1094,7 @@
       case Primitive::kPrimShort:
       case Primitive::kPrimInt:
       case Primitive::kPrimNot:
-        DCHECK_EQ(ret->GetLocations()->InAt(0).As<Register>(), EAX);
+        DCHECK_EQ(ret->GetLocations()->InAt(0).AsRegister<Register>(), EAX);
         break;
 
       case Primitive::kPrimLong:
@@ -1088,7 +1104,7 @@
 
       case Primitive::kPrimFloat:
       case Primitive::kPrimDouble:
-        DCHECK_EQ(ret->GetLocations()->InAt(0).As<XmmRegister>(), XMM0);
+        DCHECK_EQ(ret->GetLocations()->InAt(0).AsFpuRegister<XmmRegister>(), XMM0);
         break;
 
       default:
@@ -1104,7 +1120,7 @@
 }
 
 void InstructionCodeGeneratorX86::VisitInvokeStatic(HInvokeStatic* invoke) {
-  Register temp = invoke->GetLocations()->GetTemp(0).As<Register>();
+  Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>();
 
   // TODO: Implement all kinds of calls:
   // 1) boot -> boot
@@ -1120,7 +1136,8 @@
   // temp = temp[index_in_cache]
   __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetIndexInDexCache())));
   // (temp + offset_of_quick_compiled_code)()
-  __ call(Address(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(4).Int32Value()));
+  __ call(Address(
+      temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
 
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1168,7 +1185,7 @@
 }
 
 void InstructionCodeGeneratorX86::VisitInvokeVirtual(HInvokeVirtual* invoke) {
-  Register temp = invoke->GetLocations()->GetTemp(0).As<Register>();
+  Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>();
   uint32_t method_offset = mirror::Class::EmbeddedVTableOffset().Uint32Value() +
           invoke->GetVTableIndex() * sizeof(mirror::Class::VTableEntry);
   LocationSummary* locations = invoke->GetLocations();
@@ -1179,12 +1196,13 @@
     __ movl(temp, Address(ESP, receiver.GetStackIndex()));
     __ movl(temp, Address(temp, class_offset));
   } else {
-    __ movl(temp, Address(receiver.As<Register>(), class_offset));
+    __ movl(temp, Address(receiver.AsRegister<Register>(), class_offset));
   }
   // temp = temp->GetMethodAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
-  __ call(Address(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(4).Int32Value()));
+  __ call(Address(
+      temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
 
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1198,7 +1216,7 @@
 
 void InstructionCodeGeneratorX86::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
-  Register temp = invoke->GetLocations()->GetTemp(0).As<Register>();
+  Register temp = invoke->GetLocations()->GetTemp(0).AsRegister<Register>();
   uint32_t method_offset = mirror::Class::EmbeddedImTableOffset().Uint32Value() +
           (invoke->GetImtIndex() % mirror::Class::kImtSize) * sizeof(mirror::Class::ImTableEntry);
   LocationSummary* locations = invoke->GetLocations();
@@ -1207,20 +1225,20 @@
 
   // Set the hidden argument.
   __ movl(temp, Immediate(invoke->GetDexMethodIndex()));
-  __ movd(invoke->GetLocations()->GetTemp(1).As<XmmRegister>(), temp);
+  __ movd(invoke->GetLocations()->GetTemp(1).AsFpuRegister<XmmRegister>(), temp);
 
   // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
     __ movl(temp, Address(ESP, receiver.GetStackIndex()));
     __ movl(temp, Address(temp, class_offset));
   } else {
-    __ movl(temp, Address(receiver.As<Register>(), class_offset));
+    __ movl(temp, Address(receiver.AsRegister<Register>(), class_offset));
   }
   // temp = temp->GetImtEntryAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
   __ call(Address(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kX86PointerSize).Int32Value()));
+      kX86WordSize).Int32Value()));
 
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1237,11 +1255,16 @@
       break;
 
     case Primitive::kPrimFloat:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      locations->AddTemp(Location::RequiresRegister());
+      locations->AddTemp(Location::RequiresFpuRegister());
+      break;
+
     case Primitive::kPrimDouble:
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      // Output overlaps as we need a fresh (zero-initialized)
-      // register to perform subtraction from zero.
-      locations->SetOut(Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      locations->AddTemp(Location::RequiresFpuRegister());
       break;
 
     default:
@@ -1257,7 +1280,7 @@
     case Primitive::kPrimInt:
       DCHECK(in.IsRegister());
       DCHECK(in.Equals(out));
-      __ negl(out.As<Register>());
+      __ negl(out.AsRegister<Register>());
       break;
 
     case Primitive::kPrimLong:
@@ -1273,21 +1296,29 @@
       __ negl(out.AsRegisterPairHigh<Register>());
       break;
 
-    case Primitive::kPrimFloat:
-      DCHECK(!in.Equals(out));
-      // out = 0
-      __ xorps(out.As<XmmRegister>(), out.As<XmmRegister>());
-      // out = out - in
-      __ subss(out.As<XmmRegister>(), in.As<XmmRegister>());
+    case Primitive::kPrimFloat: {
+      DCHECK(in.Equals(out));
+      Register constant = locations->GetTemp(0).AsRegister<Register>();
+      XmmRegister mask = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+      // Implement float negation with an exclusive or with value
+      // 0x80000000 (mask for bit 31, representing the sign of a
+      // single-precision floating-point number).
+      __ movl(constant, Immediate(INT32_C(0x80000000)));
+      __ movd(mask, constant);
+      __ xorps(out.AsFpuRegister<XmmRegister>(), mask);
       break;
+    }
 
-    case Primitive::kPrimDouble:
-      DCHECK(!in.Equals(out));
-      // out = 0
-      __ xorpd(out.As<XmmRegister>(), out.As<XmmRegister>());
-      // out = out - in
-      __ subsd(out.As<XmmRegister>(), in.As<XmmRegister>());
+    case Primitive::kPrimDouble: {
+      DCHECK(in.Equals(out));
+      XmmRegister mask = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      // Implement double negation with an exclusive or with value
+      // 0x8000000000000000 (mask for bit 63, representing the sign of
+      // a double-precision floating-point number).
+      __ LoadLongConstant(mask, INT64_C(0x8000000000000000));
+      __ xorpd(out.AsFpuRegister<XmmRegister>(), mask);
       break;
+    }
 
     default:
       LOG(FATAL) << "Unexpected neg type " << neg->GetResultType();
@@ -1299,6 +1330,7 @@
       new (GetGraph()->GetArena()) LocationSummary(conversion, LocationSummary::kNoCall);
   Primitive::Type result_type = conversion->GetResultType();
   Primitive::Type input_type = conversion->GetInputType();
+  DCHECK_NE(result_type, input_type);
   switch (result_type) {
     case Primitive::kPrimByte:
       switch (input_type) {
@@ -1380,7 +1412,6 @@
         case Primitive::kPrimByte:
         case Primitive::kPrimShort:
         case Primitive::kPrimInt:
-        case Primitive::kPrimChar:
           // Processing a Dex `int-to-char' instruction.
           locations->SetInAt(0, Location::Any());
           locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
@@ -1404,6 +1435,13 @@
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-float' instruction.
+          locations->SetInAt(0, Location::RequiresRegister());
+          locations->SetOut(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          break;
+
         case Primitive::kPrimDouble:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1427,6 +1465,13 @@
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-double' instruction.
+          locations->SetInAt(0, Location::RequiresRegister());
+          locations->SetOut(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          break;
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1450,6 +1495,7 @@
   Location in = locations->InAt(0);
   Primitive::Type result_type = conversion->GetResultType();
   Primitive::Type input_type = conversion->GetInputType();
+  DCHECK_NE(result_type, input_type);
   switch (result_type) {
     case Primitive::kPrimByte:
       switch (input_type) {
@@ -1458,13 +1504,13 @@
         case Primitive::kPrimChar:
           // Processing a Dex `int-to-byte' instruction.
           if (in.IsRegister()) {
-            __ movsxb(out.As<Register>(), in.As<ByteRegister>());
+            __ movsxb(out.AsRegister<Register>(), in.AsRegister<ByteRegister>());
           } else if (in.IsStackSlot()) {
-            __ movsxb(out.As<Register>(), Address(ESP, in.GetStackIndex()));
+            __ movsxb(out.AsRegister<Register>(), Address(ESP, in.GetStackIndex()));
           } else {
             DCHECK(in.GetConstant()->IsIntConstant());
             int32_t value = in.GetConstant()->AsIntConstant()->GetValue();
-            __ movl(out.As<Register>(), Immediate(static_cast<int8_t>(value)));
+            __ movl(out.AsRegister<Register>(), Immediate(static_cast<int8_t>(value)));
           }
           break;
 
@@ -1481,13 +1527,13 @@
         case Primitive::kPrimChar:
           // Processing a Dex `int-to-short' instruction.
           if (in.IsRegister()) {
-            __ movsxw(out.As<Register>(), in.As<Register>());
+            __ movsxw(out.AsRegister<Register>(), in.AsRegister<Register>());
           } else if (in.IsStackSlot()) {
-            __ movsxw(out.As<Register>(), Address(ESP, in.GetStackIndex()));
+            __ movsxw(out.AsRegister<Register>(), Address(ESP, in.GetStackIndex()));
           } else {
             DCHECK(in.GetConstant()->IsIntConstant());
             int32_t value = in.GetConstant()->AsIntConstant()->GetValue();
-            __ movl(out.As<Register>(), Immediate(static_cast<int16_t>(value)));
+            __ movl(out.AsRegister<Register>(), Immediate(static_cast<int16_t>(value)));
           }
           break;
 
@@ -1502,14 +1548,14 @@
         case Primitive::kPrimLong:
           // Processing a Dex `long-to-int' instruction.
           if (in.IsRegisterPair()) {
-            __ movl(out.As<Register>(), in.AsRegisterPairLow<Register>());
+            __ movl(out.AsRegister<Register>(), in.AsRegisterPairLow<Register>());
           } else if (in.IsDoubleStackSlot()) {
-            __ movl(out.As<Register>(), Address(ESP, in.GetStackIndex()));
+            __ movl(out.AsRegister<Register>(), Address(ESP, in.GetStackIndex()));
           } else {
             DCHECK(in.IsConstant());
             DCHECK(in.GetConstant()->IsLongConstant());
             int64_t value = in.GetConstant()->AsLongConstant()->GetValue();
-            __ movl(out.As<Register>(), Immediate(static_cast<int32_t>(value)));
+            __ movl(out.AsRegister<Register>(), Immediate(static_cast<int32_t>(value)));
           }
           break;
 
@@ -1534,7 +1580,7 @@
           // Processing a Dex `int-to-long' instruction.
           DCHECK_EQ(out.AsRegisterPairLow<Register>(), EAX);
           DCHECK_EQ(out.AsRegisterPairHigh<Register>(), EDX);
-          DCHECK_EQ(in.As<Register>(), EAX);
+          DCHECK_EQ(in.AsRegister<Register>(), EAX);
           __ cdq();
           break;
 
@@ -1555,16 +1601,15 @@
         case Primitive::kPrimByte:
         case Primitive::kPrimShort:
         case Primitive::kPrimInt:
-        case Primitive::kPrimChar:
           // Processing a Dex `Process a Dex `int-to-char'' instruction.
           if (in.IsRegister()) {
-            __ movzxw(out.As<Register>(), in.As<Register>());
+            __ movzxw(out.AsRegister<Register>(), in.AsRegister<Register>());
           } else if (in.IsStackSlot()) {
-            __ movzxw(out.As<Register>(), Address(ESP, in.GetStackIndex()));
+            __ movzxw(out.AsRegister<Register>(), Address(ESP, in.GetStackIndex()));
           } else {
             DCHECK(in.GetConstant()->IsIntConstant());
             int32_t value = in.GetConstant()->AsIntConstant()->GetValue();
-            __ movl(out.As<Register>(), Immediate(static_cast<uint16_t>(value)));
+            __ movl(out.AsRegister<Register>(), Immediate(static_cast<uint16_t>(value)));
           }
           break;
 
@@ -1576,15 +1621,48 @@
 
     case Primitive::kPrimFloat:
       switch (input_type) {
-          // Processing a Dex `int-to-float' instruction.
         case Primitive::kPrimByte:
         case Primitive::kPrimShort:
         case Primitive::kPrimInt:
         case Primitive::kPrimChar:
-          __ cvtsi2ss(out.As<XmmRegister>(), in.As<Register>());
+          // Processing a Dex `int-to-float' instruction.
+          __ cvtsi2ss(out.AsFpuRegister<XmmRegister>(), in.AsRegister<Register>());
           break;
 
-        case Primitive::kPrimLong:
+        case Primitive::kPrimLong: {
+          // Processing a Dex `long-to-float' instruction.
+          Register low = in.AsRegisterPairLow<Register>();
+          Register high = in.AsRegisterPairHigh<Register>();
+          XmmRegister result = out.AsFpuRegister<XmmRegister>();
+          XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          XmmRegister constant = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+
+          // Operations use doubles for precision reasons (each 32-bit
+          // half of a long fits in the 53-bit mantissa of a double,
+          // but not in the 24-bit mantissa of a float).  This is
+          // especially important for the low bits.  The result is
+          // eventually converted to float.
+
+          // low = low - 2^31 (to prevent bit 31 of `low` to be
+          // interpreted as a sign bit)
+          __ subl(low, Immediate(0x80000000));
+          // temp = int-to-double(high)
+          __ cvtsi2sd(temp, high);
+          // temp = temp * 2^32
+          __ LoadLongConstant(constant, k2Pow32EncodingForDouble);
+          __ mulsd(temp, constant);
+          // result = int-to-double(low)
+          __ cvtsi2sd(result, low);
+          // result = result + 2^31 (restore the original value of `low`)
+          __ LoadLongConstant(constant, k2Pow31EncodingForDouble);
+          __ addsd(result, constant);
+          // result = result + temp
+          __ addsd(result, temp);
+          // result = double-to-float(result)
+          __ cvtsd2ss(result, result);
+          break;
+        }
+
         case Primitive::kPrimDouble:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1598,15 +1676,40 @@
 
     case Primitive::kPrimDouble:
       switch (input_type) {
-          // Processing a Dex `int-to-double' instruction.
         case Primitive::kPrimByte:
         case Primitive::kPrimShort:
         case Primitive::kPrimInt:
         case Primitive::kPrimChar:
-          __ cvtsi2sd(out.As<XmmRegister>(), in.As<Register>());
+          // Processing a Dex `int-to-double' instruction.
+          __ cvtsi2sd(out.AsFpuRegister<XmmRegister>(), in.AsRegister<Register>());
           break;
 
-        case Primitive::kPrimLong:
+        case Primitive::kPrimLong: {
+          // Processing a Dex `long-to-double' instruction.
+          Register low = in.AsRegisterPairLow<Register>();
+          Register high = in.AsRegisterPairHigh<Register>();
+          XmmRegister result = out.AsFpuRegister<XmmRegister>();
+          XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          XmmRegister constant = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+
+          // low = low - 2^31 (to prevent bit 31 of `low` to be
+          // interpreted as a sign bit)
+          __ subl(low, Immediate(0x80000000));
+          // temp = int-to-double(high)
+          __ cvtsi2sd(temp, high);
+          // temp = temp * 2^32
+          __ LoadLongConstant(constant, k2Pow32EncodingForDouble);
+          __ mulsd(temp, constant);
+          // result = int-to-double(low)
+          __ cvtsi2sd(result, low);
+          // result = result + 2^31 (restore the original value of `low`)
+          __ LoadLongConstant(constant, k2Pow31EncodingForDouble);
+          __ addsd(result, constant);
+          // result = result + temp
+          __ addsd(result, temp);
+          break;
+        }
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1658,11 +1761,12 @@
   switch (add->GetResultType()) {
     case Primitive::kPrimInt: {
       if (second.IsRegister()) {
-        __ addl(first.As<Register>(), second.As<Register>());
+        __ addl(first.AsRegister<Register>(), second.AsRegister<Register>());
       } else if (second.IsConstant()) {
-        __ addl(first.As<Register>(), Immediate(second.GetConstant()->AsIntConstant()->GetValue()));
+        __ addl(first.AsRegister<Register>(),
+                Immediate(second.GetConstant()->AsIntConstant()->GetValue()));
       } else {
-        __ addl(first.As<Register>(), Address(ESP, second.GetStackIndex()));
+        __ addl(first.AsRegister<Register>(), Address(ESP, second.GetStackIndex()));
       }
       break;
     }
@@ -1681,18 +1785,18 @@
 
     case Primitive::kPrimFloat: {
       if (second.IsFpuRegister()) {
-        __ addss(first.As<XmmRegister>(), second.As<XmmRegister>());
+        __ addss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       } else {
-        __ addss(first.As<XmmRegister>(), Address(ESP, second.GetStackIndex()));
+        __ addss(first.AsFpuRegister<XmmRegister>(), Address(ESP, second.GetStackIndex()));
       }
       break;
     }
 
     case Primitive::kPrimDouble: {
       if (second.IsFpuRegister()) {
-        __ addsd(first.As<XmmRegister>(), second.As<XmmRegister>());
+        __ addsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       } else {
-        __ addsd(first.As<XmmRegister>(), Address(ESP, second.GetStackIndex()));
+        __ addsd(first.AsFpuRegister<XmmRegister>(), Address(ESP, second.GetStackIndex()));
       }
       break;
     }
@@ -1734,11 +1838,12 @@
   switch (sub->GetResultType()) {
     case Primitive::kPrimInt: {
       if (second.IsRegister()) {
-        __ subl(first.As<Register>(), second.As<Register>());
+        __ subl(first.AsRegister<Register>(), second.AsRegister<Register>());
       } else if (second.IsConstant()) {
-        __ subl(first.As<Register>(), Immediate(second.GetConstant()->AsIntConstant()->GetValue()));
+        __ subl(first.AsRegister<Register>(),
+                Immediate(second.GetConstant()->AsIntConstant()->GetValue()));
       } else {
-        __ subl(first.As<Register>(), Address(ESP, second.GetStackIndex()));
+        __ subl(first.AsRegister<Register>(), Address(ESP, second.GetStackIndex()));
       }
       break;
     }
@@ -1756,12 +1861,12 @@
     }
 
     case Primitive::kPrimFloat: {
-      __ subss(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ subss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      __ subsd(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ subsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
@@ -1816,13 +1921,13 @@
   switch (mul->GetResultType()) {
     case Primitive::kPrimInt: {
       if (second.IsRegister()) {
-        __ imull(first.As<Register>(), second.As<Register>());
+        __ imull(first.AsRegister<Register>(), second.AsRegister<Register>());
       } else if (second.IsConstant()) {
         Immediate imm(second.GetConstant()->AsIntConstant()->GetValue());
-        __ imull(first.As<Register>(), imm);
+        __ imull(first.AsRegister<Register>(), imm);
       } else {
         DCHECK(second.IsStackSlot());
-        __ imull(first.As<Register>(), Address(ESP, second.GetStackIndex()));
+        __ imull(first.AsRegister<Register>(), Address(ESP, second.GetStackIndex()));
       }
       break;
     }
@@ -1834,8 +1939,8 @@
       Register in1_lo = first.AsRegisterPairLow<Register>();
       Address in2_hi(ESP, second.GetHighStackIndex(kX86WordSize));
       Address in2_lo(ESP, second.GetStackIndex());
-      Register eax = locations->GetTemp(0).As<Register>();
-      Register edx = locations->GetTemp(1).As<Register>();
+      Register eax = locations->GetTemp(0).AsRegister<Register>();
+      Register edx = locations->GetTemp(1).AsRegister<Register>();
 
       DCHECK_EQ(EAX, eax);
       DCHECK_EQ(EDX, edx);
@@ -1866,12 +1971,12 @@
     }
 
     case Primitive::kPrimFloat: {
-      __ mulss(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ mulss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      __ mulsd(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ mulsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
@@ -1891,12 +1996,13 @@
 
   switch (instruction->GetResultType()) {
     case Primitive::kPrimInt: {
-      Register second_reg = second.As<Register>();
-      DCHECK_EQ(EAX, first.As<Register>());
-      DCHECK_EQ(is_div ? EAX : EDX, out.As<Register>());
+      Register second_reg = second.AsRegister<Register>();
+      DCHECK_EQ(EAX, first.AsRegister<Register>());
+      DCHECK_EQ(is_div ? EAX : EDX, out.AsRegister<Register>());
 
       SlowPathCodeX86* slow_path =
-          new (GetGraph()->GetArena()) DivRemMinusOneSlowPathX86(out.As<Register>(), is_div);
+          new (GetGraph()->GetArena()) DivRemMinusOneSlowPathX86(out.AsRegister<Register>(),
+                                                                 is_div);
       codegen_->AddSlowPath(slow_path);
 
       // 0x80000000/-1 triggers an arithmetic exception!
@@ -1995,13 +2101,13 @@
 
     case Primitive::kPrimFloat: {
       DCHECK(first.Equals(out));
-      __ divss(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ divss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
       DCHECK(first.Equals(out));
-      __ divsd(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ divsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
@@ -2095,7 +2201,7 @@
   switch (instruction->GetType()) {
     case Primitive::kPrimInt: {
       if (value.IsRegister()) {
-        __ testl(value.As<Register>(), value.As<Register>());
+        __ testl(value.AsRegister<Register>(), value.AsRegister<Register>());
         __ j(kEqual, slow_path->GetEntryLabel());
       } else if (value.IsStackSlot()) {
         __ cmpl(Address(ESP, value.GetStackIndex()), Immediate(0));
@@ -2110,7 +2216,7 @@
     }
     case Primitive::kPrimLong: {
       if (value.IsRegisterPair()) {
-        Register temp = locations->GetTemp(0).As<Register>();
+        Register temp = locations->GetTemp(0).AsRegister<Register>();
         __ movl(temp, value.AsRegisterPairLow<Register>());
         __ orl(temp, value.AsRegisterPairHigh<Register>());
         __ j(kEqual, slow_path->GetEntryLabel());
@@ -2127,6 +2233,139 @@
   }
 }
 
+void LocationsBuilderX86::HandleShift(HBinaryOperation* op) {
+  DCHECK(op->IsShl() || op->IsShr() || op->IsUShr());
+
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(op, LocationSummary::kNoCall);
+
+  switch (op->GetResultType()) {
+    case Primitive::kPrimInt: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      // The shift count needs to be in CL.
+      locations->SetInAt(1, Location::ByteRegisterOrConstant(ECX, op->InputAt(1)));
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    }
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      // The shift count needs to be in CL.
+      locations->SetInAt(1, Location::RegisterLocation(ECX));
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected op type " << op->GetResultType();
+  }
+}
+
+void InstructionCodeGeneratorX86::HandleShift(HBinaryOperation* op) {
+  DCHECK(op->IsShl() || op->IsShr() || op->IsUShr());
+
+  LocationSummary* locations = op->GetLocations();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+  DCHECK(first.Equals(locations->Out()));
+
+  switch (op->GetResultType()) {
+    case Primitive::kPrimInt: {
+      Register first_reg = first.AsRegister<Register>();
+      if (second.IsRegister()) {
+        Register second_reg = second.AsRegister<Register>();
+        DCHECK_EQ(ECX, second_reg);
+        if (op->IsShl()) {
+          __ shll(first_reg, second_reg);
+        } else if (op->IsShr()) {
+          __ sarl(first_reg, second_reg);
+        } else {
+          __ shrl(first_reg, second_reg);
+        }
+      } else {
+        Immediate imm(second.GetConstant()->AsIntConstant()->GetValue());
+        if (op->IsShl()) {
+          __ shll(first_reg, imm);
+        } else if (op->IsShr()) {
+          __ sarl(first_reg, imm);
+        } else {
+          __ shrl(first_reg, imm);
+        }
+      }
+      break;
+    }
+    case Primitive::kPrimLong: {
+      Register second_reg = second.AsRegister<Register>();
+      DCHECK_EQ(ECX, second_reg);
+      if (op->IsShl()) {
+        GenerateShlLong(first, second_reg);
+      } else if (op->IsShr()) {
+        GenerateShrLong(first, second_reg);
+      } else {
+        GenerateUShrLong(first, second_reg);
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected op type " << op->GetResultType();
+  }
+}
+
+void InstructionCodeGeneratorX86::GenerateShlLong(const Location& loc, Register shifter) {
+  Label done;
+  __ shld(loc.AsRegisterPairHigh<Register>(), loc.AsRegisterPairLow<Register>(), shifter);
+  __ shll(loc.AsRegisterPairLow<Register>(), shifter);
+  __ testl(shifter, Immediate(32));
+  __ j(kEqual, &done);
+  __ movl(loc.AsRegisterPairHigh<Register>(), loc.AsRegisterPairLow<Register>());
+  __ movl(loc.AsRegisterPairLow<Register>(), Immediate(0));
+  __ Bind(&done);
+}
+
+void InstructionCodeGeneratorX86::GenerateShrLong(const Location& loc, Register shifter) {
+  Label done;
+  __ shrd(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>(), shifter);
+  __ sarl(loc.AsRegisterPairHigh<Register>(), shifter);
+  __ testl(shifter, Immediate(32));
+  __ j(kEqual, &done);
+  __ movl(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>());
+  __ sarl(loc.AsRegisterPairHigh<Register>(), Immediate(31));
+  __ Bind(&done);
+}
+
+void InstructionCodeGeneratorX86::GenerateUShrLong(const Location& loc, Register shifter) {
+  Label done;
+  __ shrd(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>(), shifter);
+  __ shrl(loc.AsRegisterPairHigh<Register>(), shifter);
+  __ testl(shifter, Immediate(32));
+  __ j(kEqual, &done);
+  __ movl(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>());
+  __ movl(loc.AsRegisterPairHigh<Register>(), Immediate(0));
+  __ Bind(&done);
+}
+
+void LocationsBuilderX86::VisitShl(HShl* shl) {
+  HandleShift(shl);
+}
+
+void InstructionCodeGeneratorX86::VisitShl(HShl* shl) {
+  HandleShift(shl);
+}
+
+void LocationsBuilderX86::VisitShr(HShr* shr) {
+  HandleShift(shr);
+}
+
+void InstructionCodeGeneratorX86::VisitShr(HShr* shr) {
+  HandleShift(shr);
+}
+
+void LocationsBuilderX86::VisitUShr(HUShr* ushr) {
+  HandleShift(ushr);
+}
+
+void InstructionCodeGeneratorX86::VisitUShr(HUShr* ushr) {
+  HandleShift(ushr);
+}
+
 void LocationsBuilderX86::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
@@ -2200,11 +2439,11 @@
   DCHECK(in.Equals(out));
   switch (not_->InputAt(0)->GetType()) {
     case Primitive::kPrimBoolean:
-      __ xorl(out.As<Register>(), Immediate(1));
+      __ xorl(out.AsRegister<Register>(), Immediate(1));
       break;
 
     case Primitive::kPrimInt:
-      __ notl(out.As<Register>());
+      __ notl(out.AsRegister<Register>());
       break;
 
     case Primitive::kPrimLong:
@@ -2220,20 +2459,36 @@
 void LocationsBuilderX86::VisitCompare(HCompare* compare) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(compare, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::Any());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  switch (compare->InputAt(0)->GetType()) {
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      // TODO: we set any here but we don't handle constants
+      locations->SetInAt(1, Location::Any());
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      break;
+    }
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble: {
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected type for compare operation " << compare->InputAt(0)->GetType();
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitCompare(HCompare* compare) {
   LocationSummary* locations = compare->GetLocations();
+  Register out = locations->Out().AsRegister<Register>();
+  Location left = locations->InAt(0);
+  Location right = locations->InAt(1);
+
+  Label less, greater, done;
   switch (compare->InputAt(0)->GetType()) {
     case Primitive::kPrimLong: {
-      Label less, greater, done;
-      Register output = locations->Out().As<Register>();
-      Location left = locations->InAt(0);
-      Location right = locations->InAt(1);
-      if (right.IsRegister()) {
+      if (right.IsRegisterPair()) {
         __ cmpl(left.AsRegisterPairHigh<Register>(), right.AsRegisterPairHigh<Register>());
       } else {
         DCHECK(right.IsDoubleStackSlot());
@@ -2248,23 +2503,33 @@
         DCHECK(right.IsDoubleStackSlot());
         __ cmpl(left.AsRegisterPairLow<Register>(), Address(ESP, right.GetStackIndex()));
       }
-      __ movl(output, Immediate(0));
-      __ j(kEqual, &done);
-      __ j(kBelow, &less);  // Unsigned compare.
-
-      __ Bind(&greater);
-      __ movl(output, Immediate(1));
-      __ jmp(&done);
-
-      __ Bind(&less);
-      __ movl(output, Immediate(-1));
-
-      __ Bind(&done);
+      break;
+    }
+    case Primitive::kPrimFloat: {
+      __ ucomiss(left.AsFpuRegister<XmmRegister>(), right.AsFpuRegister<XmmRegister>());
+      __ j(kUnordered, compare->IsGtBias() ? &greater : &less);
+      break;
+    }
+    case Primitive::kPrimDouble: {
+      __ ucomisd(left.AsFpuRegister<XmmRegister>(), right.AsFpuRegister<XmmRegister>());
+      __ j(kUnordered, compare->IsGtBias() ? &greater : &less);
       break;
     }
     default:
-      LOG(FATAL) << "Unimplemented compare type " << compare->InputAt(0)->GetType();
+      LOG(FATAL) << "Unexpected type for compare operation " << compare->InputAt(0)->GetType();
   }
+  __ movl(out, Immediate(0));
+  __ j(kEqual, &done);
+  __ j(kBelow, &less);  // kBelow is for CF (unsigned & floats).
+
+  __ Bind(&greater);
+  __ movl(out, Immediate(1));
+  __ jmp(&done);
+
+  __ Bind(&less);
+  __ movl(out, Immediate(-1));
+
+  __ Bind(&done);
 }
 
 void LocationsBuilderX86::VisitPhi(HPhi* instruction) {
@@ -2309,33 +2574,33 @@
 
 void InstructionCodeGeneratorX86::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
   uint32_t offset = instruction->GetFieldOffset().Uint32Value();
   Primitive::Type field_type = instruction->GetFieldType();
 
   switch (field_type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
-      ByteRegister value = locations->InAt(1).As<ByteRegister>();
+      ByteRegister value = locations->InAt(1).AsRegister<ByteRegister>();
       __ movb(Address(obj, offset), value);
       break;
     }
 
     case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
-      Register value = locations->InAt(1).As<Register>();
+      Register value = locations->InAt(1).AsRegister<Register>();
       __ movw(Address(obj, offset), value);
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register value = locations->InAt(1).As<Register>();
+      Register value = locations->InAt(1).AsRegister<Register>();
       __ movl(Address(obj, offset), value);
 
       if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
-        Register temp = locations->GetTemp(0).As<Register>();
-        Register card = locations->GetTemp(1).As<Register>();
+        Register temp = locations->GetTemp(0).AsRegister<Register>();
+        Register card = locations->GetTemp(1).AsRegister<Register>();
         codegen_->MarkGCCard(temp, card, obj, value);
       }
       break;
@@ -2349,13 +2614,13 @@
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister value = locations->InAt(1).As<XmmRegister>();
+      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
       __ movss(Address(obj, offset), value);
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister value = locations->InAt(1).As<XmmRegister>();
+      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
       __ movsd(Address(obj, offset), value);
       break;
     }
@@ -2387,37 +2652,37 @@
 
 void InstructionCodeGeneratorX86::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
   uint32_t offset = instruction->GetFieldOffset().Uint32Value();
 
   switch (instruction->GetType()) {
     case Primitive::kPrimBoolean: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ movzxb(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimByte: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ movsxb(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimShort: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ movsxw(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimChar: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ movzxw(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ movl(out, Address(obj, offset));
       break;
     }
@@ -2430,13 +2695,13 @@
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister out = locations->Out().As<XmmRegister>();
+      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
       __ movss(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister out = locations->Out().As<XmmRegister>();
+      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
       __ movsd(out, Address(obj, offset));
       break;
     }
@@ -2464,7 +2729,7 @@
   Location obj = locations->InAt(0);
 
   if (obj.IsRegister()) {
-    __ cmpl(obj.As<Register>(), Immediate(0));
+    __ cmpl(obj.AsRegister<Register>(), Immediate(0));
   } else if (obj.IsStackSlot()) {
     __ cmpl(Address(ESP, obj.GetStackIndex()), Immediate(0));
   } else {
@@ -2486,54 +2751,54 @@
 
 void InstructionCodeGeneratorX86::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
   Location index = locations->InAt(1);
 
   switch (instruction->GetType()) {
     case Primitive::kPrimBoolean: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint8_t)).Uint32Value();
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
         __ movzxb(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset));
       } else {
-        __ movzxb(out, Address(obj, index.As<Register>(), TIMES_1, data_offset));
+        __ movzxb(out, Address(obj, index.AsRegister<Register>(), TIMES_1, data_offset));
       }
       break;
     }
 
     case Primitive::kPrimByte: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int8_t)).Uint32Value();
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
         __ movsxb(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset));
       } else {
-        __ movsxb(out, Address(obj, index.As<Register>(), TIMES_1, data_offset));
+        __ movsxb(out, Address(obj, index.AsRegister<Register>(), TIMES_1, data_offset));
       }
       break;
     }
 
     case Primitive::kPrimShort: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int16_t)).Uint32Value();
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
         __ movsxw(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset));
       } else {
-        __ movsxw(out, Address(obj, index.As<Register>(), TIMES_2, data_offset));
+        __ movsxw(out, Address(obj, index.AsRegister<Register>(), TIMES_2, data_offset));
       }
       break;
     }
 
     case Primitive::kPrimChar: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Uint32Value();
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
         __ movzxw(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset));
       } else {
-        __ movzxw(out, Address(obj, index.As<Register>(), TIMES_2, data_offset));
+        __ movzxw(out, Address(obj, index.AsRegister<Register>(), TIMES_2, data_offset));
       }
       break;
     }
@@ -2541,12 +2806,12 @@
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       if (index.IsConstant()) {
         __ movl(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset));
       } else {
-        __ movl(out, Address(obj, index.As<Register>(), TIMES_4, data_offset));
+        __ movl(out, Address(obj, index.AsRegister<Register>(), TIMES_4, data_offset));
       }
       break;
     }
@@ -2560,9 +2825,9 @@
         __ movl(out.AsRegisterPairHigh<Register>(), Address(obj, offset + kX86WordSize));
       } else {
         __ movl(out.AsRegisterPairLow<Register>(),
-                Address(obj, index.As<Register>(), TIMES_8, data_offset));
+                Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset));
         __ movl(out.AsRegisterPairHigh<Register>(),
-                Address(obj, index.As<Register>(), TIMES_8, data_offset + kX86WordSize));
+                Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset + kX86WordSize));
       }
       break;
     }
@@ -2622,7 +2887,7 @@
 
 void InstructionCodeGeneratorX86::VisitArraySet(HArraySet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
   Location index = locations->InAt(1);
   Location value = locations->InAt(2);
   Primitive::Type value_type = instruction->GetComponentType();
@@ -2637,17 +2902,17 @@
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
         if (value.IsRegister()) {
-          __ movb(Address(obj, offset), value.As<ByteRegister>());
+          __ movb(Address(obj, offset), value.AsRegister<ByteRegister>());
         } else {
           __ movb(Address(obj, offset),
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       } else {
         if (value.IsRegister()) {
-          __ movb(Address(obj, index.As<Register>(), TIMES_1, data_offset),
-                  value.As<ByteRegister>());
+          __ movb(Address(obj, index.AsRegister<Register>(), TIMES_1, data_offset),
+                  value.AsRegister<ByteRegister>());
         } else {
-          __ movb(Address(obj, index.As<Register>(), TIMES_1, data_offset),
+          __ movb(Address(obj, index.AsRegister<Register>(), TIMES_1, data_offset),
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       }
@@ -2660,17 +2925,17 @@
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
         if (value.IsRegister()) {
-          __ movw(Address(obj, offset), value.As<Register>());
+          __ movw(Address(obj, offset), value.AsRegister<Register>());
         } else {
           __ movw(Address(obj, offset),
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       } else {
         if (value.IsRegister()) {
-          __ movw(Address(obj, index.As<Register>(), TIMES_2, data_offset),
-                  value.As<Register>());
+          __ movw(Address(obj, index.AsRegister<Register>(), TIMES_2, data_offset),
+                  value.AsRegister<Register>());
         } else {
-          __ movw(Address(obj, index.As<Register>(), TIMES_2, data_offset),
+          __ movw(Address(obj, index.AsRegister<Register>(), TIMES_2, data_offset),
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       }
@@ -2682,9 +2947,10 @@
       if (!needs_runtime_call) {
         uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
         if (index.IsConstant()) {
-          size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+          size_t offset =
+              (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
           if (value.IsRegister()) {
-            __ movl(Address(obj, offset), value.As<Register>());
+            __ movl(Address(obj, offset), value.AsRegister<Register>());
           } else {
             DCHECK(value.IsConstant()) << value;
             __ movl(Address(obj, offset),
@@ -2693,19 +2959,19 @@
         } else {
           DCHECK(index.IsRegister()) << index;
           if (value.IsRegister()) {
-            __ movl(Address(obj, index.As<Register>(), TIMES_4, data_offset),
-                    value.As<Register>());
+            __ movl(Address(obj, index.AsRegister<Register>(), TIMES_4, data_offset),
+                    value.AsRegister<Register>());
           } else {
             DCHECK(value.IsConstant()) << value;
-            __ movl(Address(obj, index.As<Register>(), TIMES_4, data_offset),
+            __ movl(Address(obj, index.AsRegister<Register>(), TIMES_4, data_offset),
                     Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
           }
         }
 
         if (needs_write_barrier) {
-          Register temp = locations->GetTemp(0).As<Register>();
-          Register card = locations->GetTemp(1).As<Register>();
-          codegen_->MarkGCCard(temp, card, obj, value.As<Register>());
+          Register temp = locations->GetTemp(0).AsRegister<Register>();
+          Register card = locations->GetTemp(1).AsRegister<Register>();
+          codegen_->MarkGCCard(temp, card, obj, value.AsRegister<Register>());
         }
       } else {
         DCHECK_EQ(value_type, Primitive::kPrimNot);
@@ -2731,16 +2997,16 @@
         }
       } else {
         if (value.IsRegisterPair()) {
-          __ movl(Address(obj, index.As<Register>(), TIMES_8, data_offset),
+          __ movl(Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset),
                   value.AsRegisterPairLow<Register>());
-          __ movl(Address(obj, index.As<Register>(), TIMES_8, data_offset + kX86WordSize),
+          __ movl(Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset + kX86WordSize),
                   value.AsRegisterPairHigh<Register>());
         } else {
           DCHECK(value.IsConstant());
           int64_t val = value.GetConstant()->AsLongConstant()->GetValue();
-          __ movl(Address(obj, index.As<Register>(), TIMES_8, data_offset),
+          __ movl(Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset),
                   Immediate(Low32Bits(val)));
-          __ movl(Address(obj, index.As<Register>(), TIMES_8, data_offset + kX86WordSize),
+          __ movl(Address(obj, index.AsRegister<Register>(), TIMES_8, data_offset + kX86WordSize),
                   Immediate(High32Bits(val)));
         }
       }
@@ -2767,8 +3033,8 @@
 void InstructionCodeGeneratorX86::VisitArrayLength(HArrayLength* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   uint32_t offset = mirror::Array::LengthOffset().Uint32Value();
-  Register obj = locations->InAt(0).As<Register>();
-  Register out = locations->Out().As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
+  Register out = locations->Out().AsRegister<Register>();
   __ movl(out, Address(obj, offset));
 }
 
@@ -2788,8 +3054,8 @@
       instruction, locations->InAt(0), locations->InAt(1));
   codegen_->AddSlowPath(slow_path);
 
-  Register index = locations->InAt(0).As<Register>();
-  Register length = locations->InAt(1).As<Register>();
+  Register index = locations->InAt(0).AsRegister<Register>();
+  Register length = locations->InAt(1).AsRegister<Register>();
 
   __ cmpl(index, length);
   __ j(kAboveEqual, slow_path->GetEntryLabel());
@@ -2866,14 +3132,14 @@
 
   if (source.IsRegister()) {
     if (destination.IsRegister()) {
-      __ movl(destination.As<Register>(), source.As<Register>());
+      __ movl(destination.AsRegister<Register>(), source.AsRegister<Register>());
     } else {
       DCHECK(destination.IsStackSlot());
-      __ movl(Address(ESP, destination.GetStackIndex()), source.As<Register>());
+      __ movl(Address(ESP, destination.GetStackIndex()), source.AsRegister<Register>());
     }
   } else if (source.IsStackSlot()) {
     if (destination.IsRegister()) {
-      __ movl(destination.As<Register>(), Address(ESP, source.GetStackIndex()));
+      __ movl(destination.AsRegister<Register>(), Address(ESP, source.GetStackIndex()));
     } else {
       DCHECK(destination.IsStackSlot());
       MoveMemoryToMemory(destination.GetStackIndex(),
@@ -2883,7 +3149,7 @@
     HIntConstant* instruction = source.GetConstant()->AsIntConstant();
     Immediate imm(instruction->AsIntConstant()->GetValue());
     if (destination.IsRegister()) {
-      __ movl(destination.As<Register>(), imm);
+      __ movl(destination.AsRegister<Register>(), imm);
     } else {
       __ movl(Address(ESP, destination.GetStackIndex()), imm);
     }
@@ -2925,11 +3191,11 @@
   Location destination = move->GetDestination();
 
   if (source.IsRegister() && destination.IsRegister()) {
-    __ xchgl(destination.As<Register>(), source.As<Register>());
+    __ xchgl(destination.AsRegister<Register>(), source.AsRegister<Register>());
   } else if (source.IsRegister() && destination.IsStackSlot()) {
-    Exchange(source.As<Register>(), destination.GetStackIndex());
+    Exchange(source.AsRegister<Register>(), destination.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsRegister()) {
-    Exchange(destination.As<Register>(), source.GetStackIndex());
+    Exchange(destination.AsRegister<Register>(), source.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsStackSlot()) {
     Exchange(destination.GetStackIndex(), source.GetStackIndex());
   } else {
@@ -2955,7 +3221,7 @@
 }
 
 void InstructionCodeGeneratorX86::VisitLoadClass(HLoadClass* cls) {
-  Register out = cls->GetLocations()->Out().As<Register>();
+  Register out = cls->GetLocations()->Out().AsRegister<Register>();
   if (cls->IsReferrersClass()) {
     DCHECK(!cls->CanCallRuntime());
     DCHECK(!cls->MustGenerateClinitCheck());
@@ -2994,7 +3260,8 @@
   SlowPathCodeX86* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86(
       check->GetLoadClass(), check, check->GetDexPc(), true);
   codegen_->AddSlowPath(slow_path);
-  GenerateClassInitializationCheck(slow_path, check->GetLocations()->InAt(0).As<Register>());
+  GenerateClassInitializationCheck(slow_path,
+                                   check->GetLocations()->InAt(0).AsRegister<Register>());
 }
 
 void InstructionCodeGeneratorX86::GenerateClassInitializationCheck(
@@ -3015,37 +3282,37 @@
 
 void InstructionCodeGeneratorX86::VisitStaticFieldGet(HStaticFieldGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register cls = locations->InAt(0).As<Register>();
+  Register cls = locations->InAt(0).AsRegister<Register>();
   uint32_t offset = instruction->GetFieldOffset().Uint32Value();
 
   switch (instruction->GetType()) {
     case Primitive::kPrimBoolean: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ movzxb(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimByte: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ movsxb(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimShort: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ movsxw(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimChar: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ movzxw(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register out = locations->Out().As<Register>();
+      Register out = locations->Out().AsRegister<Register>();
       __ movl(out, Address(cls, offset));
       break;
     }
@@ -3058,13 +3325,13 @@
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister out = locations->Out().As<XmmRegister>();
+      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
       __ movss(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister out = locations->Out().As<XmmRegister>();
+      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
       __ movsd(out, Address(cls, offset));
       break;
     }
@@ -3102,33 +3369,33 @@
 
 void InstructionCodeGeneratorX86::VisitStaticFieldSet(HStaticFieldSet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register cls = locations->InAt(0).As<Register>();
+  Register cls = locations->InAt(0).AsRegister<Register>();
   uint32_t offset = instruction->GetFieldOffset().Uint32Value();
   Primitive::Type field_type = instruction->GetFieldType();
 
   switch (field_type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
-      ByteRegister value = locations->InAt(1).As<ByteRegister>();
+      ByteRegister value = locations->InAt(1).AsRegister<ByteRegister>();
       __ movb(Address(cls, offset), value);
       break;
     }
 
     case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
-      Register value = locations->InAt(1).As<Register>();
+      Register value = locations->InAt(1).AsRegister<Register>();
       __ movw(Address(cls, offset), value);
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      Register value = locations->InAt(1).As<Register>();
+      Register value = locations->InAt(1).AsRegister<Register>();
       __ movl(Address(cls, offset), value);
 
       if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
-        Register temp = locations->GetTemp(0).As<Register>();
-        Register card = locations->GetTemp(1).As<Register>();
+        Register temp = locations->GetTemp(0).AsRegister<Register>();
+        Register card = locations->GetTemp(1).AsRegister<Register>();
         codegen_->MarkGCCard(temp, card, cls, value);
       }
       break;
@@ -3142,13 +3409,13 @@
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister value = locations->InAt(1).As<XmmRegister>();
+      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
       __ movss(Address(cls, offset), value);
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister value = locations->InAt(1).As<XmmRegister>();
+      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
       __ movsd(Address(cls, offset), value);
       break;
     }
@@ -3169,9 +3436,10 @@
   SlowPathCodeX86* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86(load);
   codegen_->AddSlowPath(slow_path);
 
-  Register out = load->GetLocations()->Out().As<Register>();
+  Register out = load->GetLocations()->Out().AsRegister<Register>();
   codegen_->LoadCurrentMethod(out);
-  __ movl(out, Address(out, mirror::ArtMethod::DexCacheStringsOffset().Int32Value()));
+  __ movl(out, Address(out, mirror::ArtMethod::DeclaringClassOffset().Int32Value()));
+  __ movl(out, Address(out, mirror::Class::DexCacheStringsOffset().Int32Value()));
   __ movl(out, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex())));
   __ testl(out, out);
   __ j(kEqual, slow_path->GetEntryLabel());
@@ -3186,7 +3454,7 @@
 
 void InstructionCodeGeneratorX86::VisitLoadException(HLoadException* load) {
   Address address = Address::Absolute(Thread::ExceptionOffset<kX86WordSize>().Int32Value());
-  __ fs()->movl(load->GetLocations()->Out().As<Register>(), address);
+  __ fs()->movl(load->GetLocations()->Out().AsRegister<Register>(), address);
   __ fs()->movl(address, Immediate(0));
 }
 
@@ -3214,9 +3482,9 @@
 
 void InstructionCodeGeneratorX86::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
   Location cls = locations->InAt(1);
-  Register out = locations->Out().As<Register>();
+  Register out = locations->Out().AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   Label done, zero;
   SlowPathCodeX86* slow_path = nullptr;
@@ -3228,7 +3496,7 @@
   __ movl(out, Address(obj, class_offset));
   // Compare the class of `obj` with `cls`.
   if (cls.IsRegister()) {
-    __ cmpl(out, cls.As<Register>());
+    __ cmpl(out, cls.AsRegister<Register>());
   } else {
     DCHECK(cls.IsStackSlot()) << cls;
     __ cmpl(out, Address(ESP, cls.GetStackIndex()));
@@ -3267,9 +3535,9 @@
 
 void InstructionCodeGeneratorX86::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  Register obj = locations->InAt(0).As<Register>();
+  Register obj = locations->InAt(0).AsRegister<Register>();
   Location cls = locations->InAt(1);
-  Register temp = locations->GetTemp(0).As<Register>();
+  Register temp = locations->GetTemp(0).AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   SlowPathCodeX86* slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathX86(
       instruction, locations->InAt(1), locations->GetTemp(0), instruction->GetDexPc());
@@ -3282,7 +3550,7 @@
 
   // Compare the class of `obj` with `cls`.
   if (cls.IsRegister()) {
-    __ cmpl(temp, cls.As<Register>());
+    __ cmpl(temp, cls.AsRegister<Register>());
   } else {
     DCHECK(cls.IsStackSlot()) << cls;
     __ cmpl(temp, Address(ESP, cls.GetStackIndex()));
@@ -3341,30 +3609,33 @@
   if (instruction->GetResultType() == Primitive::kPrimInt) {
     if (second.IsRegister()) {
       if (instruction->IsAnd()) {
-        __ andl(first.As<Register>(), second.As<Register>());
+        __ andl(first.AsRegister<Register>(), second.AsRegister<Register>());
       } else if (instruction->IsOr()) {
-        __ orl(first.As<Register>(), second.As<Register>());
+        __ orl(first.AsRegister<Register>(), second.AsRegister<Register>());
       } else {
         DCHECK(instruction->IsXor());
-        __ xorl(first.As<Register>(), second.As<Register>());
+        __ xorl(first.AsRegister<Register>(), second.AsRegister<Register>());
       }
     } else if (second.IsConstant()) {
       if (instruction->IsAnd()) {
-        __ andl(first.As<Register>(), Immediate(second.GetConstant()->AsIntConstant()->GetValue()));
+        __ andl(first.AsRegister<Register>(),
+                Immediate(second.GetConstant()->AsIntConstant()->GetValue()));
       } else if (instruction->IsOr()) {
-        __ orl(first.As<Register>(), Immediate(second.GetConstant()->AsIntConstant()->GetValue()));
+        __ orl(first.AsRegister<Register>(),
+               Immediate(second.GetConstant()->AsIntConstant()->GetValue()));
       } else {
         DCHECK(instruction->IsXor());
-        __ xorl(first.As<Register>(), Immediate(second.GetConstant()->AsIntConstant()->GetValue()));
+        __ xorl(first.AsRegister<Register>(),
+                Immediate(second.GetConstant()->AsIntConstant()->GetValue()));
       }
     } else {
       if (instruction->IsAnd()) {
-        __ andl(first.As<Register>(), Address(ESP, second.GetStackIndex()));
+        __ andl(first.AsRegister<Register>(), Address(ESP, second.GetStackIndex()));
       } else if (instruction->IsOr()) {
-        __ orl(first.As<Register>(), Address(ESP, second.GetStackIndex()));
+        __ orl(first.AsRegister<Register>(), Address(ESP, second.GetStackIndex()));
       } else {
         DCHECK(instruction->IsXor());
-        __ xorl(first.As<Register>(), Address(ESP, second.GetStackIndex()));
+        __ xorl(first.AsRegister<Register>(), Address(ESP, second.GetStackIndex()));
       }
     }
   } else {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 8252f81..aed06c0 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -25,7 +25,8 @@
 namespace art {
 namespace x86 {
 
-static constexpr size_t kX86WordSize = 4;
+// Use a local definition to prevent copying mistakes.
+static constexpr size_t kX86WordSize = kX86PointerSize;
 
 class CodeGeneratorX86;
 class SlowPathCodeX86;
@@ -103,6 +104,7 @@
  private:
   void HandleBitwiseOperation(HBinaryOperation* instruction);
   void HandleInvoke(HInvoke* invoke);
+  void HandleShift(HBinaryOperation* instruction);
 
   CodeGeneratorX86* const codegen_;
   InvokeDexCallingConventionVisitor parameter_visitor_;
@@ -131,6 +133,10 @@
   void GenerateClassInitializationCheck(SlowPathCodeX86* slow_path, Register class_reg);
   void HandleBitwiseOperation(HBinaryOperation* instruction);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
+  void HandleShift(HBinaryOperation* instruction);
+  void GenerateShlLong(const Location& loc, Register shifter);
+  void GenerateShrLong(const Location& loc, Register shifter);
+  void GenerateUShrLong(const Location& loc, Register shifter);
 
   X86Assembler* const assembler_;
   CodeGeneratorX86* const codegen_;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index e9c67e3..4d70efc 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -539,37 +539,37 @@
   }
   if (destination.IsRegister()) {
     if (source.IsRegister()) {
-      __ movq(destination.As<CpuRegister>(), source.As<CpuRegister>());
+      __ movq(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
     } else if (source.IsFpuRegister()) {
-      __ movd(destination.As<CpuRegister>(), source.As<XmmRegister>());
+      __ movd(destination.AsRegister<CpuRegister>(), source.AsFpuRegister<XmmRegister>());
     } else if (source.IsStackSlot()) {
-      __ movl(destination.As<CpuRegister>(),
+      __ movl(destination.AsRegister<CpuRegister>(),
               Address(CpuRegister(RSP), source.GetStackIndex()));
     } else {
       DCHECK(source.IsDoubleStackSlot());
-      __ movq(destination.As<CpuRegister>(),
+      __ movq(destination.AsRegister<CpuRegister>(),
               Address(CpuRegister(RSP), source.GetStackIndex()));
     }
   } else if (destination.IsFpuRegister()) {
     if (source.IsRegister()) {
-      __ movd(destination.As<XmmRegister>(), source.As<CpuRegister>());
+      __ movd(destination.AsFpuRegister<XmmRegister>(), source.AsRegister<CpuRegister>());
     } else if (source.IsFpuRegister()) {
-      __ movaps(destination.As<XmmRegister>(), source.As<XmmRegister>());
+      __ movaps(destination.AsFpuRegister<XmmRegister>(), source.AsFpuRegister<XmmRegister>());
     } else if (source.IsStackSlot()) {
-      __ movss(destination.As<XmmRegister>(),
+      __ movss(destination.AsFpuRegister<XmmRegister>(),
               Address(CpuRegister(RSP), source.GetStackIndex()));
     } else {
       DCHECK(source.IsDoubleStackSlot());
-      __ movsd(destination.As<XmmRegister>(),
+      __ movsd(destination.AsFpuRegister<XmmRegister>(),
                Address(CpuRegister(RSP), source.GetStackIndex()));
     }
   } else if (destination.IsStackSlot()) {
     if (source.IsRegister()) {
       __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()),
-              source.As<CpuRegister>());
+              source.AsRegister<CpuRegister>());
     } else if (source.IsFpuRegister()) {
       __ movss(Address(CpuRegister(RSP), destination.GetStackIndex()),
-               source.As<XmmRegister>());
+               source.AsFpuRegister<XmmRegister>());
     } else {
       DCHECK(source.IsStackSlot());
       __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -579,10 +579,10 @@
     DCHECK(destination.IsDoubleStackSlot());
     if (source.IsRegister()) {
       __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()),
-              source.As<CpuRegister>());
+              source.AsRegister<CpuRegister>());
     } else if (source.IsFpuRegister()) {
       __ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()),
-               source.As<XmmRegister>());
+               source.AsFpuRegister<XmmRegister>());
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -604,7 +604,7 @@
     if (const_to_move->IsIntConstant()) {
       Immediate imm(const_to_move->AsIntConstant()->GetValue());
       if (location.IsRegister()) {
-        __ movl(location.As<CpuRegister>(), imm);
+        __ movl(location.AsRegister<CpuRegister>(), imm);
       } else if (location.IsStackSlot()) {
         __ movl(Address(CpuRegister(RSP), location.GetStackIndex()), imm);
       } else {
@@ -614,7 +614,7 @@
     } else if (const_to_move->IsLongConstant()) {
       int64_t value = const_to_move->AsLongConstant()->GetValue();
       if (location.IsRegister()) {
-        __ movq(location.As<CpuRegister>(), Immediate(value));
+        __ movq(location.AsRegister<CpuRegister>(), Immediate(value));
       } else if (location.IsDoubleStackSlot()) {
         __ movq(CpuRegister(TMP), Immediate(value));
         __ movq(Address(CpuRegister(RSP), location.GetStackIndex()), CpuRegister(TMP));
@@ -637,7 +637,8 @@
 
       case Primitive::kPrimLong:
       case Primitive::kPrimDouble:
-        Move(location, Location::DoubleStackSlot(GetStackSlot(instruction->AsLoadLocal()->GetLocal())));
+        Move(location,
+             Location::DoubleStackSlot(GetStackSlot(instruction->AsLoadLocal()->GetLocal())));
         break;
 
       default:
@@ -741,7 +742,7 @@
         // Materialized condition, compare against 0.
         Location lhs = if_instr->GetLocations()->InAt(0);
         if (lhs.IsRegister()) {
-          __ cmpl(lhs.As<CpuRegister>(), Immediate(0));
+          __ cmpl(lhs.AsRegister<CpuRegister>(), Immediate(0));
         } else {
           __ cmpl(Address(CpuRegister(RSP), lhs.GetStackIndex()),
                   Immediate(0));
@@ -755,12 +756,12 @@
       Location lhs = cond->GetLocations()->InAt(0);
       Location rhs = cond->GetLocations()->InAt(1);
       if (rhs.IsRegister()) {
-        __ cmpl(lhs.As<CpuRegister>(), rhs.As<CpuRegister>());
+        __ cmpl(lhs.AsRegister<CpuRegister>(), rhs.AsRegister<CpuRegister>());
       } else if (rhs.IsConstant()) {
-        __ cmpl(lhs.As<CpuRegister>(),
+        __ cmpl(lhs.AsRegister<CpuRegister>(),
                 Immediate(rhs.GetConstant()->AsIntConstant()->GetValue()));
       } else {
-        __ cmpl(lhs.As<CpuRegister>(),
+        __ cmpl(lhs.AsRegister<CpuRegister>(),
                 Address(CpuRegister(RSP), rhs.GetStackIndex()));
       }
       __ j(X86_64Condition(cond->AsCondition()->GetCondition()),
@@ -831,17 +832,17 @@
 void InstructionCodeGeneratorX86_64::VisitCondition(HCondition* comp) {
   if (comp->NeedsMaterialization()) {
     LocationSummary* locations = comp->GetLocations();
-    CpuRegister reg = locations->Out().As<CpuRegister>();
+    CpuRegister reg = locations->Out().AsRegister<CpuRegister>();
     // Clear register: setcc only sets the low byte.
     __ xorq(reg, reg);
     if (locations->InAt(1).IsRegister()) {
-      __ cmpl(locations->InAt(0).As<CpuRegister>(),
-              locations->InAt(1).As<CpuRegister>());
+      __ cmpl(locations->InAt(0).AsRegister<CpuRegister>(),
+              locations->InAt(1).AsRegister<CpuRegister>());
     } else if (locations->InAt(1).IsConstant()) {
-      __ cmpl(locations->InAt(0).As<CpuRegister>(),
+      __ cmpl(locations->InAt(0).AsRegister<CpuRegister>(),
               Immediate(locations->InAt(1).GetConstant()->AsIntConstant()->GetValue()));
     } else {
-      __ cmpl(locations->InAt(0).As<CpuRegister>(),
+      __ cmpl(locations->InAt(0).AsRegister<CpuRegister>(),
               Address(CpuRegister(RSP), locations->InAt(1).GetStackIndex()));
     }
     __ setcc(X86_64Condition(comp->GetCondition()), reg);
@@ -899,33 +900,61 @@
 void LocationsBuilderX86_64::VisitCompare(HCompare* compare) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(compare, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  switch (compare->InputAt(0)->GetType()) {
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+      break;
+    }
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble: {
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected type for compare operation " << compare->InputAt(0)->GetType();
+  }
 }
 
 void InstructionCodeGeneratorX86_64::VisitCompare(HCompare* compare) {
-  Label greater, done;
   LocationSummary* locations = compare->GetLocations();
-  switch (compare->InputAt(0)->GetType()) {
-    case Primitive::kPrimLong:
-      __ cmpq(locations->InAt(0).As<CpuRegister>(),
-              locations->InAt(1).As<CpuRegister>());
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+  Location left = locations->InAt(0);
+  Location right = locations->InAt(1);
+
+  Label less, greater, done;
+  Primitive::Type type = compare->InputAt(0)->GetType();
+  switch (type) {
+    case Primitive::kPrimLong: {
+      __ cmpq(left.AsRegister<CpuRegister>(), right.AsRegister<CpuRegister>());
       break;
+    }
+    case Primitive::kPrimFloat: {
+      __ ucomiss(left.AsFpuRegister<XmmRegister>(), right.AsFpuRegister<XmmRegister>());
+      __ j(kUnordered, compare->IsGtBias() ? &greater : &less);
+      break;
+    }
+    case Primitive::kPrimDouble: {
+      __ ucomisd(left.AsFpuRegister<XmmRegister>(), right.AsFpuRegister<XmmRegister>());
+      __ j(kUnordered, compare->IsGtBias() ? &greater : &less);
+      break;
+    }
     default:
-      LOG(FATAL) << "Unimplemented compare type " << compare->InputAt(0)->GetType();
+      LOG(FATAL) << "Unexpected compare type " << type;
   }
-
-  CpuRegister output = locations->Out().As<CpuRegister>();
-  __ movl(output, Immediate(0));
+  __ movl(out, Immediate(0));
   __ j(kEqual, &done);
-  __ j(kGreater, &greater);
-
-  __ movl(output, Immediate(-1));
-  __ jmp(&done);
+  __ j(type == Primitive::kPrimLong ? kLess : kBelow, &less);  //  ucomis{s,d} sets CF (kBelow)
 
   __ Bind(&greater);
-  __ movl(output, Immediate(1));
+  __ movl(out, Immediate(1));
+  __ jmp(&done);
+
+  __ Bind(&less);
+  __ movl(out, Immediate(-1));
 
   __ Bind(&done);
 }
@@ -1019,12 +1048,12 @@
       case Primitive::kPrimInt:
       case Primitive::kPrimNot:
       case Primitive::kPrimLong:
-        DCHECK_EQ(ret->GetLocations()->InAt(0).As<CpuRegister>().AsRegister(), RAX);
+        DCHECK_EQ(ret->GetLocations()->InAt(0).AsRegister<CpuRegister>().AsRegister(), RAX);
         break;
 
       case Primitive::kPrimFloat:
       case Primitive::kPrimDouble:
-        DCHECK_EQ(ret->GetLocations()->InAt(0).As<XmmRegister>().AsFloatRegister(),
+        DCHECK_EQ(ret->GetLocations()->InAt(0).AsFpuRegister<XmmRegister>().AsFloatRegister(),
                   XMM0);
         break;
 
@@ -1097,7 +1126,7 @@
 }
 
 void InstructionCodeGeneratorX86_64::VisitInvokeStatic(HInvokeStatic* invoke) {
-  CpuRegister temp = invoke->GetLocations()->GetTemp(0).As<CpuRegister>();
+  CpuRegister temp = invoke->GetLocations()->GetTemp(0).AsRegister<CpuRegister>();
   // TODO: Implement all kinds of calls:
   // 1) boot -> boot
   // 2) app -> boot
@@ -1113,7 +1142,7 @@
   __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetIndexInDexCache())));
   // (temp + offset_of_quick_compiled_code)()
   __ call(Address(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kX86_64PointerSize).SizeValue()));
+      kX86_64WordSize).SizeValue()));
 
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1156,7 +1185,7 @@
 }
 
 void InstructionCodeGeneratorX86_64::VisitInvokeVirtual(HInvokeVirtual* invoke) {
-  CpuRegister temp = invoke->GetLocations()->GetTemp(0).As<CpuRegister>();
+  CpuRegister temp = invoke->GetLocations()->GetTemp(0).AsRegister<CpuRegister>();
   size_t method_offset = mirror::Class::EmbeddedVTableOffset().SizeValue() +
           invoke->GetVTableIndex() * sizeof(mirror::Class::VTableEntry);
   LocationSummary* locations = invoke->GetLocations();
@@ -1167,13 +1196,13 @@
     __ movl(temp, Address(CpuRegister(RSP), receiver.GetStackIndex()));
     __ movl(temp, Address(temp, class_offset));
   } else {
-    __ movl(temp, Address(receiver.As<CpuRegister>(), class_offset));
+    __ movl(temp, Address(receiver.AsRegister<CpuRegister>(), class_offset));
   }
   // temp = temp->GetMethodAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
   __ call(Address(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kX86_64PointerSize).SizeValue()));
+      kX86_64WordSize).SizeValue()));
 
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1187,7 +1216,7 @@
 
 void InstructionCodeGeneratorX86_64::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
-  CpuRegister temp = invoke->GetLocations()->GetTemp(0).As<CpuRegister>();
+  CpuRegister temp = invoke->GetLocations()->GetTemp(0).AsRegister<CpuRegister>();
   uint32_t method_offset = mirror::Class::EmbeddedImTableOffset().Uint32Value() +
           (invoke->GetImtIndex() % mirror::Class::kImtSize) * sizeof(mirror::Class::ImTableEntry);
   LocationSummary* locations = invoke->GetLocations();
@@ -1195,7 +1224,7 @@
   size_t class_offset = mirror::Object::ClassOffset().SizeValue();
 
   // Set the hidden argument.
-  __ movq(invoke->GetLocations()->GetTemp(1).As<CpuRegister>(),
+  __ movq(invoke->GetLocations()->GetTemp(1).AsRegister<CpuRegister>(),
           Immediate(invoke->GetDexMethodIndex()));
 
   // temp = object->GetClass();
@@ -1203,13 +1232,13 @@
     __ movl(temp, Address(CpuRegister(RSP), receiver.GetStackIndex()));
     __ movl(temp, Address(temp, class_offset));
   } else {
-    __ movl(temp, Address(receiver.As<CpuRegister>(), class_offset));
+    __ movl(temp, Address(receiver.AsRegister<CpuRegister>(), class_offset));
   }
   // temp = temp->GetImtEntryAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
   __ call(Address(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kX86_64PointerSize).SizeValue()));
+      kX86_64WordSize).SizeValue()));
 
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -1228,9 +1257,9 @@
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      // Output overlaps as we need a fresh (zero-initialized)
-      // register to perform subtraction from zero.
-      locations->SetOut(Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      locations->AddTemp(Location::RequiresRegister());
+      locations->AddTemp(Location::RequiresFpuRegister());
       break;
 
     default:
@@ -1246,49 +1275,40 @@
     case Primitive::kPrimInt:
       DCHECK(in.IsRegister());
       DCHECK(in.Equals(out));
-      __ negl(out.As<CpuRegister>());
+      __ negl(out.AsRegister<CpuRegister>());
       break;
 
     case Primitive::kPrimLong:
       DCHECK(in.IsRegister());
       DCHECK(in.Equals(out));
-      __ negq(out.As<CpuRegister>());
+      __ negq(out.AsRegister<CpuRegister>());
       break;
 
-    case Primitive::kPrimFloat:
-      DCHECK(in.IsFpuRegister());
-      DCHECK(out.IsFpuRegister());
-      DCHECK(!in.Equals(out));
-      // TODO: Instead of computing negation as a subtraction from
-      // zero, implement it with an exclusive or with value 0x80000000
-      // (mask for bit 31, representing the sign of a single-precision
-      // floating-point number), fetched from a constant pool:
-      //
-      //   xorps out, [RIP:...] // value at RIP is 0x80 00 00 00
-
-      // out = 0
-      __ xorps(out.As<XmmRegister>(), out.As<XmmRegister>());
-      // out = out - in
-      __ subss(out.As<XmmRegister>(), in.As<XmmRegister>());
+    case Primitive::kPrimFloat: {
+      DCHECK(in.Equals(out));
+      CpuRegister constant = locations->GetTemp(0).AsRegister<CpuRegister>();
+      XmmRegister mask = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+      // Implement float negation with an exclusive or with value
+      // 0x80000000 (mask for bit 31, representing the sign of a
+      // single-precision floating-point number).
+      __ movq(constant, Immediate(INT64_C(0x80000000)));
+      __ movd(mask, constant);
+      __ xorps(out.AsFpuRegister<XmmRegister>(), mask);
       break;
+    }
 
-    case Primitive::kPrimDouble:
-      DCHECK(in.IsFpuRegister());
-      DCHECK(out.IsFpuRegister());
-      DCHECK(!in.Equals(out));
-      // TODO: Instead of computing negation as a subtraction from
-      // zero, implement it with an exclusive or with value
+    case Primitive::kPrimDouble: {
+      DCHECK(in.Equals(out));
+      CpuRegister constant = locations->GetTemp(0).AsRegister<CpuRegister>();
+      XmmRegister mask = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+      // Implement double negation with an exclusive or with value
       // 0x8000000000000000 (mask for bit 63, representing the sign of
-      // a double-precision floating-point number), fetched from a
-      // constant pool:
-      //
-      //   xorpd out, [RIP:...] // value at RIP is 0x80 00 00 00 00 00 00 00
-
-      // out = 0
-      __ xorpd(out.As<XmmRegister>(), out.As<XmmRegister>());
-      // out = out - in
-      __ subsd(out.As<XmmRegister>(), in.As<XmmRegister>());
+      // a double-precision floating-point number).
+      __ movq(constant, Immediate(INT64_C(0x8000000000000000)));
+      __ movd(mask, constant);
+      __ xorpd(out.AsFpuRegister<XmmRegister>(), mask);
       break;
+    }
 
     default:
       LOG(FATAL) << "Unexpected neg type " << neg->GetResultType();
@@ -1300,6 +1320,7 @@
       new (GetGraph()->GetArena()) LocationSummary(conversion, LocationSummary::kNoCall);
   Primitive::Type result_type = conversion->GetResultType();
   Primitive::Type input_type = conversion->GetInputType();
+  DCHECK_NE(result_type, input_type);
   switch (result_type) {
     case Primitive::kPrimByte:
       switch (input_type) {
@@ -1383,7 +1404,6 @@
         case Primitive::kPrimByte:
         case Primitive::kPrimShort:
         case Primitive::kPrimInt:
-        case Primitive::kPrimChar:
           // Processing a Dex `int-to-char' instruction.
           locations->SetInAt(0, Location::Any());
           locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
@@ -1407,6 +1427,11 @@
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-float' instruction.
+          locations->SetInAt(0, Location::RequiresRegister());
+          locations->SetOut(Location::RequiresFpuRegister());
+          break;
+
         case Primitive::kPrimDouble:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1430,6 +1455,11 @@
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-double' instruction.
+          locations->SetInAt(0, Location::RequiresRegister());
+          locations->SetOut(Location::RequiresFpuRegister());
+          break;
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1453,6 +1483,7 @@
   Location in = locations->InAt(0);
   Primitive::Type result_type = conversion->GetResultType();
   Primitive::Type input_type = conversion->GetInputType();
+  DCHECK_NE(result_type, input_type);
   switch (result_type) {
     case Primitive::kPrimByte:
       switch (input_type) {
@@ -1461,13 +1492,13 @@
         case Primitive::kPrimChar:
           // Processing a Dex `int-to-byte' instruction.
           if (in.IsRegister()) {
-            __ movsxb(out.As<CpuRegister>(), in.As<CpuRegister>());
+            __ movsxb(out.AsRegister<CpuRegister>(), in.AsRegister<CpuRegister>());
           } else if (in.IsStackSlot()) {
-            __ movsxb(out.As<CpuRegister>(),
+            __ movsxb(out.AsRegister<CpuRegister>(),
                       Address(CpuRegister(RSP), in.GetStackIndex()));
           } else {
             DCHECK(in.GetConstant()->IsIntConstant());
-            __ movl(out.As<CpuRegister>(),
+            __ movl(out.AsRegister<CpuRegister>(),
                     Immediate(static_cast<int8_t>(in.GetConstant()->AsIntConstant()->GetValue())));
           }
           break;
@@ -1485,13 +1516,13 @@
         case Primitive::kPrimChar:
           // Processing a Dex `int-to-short' instruction.
           if (in.IsRegister()) {
-            __ movsxw(out.As<CpuRegister>(), in.As<CpuRegister>());
+            __ movsxw(out.AsRegister<CpuRegister>(), in.AsRegister<CpuRegister>());
           } else if (in.IsStackSlot()) {
-            __ movsxw(out.As<CpuRegister>(),
+            __ movsxw(out.AsRegister<CpuRegister>(),
                       Address(CpuRegister(RSP), in.GetStackIndex()));
           } else {
             DCHECK(in.GetConstant()->IsIntConstant());
-            __ movl(out.As<CpuRegister>(),
+            __ movl(out.AsRegister<CpuRegister>(),
                     Immediate(static_cast<int16_t>(in.GetConstant()->AsIntConstant()->GetValue())));
           }
           break;
@@ -1507,15 +1538,15 @@
         case Primitive::kPrimLong:
           // Processing a Dex `long-to-int' instruction.
           if (in.IsRegister()) {
-            __ movl(out.As<CpuRegister>(), in.As<CpuRegister>());
+            __ movl(out.AsRegister<CpuRegister>(), in.AsRegister<CpuRegister>());
           } else if (in.IsDoubleStackSlot()) {
-            __ movl(out.As<CpuRegister>(),
+            __ movl(out.AsRegister<CpuRegister>(),
                     Address(CpuRegister(RSP), in.GetStackIndex()));
           } else {
             DCHECK(in.IsConstant());
             DCHECK(in.GetConstant()->IsLongConstant());
             int64_t value = in.GetConstant()->AsLongConstant()->GetValue();
-            __ movl(out.As<CpuRegister>(), Immediate(static_cast<int32_t>(value)));
+            __ movl(out.AsRegister<CpuRegister>(), Immediate(static_cast<int32_t>(value)));
           }
           break;
 
@@ -1540,7 +1571,7 @@
         case Primitive::kPrimChar:
           // Processing a Dex `int-to-long' instruction.
           DCHECK(in.IsRegister());
-          __ movsxd(out.As<CpuRegister>(), in.As<CpuRegister>());
+          __ movsxd(out.AsRegister<CpuRegister>(), in.AsRegister<CpuRegister>());
           break;
 
         case Primitive::kPrimFloat:
@@ -1560,16 +1591,15 @@
         case Primitive::kPrimByte:
         case Primitive::kPrimShort:
         case Primitive::kPrimInt:
-        case Primitive::kPrimChar:
           // Processing a Dex `int-to-char' instruction.
           if (in.IsRegister()) {
-            __ movzxw(out.As<CpuRegister>(), in.As<CpuRegister>());
+            __ movzxw(out.AsRegister<CpuRegister>(), in.AsRegister<CpuRegister>());
           } else if (in.IsStackSlot()) {
-            __ movzxw(out.As<CpuRegister>(),
+            __ movzxw(out.AsRegister<CpuRegister>(),
                       Address(CpuRegister(RSP), in.GetStackIndex()));
           } else {
             DCHECK(in.GetConstant()->IsIntConstant());
-            __ movl(out.As<CpuRegister>(),
+            __ movl(out.AsRegister<CpuRegister>(),
                     Immediate(static_cast<uint16_t>(in.GetConstant()->AsIntConstant()->GetValue())));
           }
           break;
@@ -1582,15 +1612,19 @@
 
     case Primitive::kPrimFloat:
       switch (input_type) {
-          // Processing a Dex `int-to-float' instruction.
         case Primitive::kPrimByte:
         case Primitive::kPrimShort:
         case Primitive::kPrimInt:
         case Primitive::kPrimChar:
-          __ cvtsi2ss(out.As<XmmRegister>(), in.As<CpuRegister>());
+          // Processing a Dex `int-to-float' instruction.
+          __ cvtsi2ss(out.AsFpuRegister<XmmRegister>(), in.AsRegister<CpuRegister>(), false);
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-float' instruction.
+          __ cvtsi2ss(out.AsFpuRegister<XmmRegister>(), in.AsRegister<CpuRegister>(), true);
+          break;
+
         case Primitive::kPrimDouble:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1604,15 +1638,19 @@
 
     case Primitive::kPrimDouble:
       switch (input_type) {
-          // Processing a Dex `int-to-double' instruction.
         case Primitive::kPrimByte:
         case Primitive::kPrimShort:
         case Primitive::kPrimInt:
         case Primitive::kPrimChar:
-          __ cvtsi2sd(out.As<XmmRegister>(), in.As<CpuRegister>());
+          // Processing a Dex `int-to-double' instruction.
+          __ cvtsi2sd(out.AsFpuRegister<XmmRegister>(), in.AsRegister<CpuRegister>(), false);
           break;
 
         case Primitive::kPrimLong:
+          // Processing a Dex `long-to-double' instruction.
+          __ cvtsi2sd(out.AsFpuRegister<XmmRegister>(), in.AsRegister<CpuRegister>(), true);
+          break;
+
         case Primitive::kPrimFloat:
           LOG(FATAL) << "Type conversion from " << input_type
                      << " to " << result_type << " not yet implemented";
@@ -1670,28 +1708,28 @@
   switch (add->GetResultType()) {
     case Primitive::kPrimInt: {
       if (second.IsRegister()) {
-        __ addl(first.As<CpuRegister>(), second.As<CpuRegister>());
+        __ addl(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
       } else if (second.IsConstant()) {
         Immediate imm(second.GetConstant()->AsIntConstant()->GetValue());
-        __ addl(first.As<CpuRegister>(), imm);
+        __ addl(first.AsRegister<CpuRegister>(), imm);
       } else {
-        __ addl(first.As<CpuRegister>(), Address(CpuRegister(RSP), second.GetStackIndex()));
+        __ addl(first.AsRegister<CpuRegister>(), Address(CpuRegister(RSP), second.GetStackIndex()));
       }
       break;
     }
 
     case Primitive::kPrimLong: {
-      __ addq(first.As<CpuRegister>(), second.As<CpuRegister>());
+      __ addq(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
       break;
     }
 
     case Primitive::kPrimFloat: {
-      __ addss(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ addss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      __ addsd(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ addsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
@@ -1736,27 +1774,27 @@
   switch (sub->GetResultType()) {
     case Primitive::kPrimInt: {
       if (second.IsRegister()) {
-        __ subl(first.As<CpuRegister>(), second.As<CpuRegister>());
+        __ subl(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
       } else if (second.IsConstant()) {
         Immediate imm(second.GetConstant()->AsIntConstant()->GetValue());
-        __ subl(first.As<CpuRegister>(), imm);
+        __ subl(first.AsRegister<CpuRegister>(), imm);
       } else {
-        __ subl(first.As<CpuRegister>(), Address(CpuRegister(RSP), second.GetStackIndex()));
+        __ subl(first.AsRegister<CpuRegister>(), Address(CpuRegister(RSP), second.GetStackIndex()));
       }
       break;
     }
     case Primitive::kPrimLong: {
-      __ subq(first.As<CpuRegister>(), second.As<CpuRegister>());
+      __ subq(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
       break;
     }
 
     case Primitive::kPrimFloat: {
-      __ subss(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ subss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      __ subsd(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ subsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
@@ -1802,28 +1840,29 @@
   switch (mul->GetResultType()) {
     case Primitive::kPrimInt: {
       if (second.IsRegister()) {
-        __ imull(first.As<CpuRegister>(), second.As<CpuRegister>());
+        __ imull(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
       } else if (second.IsConstant()) {
         Immediate imm(second.GetConstant()->AsIntConstant()->GetValue());
-        __ imull(first.As<CpuRegister>(), imm);
+        __ imull(first.AsRegister<CpuRegister>(), imm);
       } else {
         DCHECK(second.IsStackSlot());
-        __ imull(first.As<CpuRegister>(), Address(CpuRegister(RSP), second.GetStackIndex()));
+        __ imull(first.AsRegister<CpuRegister>(),
+                 Address(CpuRegister(RSP), second.GetStackIndex()));
       }
       break;
     }
     case Primitive::kPrimLong: {
-      __ imulq(first.As<CpuRegister>(), second.As<CpuRegister>());
+      __ imulq(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
       break;
     }
 
     case Primitive::kPrimFloat: {
-      __ mulss(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ mulss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      __ mulsd(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ mulsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
@@ -1840,10 +1879,10 @@
   bool is_div = instruction->IsDiv();
   LocationSummary* locations = instruction->GetLocations();
 
-  CpuRegister out_reg = locations->Out().As<CpuRegister>();
-  CpuRegister second_reg = locations->InAt(1).As<CpuRegister>();
+  CpuRegister out_reg = locations->Out().AsRegister<CpuRegister>();
+  CpuRegister second_reg = locations->InAt(1).AsRegister<CpuRegister>();
 
-  DCHECK_EQ(RAX, locations->InAt(0).As<CpuRegister>().AsRegister());
+  DCHECK_EQ(RAX, locations->InAt(0).AsRegister<CpuRegister>().AsRegister());
   DCHECK_EQ(is_div ? RAX : RDX, out_reg.AsRegister());
 
   SlowPathCodeX86_64* slow_path =
@@ -1915,12 +1954,12 @@
     }
 
     case Primitive::kPrimFloat: {
-      __ divss(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ divss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      __ divsd(first.As<XmmRegister>(), second.As<XmmRegister>());
+      __ divsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
       break;
     }
 
@@ -1993,7 +2032,7 @@
   switch (instruction->GetType()) {
     case Primitive::kPrimInt: {
       if (value.IsRegister()) {
-        __ testl(value.As<CpuRegister>(), value.As<CpuRegister>());
+        __ testl(value.AsRegister<CpuRegister>(), value.AsRegister<CpuRegister>());
         __ j(kEqual, slow_path->GetEntryLabel());
       } else if (value.IsStackSlot()) {
         __ cmpl(Address(CpuRegister(RSP), value.GetStackIndex()), Immediate(0));
@@ -2008,7 +2047,7 @@
     }
     case Primitive::kPrimLong: {
       if (value.IsRegister()) {
-        __ testq(value.As<CpuRegister>(), value.As<CpuRegister>());
+        __ testq(value.AsRegister<CpuRegister>(), value.AsRegister<CpuRegister>());
         __ j(kEqual, slow_path->GetEntryLabel());
       } else if (value.IsDoubleStackSlot()) {
         __ cmpq(Address(CpuRegister(RSP), value.GetStackIndex()), Immediate(0));
@@ -2026,6 +2065,107 @@
   }
 }
 
+void LocationsBuilderX86_64::HandleShift(HBinaryOperation* op) {
+  DCHECK(op->IsShl() || op->IsShr() || op->IsUShr());
+
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(op, LocationSummary::kNoCall);
+
+  switch (op->GetResultType()) {
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      // The shift count needs to be in CL.
+      locations->SetInAt(1, Location::ByteRegisterOrConstant(RCX, op->InputAt(1)));
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected operation type " << op->GetResultType();
+  }
+}
+
+void InstructionCodeGeneratorX86_64::HandleShift(HBinaryOperation* op) {
+  DCHECK(op->IsShl() || op->IsShr() || op->IsUShr());
+
+  LocationSummary* locations = op->GetLocations();
+  CpuRegister first_reg = locations->InAt(0).AsRegister<CpuRegister>();
+  Location second = locations->InAt(1);
+
+  switch (op->GetResultType()) {
+    case Primitive::kPrimInt: {
+      if (second.IsRegister()) {
+        CpuRegister second_reg = second.AsRegister<CpuRegister>();
+        if (op->IsShl()) {
+          __ shll(first_reg, second_reg);
+        } else if (op->IsShr()) {
+          __ sarl(first_reg, second_reg);
+        } else {
+          __ shrl(first_reg, second_reg);
+        }
+      } else {
+        Immediate imm(second.GetConstant()->AsIntConstant()->GetValue());
+        if (op->IsShl()) {
+          __ shll(first_reg, imm);
+        } else if (op->IsShr()) {
+          __ sarl(first_reg, imm);
+        } else {
+          __ shrl(first_reg, imm);
+        }
+      }
+      break;
+    }
+    case Primitive::kPrimLong: {
+      if (second.IsRegister()) {
+        CpuRegister second_reg = second.AsRegister<CpuRegister>();
+        if (op->IsShl()) {
+          __ shlq(first_reg, second_reg);
+        } else if (op->IsShr()) {
+          __ sarq(first_reg, second_reg);
+        } else {
+          __ shrq(first_reg, second_reg);
+        }
+      } else {
+        Immediate imm(second.GetConstant()->AsIntConstant()->GetValue());
+        if (op->IsShl()) {
+          __ shlq(first_reg, imm);
+        } else if (op->IsShr()) {
+          __ sarq(first_reg, imm);
+        } else {
+          __ shrq(first_reg, imm);
+        }
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected operation type " << op->GetResultType();
+  }
+}
+
+void LocationsBuilderX86_64::VisitShl(HShl* shl) {
+  HandleShift(shl);
+}
+
+void InstructionCodeGeneratorX86_64::VisitShl(HShl* shl) {
+  HandleShift(shl);
+}
+
+void LocationsBuilderX86_64::VisitShr(HShr* shr) {
+  HandleShift(shr);
+}
+
+void InstructionCodeGeneratorX86_64::VisitShr(HShr* shr) {
+  HandleShift(shr);
+}
+
+void LocationsBuilderX86_64::VisitUShr(HUShr* ushr) {
+  HandleShift(ushr);
+}
+
+void InstructionCodeGeneratorX86_64::VisitUShr(HUShr* ushr) {
+  HandleShift(ushr);
+}
+
 void LocationsBuilderX86_64::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
@@ -2095,20 +2235,20 @@
 
 void InstructionCodeGeneratorX86_64::VisitNot(HNot* not_) {
   LocationSummary* locations = not_->GetLocations();
-  DCHECK_EQ(locations->InAt(0).As<CpuRegister>().AsRegister(),
-            locations->Out().As<CpuRegister>().AsRegister());
+  DCHECK_EQ(locations->InAt(0).AsRegister<CpuRegister>().AsRegister(),
+            locations->Out().AsRegister<CpuRegister>().AsRegister());
   Location out = locations->Out();
   switch (not_->InputAt(0)->GetType()) {
     case Primitive::kPrimBoolean:
-      __ xorq(out.As<CpuRegister>(), Immediate(1));
+      __ xorq(out.AsRegister<CpuRegister>(), Immediate(1));
       break;
 
     case Primitive::kPrimInt:
-      __ notl(out.As<CpuRegister>());
+      __ notl(out.AsRegister<CpuRegister>());
       break;
 
     case Primitive::kPrimLong:
-      __ notq(out.As<CpuRegister>());
+      __ notq(out.AsRegister<CpuRegister>());
       break;
 
     default:
@@ -2147,51 +2287,51 @@
 
 void InstructionCodeGeneratorX86_64::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister obj = locations->InAt(0).As<CpuRegister>();
+  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
   size_t offset = instruction->GetFieldOffset().SizeValue();
   Primitive::Type field_type = instruction->GetFieldType();
 
   switch (field_type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
-      CpuRegister value = locations->InAt(1).As<CpuRegister>();
+      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
       __ movb(Address(obj, offset), value);
       break;
     }
 
     case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
-      CpuRegister value = locations->InAt(1).As<CpuRegister>();
+      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
       __ movw(Address(obj, offset), value);
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      CpuRegister value = locations->InAt(1).As<CpuRegister>();
+      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
       __ movl(Address(obj, offset), value);
       if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->GetValue())) {
-        CpuRegister temp = locations->GetTemp(0).As<CpuRegister>();
-        CpuRegister card = locations->GetTemp(1).As<CpuRegister>();
+        CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
+        CpuRegister card = locations->GetTemp(1).AsRegister<CpuRegister>();
         codegen_->MarkGCCard(temp, card, obj, value);
       }
       break;
     }
 
     case Primitive::kPrimLong: {
-      CpuRegister value = locations->InAt(1).As<CpuRegister>();
+      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
       __ movq(Address(obj, offset), value);
       break;
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister value = locations->InAt(1).As<XmmRegister>();
+      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
       __ movss(Address(obj, offset), value);
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister value = locations->InAt(1).As<XmmRegister>();
+      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
       __ movsd(Address(obj, offset), value);
       break;
     }
@@ -2211,55 +2351,55 @@
 
 void InstructionCodeGeneratorX86_64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister obj = locations->InAt(0).As<CpuRegister>();
+  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
   size_t offset = instruction->GetFieldOffset().SizeValue();
 
   switch (instruction->GetType()) {
     case Primitive::kPrimBoolean: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movzxb(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimByte: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movsxb(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimShort: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movsxw(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimChar: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movzxw(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movl(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimLong: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movq(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister out = locations->Out().As<XmmRegister>();
+      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
       __ movss(out, Address(obj, offset));
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister out = locations->Out().As<XmmRegister>();
+      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
       __ movsd(out, Address(obj, offset));
       break;
     }
@@ -2287,7 +2427,7 @@
   Location obj = locations->InAt(0);
 
   if (obj.IsRegister()) {
-    __ cmpl(obj.As<CpuRegister>(), Immediate(0));
+    __ cmpl(obj.AsRegister<CpuRegister>(), Immediate(0));
   } else if (obj.IsStackSlot()) {
     __ cmpl(Address(CpuRegister(RSP), obj.GetStackIndex()), Immediate(0));
   } else {
@@ -2310,54 +2450,54 @@
 
 void InstructionCodeGeneratorX86_64::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister obj = locations->InAt(0).As<CpuRegister>();
+  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
   Location index = locations->InAt(1);
 
   switch (instruction->GetType()) {
     case Primitive::kPrimBoolean: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint8_t)).Uint32Value();
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       if (index.IsConstant()) {
         __ movzxb(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset));
       } else {
-        __ movzxb(out, Address(obj, index.As<CpuRegister>(), TIMES_1, data_offset));
+        __ movzxb(out, Address(obj, index.AsRegister<CpuRegister>(), TIMES_1, data_offset));
       }
       break;
     }
 
     case Primitive::kPrimByte: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int8_t)).Uint32Value();
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       if (index.IsConstant()) {
         __ movsxb(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset));
       } else {
-        __ movsxb(out, Address(obj, index.As<CpuRegister>(), TIMES_1, data_offset));
+        __ movsxb(out, Address(obj, index.AsRegister<CpuRegister>(), TIMES_1, data_offset));
       }
       break;
     }
 
     case Primitive::kPrimShort: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int16_t)).Uint32Value();
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       if (index.IsConstant()) {
         __ movsxw(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset));
       } else {
-        __ movsxw(out, Address(obj, index.As<CpuRegister>(), TIMES_2, data_offset));
+        __ movsxw(out, Address(obj, index.AsRegister<CpuRegister>(), TIMES_2, data_offset));
       }
       break;
     }
 
     case Primitive::kPrimChar: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Uint32Value();
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       if (index.IsConstant()) {
         __ movzxw(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset));
       } else {
-        __ movzxw(out, Address(obj, index.As<CpuRegister>(), TIMES_2, data_offset));
+        __ movzxw(out, Address(obj, index.AsRegister<CpuRegister>(), TIMES_2, data_offset));
       }
       break;
     }
@@ -2366,48 +2506,48 @@
     case Primitive::kPrimNot: {
       DCHECK_EQ(sizeof(mirror::HeapReference<mirror::Object>), sizeof(int32_t));
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value();
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       if (index.IsConstant()) {
         __ movl(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset));
       } else {
-        __ movl(out, Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset));
+        __ movl(out, Address(obj, index.AsRegister<CpuRegister>(), TIMES_4, data_offset));
       }
       break;
     }
 
     case Primitive::kPrimLong: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Uint32Value();
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       if (index.IsConstant()) {
         __ movq(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset));
       } else {
-        __ movq(out, Address(obj, index.As<CpuRegister>(), TIMES_8, data_offset));
+        __ movq(out, Address(obj, index.AsRegister<CpuRegister>(), TIMES_8, data_offset));
       }
       break;
     }
 
     case Primitive::kPrimFloat: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(float)).Uint32Value();
-      XmmRegister out = locations->Out().As<XmmRegister>();
+      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
       if (index.IsConstant()) {
         __ movss(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset));
       } else {
-        __ movss(out, Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset));
+        __ movss(out, Address(obj, index.AsRegister<CpuRegister>(), TIMES_4, data_offset));
       }
       break;
     }
 
     case Primitive::kPrimDouble: {
       uint32_t data_offset = mirror::Array::DataOffset(sizeof(double)).Uint32Value();
-      XmmRegister out = locations->Out().As<XmmRegister>();
+      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
       if (index.IsConstant()) {
         __ movsd(out, Address(obj,
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset));
       } else {
-        __ movsd(out, Address(obj, index.As<CpuRegister>(), TIMES_8, data_offset));
+        __ movsd(out, Address(obj, index.AsRegister<CpuRegister>(), TIMES_8, data_offset));
       }
       break;
     }
@@ -2455,7 +2595,7 @@
 
 void InstructionCodeGeneratorX86_64::VisitArraySet(HArraySet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister obj = locations->InAt(0).As<CpuRegister>();
+  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
   Location index = locations->InAt(1);
   Location value = locations->InAt(2);
   Primitive::Type value_type = instruction->GetComponentType();
@@ -2470,16 +2610,17 @@
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
         if (value.IsRegister()) {
-          __ movb(Address(obj, offset), value.As<CpuRegister>());
+          __ movb(Address(obj, offset), value.AsRegister<CpuRegister>());
         } else {
-          __ movb(Address(obj, offset), Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+          __ movb(Address(obj, offset),
+                  Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       } else {
         if (value.IsRegister()) {
-          __ movb(Address(obj, index.As<CpuRegister>(), TIMES_1, data_offset),
-                  value.As<CpuRegister>());
+          __ movb(Address(obj, index.AsRegister<CpuRegister>(), TIMES_1, data_offset),
+                  value.AsRegister<CpuRegister>());
         } else {
-          __ movb(Address(obj, index.As<CpuRegister>(), TIMES_1, data_offset),
+          __ movb(Address(obj, index.AsRegister<CpuRegister>(), TIMES_1, data_offset),
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       }
@@ -2492,19 +2633,20 @@
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
         if (value.IsRegister()) {
-          __ movw(Address(obj, offset), value.As<CpuRegister>());
+          __ movw(Address(obj, offset), value.AsRegister<CpuRegister>());
         } else {
           DCHECK(value.IsConstant()) << value;
-          __ movw(Address(obj, offset), Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
+          __ movw(Address(obj, offset),
+                  Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       } else {
         DCHECK(index.IsRegister()) << index;
         if (value.IsRegister()) {
-          __ movw(Address(obj, index.As<CpuRegister>(), TIMES_2, data_offset),
-                  value.As<CpuRegister>());
+          __ movw(Address(obj, index.AsRegister<CpuRegister>(), TIMES_2, data_offset),
+                  value.AsRegister<CpuRegister>());
         } else {
           DCHECK(value.IsConstant()) << value;
-          __ movw(Address(obj, index.As<CpuRegister>(), TIMES_2, data_offset),
+          __ movw(Address(obj, index.AsRegister<CpuRegister>(), TIMES_2, data_offset),
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
       }
@@ -2519,7 +2661,7 @@
           size_t offset =
               (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
           if (value.IsRegister()) {
-            __ movl(Address(obj, offset), value.As<CpuRegister>());
+            __ movl(Address(obj, offset), value.AsRegister<CpuRegister>());
           } else {
             DCHECK(value.IsConstant()) << value;
             __ movl(Address(obj, offset),
@@ -2528,24 +2670,25 @@
         } else {
           DCHECK(index.IsRegister()) << index;
           if (value.IsRegister()) {
-            __ movl(Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset),
-                    value.As<CpuRegister>());
+            __ movl(Address(obj, index.AsRegister<CpuRegister>(), TIMES_4, data_offset),
+                    value.AsRegister<CpuRegister>());
           } else {
             DCHECK(value.IsConstant()) << value;
-            __ movl(Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset),
+            __ movl(Address(obj, index.AsRegister<CpuRegister>(), TIMES_4, data_offset),
                     Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
           }
         }
 
         if (needs_write_barrier) {
           DCHECK_EQ(value_type, Primitive::kPrimNot);
-          CpuRegister temp = locations->GetTemp(0).As<CpuRegister>();
-          CpuRegister card = locations->GetTemp(1).As<CpuRegister>();
-          codegen_->MarkGCCard(temp, card, obj, value.As<CpuRegister>());
+          CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
+          CpuRegister card = locations->GetTemp(1).AsRegister<CpuRegister>();
+          codegen_->MarkGCCard(temp, card, obj, value.AsRegister<CpuRegister>());
         }
       } else {
         DCHECK_EQ(value_type, Primitive::kPrimNot);
-        __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAputObject), true));
+        __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAputObject),
+                                        true));
         DCHECK(!codegen_->IsLeafMethod());
         codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
       }
@@ -2557,11 +2700,11 @@
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
         DCHECK(value.IsRegister());
-        __ movq(Address(obj, offset), value.As<CpuRegister>());
+        __ movq(Address(obj, offset), value.AsRegister<CpuRegister>());
       } else {
         DCHECK(value.IsRegister());
-        __ movq(Address(obj, index.As<CpuRegister>(), TIMES_8, data_offset),
-                value.As<CpuRegister>());
+        __ movq(Address(obj, index.AsRegister<CpuRegister>(), TIMES_8, data_offset),
+                value.AsRegister<CpuRegister>());
       }
       break;
     }
@@ -2571,11 +2714,11 @@
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
         DCHECK(value.IsFpuRegister());
-        __ movss(Address(obj, offset), value.As<XmmRegister>());
+        __ movss(Address(obj, offset), value.AsFpuRegister<XmmRegister>());
       } else {
         DCHECK(value.IsFpuRegister());
-        __ movss(Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset),
-                value.As<XmmRegister>());
+        __ movss(Address(obj, index.AsRegister<CpuRegister>(), TIMES_4, data_offset),
+                value.AsFpuRegister<XmmRegister>());
       }
       break;
     }
@@ -2585,11 +2728,11 @@
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
         DCHECK(value.IsFpuRegister());
-        __ movsd(Address(obj, offset), value.As<XmmRegister>());
+        __ movsd(Address(obj, offset), value.AsFpuRegister<XmmRegister>());
       } else {
         DCHECK(value.IsFpuRegister());
-        __ movsd(Address(obj, index.As<CpuRegister>(), TIMES_8, data_offset),
-                value.As<XmmRegister>());
+        __ movsd(Address(obj, index.AsRegister<CpuRegister>(), TIMES_8, data_offset),
+                value.AsFpuRegister<XmmRegister>());
       }
       break;
     }
@@ -2610,8 +2753,8 @@
 void InstructionCodeGeneratorX86_64::VisitArrayLength(HArrayLength* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   uint32_t offset = mirror::Array::LengthOffset().Uint32Value();
-  CpuRegister obj = locations->InAt(0).As<CpuRegister>();
-  CpuRegister out = locations->Out().As<CpuRegister>();
+  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   __ movl(out, Address(obj, offset));
 }
 
@@ -2631,8 +2774,8 @@
       instruction, locations->InAt(0), locations->InAt(1));
   codegen_->AddSlowPath(slow_path);
 
-  CpuRegister index = locations->InAt(0).As<CpuRegister>();
-  CpuRegister length = locations->InAt(1).As<CpuRegister>();
+  CpuRegister index = locations->InAt(0).AsRegister<CpuRegister>();
+  CpuRegister length = locations->InAt(1).AsRegister<CpuRegister>();
 
   __ cmpl(index, length);
   __ j(kAboveEqual, slow_path->GetEntryLabel());
@@ -2716,21 +2859,21 @@
 
   if (source.IsRegister()) {
     if (destination.IsRegister()) {
-      __ movq(destination.As<CpuRegister>(), source.As<CpuRegister>());
+      __ movq(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
     } else if (destination.IsStackSlot()) {
       __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()),
-              source.As<CpuRegister>());
+              source.AsRegister<CpuRegister>());
     } else {
       DCHECK(destination.IsDoubleStackSlot());
       __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()),
-              source.As<CpuRegister>());
+              source.AsRegister<CpuRegister>());
     }
   } else if (source.IsStackSlot()) {
     if (destination.IsRegister()) {
-      __ movl(destination.As<CpuRegister>(),
+      __ movl(destination.AsRegister<CpuRegister>(),
               Address(CpuRegister(RSP), source.GetStackIndex()));
     } else if (destination.IsFpuRegister()) {
-      __ movss(destination.As<XmmRegister>(),
+      __ movss(destination.AsFpuRegister<XmmRegister>(),
               Address(CpuRegister(RSP), source.GetStackIndex()));
     } else {
       DCHECK(destination.IsStackSlot());
@@ -2739,10 +2882,11 @@
     }
   } else if (source.IsDoubleStackSlot()) {
     if (destination.IsRegister()) {
-      __ movq(destination.As<CpuRegister>(),
+      __ movq(destination.AsRegister<CpuRegister>(),
               Address(CpuRegister(RSP), source.GetStackIndex()));
     } else if (destination.IsFpuRegister()) {
-      __ movsd(destination.As<XmmRegister>(), Address(CpuRegister(RSP), source.GetStackIndex()));
+      __ movsd(destination.AsFpuRegister<XmmRegister>(),
+               Address(CpuRegister(RSP), source.GetStackIndex()));
     } else {
       DCHECK(destination.IsDoubleStackSlot()) << destination;
       __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -2753,7 +2897,7 @@
     if (constant->IsIntConstant()) {
       Immediate imm(constant->AsIntConstant()->GetValue());
       if (destination.IsRegister()) {
-        __ movl(destination.As<CpuRegister>(), imm);
+        __ movl(destination.AsRegister<CpuRegister>(), imm);
       } else {
         DCHECK(destination.IsStackSlot()) << destination;
         __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), imm);
@@ -2761,7 +2905,7 @@
     } else if (constant->IsLongConstant()) {
       int64_t value = constant->AsLongConstant()->GetValue();
       if (destination.IsRegister()) {
-        __ movq(destination.As<CpuRegister>(), Immediate(value));
+        __ movq(destination.AsRegister<CpuRegister>(), Immediate(value));
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
         __ movq(CpuRegister(TMP), Immediate(value));
@@ -2771,7 +2915,7 @@
       Immediate imm(bit_cast<float, int32_t>(constant->AsFloatConstant()->GetValue()));
       if (destination.IsFpuRegister()) {
         __ movl(CpuRegister(TMP), imm);
-        __ movd(destination.As<XmmRegister>(), CpuRegister(TMP));
+        __ movd(destination.AsFpuRegister<XmmRegister>(), CpuRegister(TMP));
       } else {
         DCHECK(destination.IsStackSlot()) << destination;
         __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), imm);
@@ -2781,7 +2925,7 @@
       Immediate imm(bit_cast<double, int64_t>(constant->AsDoubleConstant()->GetValue()));
       if (destination.IsFpuRegister()) {
         __ movq(CpuRegister(TMP), imm);
-        __ movd(destination.As<XmmRegister>(), CpuRegister(TMP));
+        __ movd(destination.AsFpuRegister<XmmRegister>(), CpuRegister(TMP));
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
         __ movq(CpuRegister(TMP), imm);
@@ -2790,14 +2934,14 @@
     }
   } else if (source.IsFpuRegister()) {
     if (destination.IsFpuRegister()) {
-      __ movaps(destination.As<XmmRegister>(), source.As<XmmRegister>());
+      __ movaps(destination.AsFpuRegister<XmmRegister>(), source.AsFpuRegister<XmmRegister>());
     } else if (destination.IsStackSlot()) {
       __ movss(Address(CpuRegister(RSP), destination.GetStackIndex()),
-               source.As<XmmRegister>());
+               source.AsFpuRegister<XmmRegister>());
     } else {
-      DCHECK(destination.IsDoubleStackSlot());
+      DCHECK(destination.IsDoubleStackSlot()) << destination;
       __ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()),
-               source.As<XmmRegister>());
+               source.AsFpuRegister<XmmRegister>());
     }
   }
 }
@@ -2858,31 +3002,31 @@
   Location destination = move->GetDestination();
 
   if (source.IsRegister() && destination.IsRegister()) {
-    __ xchgq(destination.As<CpuRegister>(), source.As<CpuRegister>());
+    __ xchgq(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
   } else if (source.IsRegister() && destination.IsStackSlot()) {
-    Exchange32(source.As<CpuRegister>(), destination.GetStackIndex());
+    Exchange32(source.AsRegister<CpuRegister>(), destination.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsRegister()) {
-    Exchange32(destination.As<CpuRegister>(), source.GetStackIndex());
+    Exchange32(destination.AsRegister<CpuRegister>(), source.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsStackSlot()) {
     Exchange32(destination.GetStackIndex(), source.GetStackIndex());
   } else if (source.IsRegister() && destination.IsDoubleStackSlot()) {
-    Exchange64(source.As<CpuRegister>(), destination.GetStackIndex());
+    Exchange64(source.AsRegister<CpuRegister>(), destination.GetStackIndex());
   } else if (source.IsDoubleStackSlot() && destination.IsRegister()) {
-    Exchange64(destination.As<CpuRegister>(), source.GetStackIndex());
+    Exchange64(destination.AsRegister<CpuRegister>(), source.GetStackIndex());
   } else if (source.IsDoubleStackSlot() && destination.IsDoubleStackSlot()) {
     Exchange64(destination.GetStackIndex(), source.GetStackIndex());
   } else if (source.IsFpuRegister() && destination.IsFpuRegister()) {
-    __ movd(CpuRegister(TMP), source.As<XmmRegister>());
-    __ movaps(source.As<XmmRegister>(), destination.As<XmmRegister>());
-    __ movd(destination.As<XmmRegister>(), CpuRegister(TMP));
+    __ movd(CpuRegister(TMP), source.AsFpuRegister<XmmRegister>());
+    __ movaps(source.AsFpuRegister<XmmRegister>(), destination.AsFpuRegister<XmmRegister>());
+    __ movd(destination.AsFpuRegister<XmmRegister>(), CpuRegister(TMP));
   } else if (source.IsFpuRegister() && destination.IsStackSlot()) {
-    Exchange32(source.As<XmmRegister>(), destination.GetStackIndex());
+    Exchange32(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsFpuRegister()) {
-    Exchange32(destination.As<XmmRegister>(), source.GetStackIndex());
+    Exchange32(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex());
   } else if (source.IsFpuRegister() && destination.IsDoubleStackSlot()) {
-    Exchange64(source.As<XmmRegister>(), destination.GetStackIndex());
+    Exchange64(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex());
   } else if (source.IsDoubleStackSlot() && destination.IsFpuRegister()) {
-    Exchange64(destination.As<XmmRegister>(), source.GetStackIndex());
+    Exchange64(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex());
   } else {
     LOG(FATAL) << "Unimplemented swap between " << source << " and " << destination;
   }
@@ -2917,7 +3061,7 @@
 }
 
 void InstructionCodeGeneratorX86_64::VisitLoadClass(HLoadClass* cls) {
-  CpuRegister out = cls->GetLocations()->Out().As<CpuRegister>();
+  CpuRegister out = cls->GetLocations()->Out().AsRegister<CpuRegister>();
   if (cls->IsReferrersClass()) {
     DCHECK(!cls->CanCallRuntime());
     DCHECK(!cls->MustGenerateClinitCheck());
@@ -2955,7 +3099,8 @@
   SlowPathCodeX86_64* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathX86_64(
       check->GetLoadClass(), check, check->GetDexPc(), true);
   codegen_->AddSlowPath(slow_path);
-  GenerateClassInitializationCheck(slow_path, check->GetLocations()->InAt(0).As<CpuRegister>());
+  GenerateClassInitializationCheck(slow_path,
+                                   check->GetLocations()->InAt(0).AsRegister<CpuRegister>());
 }
 
 void LocationsBuilderX86_64::VisitStaticFieldGet(HStaticFieldGet* instruction) {
@@ -2967,55 +3112,55 @@
 
 void InstructionCodeGeneratorX86_64::VisitStaticFieldGet(HStaticFieldGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister cls = locations->InAt(0).As<CpuRegister>();
+  CpuRegister cls = locations->InAt(0).AsRegister<CpuRegister>();
   size_t offset = instruction->GetFieldOffset().SizeValue();
 
   switch (instruction->GetType()) {
     case Primitive::kPrimBoolean: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movzxb(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimByte: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movsxb(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimShort: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movsxw(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimChar: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movzxw(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movl(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimLong: {
-      CpuRegister out = locations->Out().As<CpuRegister>();
+      CpuRegister out = locations->Out().AsRegister<CpuRegister>();
       __ movq(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister out = locations->Out().As<XmmRegister>();
+      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
       __ movss(out, Address(cls, offset));
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister out = locations->Out().As<XmmRegister>();
+      XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
       __ movsd(out, Address(cls, offset));
       break;
     }
@@ -3043,51 +3188,51 @@
 
 void InstructionCodeGeneratorX86_64::VisitStaticFieldSet(HStaticFieldSet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister cls = locations->InAt(0).As<CpuRegister>();
+  CpuRegister cls = locations->InAt(0).AsRegister<CpuRegister>();
   size_t offset = instruction->GetFieldOffset().SizeValue();
   Primitive::Type field_type = instruction->GetFieldType();
 
   switch (field_type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
-      CpuRegister value = locations->InAt(1).As<CpuRegister>();
+      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
       __ movb(Address(cls, offset), value);
       break;
     }
 
     case Primitive::kPrimShort:
     case Primitive::kPrimChar: {
-      CpuRegister value = locations->InAt(1).As<CpuRegister>();
+      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
       __ movw(Address(cls, offset), value);
       break;
     }
 
     case Primitive::kPrimInt:
     case Primitive::kPrimNot: {
-      CpuRegister value = locations->InAt(1).As<CpuRegister>();
+      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
       __ movl(Address(cls, offset), value);
       if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->GetValue())) {
-        CpuRegister temp = locations->GetTemp(0).As<CpuRegister>();
-        CpuRegister card = locations->GetTemp(1).As<CpuRegister>();
+        CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
+        CpuRegister card = locations->GetTemp(1).AsRegister<CpuRegister>();
         codegen_->MarkGCCard(temp, card, cls, value);
       }
       break;
     }
 
     case Primitive::kPrimLong: {
-      CpuRegister value = locations->InAt(1).As<CpuRegister>();
+      CpuRegister value = locations->InAt(1).AsRegister<CpuRegister>();
       __ movq(Address(cls, offset), value);
       break;
     }
 
     case Primitive::kPrimFloat: {
-      XmmRegister value = locations->InAt(1).As<XmmRegister>();
+      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
       __ movss(Address(cls, offset), value);
       break;
     }
 
     case Primitive::kPrimDouble: {
-      XmmRegister value = locations->InAt(1).As<XmmRegister>();
+      XmmRegister value = locations->InAt(1).AsFpuRegister<XmmRegister>();
       __ movsd(Address(cls, offset), value);
       break;
     }
@@ -3108,9 +3253,10 @@
   SlowPathCodeX86_64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86_64(load);
   codegen_->AddSlowPath(slow_path);
 
-  CpuRegister out = load->GetLocations()->Out().As<CpuRegister>();
+  CpuRegister out = load->GetLocations()->Out().AsRegister<CpuRegister>();
   codegen_->LoadCurrentMethod(CpuRegister(out));
-  __ movl(out, Address(out, mirror::ArtMethod::DexCacheStringsOffset().Int32Value()));
+  __ movl(out, Address(out, mirror::ArtMethod::DeclaringClassOffset().Int32Value()));
+  __ movl(out, Address(out, mirror::Class::DexCacheStringsOffset().Int32Value()));
   __ movl(out, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex())));
   __ testl(out, out);
   __ j(kEqual, slow_path->GetEntryLabel());
@@ -3126,7 +3272,7 @@
 void InstructionCodeGeneratorX86_64::VisitLoadException(HLoadException* load) {
   Address address = Address::Absolute(
       Thread::ExceptionOffset<kX86_64WordSize>().Int32Value(), true);
-  __ gs()->movl(load->GetLocations()->Out().As<CpuRegister>(), address);
+  __ gs()->movl(load->GetLocations()->Out().AsRegister<CpuRegister>(), address);
   __ gs()->movl(address, Immediate(0));
 }
 
@@ -3155,9 +3301,9 @@
 
 void InstructionCodeGeneratorX86_64::VisitInstanceOf(HInstanceOf* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister obj = locations->InAt(0).As<CpuRegister>();
+  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
   Location cls = locations->InAt(1);
-  CpuRegister out = locations->Out().As<CpuRegister>();
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   Label done, zero;
   SlowPathCodeX86_64* slow_path = nullptr;
@@ -3169,7 +3315,7 @@
   // Compare the class of `obj` with `cls`.
   __ movl(out, Address(obj, class_offset));
   if (cls.IsRegister()) {
-    __ cmpl(out, cls.As<CpuRegister>());
+    __ cmpl(out, cls.AsRegister<CpuRegister>());
   } else {
     DCHECK(cls.IsStackSlot()) << cls;
     __ cmpl(out, Address(CpuRegister(RSP), cls.GetStackIndex()));
@@ -3207,9 +3353,9 @@
 
 void InstructionCodeGeneratorX86_64::VisitCheckCast(HCheckCast* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  CpuRegister obj = locations->InAt(0).As<CpuRegister>();
+  CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
   Location cls = locations->InAt(1);
-  CpuRegister temp = locations->GetTemp(0).As<CpuRegister>();
+  CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
   SlowPathCodeX86_64* slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathX86_64(
       instruction, locations->InAt(1), locations->GetTemp(0), instruction->GetDexPc());
@@ -3221,7 +3367,7 @@
   // Compare the class of `obj` with `cls`.
   __ movl(temp, Address(obj, class_offset));
   if (cls.IsRegister()) {
-    __ cmpl(temp, cls.As<CpuRegister>());
+    __ cmpl(temp, cls.AsRegister<CpuRegister>());
   } else {
     DCHECK(cls.IsStackSlot()) << cls;
     __ cmpl(temp, Address(CpuRegister(RSP), cls.GetStackIndex()));
@@ -3286,43 +3432,43 @@
   if (instruction->GetResultType() == Primitive::kPrimInt) {
     if (second.IsRegister()) {
       if (instruction->IsAnd()) {
-        __ andl(first.As<CpuRegister>(), second.As<CpuRegister>());
+        __ andl(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
       } else if (instruction->IsOr()) {
-        __ orl(first.As<CpuRegister>(), second.As<CpuRegister>());
+        __ orl(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
       } else {
         DCHECK(instruction->IsXor());
-        __ xorl(first.As<CpuRegister>(), second.As<CpuRegister>());
+        __ xorl(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
       }
     } else if (second.IsConstant()) {
       Immediate imm(second.GetConstant()->AsIntConstant()->GetValue());
       if (instruction->IsAnd()) {
-        __ andl(first.As<CpuRegister>(), imm);
+        __ andl(first.AsRegister<CpuRegister>(), imm);
       } else if (instruction->IsOr()) {
-        __ orl(first.As<CpuRegister>(), imm);
+        __ orl(first.AsRegister<CpuRegister>(), imm);
       } else {
         DCHECK(instruction->IsXor());
-        __ xorl(first.As<CpuRegister>(), imm);
+        __ xorl(first.AsRegister<CpuRegister>(), imm);
       }
     } else {
       Address address(CpuRegister(RSP), second.GetStackIndex());
       if (instruction->IsAnd()) {
-        __ andl(first.As<CpuRegister>(), address);
+        __ andl(first.AsRegister<CpuRegister>(), address);
       } else if (instruction->IsOr()) {
-        __ orl(first.As<CpuRegister>(), address);
+        __ orl(first.AsRegister<CpuRegister>(), address);
       } else {
         DCHECK(instruction->IsXor());
-        __ xorl(first.As<CpuRegister>(), address);
+        __ xorl(first.AsRegister<CpuRegister>(), address);
       }
     }
   } else {
     DCHECK_EQ(instruction->GetResultType(), Primitive::kPrimLong);
     if (instruction->IsAnd()) {
-      __ andq(first.As<CpuRegister>(), second.As<CpuRegister>());
+      __ andq(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
     } else if (instruction->IsOr()) {
-      __ orq(first.As<CpuRegister>(), second.As<CpuRegister>());
+      __ orq(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
     } else {
       DCHECK(instruction->IsXor());
-      __ xorq(first.As<CpuRegister>(), second.As<CpuRegister>());
+      __ xorq(first.AsRegister<CpuRegister>(), second.AsRegister<CpuRegister>());
     }
   }
 }
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 86f3b4e..794b81f 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -25,7 +25,8 @@
 namespace art {
 namespace x86_64 {
 
-static constexpr size_t kX86_64WordSize = 8;
+// Use a local definition to prevent copying mistakes.
+static constexpr size_t kX86_64WordSize = kX86_64PointerSize;
 
 static constexpr Register kParameterCoreRegisters[] = { RSI, RDX, RCX, R8, R9 };
 static constexpr FloatRegister kParameterFloatRegisters[] =
@@ -107,6 +108,7 @@
  private:
   void HandleInvoke(HInvoke* invoke);
   void HandleBitwiseOperation(HBinaryOperation* operation);
+  void HandleShift(HBinaryOperation* operation);
 
   CodeGeneratorX86_64* const codegen_;
   InvokeDexCallingConventionVisitor parameter_visitor_;
@@ -135,6 +137,7 @@
   void GenerateClassInitializationCheck(SlowPathCodeX86_64* slow_path, CpuRegister class_reg);
   void HandleBitwiseOperation(HBinaryOperation* operation);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
+  void HandleShift(HBinaryOperation* operation);
 
   X86_64Assembler* const assembler_;
   CodeGeneratorX86_64* const codegen_;
diff --git a/compiler/optimizing/constant_folding.h b/compiler/optimizing/constant_folding.h
index d2acfa6..ac00824 100644
--- a/compiler/optimizing/constant_folding.h
+++ b/compiler/optimizing/constant_folding.h
@@ -32,10 +32,10 @@
  */
 class HConstantFolding : public HOptimization {
  public:
-  HConstantFolding(HGraph* graph, const HGraphVisualizer& visualizer)
-      : HOptimization(graph, true, kConstantFoldingPassName, visualizer) {}
+  explicit HConstantFolding(HGraph* graph)
+      : HOptimization(graph, true, kConstantFoldingPassName) {}
 
-  virtual void Run() OVERRIDE;
+  void Run() OVERRIDE;
 
   static constexpr const char* kConstantFoldingPassName = "constant_folding";
 
diff --git a/compiler/optimizing/constant_folding_test.cc b/compiler/optimizing/constant_folding_test.cc
index 856c516..a56b9d9 100644
--- a/compiler/optimizing/constant_folding_test.cc
+++ b/compiler/optimizing/constant_folding_test.cc
@@ -47,8 +47,7 @@
   ASSERT_EQ(expected_before, actual_before);
 
   x86::CodeGeneratorX86 codegen(graph);
-  HGraphVisualizer visualizer(nullptr, graph, codegen, "");
-  HConstantFolding(graph, visualizer).Run();
+  HConstantFolding(graph).Run();
   SSAChecker ssa_checker(&allocator, graph);
   ssa_checker.Run();
   ASSERT_TRUE(ssa_checker.IsValid());
@@ -60,7 +59,7 @@
 
   check_after_cf(graph);
 
-  HDeadCodeElimination(graph, visualizer).Run();
+  HDeadCodeElimination(graph).Run();
   ssa_checker.Run();
   ASSERT_TRUE(ssa_checker.IsValid());
 
diff --git a/compiler/optimizing/dead_code_elimination.h b/compiler/optimizing/dead_code_elimination.h
index a4446ae..3db2c3f 100644
--- a/compiler/optimizing/dead_code_elimination.h
+++ b/compiler/optimizing/dead_code_elimination.h
@@ -28,10 +28,10 @@
  */
 class HDeadCodeElimination : public HOptimization {
  public:
-  HDeadCodeElimination(HGraph* graph, const HGraphVisualizer& visualizer)
-      : HOptimization(graph, true, kDeadCodeEliminationPassName, visualizer) {}
+  explicit HDeadCodeElimination(HGraph* graph)
+      : HOptimization(graph, true, kDeadCodeEliminationPassName) {}
 
-  virtual void Run() OVERRIDE;
+  void Run() OVERRIDE;
 
   static constexpr const char* kDeadCodeEliminationPassName =
     "dead_code_elimination";
diff --git a/compiler/optimizing/dead_code_elimination_test.cc b/compiler/optimizing/dead_code_elimination_test.cc
index 0c68074..5d4b9cb 100644
--- a/compiler/optimizing/dead_code_elimination_test.cc
+++ b/compiler/optimizing/dead_code_elimination_test.cc
@@ -41,8 +41,7 @@
   ASSERT_EQ(actual_before, expected_before);
 
   x86::CodeGeneratorX86 codegen(graph);
-  HGraphVisualizer visualizer(nullptr, graph, codegen, "");
-  HDeadCodeElimination(graph, visualizer).Run();
+  HDeadCodeElimination(graph).Run();
   SSAChecker ssa_checker(&allocator, graph);
   ssa_checker.Run();
   ASSERT_TRUE(ssa_checker.IsValid());
diff --git a/compiler/optimizing/graph_checker.cc b/compiler/optimizing/graph_checker.cc
index 1953241..5d712fe 100644
--- a/compiler/optimizing/graph_checker.cc
+++ b/compiler/optimizing/graph_checker.cc
@@ -342,4 +342,72 @@
   }
 }
 
+static Primitive::Type PrimitiveKind(Primitive::Type type) {
+  switch (type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimInt:
+      return Primitive::kPrimInt;
+    default:
+      return type;
+  }
+}
+
+void SSAChecker::VisitCondition(HCondition* op) {
+  VisitInstruction(op);
+  // TODO: check inputs types, and special case the `null` check.
+  if (op->GetType() != Primitive::kPrimBoolean) {
+    std::stringstream error;
+    error << "Condition " << op->DebugName() << " " << op->GetId()
+          << " has a non-boolean result type: "
+          << op->GetType() << ".";
+    errors_.push_back(error.str());
+  }
+}
+
+void SSAChecker::VisitBinaryOperation(HBinaryOperation* op) {
+  VisitInstruction(op);
+  if (op->IsUShr() || op->IsShr() || op->IsShl()) {
+    if (PrimitiveKind(op->InputAt(1)->GetType()) != Primitive::kPrimInt) {
+      std::stringstream error;
+      error << "Shift operation " << op->DebugName() << " " << op->GetId()
+            << " has a non-int kind second input: "
+            << op->InputAt(1)->DebugName() << " of type " << op->InputAt(1)->GetType()
+            << ".";
+      errors_.push_back(error.str());
+    }
+  } else {
+    if (PrimitiveKind(op->InputAt(1)->GetType()) != PrimitiveKind(op->InputAt(0)->GetType())) {
+      std::stringstream error;
+      error << "Binary operation " << op->DebugName() << " " << op->GetId()
+            << " has inputs of different type: "
+            << op->InputAt(0)->GetType() << ", and " << op->InputAt(1)->GetType()
+            << ".";
+      errors_.push_back(error.str());
+    }
+  }
+
+  if (op->IsCompare()) {
+    if (op->GetType() != Primitive::kPrimInt) {
+      std::stringstream error;
+      error << "Compare operation " << op->GetId()
+            << " has a non-int result type: "
+            << op->GetType() << ".";
+      errors_.push_back(error.str());
+    }
+  } else {
+    // Use the first input, so that we can also make this check for shift operations.
+    if (PrimitiveKind(op->GetType()) != PrimitiveKind(op->InputAt(0)->GetType())) {
+      std::stringstream error;
+      error << "Binary operation " << op->DebugName() << " " << op->GetId()
+            << " has a result type different than its input type: "
+            << op->GetType() << ", and " << op->InputAt(1)->GetType()
+            << ".";
+      errors_.push_back(error.str());
+    }
+  }
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/graph_checker.h b/compiler/optimizing/graph_checker.h
index 8ba8cb1..b6c9f17 100644
--- a/compiler/optimizing/graph_checker.h
+++ b/compiler/optimizing/graph_checker.h
@@ -24,11 +24,11 @@
 namespace art {
 
 // A control-flow graph visitor performing various checks.
-class GraphChecker : public HGraphVisitor {
+class GraphChecker : public HGraphDelegateVisitor {
  public:
   GraphChecker(ArenaAllocator* allocator, HGraph* graph,
                const char* dump_prefix = "art::GraphChecker: ")
-    : HGraphVisitor(graph),
+    : HGraphDelegateVisitor(graph),
       allocator_(allocator),
       dump_prefix_(dump_prefix) {}
 
@@ -36,10 +36,10 @@
   virtual void Run() { VisitInsertionOrder(); }
 
   // Check `block`.
-  virtual void VisitBasicBlock(HBasicBlock* block) OVERRIDE;
+  void VisitBasicBlock(HBasicBlock* block) OVERRIDE;
 
   // Check `instruction`.
-  virtual void VisitInstruction(HInstruction* instruction) OVERRIDE;
+  void VisitInstruction(HInstruction* instruction) OVERRIDE;
 
   // Was the last visit of the graph valid?
   bool IsValid() const {
@@ -82,7 +82,7 @@
     : GraphChecker(allocator, graph, "art::SSAChecker: ") {}
 
   // Check the whole graph (in reverse post-order).
-  virtual void Run() {
+  void Run() OVERRIDE {
     // VisitReversePostOrder is used instead of VisitInsertionOrder,
     // as the latter might visit dead blocks removed by the dominator
     // computation.
@@ -90,13 +90,15 @@
   }
 
   // Perform SSA form checks on `block`.
-  virtual void VisitBasicBlock(HBasicBlock* block) OVERRIDE;
+  void VisitBasicBlock(HBasicBlock* block) OVERRIDE;
   // Loop-related checks from block `loop_header`.
   void CheckLoop(HBasicBlock* loop_header);
 
   // Perform SSA form checks on instructions.
-  virtual void VisitInstruction(HInstruction* instruction) OVERRIDE;
-  virtual void VisitPhi(HPhi* phi) OVERRIDE;
+  void VisitInstruction(HInstruction* instruction) OVERRIDE;
+  void VisitPhi(HPhi* phi) OVERRIDE;
+  void VisitBinaryOperation(HBinaryOperation* op) OVERRIDE;
+  void VisitCondition(HCondition* op) OVERRIDE;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(SSAChecker);
diff --git a/compiler/optimizing/graph_visualizer.h b/compiler/optimizing/graph_visualizer.h
index 4d8bec2..60d996b 100644
--- a/compiler/optimizing/graph_visualizer.h
+++ b/compiler/optimizing/graph_visualizer.h
@@ -30,7 +30,6 @@
 // TODO: Create an analysis/optimization abstraction.
 static const char* kLivenessPassName = "liveness";
 static const char* kRegisterAllocatorPassName = "register";
-static const char* kGVNPassName = "gvn";
 
 /**
  * If enabled, emits compilation information suitable for the c1visualizer tool
diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc
index 25168b5..6e5f1bd 100644
--- a/compiler/optimizing/gvn.cc
+++ b/compiler/optimizing/gvn.cc
@@ -91,29 +91,38 @@
   return block_effects_.Get(block->GetBlockId());
 }
 
-static bool IsLoopExit(HBasicBlock* block, HBasicBlock* successor) {
-  HLoopInformation* block_info = block->GetLoopInformation();
-  HLoopInformation* other_info = successor->GetLoopInformation();
-  return block_info != other_info && (other_info == nullptr || block_info->IsIn(*other_info));
-}
-
 void GlobalValueNumberer::VisitBasicBlock(HBasicBlock* block) {
-  if (kIsDebugBuild) {
-    // Check that all non back-edge processors have been visited.
-    for (size_t i = 0, e = block->GetPredecessors().Size(); i < e; ++i) {
-      HBasicBlock* predecessor = block->GetPredecessors().Get(i);
-      DCHECK(visited_.Get(predecessor->GetBlockId())
-             || (block->GetLoopInformation() != nullptr
-                 && (block->GetLoopInformation()->GetBackEdges().Get(0) == predecessor)));
+  ValueSet* set = nullptr;
+  const GrowableArray<HBasicBlock*>& predecessors = block->GetPredecessors();
+  if (predecessors.Size() == 0 || predecessors.Get(0)->IsEntryBlock()) {
+    // The entry block should only accumulate constant instructions, and
+    // the builder puts constants only in the entry block.
+    // Therefore, there is no need to propagate the value set to the next block.
+    set = new (allocator_) ValueSet(allocator_);
+  } else {
+    HBasicBlock* dominator = block->GetDominator();
+    set = sets_.Get(dominator->GetBlockId())->Copy();
+    if (dominator->GetSuccessors().Size() != 1 || dominator->GetSuccessors().Get(0) != block) {
+      // We have to copy if the dominator has other successors, or `block` is not a successor
+      // of the dominator.
+      set = set->Copy();
     }
-    visited_.Put(block->GetBlockId(), true);
+    if (!set->IsEmpty()) {
+      if (block->IsLoopHeader()) {
+        DCHECK_EQ(block->GetDominator(), block->GetLoopInformation()->GetPreHeader());
+        set->Kill(GetLoopEffects(block));
+      } else if (predecessors.Size() > 1) {
+        for (size_t i = 0, e = predecessors.Size(); i < e; ++i) {
+          set->IntersectionWith(sets_.Get(predecessors.Get(i)->GetBlockId()));
+          if (set->IsEmpty()) {
+            break;
+          }
+        }
+      }
+    }
   }
 
-  ValueSet* set = sets_.Get(block->GetBlockId());
-
-  if (block->IsLoopHeader()) {
-    set->Kill(GetLoopEffects(block));
-  }
+  sets_.Put(block->GetBlockId(), set);
 
   HInstruction* current = block->GetFirstInstruction();
   while (current != nullptr) {
@@ -131,57 +140,6 @@
     }
     current = next;
   }
-
-  if (block == graph_->GetEntryBlock()) {
-    // The entry block should only accumulate constant instructions, and
-    // the builder puts constants only in the entry block.
-    // Therefore, there is no need to propagate the value set to the next block.
-    DCHECK_EQ(block->GetDominatedBlocks().Size(), 1u);
-    HBasicBlock* dominated = block->GetDominatedBlocks().Get(0);
-    sets_.Put(dominated->GetBlockId(), new (allocator_) ValueSet(allocator_));
-    return;
-  }
-
-  // Copy the value set to dominated blocks. We can re-use
-  // the current set for the last dominated block because we are done visiting
-  // this block.
-  for (size_t i = 0, e = block->GetDominatedBlocks().Size(); i < e; ++i) {
-    HBasicBlock* dominated = block->GetDominatedBlocks().Get(i);
-    sets_.Put(dominated->GetBlockId(), i == e - 1 ? set : set->Copy());
-  }
-
-  // Kill instructions in the value set of each successor. If the successor
-  // is a loop exit, then we use the side effects of the loop. If not, we use
-  // the side effects of this block.
-  for (size_t i = 0, e = block->GetSuccessors().Size(); i < e; ++i) {
-    HBasicBlock* successor = block->GetSuccessors().Get(i);
-    if (successor->IsLoopHeader()
-        && successor->GetLoopInformation()->GetBackEdges().Get(0) == block) {
-      // In case of a back edge, we already have visited the loop header.
-      // We should not update its value set, because the last dominated block
-      // of the loop header uses the same value set.
-      DCHECK(visited_.Get(successor->GetBlockId()));
-      continue;
-    }
-    DCHECK(!visited_.Get(successor->GetBlockId()));
-    ValueSet* successor_set = sets_.Get(successor->GetBlockId());
-    // The dominator sets the set, and we are guaranteed to have visited it already.
-    DCHECK(successor_set != nullptr);
-
-    // If this block dominates this successor there is nothing to do.
-    // Also if the set is empty, there is nothing to kill.
-    if (successor->GetDominator() != block && !successor_set->IsEmpty()) {
-      if (block->IsInLoop() && IsLoopExit(block, successor)) {
-        // All instructions killed in the loop must be killed for a loop exit.
-        SideEffects effects = GetLoopEffects(block->GetLoopInformation()->GetHeader());
-        sets_.Get(successor->GetBlockId())->Kill(effects);
-      } else {
-        // Following block (that might be in the same loop).
-        // Just kill instructions based on this block's side effects.
-        sets_.Get(successor->GetBlockId())->Kill(GetBlockEffects(block));
-      }
-    }
-  }
 }
 
 }  // namespace art
diff --git a/compiler/optimizing/gvn.h b/compiler/optimizing/gvn.h
index 8d2c774..81f2c3f 100644
--- a/compiler/optimizing/gvn.h
+++ b/compiler/optimizing/gvn.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_OPTIMIZING_GVN_H_
 
 #include "nodes.h"
+#include "optimization.h"
 
 namespace art {
 
@@ -95,6 +96,26 @@
     return nullptr;
   }
 
+  // Returns whether `instruction` is in the set.
+  HInstruction* IdentityLookup(HInstruction* instruction) const {
+    size_t hash_code = instruction->ComputeHashCode();
+    size_t index = hash_code % kDefaultNumberOfEntries;
+    HInstruction* existing = table_[index];
+    if (existing != nullptr && existing == instruction) {
+      return existing;
+    }
+
+    for (ValueSetNode* node = collisions_; node != nullptr; node = node->GetNext()) {
+      if (node->GetHashCode() == hash_code) {
+        existing = node->GetInstruction();
+        if (existing == instruction) {
+          return existing;
+        }
+      }
+    }
+    return nullptr;
+  }
+
   // Removes all instructions in the set that are affected by the given side effects.
   void Kill(SideEffects side_effects) {
     for (size_t i = 0; i < kDefaultNumberOfEntries; ++i) {
@@ -105,9 +126,9 @@
       }
     }
 
-    ValueSetNode* current = collisions_;
-    ValueSetNode* previous = nullptr;
-    while (current != nullptr) {
+    for (ValueSetNode* current = collisions_, *previous = nullptr;
+         current != nullptr;
+         current = current->GetNext()) {
       HInstruction* instruction = current->GetInstruction();
       if (instruction->GetSideEffects().DependsOn(side_effects)) {
         if (previous == nullptr) {
@@ -119,7 +140,6 @@
       } else {
         previous = current;
       }
-      current = current->GetNext();
     }
   }
 
@@ -142,6 +162,44 @@
     return copy;
   }
 
+  void Clear() {
+    number_of_entries_ = 0;
+    collisions_ = nullptr;
+    for (size_t i = 0; i < kDefaultNumberOfEntries; ++i) {
+      table_[i] = nullptr;
+    }
+  }
+
+  // Update this `ValueSet` by intersecting with instructions in `other`.
+  void IntersectionWith(ValueSet* other) {
+    if (IsEmpty()) {
+      return;
+    } else if (other->IsEmpty()) {
+      Clear();
+    } else {
+      for (size_t i = 0; i < kDefaultNumberOfEntries; ++i) {
+        if (table_[i] != nullptr && other->IdentityLookup(table_[i]) == nullptr) {
+          --number_of_entries_;
+          table_[i] = nullptr;
+        }
+      }
+      for (ValueSetNode* current = collisions_, *previous = nullptr;
+           current != nullptr;
+           current = current->GetNext()) {
+        if (other->IdentityLookup(current->GetInstruction()) == nullptr) {
+          if (previous == nullptr) {
+            collisions_ = current->GetNext();
+          } else {
+            previous->SetNext(current->GetNext());
+          }
+          --number_of_entries_;
+        } else {
+          previous = current;
+        }
+      }
+    }
+  }
+
   bool IsEmpty() const { return number_of_entries_ == 0; }
   size_t GetNumberOfEntries() const { return number_of_entries_; }
 
@@ -168,17 +226,15 @@
 class GlobalValueNumberer : public ValueObject {
  public:
   GlobalValueNumberer(ArenaAllocator* allocator, HGraph* graph)
-      : allocator_(allocator),
-        graph_(graph),
+      : graph_(graph),
+        allocator_(allocator),
         block_effects_(allocator, graph->GetBlocks().Size()),
         loop_effects_(allocator, graph->GetBlocks().Size()),
-        sets_(allocator, graph->GetBlocks().Size()),
-        visited_(allocator, graph->GetBlocks().Size()) {
+        sets_(allocator, graph->GetBlocks().Size()) {
     size_t number_of_blocks = graph->GetBlocks().Size();
     block_effects_.SetSize(number_of_blocks);
     loop_effects_.SetSize(number_of_blocks);
     sets_.SetSize(number_of_blocks);
-    visited_.SetSize(number_of_blocks);
 
     for (size_t i = 0; i < number_of_blocks; ++i) {
       block_effects_.Put(i, SideEffects::None());
@@ -201,8 +257,9 @@
   SideEffects GetLoopEffects(HBasicBlock* block) const;
   SideEffects GetBlockEffects(HBasicBlock* block) const;
 
+  HGraph* graph_;
+
   ArenaAllocator* const allocator_;
-  HGraph* const graph_;
 
   // Side effects of individual blocks, that is the union of the side effects
   // of the instructions in the block.
@@ -217,13 +274,23 @@
   // in the path from the dominator to the block.
   GrowableArray<ValueSet*> sets_;
 
-  // Mark visisted blocks. Only used for debugging.
-  GrowableArray<bool> visited_;
-
   ART_FRIEND_TEST(GVNTest, LoopSideEffects);
   DISALLOW_COPY_AND_ASSIGN(GlobalValueNumberer);
 };
 
+class GVNOptimization : public HOptimization {
+ public:
+  explicit GVNOptimization(HGraph* graph) : HOptimization(graph, true, "GVN") {}
+
+  void Run() OVERRIDE {
+    GlobalValueNumberer gvn(graph_->GetArena(), graph_);
+    gvn.Run();
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(GVNOptimization);
+};
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_GVN_H_
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 3e8361e..49ca443 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -18,11 +18,23 @@
 
 namespace art {
 
+class InstructionSimplifierVisitor : public HGraphVisitor {
+ public:
+  explicit InstructionSimplifierVisitor(HGraph* graph) : HGraphVisitor(graph) {}
+
+ private:
+  void VisitSuspendCheck(HSuspendCheck* check) OVERRIDE;
+  void VisitEqual(HEqual* equal) OVERRIDE;
+  void VisitArraySet(HArraySet* equal) OVERRIDE;
+  void VisitTypeConversion(HTypeConversion* instruction) OVERRIDE;
+};
+
 void InstructionSimplifier::Run() {
-  VisitInsertionOrder();
+  InstructionSimplifierVisitor visitor(graph_);
+  visitor.VisitInsertionOrder();
 }
 
-void InstructionSimplifier::VisitSuspendCheck(HSuspendCheck* check) {
+void InstructionSimplifierVisitor::VisitSuspendCheck(HSuspendCheck* check) {
   HBasicBlock* block = check->GetBlock();
   // Currently always keep the suspend check at entry.
   if (block->IsEntryBlock()) return;
@@ -38,7 +50,7 @@
   block->RemoveInstruction(check);
 }
 
-void InstructionSimplifier::VisitEqual(HEqual* equal) {
+void InstructionSimplifierVisitor::VisitEqual(HEqual* equal) {
   HInstruction* input1 = equal->InputAt(0);
   HInstruction* input2 = equal->InputAt(1);
   if (input1->GetType() == Primitive::kPrimBoolean && input2->IsIntConstant()) {
@@ -55,7 +67,7 @@
   }
 }
 
-void InstructionSimplifier::VisitArraySet(HArraySet* instruction) {
+void InstructionSimplifierVisitor::VisitArraySet(HArraySet* instruction) {
   HInstruction* value = instruction->GetValue();
   if (value->GetType() != Primitive::kPrimNot) return;
 
@@ -67,4 +79,12 @@
   }
 }
 
+void InstructionSimplifierVisitor::VisitTypeConversion(HTypeConversion* instruction) {
+  if (instruction->GetResultType() == instruction->GetInputType()) {
+    // Remove the instruction if it's converting to the same type.
+    instruction->ReplaceWith(instruction->GetInput());
+    instruction->GetBlock()->RemoveInstruction(instruction);
+  }
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier.h b/compiler/optimizing/instruction_simplifier.h
index 3844d57..7068c7f 100644
--- a/compiler/optimizing/instruction_simplifier.h
+++ b/compiler/optimizing/instruction_simplifier.h
@@ -18,22 +18,19 @@
 #define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_H_
 
 #include "nodes.h"
+#include "optimization.h"
 
 namespace art {
 
 /**
  * Implements optimizations specific to each instruction.
  */
-class InstructionSimplifier : public HGraphVisitor {
+class InstructionSimplifier : public HOptimization {
  public:
-  explicit InstructionSimplifier(HGraph* graph) : HGraphVisitor(graph) {}
+  explicit InstructionSimplifier(HGraph* graph)
+    : HOptimization(graph, true, "instruction_simplifier") {}
 
-  void Run();
-
- private:
-  virtual void VisitSuspendCheck(HSuspendCheck* check) OVERRIDE;
-  virtual void VisitEqual(HEqual* equal) OVERRIDE;
-  virtual void VisitArraySet(HArraySet* equal) OVERRIDE;
+  void Run() OVERRIDE;
 };
 
 }  // namespace art
diff --git a/compiler/optimizing/linearize_test.cc b/compiler/optimizing/linearize_test.cc
index 6dd4207..c49cf7e 100644
--- a/compiler/optimizing/linearize_test.cc
+++ b/compiler/optimizing/linearize_test.cc
@@ -50,10 +50,9 @@
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
-  ASSERT_EQ(liveness.GetLinearPostOrder().Size(), number_of_blocks);
+  ASSERT_EQ(liveness.GetLinearOrder().Size(), number_of_blocks);
   for (size_t i = 0; i < number_of_blocks; ++i) {
-    ASSERT_EQ(liveness.GetLinearPostOrder().Get(number_of_blocks - i - 1)->GetBlockId(),
-              expected_order[i]);
+    ASSERT_EQ(liveness.GetLinearOrder().Get(i)->GetBlockId(), expected_order[i]);
   }
 }
 
@@ -194,4 +193,58 @@
   TestCode(data, blocks, 12);
 }
 
+TEST(LinearizeTest, CFG6) {
+  //            Block0
+  //              |
+  //            Block1
+  //              |
+  //            Block2 ++++++++++++++
+  //              |                 +
+  //            Block3              +
+  //            /     \             +
+  //       Block8     Block4        +
+  //         |         /   \        +
+  //       Block5 <- Block9 Block6  +
+  //         |
+  //       Block7
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::GOTO | 0x0100,
+    Instruction::IF_EQ, 0x0004,
+    Instruction::IF_EQ, 0x0003,
+    Instruction::RETURN_VOID,
+    Instruction::GOTO | 0xFA00);
+
+  const int blocks[] = {0, 1, 2, 3, 4, 6, 9, 8, 5, 7};
+  TestCode(data, blocks, arraysize(blocks));
+}
+
+TEST(LinearizeTest, CFG7) {
+  // Structure of this graph (+ are back edges)
+  //            Block0
+  //              |
+  //            Block1
+  //              |
+  //            Block2 ++++++++
+  //              |           +
+  //            Block3        +
+  //            /    \        +
+  //        Block4  Block8    +
+  //        /  \        |     +
+  //   Block5 Block9 - Block6 +
+  //     |
+  //   Block7
+  //
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::GOTO | 0x0100,
+    Instruction::IF_EQ, 0x0005,
+    Instruction::IF_EQ, 0x0003,
+    Instruction::RETURN_VOID,
+    Instruction::GOTO | 0xFA00);
+
+  const int blocks[] = {0, 1, 2, 3, 4, 9, 8, 6, 5, 7};
+  TestCode(data, blocks, arraysize(blocks));
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/live_ranges_test.cc b/compiler/optimizing/live_ranges_test.cc
index 89c9495..e3c6fec 100644
--- a/compiler/optimizing/live_ranges_test.cc
+++ b/compiler/optimizing/live_ranges_test.cc
@@ -386,7 +386,7 @@
     Instruction::ADD_INT, 1 << 8,
     Instruction::GOTO | 0x300,
     Instruction::ADD_INT, 1 << 8,
-    Instruction::RETURN | 1 << 8);
+    Instruction::RETURN);
 
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
@@ -410,7 +410,10 @@
   interval = liveness.GetInstructionFromSsaIndex(1)->GetLiveInterval();
   range = interval->GetFirstRange();
   ASSERT_EQ(4u, range->GetStart());
-  ASSERT_EQ(28u, range->GetEnd());
+  ASSERT_EQ(17u, range->GetEnd());
+  range = range->GetNext();
+  ASSERT_EQ(20u, range->GetStart());
+  ASSERT_EQ(23u, range->GetEnd());
   ASSERT_TRUE(range->GetNext() == nullptr);
 
   // Test for the first add.
@@ -429,9 +432,8 @@
   ASSERT_EQ(26u, range->GetEnd());
   ASSERT_TRUE(range->GetNext() == nullptr);
 
-  // Test for the phi, which is unused.
   HPhi* phi = liveness.GetInstructionFromSsaIndex(4)->AsPhi();
-  ASSERT_EQ(phi->NumberOfUses(), 0u);
+  ASSERT_EQ(phi->NumberOfUses(), 1u);
   interval = phi->GetLiveInterval();
   range = interval->GetFirstRange();
   ASSERT_EQ(26u, range->GetStart());
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index d1555d4..1ff26d9 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -161,7 +161,14 @@
   }
 
   template <typename T>
-  T As() const {
+  T AsRegister() const {
+    DCHECK(IsRegister());
+    return static_cast<T>(reg());
+  }
+
+  template <typename T>
+  T AsFpuRegister() const {
+    DCHECK(IsFpuRegister());
     return static_cast<T>(reg());
   }
 
@@ -391,6 +398,10 @@
     return (register_set & (1 << reg)) != 0;
   }
 
+  size_t GetNumberOfRegisters() const {
+    return __builtin_popcount(core_registers_) + __builtin_popcount(floating_point_registers_);
+  }
+
  private:
   uint32_t core_registers_;
   uint32_t floating_point_registers_;
@@ -503,6 +514,10 @@
     return &live_registers_;
   }
 
+  size_t GetNumberOfLiveRegisters() const {
+    return live_registers_.GetNumberOfRegisters();
+  }
+
   bool InputOverlapsWithOutputOrTemp(uint32_t input_index, bool is_environment) const {
     if (is_environment) return true;
     if ((input_index == 0)
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 7d52d7d..28496e4 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -42,6 +42,9 @@
 static const int kDefaultNumberOfDominatedBlocks = 1;
 static const int kDefaultNumberOfBackEdges = 1;
 
+static constexpr uint32_t kMaxIntShiftValue = 0x1f;
+static constexpr uint64_t kMaxLongShiftValue = 0x3f;
+
 enum IfCondition {
   kCondEQ,
   kCondNE,
@@ -233,7 +236,7 @@
     return false;
   }
 
-  int NumberOfBackEdges() const {
+  size_t NumberOfBackEdges() const {
     return back_edges_.Size();
   }
 
@@ -521,9 +524,11 @@
   M(ParallelMove, Instruction)                                          \
   M(ParameterValue, Instruction)                                        \
   M(Phi, Instruction)                                                   \
-  M(Rem, BinaryOperation)                                             \
+  M(Rem, BinaryOperation)                                               \
   M(Return, Instruction)                                                \
   M(ReturnVoid, Instruction)                                            \
+  M(Shl, BinaryOperation)                                               \
+  M(Shr, BinaryOperation)                                               \
   M(StaticFieldGet, Instruction)                                        \
   M(StaticFieldSet, Instruction)                                        \
   M(StoreLocal, Instruction)                                            \
@@ -532,6 +537,7 @@
   M(Temporary, Instruction)                                             \
   M(Throw, Instruction)                                                 \
   M(TypeConversion, Instruction)                                        \
+  M(UShr, BinaryOperation)                                              \
   M(Xor, BinaryOperation)                                               \
 
 #define FOR_EACH_INSTRUCTION(M)                                         \
@@ -771,7 +777,7 @@
   }
 
   // Returns whether two instructions are equal, that is:
-  // 1) They have the same type and contain the same data,
+  // 1) They have the same type and contain the same data (InstructionDataEquals).
   // 2) Their inputs are identical.
   bool Equals(HInstruction* other) const;
 
@@ -1357,28 +1363,45 @@
 // Result is 0 if input0 == input1, 1 if input0 > input1, or -1 if input0 < input1.
 class HCompare : public HBinaryOperation {
  public:
-  HCompare(Primitive::Type type, HInstruction* first, HInstruction* second)
-      : HBinaryOperation(Primitive::kPrimInt, first, second) {
+  // The bias applies for floating point operations and indicates how NaN
+  // comparisons are treated:
+  enum Bias {
+    kNoBias,  // bias is not applicable (i.e. for long operation)
+    kGtBias,  // return 1 for NaN comparisons
+    kLtBias,  // return -1 for NaN comparisons
+  };
+
+  HCompare(Primitive::Type type, HInstruction* first, HInstruction* second, Bias bias)
+      : HBinaryOperation(Primitive::kPrimInt, first, second), bias_(bias) {
     DCHECK_EQ(type, first->GetType());
     DCHECK_EQ(type, second->GetType());
   }
 
-  virtual int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
+  int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
     return
       x == y ? 0 :
       x > y ? 1 :
       -1;
   }
-  virtual int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
+
+  int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
     return
       x == y ? 0 :
       x > y ? 1 :
       -1;
   }
 
+  bool InstructionDataEquals(HInstruction* other) const OVERRIDE {
+    return bias_ == other->AsCompare()->bias_;
+  }
+
+  bool IsGtBias() { return bias_ == kGtBias; }
+
   DECLARE_INSTRUCTION(Compare);
 
  private:
+  const Bias bias_;
+
   DISALLOW_COPY_AND_ASSIGN(HCompare);
 };
 
@@ -1831,6 +1854,57 @@
   DISALLOW_COPY_AND_ASSIGN(HDivZeroCheck);
 };
 
+class HShl : public HBinaryOperation {
+ public:
+  HShl(Primitive::Type result_type, HInstruction* left, HInstruction* right)
+      : HBinaryOperation(result_type, left, right) {}
+
+  int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE { return x << (y & kMaxIntShiftValue); }
+  int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE { return x << (y & kMaxLongShiftValue); }
+
+  DECLARE_INSTRUCTION(Shl);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HShl);
+};
+
+class HShr : public HBinaryOperation {
+ public:
+  HShr(Primitive::Type result_type, HInstruction* left, HInstruction* right)
+      : HBinaryOperation(result_type, left, right) {}
+
+  int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE { return x >> (y & kMaxIntShiftValue); }
+  int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE { return x >> (y & kMaxLongShiftValue); }
+
+  DECLARE_INSTRUCTION(Shr);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HShr);
+};
+
+class HUShr : public HBinaryOperation {
+ public:
+  HUShr(Primitive::Type result_type, HInstruction* left, HInstruction* right)
+      : HBinaryOperation(result_type, left, right) {}
+
+  int32_t Evaluate(int32_t x, int32_t y) const OVERRIDE {
+    uint32_t ux = static_cast<uint32_t>(x);
+    uint32_t uy = static_cast<uint32_t>(y) & kMaxIntShiftValue;
+    return static_cast<int32_t>(ux >> uy);
+  }
+
+  int64_t Evaluate(int64_t x, int64_t y) const OVERRIDE {
+    uint64_t ux = static_cast<uint64_t>(x);
+    uint64_t uy = static_cast<uint64_t>(y) & kMaxLongShiftValue;
+    return static_cast<int64_t>(ux >> uy);
+  }
+
+  DECLARE_INSTRUCTION(UShr);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HUShr);
+};
+
 class HAnd : public HBinaryOperation {
  public:
   HAnd(Primitive::Type result_type, HInstruction* left, HInstruction* right)
diff --git a/compiler/optimizing/optimization.cc b/compiler/optimizing/optimization.cc
index ea98186..b99f678 100644
--- a/compiler/optimizing/optimization.cc
+++ b/compiler/optimizing/optimization.cc
@@ -21,25 +21,21 @@
 
 namespace art {
 
-void HOptimization::Execute() {
-  Run();
-  visualizer_.DumpGraph(pass_name_);
-  Check();
-}
-
 void HOptimization::Check() {
   if (kIsDebugBuild) {
     if (is_in_ssa_form_) {
       SSAChecker checker(graph_->GetArena(), graph_);
       checker.Run();
       if (!checker.IsValid()) {
-        LOG(FATAL) << Dumpable<SSAChecker>(checker);
+        LOG(FATAL) << "Error after " << GetPassName() << ": "
+                   << Dumpable<SSAChecker>(checker);
       }
     } else {
       GraphChecker checker(graph_->GetArena(), graph_);
       checker.Run();
       if (!checker.IsValid()) {
-        LOG(FATAL) << Dumpable<GraphChecker>(checker);
+        LOG(FATAL) << "Error after " << GetPassName() << ": "
+                   << Dumpable<GraphChecker>(checker);
       }
     }
   }
diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h
index 59683e2..e36ef19 100644
--- a/compiler/optimizing/optimization.h
+++ b/compiler/optimizing/optimization.h
@@ -17,7 +17,6 @@
 #ifndef ART_COMPILER_OPTIMIZING_OPTIMIZATION_H_
 #define ART_COMPILER_OPTIMIZING_OPTIMIZATION_H_
 
-#include "graph_visualizer.h"
 #include "nodes.h"
 
 namespace art {
@@ -29,25 +28,19 @@
  public:
   HOptimization(HGraph* graph,
                 bool is_in_ssa_form,
-                const char* pass_name,
-                const HGraphVisualizer& visualizer)
+                const char* pass_name)
       : graph_(graph),
         is_in_ssa_form_(is_in_ssa_form),
-        pass_name_(pass_name),
-        visualizer_(visualizer) {}
+        pass_name_(pass_name) {}
 
   virtual ~HOptimization() {}
 
-  // Execute the optimization pass.
-  void Execute();
-
   // Return the name of the pass.
   const char* GetPassName() const { return pass_name_; }
 
   // Peform the analysis itself.
   virtual void Run() = 0;
 
- private:
   // Verify the graph; abort if it is not valid.
   void Check();
 
@@ -59,9 +52,6 @@
   const bool is_in_ssa_form_;
   // Optimization pass name.
   const char* pass_name_;
-  // A graph visualiser invoked after the execution of the optimization
-  // pass if enabled.
-  const HGraphVisualizer& visualizer_;
 
   DISALLOW_COPY_AND_ASSIGN(HOptimization);
 };
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 0de0907..d8533eb 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -35,6 +35,7 @@
 #include "nodes.h"
 #include "prepare_for_register_allocation.h"
 #include "register_allocator.h"
+#include "ssa_builder.h"
 #include "ssa_phi_elimination.h"
 #include "ssa_liveness_analysis.h"
 #include "utils/arena_allocator.h"
@@ -167,7 +168,8 @@
 }
 
 uintptr_t OptimizingCompiler::GetEntryPointOf(mirror::ArtMethod* method) const {
-  return reinterpret_cast<uintptr_t>(method->GetEntryPointFromQuickCompiledCode());
+  return reinterpret_cast<uintptr_t>(method->GetEntryPointFromQuickCompiledCodePtrSize(
+      InstructionSetPointerSize(GetCompilerDriver()->GetInstructionSet())));
 }
 
 bool OptimizingCompiler::WriteElf(art::File* file, OatWriter* oat_writer,
@@ -189,6 +191,35 @@
   return code_item.tries_size_ == 0;
 }
 
+static void RunOptimizations(HGraph* graph, const HGraphVisualizer& visualizer) {
+  TransformToSsa ssa(graph);
+  HDeadCodeElimination opt1(graph);
+  HConstantFolding opt2(graph);
+  SsaRedundantPhiElimination opt3(graph);
+  SsaDeadPhiElimination opt4(graph);
+  InstructionSimplifier opt5(graph);
+  GVNOptimization opt6(graph);
+  InstructionSimplifier opt7(graph);
+
+  HOptimization* optimizations[] = {
+    &ssa,
+    &opt1,
+    &opt2,
+    &opt3,
+    &opt4,
+    &opt5,
+    &opt6,
+    &opt7
+  };
+
+  for (size_t i = 0; i < arraysize(optimizations); ++i) {
+    HOptimization* optimization = optimizations[i];
+    optimization->Run();
+    visualizer.DumpGraph(optimization->GetPassName());
+    optimization->Check();
+  }
+}
+
 CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item,
                                             uint32_t access_flags,
                                             InvokeType invoke_type,
@@ -251,22 +282,9 @@
       && CanOptimize(*code_item)
       && RegisterAllocator::CanAllocateRegistersFor(*graph, instruction_set)) {
     optimized_compiled_methods_++;
-    graph->BuildDominatorTree();
-    graph->TransformToSSA();
-    visualizer.DumpGraph("ssa");
-    graph->FindNaturalLoops();
+    RunOptimizations(graph, visualizer);
 
-    HDeadCodeElimination(graph, visualizer).Execute();
-    HConstantFolding(graph, visualizer).Execute();
-
-    SsaRedundantPhiElimination(graph).Run();
-    SsaDeadPhiElimination(graph).Run();
-    InstructionSimplifier(graph).Run();
-    GlobalValueNumberer(graph->GetArena(), graph).Run();
-    visualizer.DumpGraph(kGVNPassName);
-    InstructionSimplifier(graph).Run();
     PrepareForRegisterAllocation(graph).Run();
-
     SsaLivenessAnalysis liveness(*graph, codegen);
     liveness.Analyze();
     visualizer.DumpGraph(kLivenessPassName);
@@ -309,7 +327,7 @@
       graph->FindNaturalLoops();
       SsaRedundantPhiElimination(graph).Run();
       SsaDeadPhiElimination(graph).Run();
-      GlobalValueNumberer(graph->GetArena(), graph).Run();
+      GVNOptimization(graph).Run();
       SsaLivenessAnalysis liveness(*graph, codegen);
       liveness.Analyze();
       visualizer.DumpGraph(kLivenessPassName);
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 4d6e664..a6c0635 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -70,7 +70,8 @@
          it.Advance()) {
       HInstruction* current = it.Current();
       if (current->GetType() == Primitive::kPrimLong && instruction_set != kX86_64) return false;
-      if ((current->GetType() == Primitive::kPrimFloat || current->GetType() == Primitive::kPrimDouble)
+      if ((current->GetType() == Primitive::kPrimFloat
+           || current->GetType() == Primitive::kPrimDouble)
           && instruction_set != kX86_64) {
         return false;
       }
@@ -95,6 +96,25 @@
     ValidateInternal(true);
     processing_core_registers_ = false;
     ValidateInternal(true);
+    // Check that the linear order is still correct with regards to lifetime positions.
+    // Since only parallel moves have been inserted during the register allocation,
+    // these checks are mostly for making sure these moves have been added correctly.
+    size_t current_liveness = 0;
+    for (HLinearOrderIterator it(liveness_); !it.Done(); it.Advance()) {
+      HBasicBlock* block = it.Current();
+      for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) {
+        HInstruction* instruction = inst_it.Current();
+        DCHECK_LE(current_liveness, instruction->GetLifetimePosition());
+        current_liveness = instruction->GetLifetimePosition();
+      }
+      for (HInstructionIterator inst_it(block->GetInstructions());
+           !inst_it.Done();
+           inst_it.Advance()) {
+        HInstruction* instruction = inst_it.Current();
+        DCHECK_LE(current_liveness, instruction->GetLifetimePosition()) << instruction->DebugName();
+        current_liveness = instruction->GetLifetimePosition();
+      }
+    }
   }
 }
 
@@ -189,11 +209,29 @@
       BlockRegister(temp, position, position + 1);
     } else {
       DCHECK(temp.IsUnallocated());
-      DCHECK(temp.GetPolicy() == Location::kRequiresRegister);
-      LiveInterval* interval = LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimInt);
-      temp_intervals_.Add(interval);
-      interval->AddRange(position, position + 1);
-      unhandled_core_intervals_.Add(interval);
+      switch (temp.GetPolicy()) {
+        case Location::kRequiresRegister: {
+          LiveInterval* interval =
+              LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimInt);
+          temp_intervals_.Add(interval);
+          interval->AddRange(position, position + 1);
+          unhandled_core_intervals_.Add(interval);
+          break;
+        }
+
+        case Location::kRequiresFpuRegister: {
+          LiveInterval* interval =
+              LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimDouble);
+          temp_intervals_.Add(interval);
+          interval->AddRange(position, position + 1);
+          unhandled_fp_intervals_.Add(interval);
+          break;
+        }
+
+        default:
+          LOG(FATAL) << "Unexpected policy for temporary location "
+                     << temp.GetPolicy();
+      }
     }
   }
 
@@ -216,8 +254,8 @@
       // maximum before updating locations.
       LiveInterval* interval = LiveInterval::MakeSlowPathInterval(allocator_, instruction);
       interval->AddRange(position, position + 1);
-      unhandled_core_intervals_.Add(interval);
-      unhandled_fp_intervals_.Add(interval);
+      AddSorted(&unhandled_core_intervals_, interval);
+      AddSorted(&unhandled_fp_intervals_, interval);
     }
   }
 
@@ -250,6 +288,7 @@
       : unhandled_fp_intervals_;
 
   DCHECK(unhandled.IsEmpty() || current->StartsBeforeOrAt(unhandled.Peek()));
+
   // Some instructions define their output in fixed register/stack slot. We need
   // to ensure we know these locations before doing register allocation. For a
   // given register, we create an interval that covers these locations. The register
@@ -475,6 +514,7 @@
     LiveInterval* current = unhandled_->Pop();
     DCHECK(!current->IsFixed() && !current->HasSpillSlot());
     DCHECK(unhandled_->IsEmpty() || unhandled_->Peek()->GetStart() >= current->GetStart());
+
     size_t position = current->GetStart();
 
     // Remember the inactive_ size here since the ones moved to inactive_ from
@@ -520,6 +560,7 @@
       // at safepoints. No need to allocate a register for it.
       maximum_number_of_live_registers_ =
           std::max(maximum_number_of_live_registers_, active_.Size());
+      DCHECK(unhandled_->IsEmpty() || unhandled_->Peek()->GetStart() > current->GetStart());
       continue;
     }
 
@@ -764,6 +805,12 @@
     if (current->StartsAfter(interval)) {
       insert_at = i;
       break;
+    } else if ((current->GetStart() == interval->GetStart()) && current->IsSlowPathSafepoint()) {
+      // Ensure the slow path interval is the last to be processed at its location: we want the
+      // interval to know all live registers at this location.
+      DCHECK(i == 1 || array->Get(i - 2)->StartsAfter(current));
+      insert_at = i;
+      break;
     }
   }
   array->InsertAt(insert_at, interval);
@@ -876,6 +923,14 @@
   move->AddMove(new (allocator_) MoveOperands(source, destination, nullptr));
 }
 
+static bool IsInstructionStart(size_t position) {
+  return (position & 1) == 0;
+}
+
+static bool IsInstructionEnd(size_t position) {
+  return (position & 1) == 1;
+}
+
 void RegisterAllocator::InsertParallelMoveAt(size_t position,
                                              HInstruction* instruction,
                                              Location source,
@@ -884,12 +939,29 @@
   if (source.Equals(destination)) return;
 
   HInstruction* at = liveness_.GetInstructionFromPosition(position / 2);
-  if (at == nullptr) {
-    // Block boundary, don't do anything the connection of split siblings will handle it.
-    return;
-  }
   HParallelMove* move;
-  if ((position & 1) == 1) {
+  if (at == nullptr) {
+    if (IsInstructionStart(position)) {
+      // Block boundary, don't do anything the connection of split siblings will handle it.
+      return;
+    } else {
+      // Move must happen before the first instruction of the block.
+      at = liveness_.GetInstructionFromPosition((position + 1) / 2);
+      // Note that parallel moves may have already been inserted, so we explicitly
+      // ask for the first instruction of the block: `GetInstructionFromPosition` does
+      // not contain the moves.
+      at = at->GetBlock()->GetFirstInstruction();
+      if (at->GetLifetimePosition() != position) {
+        DCHECK_GT(at->GetLifetimePosition(), position);
+        move = new (allocator_) HParallelMove(allocator_);
+        move->SetLifetimePosition(position);
+        at->GetBlock()->InsertInstructionBefore(move, at);
+      } else {
+        DCHECK(at->IsParallelMove());
+        move = at->AsParallelMove();
+      }
+    }
+  } else if (IsInstructionEnd(position)) {
     // Move must happen after the instruction.
     DCHECK(!at->IsControlFlow());
     move = at->GetNext()->AsParallelMove();
@@ -941,10 +1013,11 @@
   HParallelMove* move;
   // This is a parallel move for connecting blocks. We need to differentiate
   // it with moves for connecting siblings in a same block, and output moves.
+  size_t position = last->GetLifetimePosition();
   if (previous == nullptr || !previous->IsParallelMove()
-      || previous->AsParallelMove()->GetLifetimePosition() != block->GetLifetimeEnd()) {
+      || previous->AsParallelMove()->GetLifetimePosition() != position) {
     move = new (allocator_) HParallelMove(allocator_);
-    move->SetLifetimePosition(block->GetLifetimeEnd());
+    move->SetLifetimePosition(position);
     block->InsertInstructionBefore(move, last);
   } else {
     move = previous->AsParallelMove();
@@ -1062,6 +1135,8 @@
       switch (source.GetKind()) {
         case Location::kRegister: {
           locations->AddLiveRegister(source);
+          DCHECK_LE(locations->GetNumberOfLiveRegisters(), maximum_number_of_live_registers_);
+
           if (current->GetType() == Primitive::kPrimNot) {
             locations->SetRegisterBit(source.reg());
           }
@@ -1095,12 +1170,10 @@
     return;
   }
 
+  // Intervals end at the lifetime end of a block. The decrement by one
+  // ensures the `Cover` call will return true.
   size_t from_position = from->GetLifetimeEnd() - 1;
-  // When an instruction dies at entry of another, and the latter is the beginning
-  // of a block, the register allocator ensures the former has a register
-  // at block->GetLifetimeStart() + 1. Since this is at a block boundary, it must
-  // must be handled in this method.
-  size_t to_position = to->GetLifetimeStart() + 1;
+  size_t to_position = to->GetLifetimeStart();
 
   LiveInterval* destination = nullptr;
   LiveInterval* source = nullptr;
@@ -1238,9 +1311,27 @@
       current = at;
     }
     LocationSummary* locations = at->GetLocations();
-    DCHECK(temp->GetType() == Primitive::kPrimInt);
-    locations->SetTempAt(
-        temp_index++, Location::RegisterLocation(temp->GetRegister()));
+    switch (temp->GetType()) {
+      case Primitive::kPrimInt:
+        locations->SetTempAt(
+            temp_index++, Location::RegisterLocation(temp->GetRegister()));
+        break;
+
+      case Primitive::kPrimDouble:
+        // TODO: Support the case of ARM, where a double value
+        // requires an FPU register pair (note that the ARM back end
+        // does not yet use this register allocator when a method uses
+        // floats or doubles).
+        DCHECK(codegen_->GetInstructionSet() != kArm
+               && codegen_->GetInstructionSet() != kThumb2);
+        locations->SetTempAt(
+            temp_index++, Location::FpuRegisterLocation(temp->GetRegister()));
+        break;
+
+      default:
+        LOG(FATAL) << "Unexpected type for temporary location "
+                   << temp->GetType();
+    }
   }
 }
 
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index b2cc119..edfafcd 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -18,6 +18,7 @@
 
 #include "nodes.h"
 #include "ssa_type_propagation.h"
+#include "ssa_phi_elimination.h"
 
 namespace art {
 
@@ -41,11 +42,20 @@
     }
   }
 
-  // 3) Propagate types of phis.
+  // 3) Remove dead phis. This will remove phis that are only used by environments:
+  // at the DEX level, the type of these phis does not need to be consistent, but
+  // our code generator will complain if the inputs of a phi do not have the same
+  // type (modulo the special case of `null`).
+  SsaDeadPhiElimination dead_phis(GetGraph());
+  dead_phis.Run();
+
+  // 4) Propagate types of phis. At this point, phis are typed void in the general
+  // case, or float or double when we created a floating-point equivalent. So we
+  // need to propagate the types across phis to give them a correct type.
   SsaTypePropagation type_propagation(GetGraph());
   type_propagation.Run();
 
-  // 4) Clear locals.
+  // 5) Clear locals.
   // TODO: Move this to a dead code eliminator phase.
   for (HInstructionIterator it(GetGraph()->GetEntryBlock()->GetInstructions());
        !it.Done();
diff --git a/compiler/optimizing/ssa_builder.h b/compiler/optimizing/ssa_builder.h
index 2207cd6..5ab328f 100644
--- a/compiler/optimizing/ssa_builder.h
+++ b/compiler/optimizing/ssa_builder.h
@@ -18,9 +18,24 @@
 #define ART_COMPILER_OPTIMIZING_SSA_BUILDER_H_
 
 #include "nodes.h"
+#include "optimization.h"
 
 namespace art {
 
+class TransformToSsa : public HOptimization {
+ public:
+  explicit TransformToSsa(HGraph* graph) : HOptimization(graph, true, "ssa transform") {}
+
+  void Run() OVERRIDE {
+    graph_->BuildDominatorTree();
+    graph_->TransformToSSA();
+    graph_->FindNaturalLoops();
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(TransformToSsa);
+};
+
 static constexpr int kDefaultNumberOfLoops = 2;
 
 class SsaBuilder : public HGraphVisitor {
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index 0085b27..660a5c5 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -28,11 +28,6 @@
   ComputeLiveness();
 }
 
-static bool IsLoopExit(HLoopInformation* current, HLoopInformation* to) {
-  // `to` is either not part of a loop, or `current` is an inner loop of `to`.
-  return to == nullptr || (current != to && current->IsIn(*to));
-}
-
 static bool IsLoop(HLoopInformation* info) {
   return info != nullptr;
 }
@@ -48,46 +43,64 @@
       && inner->IsIn(*outer);
 }
 
-static void VisitBlockForLinearization(HBasicBlock* block,
-                                       GrowableArray<HBasicBlock*>* order,
-                                       ArenaBitVector* visited) {
-  if (visited->IsBitSet(block->GetBlockId())) {
-    return;
-  }
-  visited->SetBit(block->GetBlockId());
-  size_t number_of_successors = block->GetSuccessors().Size();
-  if (number_of_successors == 0) {
-    // Nothing to do.
-  } else if (number_of_successors == 1) {
-    VisitBlockForLinearization(block->GetSuccessors().Get(0), order, visited);
-  } else {
-    DCHECK_EQ(number_of_successors, 2u);
-    HBasicBlock* first_successor = block->GetSuccessors().Get(0);
-    HBasicBlock* second_successor = block->GetSuccessors().Get(1);
-    HLoopInformation* my_loop = block->GetLoopInformation();
-    HLoopInformation* first_loop = first_successor->GetLoopInformation();
-    HLoopInformation* second_loop = second_successor->GetLoopInformation();
-
-    if (!IsLoop(my_loop)) {
-      // Nothing to do. Current order is fine.
-    } else if (IsLoopExit(my_loop, second_loop) && InSameLoop(my_loop, first_loop)) {
-      // Visit the loop exit first in post order.
-      std::swap(first_successor, second_successor);
-    } else if (IsInnerLoop(my_loop, first_loop) && !IsInnerLoop(my_loop, second_loop)) {
-      // Visit the inner loop last in post order.
-      std::swap(first_successor, second_successor);
+static void AddToListForLinearization(GrowableArray<HBasicBlock*>* worklist, HBasicBlock* block) {
+  size_t insert_at = worklist->Size();
+  HLoopInformation* block_loop = block->GetLoopInformation();
+  for (; insert_at > 0; --insert_at) {
+    HBasicBlock* current = worklist->Get(insert_at - 1);
+    HLoopInformation* current_loop = current->GetLoopInformation();
+    if (InSameLoop(block_loop, current_loop)
+        || !IsLoop(current_loop)
+        || IsInnerLoop(current_loop, block_loop)) {
+      // The block can be processed immediately.
+      break;
     }
-    VisitBlockForLinearization(first_successor, order, visited);
-    VisitBlockForLinearization(second_successor, order, visited);
   }
-  order->Add(block);
+  worklist->InsertAt(insert_at, block);
 }
 
 void SsaLivenessAnalysis::LinearizeGraph() {
-  // For simplicity of the implementation, we create post linear order. The order for
-  // computing live ranges is the reverse of that order.
-  ArenaBitVector visited(graph_.GetArena(), graph_.GetBlocks().Size(), false);
-  VisitBlockForLinearization(graph_.GetEntryBlock(), &linear_post_order_, &visited);
+  // Create a reverse post ordering with the following properties:
+  // - Blocks in a loop are consecutive,
+  // - Back-edge is the last block before loop exits.
+
+  // (1): Record the number of forward predecessors for each block. This is to
+  //      ensure the resulting order is reverse post order. We could use the
+  //      current reverse post order in the graph, but it would require making
+  //      order queries to a GrowableArray, which is not the best data structure
+  //      for it.
+  GrowableArray<uint32_t> forward_predecessors(graph_.GetArena(), graph_.GetBlocks().Size());
+  forward_predecessors.SetSize(graph_.GetBlocks().Size());
+  for (size_t i = 0, e = graph_.GetBlocks().Size(); i < e; ++i) {
+    HBasicBlock* block = graph_.GetBlocks().Get(i);
+    size_t number_of_forward_predecessors = block->GetPredecessors().Size();
+    if (block->IsLoopHeader()) {
+      // We rely on having simplified the CFG.
+      DCHECK_EQ(1u, block->GetLoopInformation()->NumberOfBackEdges());
+      number_of_forward_predecessors--;
+    }
+    forward_predecessors.Put(block->GetBlockId(), number_of_forward_predecessors);
+  }
+
+  // (2): Following a worklist approach, first start with the entry block, and
+  //      iterate over the successors. When all non-back edge predecessors of a
+  //      successor block are visited, the successor block is added in the worklist
+  //      following an order that satisfies the requirements to build our linear graph.
+  GrowableArray<HBasicBlock*> worklist(graph_.GetArena(), 1);
+  worklist.Add(graph_.GetEntryBlock());
+  do {
+    HBasicBlock* current = worklist.Pop();
+    linear_order_.Add(current);
+    for (size_t i = 0, e = current->GetSuccessors().Size(); i < e; ++i) {
+      HBasicBlock* successor = current->GetSuccessors().Get(i);
+      int block_id = successor->GetBlockId();
+      size_t number_of_remaining_predecessors = forward_predecessors.Get(block_id);
+      if (number_of_remaining_predecessors == 1) {
+        AddToListForLinearization(&worklist, successor);
+      }
+      forward_predecessors.Put(block_id, number_of_remaining_predecessors - 1);
+    }
+  } while (!worklist.IsEmpty());
 }
 
 void SsaLivenessAnalysis::NumberInstructions() {
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index ca08d5b..2312389 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -582,7 +582,7 @@
   SsaLivenessAnalysis(const HGraph& graph, CodeGenerator* codegen)
       : graph_(graph),
         codegen_(codegen),
-        linear_post_order_(graph.GetArena(), graph.GetBlocks().Size()),
+        linear_order_(graph.GetArena(), graph.GetBlocks().Size()),
         block_infos_(graph.GetArena(), graph.GetBlocks().Size()),
         instructions_from_ssa_index_(graph.GetArena(), 0),
         instructions_from_lifetime_position_(graph.GetArena(), 0),
@@ -604,8 +604,8 @@
     return &block_infos_.Get(block.GetBlockId())->kill_;
   }
 
-  const GrowableArray<HBasicBlock*>& GetLinearPostOrder() const {
-    return linear_post_order_;
+  const GrowableArray<HBasicBlock*>& GetLinearOrder() const {
+    return linear_order_;
   }
 
   HInstruction* GetInstructionFromSsaIndex(size_t index) const {
@@ -661,7 +661,7 @@
 
   const HGraph& graph_;
   CodeGenerator* const codegen_;
-  GrowableArray<HBasicBlock*> linear_post_order_;
+  GrowableArray<HBasicBlock*> linear_order_;
   GrowableArray<BlockInfo*> block_infos_;
 
   // Temporary array used when computing live_in, live_out, and kill sets.
@@ -674,38 +674,43 @@
   DISALLOW_COPY_AND_ASSIGN(SsaLivenessAnalysis);
 };
 
-class HLinearOrderIterator : public ValueObject {
- public:
-  explicit HLinearOrderIterator(const SsaLivenessAnalysis& liveness)
-      : post_order_(liveness.GetLinearPostOrder()), index_(liveness.GetLinearPostOrder().Size()) {}
-
-  bool Done() const { return index_ == 0; }
-  HBasicBlock* Current() const { return post_order_.Get(index_ -1); }
-  void Advance() { --index_; DCHECK_GE(index_, 0U); }
-
- private:
-  const GrowableArray<HBasicBlock*>& post_order_;
-  size_t index_;
-
-  DISALLOW_COPY_AND_ASSIGN(HLinearOrderIterator);
-};
-
 class HLinearPostOrderIterator : public ValueObject {
  public:
   explicit HLinearPostOrderIterator(const SsaLivenessAnalysis& liveness)
-      : post_order_(liveness.GetLinearPostOrder()), index_(0) {}
+      : order_(liveness.GetLinearOrder()), index_(liveness.GetLinearOrder().Size()) {}
 
-  bool Done() const { return index_ == post_order_.Size(); }
-  HBasicBlock* Current() const { return post_order_.Get(index_); }
-  void Advance() { ++index_; }
+  bool Done() const { return index_ == 0; }
+
+  HBasicBlock* Current() const { return order_.Get(index_ -1); }
+
+  void Advance() {
+    --index_;
+    DCHECK_GE(index_, 0U);
+  }
 
  private:
-  const GrowableArray<HBasicBlock*>& post_order_;
+  const GrowableArray<HBasicBlock*>& order_;
   size_t index_;
 
   DISALLOW_COPY_AND_ASSIGN(HLinearPostOrderIterator);
 };
 
+class HLinearOrderIterator : public ValueObject {
+ public:
+  explicit HLinearOrderIterator(const SsaLivenessAnalysis& liveness)
+      : order_(liveness.GetLinearOrder()), index_(0) {}
+
+  bool Done() const { return index_ == order_.Size(); }
+  HBasicBlock* Current() const { return order_.Get(index_); }
+  void Advance() { ++index_; }
+
+ private:
+  const GrowableArray<HBasicBlock*>& order_;
+  size_t index_;
+
+  DISALLOW_COPY_AND_ASSIGN(HLinearOrderIterator);
+};
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_SSA_LIVENESS_ANALYSIS_H_
diff --git a/compiler/optimizing/ssa_phi_elimination.cc b/compiler/optimizing/ssa_phi_elimination.cc
index 56979e1..58cea77 100644
--- a/compiler/optimizing/ssa_phi_elimination.cc
+++ b/compiler/optimizing/ssa_phi_elimination.cc
@@ -24,6 +24,8 @@
     HBasicBlock* block = it.Current();
     for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) {
       HPhi* phi = inst_it.Current()->AsPhi();
+      // Set dead ahead of running through uses. The phi may have no use.
+      phi->SetDead();
       for (HUseIterator<HInstruction> use_it(phi->GetUses()); !use_it.Done(); use_it.Advance()) {
         HUseListNode<HInstruction>* current = use_it.Current();
         HInstruction* user = current->GetUser();
@@ -31,8 +33,6 @@
           worklist_.Add(phi);
           phi->SetLive();
           break;
-        } else {
-          phi->SetDead();
         }
       }
     }
@@ -65,8 +65,8 @@
                use_it.Advance()) {
             HUseListNode<HInstruction>* user_node = use_it.Current();
             HInstruction* user = user_node->GetUser();
-            DCHECK(user->IsLoopHeaderPhi());
-            DCHECK(user->AsPhi()->IsDead());
+            DCHECK(user->IsLoopHeaderPhi()) << user->GetId();
+            DCHECK(user->AsPhi()->IsDead()) << user->GetId();
             // Just put itself as an input. The phi will be removed in this loop anyway.
             user->SetRawInputAt(user_node->GetIndex(), user);
             current->RemoveUser(user, user_node->GetIndex());
diff --git a/compiler/optimizing/ssa_phi_elimination.h b/compiler/optimizing/ssa_phi_elimination.h
index 5274f09..b789971 100644
--- a/compiler/optimizing/ssa_phi_elimination.h
+++ b/compiler/optimizing/ssa_phi_elimination.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_OPTIMIZING_SSA_PHI_ELIMINATION_H_
 
 #include "nodes.h"
+#include "optimization.h"
 
 namespace art {
 
@@ -25,15 +26,15 @@
  * Optimization phase that removes dead phis from the graph. Dead phis are unused
  * phis, or phis only used by other phis.
  */
-class SsaDeadPhiElimination : public ValueObject {
+class SsaDeadPhiElimination : public HOptimization {
  public:
   explicit SsaDeadPhiElimination(HGraph* graph)
-      : graph_(graph), worklist_(graph->GetArena(), kDefaultWorklistSize) {}
+      : HOptimization(graph, true, "dead_phi_elimination"),
+        worklist_(graph->GetArena(), kDefaultWorklistSize) {}
 
-  void Run();
+  void Run() OVERRIDE;
 
  private:
-  HGraph* const graph_;
   GrowableArray<HPhi*> worklist_;
 
   static constexpr size_t kDefaultWorklistSize = 8;
@@ -47,15 +48,15 @@
  * registers might be updated with the same value, or not updated at all. We can just
  * replace the phi with the value when entering the loop.
  */
-class SsaRedundantPhiElimination : public ValueObject {
+class SsaRedundantPhiElimination : public HOptimization {
  public:
   explicit SsaRedundantPhiElimination(HGraph* graph)
-      : graph_(graph), worklist_(graph->GetArena(), kDefaultWorklistSize) {}
+      : HOptimization(graph, true, "redundant_phi_elimination"),
+        worklist_(graph->GetArena(), kDefaultWorklistSize) {}
 
-  void Run();
+  void Run() OVERRIDE;
 
  private:
-  HGraph* const graph_;
   GrowableArray<HPhi*> worklist_;
 
   static constexpr size_t kDefaultWorklistSize = 8;
diff --git a/compiler/optimizing/ssa_test.cc b/compiler/optimizing/ssa_test.cc
index fffe5c2..6174dd4 100644
--- a/compiler/optimizing/ssa_test.cc
+++ b/compiler/optimizing/ssa_test.cc
@@ -199,29 +199,31 @@
   // Test that we create a phi for an initialized local at entry of a loop.
   const char* expected =
     "BasicBlock 0, succ: 1\n"
-    "  0: IntConstant 0 [6, 4, 2, 2]\n"
-    "  1: Goto\n"
-    "BasicBlock 1, pred: 0, succ: 5, 6\n"
-    "  2: Equal(0, 0) [3]\n"
-    "  3: If(2)\n"
-    "BasicBlock 2, pred: 3, 6, succ: 3\n"
-    "  4: Phi(6, 0) [6]\n"
+    "  0: IntConstant 0 [6, 3, 3]\n"
+    "  1: IntConstant 4 [6]\n"
+    "  2: Goto\n"
+    "BasicBlock 1, pred: 0, succ: 4, 2\n"
+    "  3: Equal(0, 0) [4]\n"
+    "  4: If(3)\n"
+    "BasicBlock 2, pred: 1, succ: 3\n"
     "  5: Goto\n"
-    "BasicBlock 3, pred: 5, 2, succ: 2\n"
-    "  6: Phi(0, 4) [4]\n"
+    "BasicBlock 3, pred: 2, 4, succ: 5\n"
+    "  6: Phi(1, 0) [9]\n"
     "  7: Goto\n"
-    "BasicBlock 4\n"
-    // Synthesized blocks to avoid critical edge.
-    "BasicBlock 5, pred: 1, succ: 3\n"
+    "BasicBlock 4, pred: 1, succ: 3\n"
     "  8: Goto\n"
-    "BasicBlock 6, pred: 1, succ: 2\n"
-    "  9: Goto\n";
+    "BasicBlock 5, pred: 3, succ: 6\n"
+    "  9: Return(6)\n"
+    "BasicBlock 6, pred: 5\n"
+    "  10: Exit\n";
 
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
-    Instruction::IF_EQ, 3,
-    Instruction::GOTO | 0x100,
-    Instruction::GOTO | 0xFF00);
+    Instruction::IF_EQ, 4,
+    Instruction::CONST_4 | 4 << 12 | 0,
+    Instruction::GOTO | 0x200,
+    Instruction::GOTO | 0xFF00,
+    Instruction::RETURN | 0 << 8);
 
   TestCode(data, expected);
 }
diff --git a/compiler/trampolines/trampoline_compiler.cc b/compiler/trampolines/trampoline_compiler.cc
index 733b58f..cb07ffa 100644
--- a/compiler/trampolines/trampoline_compiler.cc
+++ b/compiler/trampolines/trampoline_compiler.cc
@@ -83,6 +83,7 @@
       break;
   }
 
+  assembler->EmitSlowPaths();
   size_t cs = assembler->CodeSize();
   std::unique_ptr<std::vector<uint8_t>> entry_stub(new std::vector<uint8_t>(cs));
   MemoryRegion code(&(*entry_stub)[0], entry_stub->size());
diff --git a/compiler/utils/arm/assembler_arm.cc b/compiler/utils/arm/assembler_arm.cc
index 591d461..0528773 100644
--- a/compiler/utils/arm/assembler_arm.cc
+++ b/compiler/utils/arm/assembler_arm.cc
@@ -92,16 +92,29 @@
       break;
     case kRegister:
       if (is_shift_) {
+        uint32_t shift_type;
+        switch (shift_) {
+          case arm::Shift::ROR:
+            shift_type = static_cast<uint32_t>(shift_);
+            CHECK_NE(immed_, 0U);
+            break;
+          case arm::Shift::RRX:
+            shift_type = static_cast<uint32_t>(arm::Shift::ROR);  // Same encoding as ROR.
+            CHECK_EQ(immed_, 0U);
+            break;
+          default:
+            shift_type = static_cast<uint32_t>(shift_);
+        }
         // Shifted immediate or register.
         if (rs_ == kNoRegister) {
           // Immediate shift.
           return immed_ << kShiftImmShift |
-                          static_cast<uint32_t>(shift_) << kShiftShift |
+                          shift_type << kShiftShift |
                           static_cast<uint32_t>(rm_);
         } else {
           // Register shift.
           return static_cast<uint32_t>(rs_) << kShiftRegisterShift |
-              static_cast<uint32_t>(shift_) << kShiftShift | (1 << 4) |
+              shift_type << kShiftShift | (1 << 4) |
               static_cast<uint32_t>(rm_);
         }
       } else {
@@ -152,36 +165,6 @@
   return 0;
 }
 
-bool ShifterOperand::CanHoldThumb(Register rd, Register rn, Opcode opcode,
-                                  uint32_t immediate, ShifterOperand* shifter_op) {
-  shifter_op->type_ = kImmediate;
-  shifter_op->immed_ = immediate;
-  shifter_op->is_shift_ = false;
-  shifter_op->is_rotate_ = false;
-  switch (opcode) {
-    case ADD:
-    case SUB:
-      if (rn == SP) {
-        if (rd == SP) {
-          return immediate < (1 << 9);    // 9 bits allowed.
-        } else {
-          return immediate < (1 << 12);   // 12 bits.
-        }
-      }
-      if (immediate < (1 << 12)) {    // Less than (or equal to) 12 bits can always be done.
-        return true;
-      }
-      return ArmAssembler::ModifiedImmediate(immediate) != kInvalidModifiedImmediate;
-
-    case MOV:
-      // TODO: Support less than or equal to 12bits.
-      return ArmAssembler::ModifiedImmediate(immediate) != kInvalidModifiedImmediate;
-    case MVN:
-    default:
-      return ArmAssembler::ModifiedImmediate(immediate) != kInvalidModifiedImmediate;
-  }
-}
-
 uint32_t Address::encodingArm() const {
   CHECK(IsAbsoluteUint(12, offset_));
   uint32_t encoding;
@@ -192,10 +175,9 @@
       encoding =  am_ | offset_;
     }
   } else {
-    uint32_t imm5 = offset_;
     uint32_t shift = shift_;
     if (shift == RRX) {
-      imm5 = 0;
+      CHECK_EQ(offset_, 0);
       shift = ROR;
     }
     encoding = am_ | static_cast<uint32_t>(rm_) | shift << 5 | offset_ << 7 | B25;
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index d288b70..c86ec4b 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -30,6 +30,9 @@
 namespace art {
 namespace arm {
 
+class Arm32Assembler;
+class Thumb2Assembler;
+
 class ShifterOperand {
  public:
   ShifterOperand() : type_(kUnknown), rm_(kNoRegister), rs_(kNoRegister),
@@ -103,33 +106,6 @@
     kImmediate
   };
 
-  static bool CanHoldArm(uint32_t immediate, ShifterOperand* shifter_op) {
-    // Avoid the more expensive test for frequent small immediate values.
-    if (immediate < (1 << kImmed8Bits)) {
-      shifter_op->type_ = kImmediate;
-      shifter_op->is_rotate_ = true;
-      shifter_op->rotate_ = 0;
-      shifter_op->immed_ = immediate;
-      return true;
-    }
-    // Note that immediate must be unsigned for the test to work correctly.
-    for (int rot = 0; rot < 16; rot++) {
-      uint32_t imm8 = (immediate << 2*rot) | (immediate >> (32 - 2*rot));
-      if (imm8 < (1 << kImmed8Bits)) {
-        shifter_op->type_ = kImmediate;
-        shifter_op->is_rotate_ = true;
-        shifter_op->rotate_ = rot;
-        shifter_op->immed_ = imm8;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  static bool CanHoldThumb(Register rd, Register rn, Opcode opcode,
-                           uint32_t immediate, ShifterOperand* shifter_op);
-
-
  private:
   Type type_;
   Register rm_;
@@ -140,6 +116,9 @@
   uint32_t rotate_;
   uint32_t immed_;
 
+  friend class Arm32Assembler;
+  friend class Thumb2Assembler;
+
 #ifdef SOURCE_ASSEMBLER_SUPPORT
   friend class BinaryAssembler;
 #endif
@@ -611,6 +590,14 @@
   virtual void Ror(Register rd, Register rm, Register rn, bool setcc = false,
                    Condition cond = AL) = 0;
 
+  // Returns whether the `immediate` can fit in a `ShifterOperand`. If yes,
+  // `shifter_op` contains the operand.
+  virtual bool ShifterOperandCanHold(Register rd,
+                                     Register rn,
+                                     Opcode opcode,
+                                     uint32_t immediate,
+                                     ShifterOperand* shifter_op) = 0;
+
   static bool IsInstructionForExceptionHandling(uintptr_t pc);
 
   virtual void Bind(Label* label) = 0;
diff --git a/compiler/utils/arm/assembler_arm32.cc b/compiler/utils/arm/assembler_arm32.cc
index 39ebf68..8f6d45a 100644
--- a/compiler/utils/arm/assembler_arm32.cc
+++ b/compiler/utils/arm/assembler_arm32.cc
@@ -25,6 +25,37 @@
 namespace art {
 namespace arm {
 
+bool Arm32Assembler::ShifterOperandCanHoldArm32(uint32_t immediate, ShifterOperand* shifter_op) {
+  // Avoid the more expensive test for frequent small immediate values.
+  if (immediate < (1 << kImmed8Bits)) {
+    shifter_op->type_ = ShifterOperand::kImmediate;
+    shifter_op->is_rotate_ = true;
+    shifter_op->rotate_ = 0;
+    shifter_op->immed_ = immediate;
+    return true;
+  }
+  // Note that immediate must be unsigned for the test to work correctly.
+  for (int rot = 0; rot < 16; rot++) {
+    uint32_t imm8 = (immediate << 2*rot) | (immediate >> (32 - 2*rot));
+    if (imm8 < (1 << kImmed8Bits)) {
+      shifter_op->type_ = ShifterOperand::kImmediate;
+      shifter_op->is_rotate_ = true;
+      shifter_op->rotate_ = rot;
+      shifter_op->immed_ = imm8;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool Arm32Assembler::ShifterOperandCanHold(Register rd ATTRIBUTE_UNUSED,
+                                           Register rn ATTRIBUTE_UNUSED,
+                                           Opcode opcode ATTRIBUTE_UNUSED,
+                                           uint32_t immediate,
+                                           ShifterOperand* shifter_op) {
+  return ShifterOperandCanHoldArm32(immediate, shifter_op);
+}
+
 void Arm32Assembler::and_(Register rd, Register rn, const ShifterOperand& so,
                         Condition cond) {
   EmitType01(cond, so.type(), AND, 0, rn, rd, so);
@@ -1079,7 +1110,7 @@
 
 void Arm32Assembler::Lsl(Register rd, Register rm, uint32_t shift_imm,
                          bool setcc, Condition cond) {
-  CHECK_NE(shift_imm, 0u);  // Do not use Lsl if no shift is wanted.
+  CHECK_LE(shift_imm, 31u);
   if (setcc) {
     movs(rd, ShifterOperand(rm, LSL, shift_imm), cond);
   } else {
@@ -1090,7 +1121,7 @@
 
 void Arm32Assembler::Lsr(Register rd, Register rm, uint32_t shift_imm,
                          bool setcc, Condition cond) {
-  CHECK_NE(shift_imm, 0u);  // Do not use Lsr if no shift is wanted.
+  CHECK(1u <= shift_imm && shift_imm <= 32u);
   if (shift_imm == 32) shift_imm = 0;  // Comply to UAL syntax.
   if (setcc) {
     movs(rd, ShifterOperand(rm, LSR, shift_imm), cond);
@@ -1102,7 +1133,7 @@
 
 void Arm32Assembler::Asr(Register rd, Register rm, uint32_t shift_imm,
                          bool setcc, Condition cond) {
-  CHECK_NE(shift_imm, 0u);  // Do not use Asr if no shift is wanted.
+  CHECK(1u <= shift_imm && shift_imm <= 32u);
   if (shift_imm == 32) shift_imm = 0;  // Comply to UAL syntax.
   if (setcc) {
     movs(rd, ShifterOperand(rm, ASR, shift_imm), cond);
@@ -1114,7 +1145,7 @@
 
 void Arm32Assembler::Ror(Register rd, Register rm, uint32_t shift_imm,
                          bool setcc, Condition cond) {
-  CHECK_NE(shift_imm, 0u);  // Use Rrx instruction.
+  CHECK(1u <= shift_imm && shift_imm <= 31u);
   if (setcc) {
     movs(rd, ShifterOperand(rm, ROR, shift_imm), cond);
   } else {
@@ -1291,16 +1322,16 @@
   // positive values and sub for negatives ones, which would slightly improve
   // the readability of generated code for some constants.
   ShifterOperand shifter_op;
-  if (ShifterOperand::CanHoldArm(value, &shifter_op)) {
+  if (ShifterOperandCanHoldArm32(value, &shifter_op)) {
     add(rd, rn, shifter_op, cond);
-  } else if (ShifterOperand::CanHoldArm(-value, &shifter_op)) {
+  } else if (ShifterOperandCanHoldArm32(-value, &shifter_op)) {
     sub(rd, rn, shifter_op, cond);
   } else {
     CHECK(rn != IP);
-    if (ShifterOperand::CanHoldArm(~value, &shifter_op)) {
+    if (ShifterOperandCanHoldArm32(~value, &shifter_op)) {
       mvn(IP, shifter_op, cond);
       add(rd, rn, ShifterOperand(IP), cond);
-    } else if (ShifterOperand::CanHoldArm(~(-value), &shifter_op)) {
+    } else if (ShifterOperandCanHoldArm32(~(-value), &shifter_op)) {
       mvn(IP, shifter_op, cond);
       sub(rd, rn, ShifterOperand(IP), cond);
     } else {
@@ -1318,16 +1349,16 @@
 void Arm32Assembler::AddConstantSetFlags(Register rd, Register rn, int32_t value,
                                          Condition cond) {
   ShifterOperand shifter_op;
-  if (ShifterOperand::CanHoldArm(value, &shifter_op)) {
+  if (ShifterOperandCanHoldArm32(value, &shifter_op)) {
     adds(rd, rn, shifter_op, cond);
-  } else if (ShifterOperand::CanHoldArm(-value, &shifter_op)) {
+  } else if (ShifterOperandCanHoldArm32(-value, &shifter_op)) {
     subs(rd, rn, shifter_op, cond);
   } else {
     CHECK(rn != IP);
-    if (ShifterOperand::CanHoldArm(~value, &shifter_op)) {
+    if (ShifterOperandCanHoldArm32(~value, &shifter_op)) {
       mvn(IP, shifter_op, cond);
       adds(rd, rn, ShifterOperand(IP), cond);
-    } else if (ShifterOperand::CanHoldArm(~(-value), &shifter_op)) {
+    } else if (ShifterOperandCanHoldArm32(~(-value), &shifter_op)) {
       mvn(IP, shifter_op, cond);
       subs(rd, rn, ShifterOperand(IP), cond);
     } else {
@@ -1343,9 +1374,9 @@
 
 void Arm32Assembler::LoadImmediate(Register rd, int32_t value, Condition cond) {
   ShifterOperand shifter_op;
-  if (ShifterOperand::CanHoldArm(value, &shifter_op)) {
+  if (ShifterOperandCanHoldArm32(value, &shifter_op)) {
     mov(rd, shifter_op, cond);
-  } else if (ShifterOperand::CanHoldArm(~value, &shifter_op)) {
+  } else if (ShifterOperandCanHoldArm32(~value, &shifter_op)) {
     mvn(rd, shifter_op, cond);
   } else {
     movw(rd, Low16Bits(value), cond);
@@ -1513,10 +1544,8 @@
 
 
 void Arm32Assembler::dmb(DmbOptions flavor) {
-#if ANDROID_SMP != 0
   int32_t encoding = 0xf57ff05f;  // dmb
   Emit(encoding | flavor);
-#endif
 }
 
 
diff --git a/compiler/utils/arm/assembler_arm32.h b/compiler/utils/arm/assembler_arm32.h
index 0b009e1..6c8d415 100644
--- a/compiler/utils/arm/assembler_arm32.h
+++ b/compiler/utils/arm/assembler_arm32.h
@@ -273,6 +273,12 @@
                       int32_t offset,
                       Condition cond = AL) OVERRIDE;
 
+  bool ShifterOperandCanHold(Register rd,
+                             Register rn,
+                             Opcode opcode,
+                             uint32_t immediate,
+                             ShifterOperand* shifter_op) OVERRIDE;
+
 
   static bool IsInstructionForExceptionHandling(uintptr_t pc);
 
@@ -359,6 +365,7 @@
   static int DecodeBranchOffset(int32_t inst);
   int32_t EncodeTstOffset(int offset, int32_t inst);
   int DecodeTstOffset(int32_t inst);
+  bool ShifterOperandCanHoldArm32(uint32_t immediate, ShifterOperand* shifter_op);
 };
 
 }  // namespace arm
diff --git a/compiler/utils/arm/assembler_arm32_test.cc b/compiler/utils/arm/assembler_arm32_test.cc
index 277a9eb..951792d 100644
--- a/compiler/utils/arm/assembler_arm32_test.cc
+++ b/compiler/utils/arm/assembler_arm32_test.cc
@@ -16,49 +16,209 @@
 
 #include "assembler_arm32.h"
 
+#include <functional>
+#include <type_traits>
+
+#include "base/macros.h"
 #include "base/stl_util.h"
-#include "utils/assembler_test.h"
+#include "utils/arm/assembler_arm_test.h"
 
 namespace art {
 
-class AssemblerArm32Test : public AssemblerTest<arm::Arm32Assembler,
-                                                arm::Register, arm::SRegister,
-                                                uint32_t> {
+using std::placeholders::_1;
+using std::placeholders::_2;
+using std::placeholders::_3;
+using std::placeholders::_4;
+using std::placeholders::_5;
+
+// To speed up tests, don't use all register combinations.
+static constexpr bool kUseSparseRegisterList = true;
+
+// To speed up tests, don't use all condition codes.
+static constexpr bool kUseSparseConditionList = true;
+
+// To speed up tests, don't use all shift immediates.
+static constexpr bool kUseSparseShiftImmediates = true;
+
+class AssemblerArm32Test : public AssemblerArmTest<arm::Arm32Assembler,
+                                                   arm::Register, arm::SRegister,
+                                                   uint32_t, arm::ShifterOperand, arm::Condition> {
  protected:
   std::string GetArchitectureString() OVERRIDE {
     return "arm";
   }
 
+  std::string GetAssemblerParameters() OVERRIDE {
+    // Arm-v7a, cortex-a15 (means we have sdiv).
+    return " -march=armv7-a -mcpu=cortex-a15 -mfpu=neon";
+  }
+
+  const char* GetAssemblyHeader() OVERRIDE {
+    return kArm32AssemblyHeader;
+  }
+
   std::string GetDisassembleParameters() OVERRIDE {
     return " -D -bbinary -marm --no-show-raw-insn";
   }
 
   void SetUpHelpers() OVERRIDE {
     if (registers_.size() == 0) {
-      registers_.insert(end(registers_),
-                        {  // NOLINT(whitespace/braces)
-                          new arm::Register(arm::R0),
-                          new arm::Register(arm::R1),
-                          new arm::Register(arm::R2),
-                          new arm::Register(arm::R3),
-                          new arm::Register(arm::R4),
-                          new arm::Register(arm::R5),
-                          new arm::Register(arm::R6),
-                          new arm::Register(arm::R7),
-                          new arm::Register(arm::R8),
-                          new arm::Register(arm::R9),
-                          new arm::Register(arm::R10),
-                          new arm::Register(arm::R11),
-                          new arm::Register(arm::R12),
-                          new arm::Register(arm::R13),
-                          new arm::Register(arm::R14),
-                          new arm::Register(arm::R15)
-                        });
+      if (kUseSparseRegisterList) {
+        registers_.insert(end(registers_),
+                          {  // NOLINT(whitespace/braces)
+                              new arm::Register(arm::R0),
+                              new arm::Register(arm::R1),
+                              new arm::Register(arm::R4),
+                              new arm::Register(arm::R8),
+                              new arm::Register(arm::R11),
+                              new arm::Register(arm::R12),
+                              new arm::Register(arm::R13),
+                              new arm::Register(arm::R14),
+                              new arm::Register(arm::R15)
+                          });
+      } else {
+        registers_.insert(end(registers_),
+                          {  // NOLINT(whitespace/braces)
+                              new arm::Register(arm::R0),
+                              new arm::Register(arm::R1),
+                              new arm::Register(arm::R2),
+                              new arm::Register(arm::R3),
+                              new arm::Register(arm::R4),
+                              new arm::Register(arm::R5),
+                              new arm::Register(arm::R6),
+                              new arm::Register(arm::R7),
+                              new arm::Register(arm::R8),
+                              new arm::Register(arm::R9),
+                              new arm::Register(arm::R10),
+                              new arm::Register(arm::R11),
+                              new arm::Register(arm::R12),
+                              new arm::Register(arm::R13),
+                              new arm::Register(arm::R14),
+                              new arm::Register(arm::R15)
+                          });
+      }
+    }
+
+    if (!kUseSparseConditionList) {
+      conditions_.push_back(arm::Condition::EQ);
+      conditions_.push_back(arm::Condition::NE);
+      conditions_.push_back(arm::Condition::CS);
+      conditions_.push_back(arm::Condition::CC);
+      conditions_.push_back(arm::Condition::MI);
+      conditions_.push_back(arm::Condition::PL);
+      conditions_.push_back(arm::Condition::VS);
+      conditions_.push_back(arm::Condition::VC);
+      conditions_.push_back(arm::Condition::HI);
+      conditions_.push_back(arm::Condition::LS);
+      conditions_.push_back(arm::Condition::GE);
+      conditions_.push_back(arm::Condition::LT);
+      conditions_.push_back(arm::Condition::GT);
+      conditions_.push_back(arm::Condition::LE);
+      conditions_.push_back(arm::Condition::AL);
+    } else {
+      conditions_.push_back(arm::Condition::EQ);
+      conditions_.push_back(arm::Condition::NE);
+      conditions_.push_back(arm::Condition::CC);
+      conditions_.push_back(arm::Condition::VC);
+      conditions_.push_back(arm::Condition::HI);
+      conditions_.push_back(arm::Condition::LT);
+      conditions_.push_back(arm::Condition::AL);
+    }
+
+    shifter_operands_.push_back(arm::ShifterOperand(0));
+    shifter_operands_.push_back(arm::ShifterOperand(1));
+    shifter_operands_.push_back(arm::ShifterOperand(2));
+    shifter_operands_.push_back(arm::ShifterOperand(3));
+    shifter_operands_.push_back(arm::ShifterOperand(4));
+    shifter_operands_.push_back(arm::ShifterOperand(5));
+    shifter_operands_.push_back(arm::ShifterOperand(127));
+    shifter_operands_.push_back(arm::ShifterOperand(128));
+    shifter_operands_.push_back(arm::ShifterOperand(254));
+    shifter_operands_.push_back(arm::ShifterOperand(255));
+
+    if (!kUseSparseRegisterList) {
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R0));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R1));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R2));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R3));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R4));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R5));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R6));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R7));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R8));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R9));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R10));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R11));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R12));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R13));
+    } else {
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R0));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R1));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R4));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R8));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R11));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R12));
+      shifter_operands_.push_back(arm::ShifterOperand(arm::R13));
+    }
+
+    std::vector<arm::Shift> shifts {
+      arm::Shift::LSL, arm::Shift::LSR, arm::Shift::ASR, arm::Shift::ROR, arm::Shift::RRX
+    };
+
+    // ShifterOperands of form "reg shift-type imm."
+    for (arm::Shift shift : shifts) {
+      for (arm::Register* reg : registers_) {  // Note: this will pick up the sparse set.
+        if (*reg == arm::R15) {  // Skip PC.
+          continue;
+        }
+        if (shift != arm::Shift::RRX) {
+          if (!kUseSparseShiftImmediates) {
+            for (uint32_t imm = 1; imm < 32; ++imm) {
+              shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, imm));
+            }
+          } else {
+            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 1));
+            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 2));
+            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 3));
+            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 7));
+            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 15));
+            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 16));
+            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 30));
+            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 31));
+          }
+        } else {
+          // RRX doesn't have an immediate.
+          shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 0));
+        }
+      }
     }
   }
 
+  std::vector<arm::ShifterOperand> CreateRegisterShifts(std::vector<arm::Register*>& base_regs,
+                                                        int32_t shift_min, int32_t shift_max) {
+    std::vector<arm::ShifterOperand> res;
+    static constexpr arm::Shift kShifts[] = { arm::Shift::LSL, arm::Shift::LSR, arm::Shift::ASR,
+                                              arm::Shift::ROR };
+
+    for (arm::Shift shift : kShifts) {
+      for (arm::Register* reg : base_regs) {
+        // Take the min, the max, and three values in between.
+        res.push_back(arm::ShifterOperand(*reg, shift, shift_min));
+        if (shift_min != shift_max) {
+          res.push_back(arm::ShifterOperand(*reg, shift, shift_max));
+          int32_t middle = (shift_min + shift_max) / 2;
+          res.push_back(arm::ShifterOperand(*reg, shift, middle));
+          res.push_back(arm::ShifterOperand(*reg, shift, middle - 1));
+          res.push_back(arm::ShifterOperand(*reg, shift, middle + 1));
+        }
+      }
+    }
+
+    return res;
+  }
+
   void TearDown() OVERRIDE {
-    AssemblerTest::TearDown();
+    AssemblerArmTest::TearDown();
     STLDeleteElements(&registers_);
   }
 
@@ -70,8 +230,281 @@
     return imm_value;
   }
 
+  std::vector<arm::Condition>& GetConditions() OVERRIDE {
+    return conditions_;
+  }
+
+  std::string GetConditionString(arm::Condition c) OVERRIDE {
+    std::ostringstream oss;
+    oss << c;
+    return oss.str();
+  }
+
+  arm::Register GetPCRegister() OVERRIDE {
+    return arm::R15;
+  }
+
+  std::vector<arm::ShifterOperand>& GetShiftOperands() OVERRIDE {
+    return shifter_operands_;
+  }
+
+  std::string GetShiftString(arm::ShifterOperand sop) OVERRIDE {
+    std::ostringstream oss;
+    if (sop.IsShift()) {
+      // Not a rotate...
+      if (sop.GetShift() == arm::Shift::RRX) {
+        oss << sop.GetRegister() << ", " << sop.GetShift();
+      } else {
+        oss << sop.GetRegister() << ", " << sop.GetShift() << " #" << sop.GetImmediate();
+      }
+    } else if (sop.IsRegister()) {
+      oss << sop.GetRegister();
+    } else {
+      CHECK(sop.IsImmediate());
+      oss << "#" << sop.GetImmediate();
+    }
+    return oss.str();
+  }
+
+  static const char* GetRegTokenFromDepth(int depth) {
+    switch (depth) {
+      case 0:
+        return Base::REG1_TOKEN;
+      case 1:
+        return Base::REG2_TOKEN;
+      case 2:
+        return REG3_TOKEN;
+      case 3:
+        return REG4_TOKEN;
+      default:
+        LOG(FATAL) << "Depth problem.";
+        UNREACHABLE();
+    }
+  }
+
+  void ExecuteAndPrint(std::function<void()> f, std::string fmt, std::ostringstream& oss) {
+    if (first_) {
+      first_ = false;
+    } else {
+      oss << "\n";
+    }
+    oss << fmt;
+
+    f();
+  }
+
+  void TemplateHelper(std::function<void(arm::Register)> f, int depth ATTRIBUTE_UNUSED,
+                      bool without_pc,
+                      std::string fmt, std::ostringstream& oss) {
+    std::vector<arm::Register*> registers = without_pc ? GetRegistersWithoutPC() : GetRegisters();
+    for (auto reg : registers) {
+      std::string after_reg = fmt;
+
+      std::string reg_string = GetRegName<RegisterView::kUsePrimaryName>(*reg);
+      size_t reg_index;
+      const char* reg_token = GetRegTokenFromDepth(depth);
+
+      while ((reg_index = after_reg.find(reg_token)) != std::string::npos) {
+        after_reg.replace(reg_index, strlen(reg_token), reg_string);
+      }
+
+      ExecuteAndPrint([&] () { f(*reg); }, after_reg, oss);
+    }
+  }
+
+  void TemplateHelper(std::function<void(const arm::ShifterOperand&)> f, int depth ATTRIBUTE_UNUSED,
+                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::ostringstream& oss) {
+    for (const arm::ShifterOperand& shift : GetShiftOperands()) {
+      std::string after_shift = fmt;
+
+      std::string shift_string = GetShiftString(shift);
+      size_t shift_index;
+      while ((shift_index = after_shift.find(SHIFT_TOKEN)) != std::string::npos) {
+        after_shift.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
+      }
+
+      ExecuteAndPrint([&] () { f(shift); }, after_shift, oss);
+    }
+  }
+
+  void TemplateHelper(std::function<void(arm::Condition)> f, int depth ATTRIBUTE_UNUSED,
+                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::ostringstream& oss) {
+    for (arm::Condition c : GetConditions()) {
+      std::string after_cond = fmt;
+
+      size_t cond_index = after_cond.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+
+      ExecuteAndPrint([&] () { f(c); }, after_cond, oss);
+    }
+  }
+
+  template <typename... Args>
+  void TemplateHelper(std::function<void(arm::Register, Args...)> f, int depth, bool without_pc,
+                      std::string fmt, std::ostringstream& oss) {
+    std::vector<arm::Register*> registers = without_pc ? GetRegistersWithoutPC() : GetRegisters();
+    for (auto reg : registers) {
+      std::string after_reg = fmt;
+
+      std::string reg_string = GetRegName<RegisterView::kUsePrimaryName>(*reg);
+      size_t reg_index;
+      const char* reg_token = GetRegTokenFromDepth(depth);
+
+      while ((reg_index = after_reg.find(reg_token)) != std::string::npos) {
+        after_reg.replace(reg_index, strlen(reg_token), reg_string);
+      }
+
+      auto lambda = [&] (Args... args) { f(*reg, args...); };  // NOLINT [readability/braces] [4]
+      TemplateHelper(std::function<void(Args...)>(lambda), depth + 1, without_pc,
+          after_reg, oss);
+    }
+  }
+
+  template <typename... Args>
+  void TemplateHelper(std::function<void(const arm::ShifterOperand&, Args...)> f, int depth,
+                      bool without_pc, std::string fmt, std::ostringstream& oss) {
+    for (const arm::ShifterOperand& shift : GetShiftOperands()) {
+      std::string after_shift = fmt;
+
+      std::string shift_string = GetShiftString(shift);
+      size_t shift_index;
+      while ((shift_index = after_shift.find(SHIFT_TOKEN)) != std::string::npos) {
+        after_shift.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
+      }
+
+      auto lambda = [&] (Args... args) { f(shift, args...); };  // NOLINT [readability/braces] [4]
+      TemplateHelper(std::function<void(Args...)>(lambda), depth, without_pc,
+          after_shift, oss);
+    }
+  }
+
+  template <typename... Args>
+  void TemplateHelper(std::function<void(arm::Condition, Args...)> f, int depth, bool without_pc,
+                      std::string fmt, std::ostringstream& oss) {
+    for (arm::Condition c : GetConditions()) {
+      std::string after_cond = fmt;
+
+      size_t cond_index = after_cond.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+
+      auto lambda = [&] (Args... args) { f(c, args...); };  // NOLINT [readability/braces] [4]
+      TemplateHelper(std::function<void(Args...)>(lambda), depth, without_pc,
+          after_cond, oss);
+    }
+  }
+
+  template <typename T1, typename T2>
+  std::function<void(T1, T2)> GetBoundFunction2(void (arm::Arm32Assembler::*f)(T1, T2)) {
+    return std::bind(f, GetAssembler(), _1, _2);
+  }
+
+  template <typename T1, typename T2, typename T3>
+  std::function<void(T1, T2, T3)> GetBoundFunction3(void (arm::Arm32Assembler::*f)(T1, T2, T3)) {
+    return std::bind(f, GetAssembler(), _1, _2, _3);
+  }
+
+  template <typename T1, typename T2, typename T3, typename T4>
+  std::function<void(T1, T2, T3, T4)> GetBoundFunction4(
+      void (arm::Arm32Assembler::*f)(T1, T2, T3, T4)) {
+    return std::bind(f, GetAssembler(), _1, _2, _3, _4);
+  }
+
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  std::function<void(T1, T2, T3, T4, T5)> GetBoundFunction5(
+      void (arm::Arm32Assembler::*f)(T1, T2, T3, T4, T5)) {
+    return std::bind(f, GetAssembler(), _1, _2, _3, _4, _5);
+  }
+
+  template <typename... Args>
+  void GenericTemplateHelper(std::function<void(Args...)> f, bool without_pc,
+                             std::string fmt, std::string test_name) {
+    first_ = false;
+    WarnOnCombinations(CountHelper<Args...>(without_pc));
+
+    std::ostringstream oss;
+
+    TemplateHelper(f, 0, without_pc, fmt, oss);
+
+    oss << "\n";  // Trailing newline.
+
+    DriverStr(oss.str(), test_name);
+  }
+
+  template <typename... Args>
+  void T2Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
+                std::string test_name) {
+    GenericTemplateHelper(GetBoundFunction2(f), without_pc, fmt, test_name);
+  }
+
+  template <typename... Args>
+  void T3Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
+      std::string test_name) {
+    GenericTemplateHelper(GetBoundFunction3(f), without_pc, fmt, test_name);
+  }
+
+  template <typename... Args>
+  void T4Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
+      std::string test_name) {
+    GenericTemplateHelper(GetBoundFunction4(f), without_pc, fmt, test_name);
+  }
+
+  template <typename... Args>
+  void T5Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
+      std::string test_name) {
+    GenericTemplateHelper(GetBoundFunction5(f), without_pc, fmt, test_name);
+  }
+
  private:
+  template <typename T>
+  size_t CountHelper(bool without_pc) {
+    size_t tmp;
+    if (std::is_same<T, arm::Register>::value) {
+      tmp = GetRegisters().size();
+      if (without_pc) {
+        tmp--;;  // Approximation...
+      }
+      return tmp;
+    } else if (std::is_same<T, const arm::ShifterOperand&>::value) {
+      return GetShiftOperands().size();
+    } else if (std::is_same<T, arm::Condition>::value) {
+      return GetConditions().size();
+    } else {
+      LOG(WARNING) << "Unknown type while counting.";
+      return 1;
+    }
+  }
+
+  template <typename T1, typename T2, typename... Args>
+  size_t CountHelper(bool without_pc) {
+    size_t tmp;
+    if (std::is_same<T1, arm::Register>::value) {
+      tmp = GetRegisters().size();
+      if (without_pc) {
+        tmp--;;  // Approximation...
+      }
+    } else if (std::is_same<T1, const arm::ShifterOperand&>::value) {
+      tmp =  GetShiftOperands().size();
+    } else if (std::is_same<T1, arm::Condition>::value) {
+      tmp = GetConditions().size();
+    } else {
+      LOG(WARNING) << "Unknown type while counting.";
+      tmp = 1;
+    }
+    size_t rec = CountHelper<T2, Args...>(without_pc);
+    return rec * tmp;
+  }
+
+  bool first_;
+
+  static constexpr const char* kArm32AssemblyHeader = ".arm\n";
+
   std::vector<arm::Register*> registers_;
+  std::vector<arm::Condition> conditions_;
+  std::vector<arm::ShifterOperand> shifter_operands_;
 };
 
 
@@ -79,77 +512,189 @@
   EXPECT_TRUE(CheckTools());
 }
 
-
 TEST_F(AssemblerArm32Test, Sbfx) {
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 1);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 8);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 16);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 32);
+  std::vector<std::pair<uint32_t, uint32_t>> immediates;
+  immediates.push_back({0, 1});
+  immediates.push_back({0, 8});
+  immediates.push_back({0, 15});
+  immediates.push_back({0, 16});
+  immediates.push_back({0, 31});
+  immediates.push_back({0, 32});
 
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 1);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 8);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 16);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 24);
+  immediates.push_back({1, 1});
+  immediates.push_back({1, 15});
+  immediates.push_back({1, 31});
 
-  GetAssembler()->sbfx(arm::R0, arm::R1, 16, 1);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 16, 8);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 16, 16);
+  immediates.push_back({8, 1});
+  immediates.push_back({8, 15});
+  immediates.push_back({8, 16});
+  immediates.push_back({8, 24});
 
-  GetAssembler()->sbfx(arm::R0, arm::R1, 31, 1);
+  immediates.push_back({31, 1});
 
-  const char* expected =
-      "sbfx r0, r1, #0, #1\n"
-      "sbfx r0, r1, #0, #8\n"
-      "sbfx r0, r1, #0, #16\n"
-      "sbfx r0, r1, #0, #32\n"
-
-      "sbfx r0, r1, #8, #1\n"
-      "sbfx r0, r1, #8, #8\n"
-      "sbfx r0, r1, #8, #16\n"
-      "sbfx r0, r1, #8, #24\n"
-
-      "sbfx r0, r1, #16, #1\n"
-      "sbfx r0, r1, #16, #8\n"
-      "sbfx r0, r1, #16, #16\n"
-
-      "sbfx r0, r1, #31, #1\n";
-  DriverStr(expected, "sbfx");
+  DriverStr(RepeatRRiiC(&arm::Arm32Assembler::sbfx, immediates,
+                        "sbfx{cond} {reg1}, {reg2}, #{imm1}, #{imm2}"), "sbfx");
 }
 
 TEST_F(AssemblerArm32Test, Ubfx) {
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 1);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 8);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 16);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 32);
+  std::vector<std::pair<uint32_t, uint32_t>> immediates;
+  immediates.push_back({0, 1});
+  immediates.push_back({0, 8});
+  immediates.push_back({0, 15});
+  immediates.push_back({0, 16});
+  immediates.push_back({0, 31});
+  immediates.push_back({0, 32});
 
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 1);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 8);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 16);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 24);
+  immediates.push_back({1, 1});
+  immediates.push_back({1, 15});
+  immediates.push_back({1, 31});
 
-  GetAssembler()->ubfx(arm::R0, arm::R1, 16, 1);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 16, 8);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 16, 16);
+  immediates.push_back({8, 1});
+  immediates.push_back({8, 15});
+  immediates.push_back({8, 16});
+  immediates.push_back({8, 24});
 
-  GetAssembler()->ubfx(arm::R0, arm::R1, 31, 1);
+  immediates.push_back({31, 1});
 
-  const char* expected =
-      "ubfx r0, r1, #0, #1\n"
-      "ubfx r0, r1, #0, #8\n"
-      "ubfx r0, r1, #0, #16\n"
-      "ubfx r0, r1, #0, #32\n"
+  DriverStr(RepeatRRiiC(&arm::Arm32Assembler::ubfx, immediates,
+                        "ubfx{cond} {reg1}, {reg2}, #{imm1}, #{imm2}"), "ubfx");
+}
 
-      "ubfx r0, r1, #8, #1\n"
-      "ubfx r0, r1, #8, #8\n"
-      "ubfx r0, r1, #8, #16\n"
-      "ubfx r0, r1, #8, #24\n"
+TEST_F(AssemblerArm32Test, Mul) {
+  T4Helper(&arm::Arm32Assembler::mul, true, "mul{cond} {reg1}, {reg2}, {reg3}", "mul");
+}
 
-      "ubfx r0, r1, #16, #1\n"
-      "ubfx r0, r1, #16, #8\n"
-      "ubfx r0, r1, #16, #16\n"
+TEST_F(AssemblerArm32Test, Mla) {
+  T5Helper(&arm::Arm32Assembler::mla, true, "mla{cond} {reg1}, {reg2}, {reg3}, {reg4}", "mul");
+}
 
-      "ubfx r0, r1, #31, #1\n";
-  DriverStr(expected, "ubfx");
+/* TODO: Needs support to filter out register combinations, as rdhi must not be equal to rdlo.
+TEST_F(AssemblerArm32Test, Umull) {
+  T5Helper(&arm::Arm32Assembler::umull, true, "umull{cond} {reg1}, {reg2}, {reg3}, {reg4}",
+           "umull");
+}
+*/
+
+TEST_F(AssemblerArm32Test, Sdiv) {
+  T4Helper(&arm::Arm32Assembler::sdiv, true, "sdiv{cond} {reg1}, {reg2}, {reg3}", "sdiv");
+}
+
+TEST_F(AssemblerArm32Test, Udiv) {
+  T4Helper(&arm::Arm32Assembler::udiv, true, "udiv{cond} {reg1}, {reg2}, {reg3}", "udiv");
+}
+
+TEST_F(AssemblerArm32Test, And) {
+  T4Helper(&arm::Arm32Assembler::and_, true, "and{cond} {reg1}, {reg2}, {shift}", "and");
+}
+
+TEST_F(AssemblerArm32Test, Eor) {
+  T4Helper(&arm::Arm32Assembler::eor, true, "eor{cond} {reg1}, {reg2}, {shift}", "eor");
+}
+
+TEST_F(AssemblerArm32Test, Orr) {
+  T4Helper(&arm::Arm32Assembler::orr, true, "orr{cond} {reg1}, {reg2}, {shift}", "orr");
+}
+
+TEST_F(AssemblerArm32Test, Orrs) {
+  T4Helper(&arm::Arm32Assembler::orrs, true, "orr{cond}s {reg1}, {reg2}, {shift}", "orrs");
+}
+
+TEST_F(AssemblerArm32Test, Bic) {
+  T4Helper(&arm::Arm32Assembler::bic, true, "bic{cond} {reg1}, {reg2}, {shift}", "bic");
+}
+
+TEST_F(AssemblerArm32Test, Mov) {
+  T3Helper(&arm::Arm32Assembler::mov, true, "mov{cond} {reg1}, {shift}", "mov");
+}
+
+TEST_F(AssemblerArm32Test, Movs) {
+  T3Helper(&arm::Arm32Assembler::movs, true, "mov{cond}s {reg1}, {shift}", "movs");
+}
+
+TEST_F(AssemblerArm32Test, Mvn) {
+  T3Helper(&arm::Arm32Assembler::mvn, true, "mvn{cond} {reg1}, {shift}", "mvn");
+}
+
+TEST_F(AssemblerArm32Test, Mvns) {
+  T3Helper(&arm::Arm32Assembler::mvns, true, "mvn{cond}s {reg1}, {shift}", "mvns");
+}
+
+TEST_F(AssemblerArm32Test, Add) {
+  T4Helper(&arm::Arm32Assembler::add, false, "add{cond} {reg1}, {reg2}, {shift}", "add");
+}
+
+TEST_F(AssemblerArm32Test, Adds) {
+  T4Helper(&arm::Arm32Assembler::adds, false, "add{cond}s {reg1}, {reg2}, {shift}", "adds");
+}
+
+TEST_F(AssemblerArm32Test, Adc) {
+  T4Helper(&arm::Arm32Assembler::adc, false, "adc{cond} {reg1}, {reg2}, {shift}", "adc");
+}
+
+TEST_F(AssemblerArm32Test, Sub) {
+  T4Helper(&arm::Arm32Assembler::sub, false, "sub{cond} {reg1}, {reg2}, {shift}", "sub");
+}
+
+TEST_F(AssemblerArm32Test, Subs) {
+  T4Helper(&arm::Arm32Assembler::subs, false, "sub{cond}s {reg1}, {reg2}, {shift}", "subs");
+}
+
+TEST_F(AssemblerArm32Test, Sbc) {
+  T4Helper(&arm::Arm32Assembler::sbc, false, "sbc{cond} {reg1}, {reg2}, {shift}", "sbc");
+}
+
+TEST_F(AssemblerArm32Test, Rsb) {
+  T4Helper(&arm::Arm32Assembler::rsb, true, "rsb{cond} {reg1}, {reg2}, {shift}", "rsb");
+}
+
+TEST_F(AssemblerArm32Test, Rsbs) {
+  T4Helper(&arm::Arm32Assembler::rsbs, true, "rsb{cond}s {reg1}, {reg2}, {shift}", "rsbs");
+}
+
+TEST_F(AssemblerArm32Test, Rsc) {
+  T4Helper(&arm::Arm32Assembler::rsc, true, "rsc{cond} {reg1}, {reg2}, {shift}", "rsc");
+}
+
+/* TODO: Needs support to filter out register combinations, as reg1 must not be equal to reg3.
+TEST_F(AssemblerArm32Test, Strex) {
+  RRRCWithoutPCHelper(&arm::Arm32Assembler::strex, "strex{cond} {reg1}, {reg2}, [{reg3}]", "strex");
+}
+*/
+
+TEST_F(AssemblerArm32Test, Clz) {
+  T3Helper(&arm::Arm32Assembler::clz, true, "clz{cond} {reg1}, {reg2}", "clz");
+}
+
+TEST_F(AssemblerArm32Test, Tst) {
+  T3Helper(&arm::Arm32Assembler::tst, true, "tst{cond} {reg1}, {shift}", "tst");
+}
+
+TEST_F(AssemblerArm32Test, Teq) {
+  T3Helper(&arm::Arm32Assembler::teq, true, "teq{cond} {reg1}, {shift}", "teq");
+}
+
+TEST_F(AssemblerArm32Test, Cmp) {
+  T3Helper(&arm::Arm32Assembler::cmp, true, "cmp{cond} {reg1}, {shift}", "cmp");
+}
+
+TEST_F(AssemblerArm32Test, Cmn) {
+  T3Helper(&arm::Arm32Assembler::cmn, true, "cmn{cond} {reg1}, {shift}", "cmn");
+}
+
+TEST_F(AssemblerArm32Test, Blx) {
+  T2Helper(&arm::Arm32Assembler::blx, true, "blx{cond} {reg1}", "blx");
+}
+
+TEST_F(AssemblerArm32Test, Bx) {
+  T2Helper(&arm::Arm32Assembler::bx, true, "bx{cond} {reg1}", "bx");
+}
+
+TEST_F(AssemblerArm32Test, Vmstat) {
+  GetAssembler()->vmstat();
+
+  const char* expected = "vmrs APSR_nzcv, FPSCR\n";
+
+  DriverStr(expected, "vmrs");
 }
 
 }  // namespace art
diff --git a/compiler/utils/arm/assembler_arm_test.h b/compiler/utils/arm/assembler_arm_test.h
new file mode 100644
index 0000000..838abb6
--- /dev/null
+++ b/compiler/utils/arm/assembler_arm_test.h
@@ -0,0 +1,545 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM_TEST_H_
+#define ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM_TEST_H_
+
+#include "utils/assembler_test.h"
+
+namespace art {
+
+template<typename Ass, typename Reg, typename FPReg, typename Imm, typename SOp, typename Cond>
+class AssemblerArmTest : public AssemblerTest<Ass, Reg, FPReg, Imm> {
+ public:
+  typedef AssemblerTest<Ass, Reg, FPReg, Imm> Base;
+
+  using Base::GetRegisters;
+  using Base::GetRegName;
+  using Base::CreateImmediate;
+  using Base::WarnOnCombinations;
+
+  static constexpr int64_t kFullImmRangeThreshold = 32;
+
+  virtual void FillImmediates(std::vector<Imm>& immediates, int64_t imm_min, int64_t imm_max) {
+    // Small range: do completely.
+    if (imm_max - imm_min <= kFullImmRangeThreshold) {
+      for (int64_t i = imm_min; i <= imm_max; ++i) {
+        immediates.push_back(CreateImmediate(i));
+      }
+    } else {
+      immediates.push_back(CreateImmediate(imm_min));
+      immediates.push_back(CreateImmediate(imm_max));
+      if (imm_min < imm_max - 1) {
+        immediates.push_back(CreateImmediate(imm_min + 1));
+      }
+      if (imm_min < imm_max - 2) {
+        immediates.push_back(CreateImmediate(imm_min + 2));
+      }
+      if (imm_min < imm_max - 3) {
+        immediates.push_back(CreateImmediate(imm_max - 1));
+      }
+      if (imm_min < imm_max - 4) {
+        immediates.push_back(CreateImmediate((imm_min + imm_max) / 2));
+      }
+    }
+  }
+
+  std::string RepeatRRIIC(void (Ass::*f)(Reg, Reg, Imm, Imm, Cond),
+                          int64_t imm1_min, int64_t imm1_max,
+                          int64_t imm2_min, int64_t imm2_max,
+                          std::string fmt) {
+    return RepeatTemplatedRRIIC(f, GetRegisters(), GetRegisters(),
+                                &AssemblerArmTest::template GetRegName<RegisterView::kUsePrimaryName>,
+                                &AssemblerArmTest::template GetRegName<RegisterView::kUsePrimaryName>,
+                                imm1_min, imm1_max, imm2_min, imm2_max,
+                                fmt);
+  }
+
+  template <typename Reg1, typename Reg2>
+  std::string RepeatTemplatedRRIIC(void (Ass::*f)(Reg1, Reg2, Imm, Imm, Cond),
+                                   const std::vector<Reg1*> reg1_registers,
+                                   const std::vector<Reg2*> reg2_registers,
+                                   std::string (AssemblerArmTest::*GetName1)(const Reg1&),
+                                   std::string (AssemblerArmTest::*GetName2)(const Reg2&),
+                                   int64_t imm1_min, int64_t imm1_max,
+                                   int64_t imm2_min, int64_t imm2_max,
+                                   std::string fmt) {
+    std::vector<Imm> immediates1;
+    FillImmediates(immediates1, imm1_min, imm1_max);
+    std::vector<Imm> immediates2;
+    FillImmediates(immediates2, imm2_min, imm2_max);
+
+    std::vector<Cond>& cond = GetConditions();
+
+    WarnOnCombinations(cond.size() * immediates1.size() * immediates2.size() *
+                       reg1_registers.size() * reg2_registers.size());
+
+    std::ostringstream oss;
+    bool first = true;
+    for (Cond& c : cond) {
+      std::string after_cond = fmt;
+
+      size_t cond_index = after_cond.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+
+      for (Imm i : immediates1) {
+        std::string base = after_cond;
+
+        size_t imm1_index = base.find(IMM1_TOKEN);
+        if (imm1_index != std::string::npos) {
+          std::ostringstream sreg;
+          sreg << i;
+          std::string imm_string = sreg.str();
+          base.replace(imm1_index, ConstexprStrLen(IMM1_TOKEN), imm_string);
+        }
+
+        for (Imm j : immediates2) {
+          std::string base2 = base;
+
+          size_t imm2_index = base2.find(IMM2_TOKEN);
+          if (imm2_index != std::string::npos) {
+            std::ostringstream sreg;
+            sreg << j;
+            std::string imm_string = sreg.str();
+            base2.replace(imm2_index, ConstexprStrLen(IMM2_TOKEN), imm_string);
+          }
+
+          for (auto reg1 : reg1_registers) {
+            std::string base3 = base2;
+
+            std::string reg1_string = (this->*GetName1)(*reg1);
+            size_t reg1_index;
+            while ((reg1_index = base3.find(Base::REG1_TOKEN)) != std::string::npos) {
+              base3.replace(reg1_index, ConstexprStrLen(Base::REG1_TOKEN), reg1_string);
+            }
+
+            for (auto reg2 : reg2_registers) {
+              std::string base4 = base3;
+
+              std::string reg2_string = (this->*GetName2)(*reg2);
+              size_t reg2_index;
+              while ((reg2_index = base4.find(Base::REG2_TOKEN)) != std::string::npos) {
+                base4.replace(reg2_index, ConstexprStrLen(Base::REG2_TOKEN), reg2_string);
+              }
+
+              if (first) {
+                first = false;
+              } else {
+                oss << "\n";
+              }
+              oss << base4;
+
+              (Base::GetAssembler()->*f)(*reg1, *reg2, i, j, c);
+            }
+          }
+        }
+      }
+    }
+    // Add a newline at the end.
+    oss << "\n";
+
+    return oss.str();
+  }
+
+  std::string RepeatRRiiC(void (Ass::*f)(Reg, Reg, Imm, Imm, Cond),
+                          std::vector<std::pair<Imm, Imm>>& immediates,
+                          std::string fmt) {
+    return RepeatTemplatedRRiiC<Reg, Reg>(f, GetRegisters(), GetRegisters(),
+        &AssemblerArmTest::template GetRegName<RegisterView::kUsePrimaryName>,
+        &AssemblerArmTest::template GetRegName<RegisterView::kUsePrimaryName>,
+        immediates, fmt);
+  }
+
+  template <typename Reg1, typename Reg2>
+  std::string RepeatTemplatedRRiiC(void (Ass::*f)(Reg1, Reg2, Imm, Imm, Cond),
+        const std::vector<Reg1*> reg1_registers,
+        const std::vector<Reg2*> reg2_registers,
+        std::string (AssemblerArmTest::*GetName1)(const Reg1&),
+        std::string (AssemblerArmTest::*GetName2)(const Reg2&),
+        std::vector<std::pair<Imm, Imm>>& immediates,
+        std::string fmt) {
+    std::vector<Cond>& cond = GetConditions();
+
+    WarnOnCombinations(cond.size() * immediates.size() * reg1_registers.size() *
+                       reg2_registers.size());
+
+    std::ostringstream oss;
+    bool first = true;
+    for (Cond& c : cond) {
+      std::string after_cond = fmt;
+
+      size_t cond_index = after_cond.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+
+      for (std::pair<Imm, Imm>& pair : immediates) {
+        Imm i = pair.first;
+        Imm j = pair.second;
+        std::string after_imm1 = after_cond;
+
+        size_t imm1_index = after_imm1.find(IMM1_TOKEN);
+        if (imm1_index != std::string::npos) {
+          std::ostringstream sreg;
+          sreg << i;
+          std::string imm_string = sreg.str();
+          after_imm1.replace(imm1_index, ConstexprStrLen(IMM1_TOKEN), imm_string);
+        }
+
+        std::string after_imm2 = after_imm1;
+
+        size_t imm2_index = after_imm2.find(IMM2_TOKEN);
+        if (imm2_index != std::string::npos) {
+          std::ostringstream sreg;
+          sreg << j;
+          std::string imm_string = sreg.str();
+          after_imm2.replace(imm2_index, ConstexprStrLen(IMM2_TOKEN), imm_string);
+        }
+
+        for (auto reg1 : reg1_registers) {
+          std::string after_reg1 = after_imm2;
+
+          std::string reg1_string = (this->*GetName1)(*reg1);
+          size_t reg1_index;
+          while ((reg1_index = after_reg1.find(Base::REG1_TOKEN)) != std::string::npos) {
+            after_reg1.replace(reg1_index, ConstexprStrLen(Base::REG1_TOKEN), reg1_string);
+          }
+
+          for (auto reg2 : reg2_registers) {
+            std::string after_reg2 = after_reg1;
+
+            std::string reg2_string = (this->*GetName2)(*reg2);
+            size_t reg2_index;
+            while ((reg2_index = after_reg2.find(Base::REG2_TOKEN)) != std::string::npos) {
+              after_reg2.replace(reg2_index, ConstexprStrLen(Base::REG2_TOKEN), reg2_string);
+            }
+
+            if (first) {
+              first = false;
+            } else {
+              oss << "\n";
+            }
+            oss << after_reg2;
+
+            (Base::GetAssembler()->*f)(*reg1, *reg2, i, j, c);
+          }
+        }
+      }
+    }
+    // Add a newline at the end.
+    oss << "\n";
+
+    return oss.str();
+  }
+
+  std::string RepeatRRC(void (Ass::*f)(Reg, Reg, Cond), std::string fmt) {
+    return RepeatTemplatedRRC(f, GetRegisters(), GetRegisters(), GetConditions(),
+        &AssemblerArmTest::template GetRegName<RegisterView::kUsePrimaryName>,
+        &AssemblerArmTest::template GetRegName<RegisterView::kUsePrimaryName>,
+        fmt);
+  }
+
+  template <typename Reg1, typename Reg2>
+  std::string RepeatTemplatedRRC(void (Ass::*f)(Reg1, Reg2, Cond),
+                                 const std::vector<Reg1*>& reg1_registers,
+                                 const std::vector<Reg2*>& reg2_registers,
+                                 const std::vector<Cond>& cond,
+                                 std::string (AssemblerArmTest::*GetName1)(const Reg1&),
+                                 std::string (AssemblerArmTest::*GetName2)(const Reg2&),
+                                 std::string fmt) {
+    WarnOnCombinations(cond.size() * reg1_registers.size() * reg2_registers.size());
+
+    std::ostringstream oss;
+    bool first = true;
+    for (const Cond& c : cond) {
+      std::string after_cond = fmt;
+
+      size_t cond_index = after_cond.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+
+      for (auto reg1 : reg1_registers) {
+        std::string after_reg1 = after_cond;
+
+        std::string reg1_string = (this->*GetName1)(*reg1);
+        size_t reg1_index;
+        while ((reg1_index = after_reg1.find(Base::REG1_TOKEN)) != std::string::npos) {
+          after_reg1.replace(reg1_index, ConstexprStrLen(Base::REG1_TOKEN), reg1_string);
+        }
+
+        for (auto reg2 : reg2_registers) {
+          std::string after_reg2 = after_reg1;
+
+          std::string reg2_string = (this->*GetName2)(*reg2);
+          size_t reg2_index;
+          while ((reg2_index = after_reg2.find(Base::REG2_TOKEN)) != std::string::npos) {
+            after_reg2.replace(reg2_index, ConstexprStrLen(Base::REG2_TOKEN), reg2_string);
+          }
+
+          if (first) {
+            first = false;
+          } else {
+            oss << "\n";
+          }
+          oss << after_reg2;
+
+          (Base::GetAssembler()->*f)(*reg1, *reg2, c);
+        }
+      }
+    }
+    // Add a newline at the end.
+    oss << "\n";
+
+    return oss.str();
+  }
+
+  std::string RepeatRRRC(void (Ass::*f)(Reg, Reg, Reg, Cond), std::string fmt) {
+    return RepeatTemplatedRRRC(f, GetRegisters(), GetRegisters(), GetRegisters(), GetConditions(),
+                               &AssemblerArmTest::template GetRegName<RegisterView::kUsePrimaryName>,
+                               &AssemblerArmTest::template GetRegName<RegisterView::kUsePrimaryName>,
+                               &AssemblerArmTest::template GetRegName<RegisterView::kUsePrimaryName>,
+                               fmt);
+  }
+
+  template <typename Reg1, typename Reg2, typename Reg3>
+  std::string RepeatTemplatedRRRC(void (Ass::*f)(Reg1, Reg2, Reg3, Cond),
+                                  const std::vector<Reg1*>& reg1_registers,
+                                  const std::vector<Reg2*>& reg2_registers,
+                                  const std::vector<Reg3*>& reg3_registers,
+                                  const std::vector<Cond>& cond,
+                                  std::string (AssemblerArmTest::*GetName1)(const Reg1&),
+                                  std::string (AssemblerArmTest::*GetName2)(const Reg2&),
+                                  std::string (AssemblerArmTest::*GetName3)(const Reg3&),
+                                  std::string fmt) {
+    WarnOnCombinations(cond.size() * reg1_registers.size() * reg2_registers.size() *
+                       reg3_registers.size());
+
+    std::ostringstream oss;
+    bool first = true;
+    for (const Cond& c : cond) {
+      std::string after_cond = fmt;
+
+      size_t cond_index = after_cond.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+
+      for (auto reg1 : reg1_registers) {
+        std::string after_reg1 = after_cond;
+
+        std::string reg1_string = (this->*GetName1)(*reg1);
+        size_t reg1_index;
+        while ((reg1_index = after_reg1.find(Base::REG1_TOKEN)) != std::string::npos) {
+          after_reg1.replace(reg1_index, ConstexprStrLen(Base::REG1_TOKEN), reg1_string);
+        }
+
+        for (auto reg2 : reg2_registers) {
+          std::string after_reg2 = after_reg1;
+
+          std::string reg2_string = (this->*GetName2)(*reg2);
+          size_t reg2_index;
+          while ((reg2_index = after_reg2.find(Base::REG2_TOKEN)) != std::string::npos) {
+            after_reg2.replace(reg2_index, ConstexprStrLen(Base::REG2_TOKEN), reg2_string);
+          }
+
+          for (auto reg3 : reg3_registers) {
+            std::string after_reg3 = after_reg2;
+
+            std::string reg3_string = (this->*GetName3)(*reg3);
+            size_t reg3_index;
+            while ((reg3_index = after_reg3.find(REG3_TOKEN)) != std::string::npos) {
+              after_reg3.replace(reg3_index, ConstexprStrLen(REG3_TOKEN), reg3_string);
+            }
+
+            if (first) {
+              first = false;
+            } else {
+              oss << "\n";
+            }
+            oss << after_reg3;
+
+            (Base::GetAssembler()->*f)(*reg1, *reg2, *reg3, c);
+          }
+        }
+      }
+    }
+    // Add a newline at the end.
+    oss << "\n";
+
+    return oss.str();
+  }
+
+  template <typename RegT>
+  std::string RepeatTemplatedRSC(void (Ass::*f)(RegT, SOp, Cond),
+                                 const std::vector<RegT*>& registers,
+                                 const std::vector<SOp>& shifts,
+                                 const std::vector<Cond>& cond,
+                                 std::string (AssemblerArmTest::*GetName)(const RegT&),
+                                 std::string fmt) {
+    WarnOnCombinations(cond.size() * registers.size() * shifts.size());
+
+    std::ostringstream oss;
+    bool first = true;
+    for (const Cond& c : cond) {
+      std::string after_cond = fmt;
+
+      size_t cond_index = after_cond.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+
+      for (const SOp& shift : shifts) {
+        std::string after_shift = after_cond;
+
+        std::string shift_string = GetShiftString(shift);
+        size_t shift_index;
+        while ((shift_index = after_shift.find(Base::SHIFT_TOKEN)) != std::string::npos) {
+          after_shift.replace(shift_index, ConstexprStrLen(Base::SHIFT_TOKEN), shift_string);
+        }
+
+        for (auto reg : registers) {
+          std::string after_reg = after_shift;
+
+          std::string reg_string = (this->*GetName)(*reg);
+          size_t reg_index;
+          while ((reg_index = after_reg.find(Base::REG_TOKEN)) != std::string::npos) {
+            after_reg.replace(reg_index, ConstexprStrLen(Base::REG_TOKEN), reg_string);
+          }
+
+          if (first) {
+            first = false;
+          } else {
+            oss << "\n";
+          }
+          oss << after_reg;
+
+          (Base::GetAssembler()->*f)(*reg, shift, c);
+        }
+      }
+    }
+    // Add a newline at the end.
+    oss << "\n";
+
+    return oss.str();
+  }
+
+  template <typename Reg1, typename Reg2>
+  std::string RepeatTemplatedRRSC(void (Ass::*f)(Reg1, Reg2, const SOp&, Cond),
+                                  const std::vector<Reg1*>& reg1_registers,
+                                  const std::vector<Reg2*>& reg2_registers,
+                                  const std::vector<SOp>& shifts,
+                                  const std::vector<Cond>& cond,
+                                  std::string (AssemblerArmTest::*GetName1)(const Reg1&),
+                                  std::string (AssemblerArmTest::*GetName2)(const Reg2&),
+                                  std::string fmt) {
+    WarnOnCombinations(cond.size() * reg1_registers.size() * reg2_registers.size() * shifts.size());
+
+    std::ostringstream oss;
+    bool first = true;
+    for (const Cond& c : cond) {
+      std::string after_cond = fmt;
+
+      size_t cond_index = after_cond.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+
+      for (const SOp& shift : shifts) {
+        std::string after_shift = after_cond;
+
+        std::string shift_string = GetShiftString(shift);
+        size_t shift_index;
+        while ((shift_index = after_shift.find(SHIFT_TOKEN)) != std::string::npos) {
+          after_shift.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
+        }
+
+        for (auto reg1 : reg1_registers) {
+          std::string after_reg1 = after_shift;
+
+          std::string reg1_string = (this->*GetName1)(*reg1);
+          size_t reg1_index;
+          while ((reg1_index = after_reg1.find(Base::REG1_TOKEN)) != std::string::npos) {
+            after_reg1.replace(reg1_index, ConstexprStrLen(Base::REG1_TOKEN), reg1_string);
+          }
+
+          for (auto reg2 : reg2_registers) {
+            std::string after_reg2 = after_reg1;
+
+            std::string reg2_string = (this->*GetName2)(*reg2);
+            size_t reg2_index;
+            while ((reg2_index = after_reg2.find(Base::REG2_TOKEN)) != std::string::npos) {
+              after_reg2.replace(reg2_index, ConstexprStrLen(Base::REG2_TOKEN), reg2_string);
+            }
+
+            if (first) {
+              first = false;
+            } else {
+              oss << "\n";
+            }
+            oss << after_reg2;
+
+            (Base::GetAssembler()->*f)(*reg1, *reg2, shift, c);
+          }
+        }
+      }
+    }
+    // Add a newline at the end.
+    oss << "\n";
+
+    return oss.str();
+  }
+
+ protected:
+  explicit AssemblerArmTest() {}
+
+  virtual std::vector<Cond>& GetConditions() = 0;
+  virtual std::string GetConditionString(Cond c) = 0;
+
+  virtual std::vector<SOp>& GetShiftOperands() = 0;
+  virtual std::string GetShiftString(SOp sop) = 0;
+
+  virtual Reg GetPCRegister() = 0;
+  virtual std::vector<Reg*> GetRegistersWithoutPC() {
+    std::vector<Reg*> without_pc = GetRegisters();
+    Reg pc_reg = GetPCRegister();
+
+    for (auto it = without_pc.begin(); it != without_pc.end(); ++it) {
+      if (**it == pc_reg) {
+        without_pc.erase(it);
+        break;
+      }
+    }
+
+    return without_pc;
+  }
+
+  static constexpr const char* IMM1_TOKEN = "{imm1}";
+  static constexpr const char* IMM2_TOKEN = "{imm2}";
+  static constexpr const char* REG3_TOKEN = "{reg3}";
+  static constexpr const char* REG4_TOKEN = "{reg4}";
+  static constexpr const char* COND_TOKEN = "{cond}";
+  static constexpr const char* SHIFT_TOKEN = "{shift}";
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(AssemblerArmTest);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM_TEST_H_
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 3ab9b2b..479186c 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -25,6 +25,39 @@
 namespace art {
 namespace arm {
 
+bool Thumb2Assembler::ShifterOperandCanHold(Register rd,
+                                            Register rn,
+                                            Opcode opcode,
+                                            uint32_t immediate,
+                                            ShifterOperand* shifter_op) {
+  shifter_op->type_ = ShifterOperand::kImmediate;
+  shifter_op->immed_ = immediate;
+  shifter_op->is_shift_ = false;
+  shifter_op->is_rotate_ = false;
+  switch (opcode) {
+    case ADD:
+    case SUB:
+      if (rn == SP) {
+        if (rd == SP) {
+          return immediate < (1 << 9);    // 9 bits allowed.
+        } else {
+          return immediate < (1 << 12);   // 12 bits.
+        }
+      }
+      if (immediate < (1 << 12)) {    // Less than (or equal to) 12 bits can always be done.
+        return true;
+      }
+      return ArmAssembler::ModifiedImmediate(immediate) != kInvalidModifiedImmediate;
+
+    case MOV:
+      // TODO: Support less than or equal to 12bits.
+      return ArmAssembler::ModifiedImmediate(immediate) != kInvalidModifiedImmediate;
+    case MVN:
+    default:
+      return ArmAssembler::ModifiedImmediate(immediate) != kInvalidModifiedImmediate;
+  }
+}
+
 void Thumb2Assembler::and_(Register rd, Register rn, const ShifterOperand& so,
                            Condition cond) {
   EmitDataProcessing(cond, AND, 0, rn, rd, so);
@@ -374,16 +407,11 @@
                           Register base,
                           RegList regs,
                           Condition cond) {
-  if (__builtin_popcount(regs) == 1) {
+  CHECK_NE(regs, 0u);  // Do not use ldm if there's nothing to load.
+  if (IsPowerOfTwo(regs)) {
     // Thumb doesn't support one reg in the list.
     // Find the register number.
-    int reg = 0;
-    while (reg < 16) {
-      if ((regs & (1 << reg)) != 0) {
-         break;
-      }
-      ++reg;
-    }
+    int reg = CTZ(static_cast<uint32_t>(regs));
     CHECK_LT(reg, 16);
     CHECK(am == DB_W);      // Only writeback is supported.
     ldr(static_cast<Register>(reg), Address(base, kRegisterSize, Address::PostIndex), cond);
@@ -397,16 +425,11 @@
                           Register base,
                           RegList regs,
                           Condition cond) {
-  if (__builtin_popcount(regs) == 1) {
+  CHECK_NE(regs, 0u);  // Do not use stm if there's nothing to store.
+  if (IsPowerOfTwo(regs)) {
     // Thumb doesn't support one reg in the list.
     // Find the register number.
-    int reg = 0;
-    while (reg < 16) {
-      if ((regs & (1 << reg)) != 0) {
-         break;
-      }
-      ++reg;
-    }
+    int reg = CTZ(static_cast<uint32_t>(regs));
     CHECK_LT(reg, 16);
     CHECK(am == IA || am == IA_W);
     Address::Mode strmode = am == IA ? Address::PreIndex : Address::Offset;
@@ -813,6 +836,7 @@
 
   if (thumb_opcode == 255U /* 0b11111111 */) {
     LOG(FATAL) << "Invalid thumb2 opcode " << opcode;
+    UNREACHABLE();
   }
 
   int32_t encoding = 0;
@@ -842,6 +866,7 @@
       uint32_t imm = ModifiedImmediate(so.encodingThumb());
       if (imm == kInvalidModifiedImmediate) {
         LOG(FATAL) << "Immediate value cannot fit in thumb2 modified immediate";
+        UNREACHABLE();
       }
       encoding = B31 | B30 | B29 | B28 |
           thumb_opcode << 21 |
@@ -979,6 +1004,7 @@
 
   if (thumb_opcode == 255U /* 0b11111111 */) {
     LOG(FATAL) << "Invalid thumb1 opcode " << opcode;
+    UNREACHABLE();
   }
 
   int16_t encoding = dp_opcode << 14 |
@@ -1116,7 +1142,7 @@
       break;
     default:
       LOG(FATAL) << "This opcode is not an ADD or SUB: " << opcode;
-      return;
+      UNREACHABLE();
   }
 
   int16_t encoding = dp_opcode << 14 |
@@ -1157,6 +1183,7 @@
       case RRX: opcode = 3U /* 0b11 */; amount = 0; break;
       default:
         LOG(FATAL) << "Unsupported thumb2 shift opcode";
+        UNREACHABLE();
     }
     // 32 bit.
     int32_t encoding = B31 | B30 | B29 | B27 | B25 | B22 |
@@ -1174,7 +1201,8 @@
       case LSR: opcode = 1U /* 0b01 */; break;
       case ASR: opcode = 2U /* 0b10 */; break;
       default:
-         LOG(FATAL) << "Unsupported thumb2 shift opcode";
+        LOG(FATAL) << "Unsupported thumb2 shift opcode";
+        UNREACHABLE();
     }
     int16_t encoding = opcode << 11 | amount << 6 | static_cast<int16_t>(rm) << 3 |
         static_cast<int16_t>(rd);
@@ -1198,6 +1226,7 @@
        case ROR: opcode = 3U /* 0b11 */; break;
        default:
          LOG(FATAL) << "Unsupported thumb2 shift opcode";
+         UNREACHABLE();
      }
      // 32 bit.
      int32_t encoding = B31 | B30 | B29 | B28 | B27 | B25 |
@@ -1212,7 +1241,8 @@
       case LSR: opcode = 3U /* 0b0011 */; break;
       case ASR: opcode = 4U /* 0b0100 */; break;
       default:
-         LOG(FATAL) << "Unsupported thumb2 shift opcode";
+        LOG(FATAL) << "Unsupported thumb2 shift opcode";
+        UNREACHABLE();
     }
     int16_t encoding = B14 | opcode << 6 | static_cast<int16_t>(rm) << 3 |
         static_cast<int16_t>(rd);
@@ -1241,6 +1271,7 @@
     } else {
       if (x) {
         LOG(FATAL) << "Invalid use of BX";
+        UNREACHABLE();
       } else {
         if (cond_ == AL) {
           // Can use the T4 encoding allowing a 24 bit offset.
@@ -1469,6 +1500,15 @@
   CheckCondition(cond);
   bool must_be_32bit = force_32bit_;
 
+  if (!must_be_32bit && base == SP && bam == (load ? IA_W : DB_W) &&
+      (regs & 0xff00 & ~(1 << (load ? PC : LR))) == 0) {
+    // Use 16-bit PUSH/POP.
+    int16_t encoding = B15 | B13 | B12 | (load ? B11 : 0) | B10 |
+        ((regs & (1 << (load ? PC : LR))) != 0 ? B8 : 0) | (regs & 0x00ff);
+    Emit16(encoding);
+    return;
+  }
+
   if ((regs & 0xff00) != 0) {
     must_be_32bit = true;
   }
@@ -1495,6 +1535,7 @@
       case DA_W:
       case IB_W:
         LOG(FATAL) << "LDM/STM mode not supported on thumb: " << bam;
+        UNREACHABLE();
     }
     if (load) {
       // Cannot have SP in the list.
@@ -1981,8 +2022,13 @@
 
 
 void Thumb2Assembler::vmstat(Condition cond) {  // VMRS APSR_nzcv, FPSCR.
+  CHECK_NE(cond, kNoCondition);
   CheckCondition(cond);
-  UNIMPLEMENTED(FATAL) << "Unimplemented thumb instruction";
+  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
+      B27 | B26 | B25 | B23 | B22 | B21 | B20 | B16 |
+      (static_cast<int32_t>(PC)*B12) |
+      B11 | B9 | B4;
+  Emit32(encoding);
 }
 
 
@@ -2068,6 +2114,7 @@
   CheckCondition(AL);
   if (label->IsBound()) {
     LOG(FATAL) << "cbz can only be used to branch forwards";
+    UNREACHABLE();
   } else {
     uint16_t branchid = EmitCompareAndBranch(rn, static_cast<uint16_t>(label->position_), false);
     label->LinkTo(branchid);
@@ -2079,6 +2126,7 @@
   CheckCondition(AL);
   if (label->IsBound()) {
     LOG(FATAL) << "cbnz can only be used to branch forwards";
+    UNREACHABLE();
   } else {
     uint16_t branchid = EmitCompareAndBranch(rn, static_cast<uint16_t>(label->position_), true);
     label->LinkTo(branchid);
@@ -2210,7 +2258,7 @@
 
 void Thumb2Assembler::Lsl(Register rd, Register rm, uint32_t shift_imm,
                           bool setcc, Condition cond) {
-  CHECK_NE(shift_imm, 0u);  // Do not use Lsl if no shift is wanted.
+  CHECK_LE(shift_imm, 31u);
   CheckCondition(cond);
   EmitShift(rd, rm, LSL, shift_imm, setcc);
 }
@@ -2218,7 +2266,7 @@
 
 void Thumb2Assembler::Lsr(Register rd, Register rm, uint32_t shift_imm,
                           bool setcc, Condition cond) {
-  CHECK_NE(shift_imm, 0u);  // Do not use Lsr if no shift is wanted.
+  CHECK(1u <= shift_imm && shift_imm <= 32u);
   if (shift_imm == 32) shift_imm = 0;  // Comply to UAL syntax.
   CheckCondition(cond);
   EmitShift(rd, rm, LSR, shift_imm, setcc);
@@ -2227,7 +2275,7 @@
 
 void Thumb2Assembler::Asr(Register rd, Register rm, uint32_t shift_imm,
                           bool setcc, Condition cond) {
-  CHECK_NE(shift_imm, 0u);  // Do not use Asr if no shift is wanted.
+  CHECK(1u <= shift_imm && shift_imm <= 32u);
   if (shift_imm == 32) shift_imm = 0;  // Comply to UAL syntax.
   CheckCondition(cond);
   EmitShift(rd, rm, ASR, shift_imm, setcc);
@@ -2236,7 +2284,7 @@
 
 void Thumb2Assembler::Ror(Register rd, Register rm, uint32_t shift_imm,
                           bool setcc, Condition cond) {
-  CHECK_NE(shift_imm, 0u);  // Use Rrx instruction.
+  CHECK(1u <= shift_imm && shift_imm <= 31u);
   CheckCondition(cond);
   EmitShift(rd, rm, ROR, shift_imm, setcc);
 }
@@ -2360,16 +2408,16 @@
   // positive values and sub for negatives ones, which would slightly improve
   // the readability of generated code for some constants.
   ShifterOperand shifter_op;
-  if (ShifterOperand::CanHoldThumb(rd, rn, ADD, value, &shifter_op)) {
+  if (ShifterOperandCanHold(rd, rn, ADD, value, &shifter_op)) {
     add(rd, rn, shifter_op, cond);
-  } else if (ShifterOperand::CanHoldThumb(rd, rn, SUB, -value, &shifter_op)) {
+  } else if (ShifterOperandCanHold(rd, rn, SUB, -value, &shifter_op)) {
     sub(rd, rn, shifter_op, cond);
   } else {
     CHECK(rn != IP);
-    if (ShifterOperand::CanHoldThumb(rd, rn, MVN, ~value, &shifter_op)) {
+    if (ShifterOperandCanHold(rd, rn, MVN, ~value, &shifter_op)) {
       mvn(IP, shifter_op, cond);
       add(rd, rn, ShifterOperand(IP), cond);
-    } else if (ShifterOperand::CanHoldThumb(rd, rn, MVN, ~(-value), &shifter_op)) {
+    } else if (ShifterOperandCanHold(rd, rn, MVN, ~(-value), &shifter_op)) {
       mvn(IP, shifter_op, cond);
       sub(rd, rn, ShifterOperand(IP), cond);
     } else {
@@ -2387,16 +2435,16 @@
 void Thumb2Assembler::AddConstantSetFlags(Register rd, Register rn, int32_t value,
                                           Condition cond) {
   ShifterOperand shifter_op;
-  if (ShifterOperand::CanHoldThumb(rd, rn, ADD, value, &shifter_op)) {
+  if (ShifterOperandCanHold(rd, rn, ADD, value, &shifter_op)) {
     adds(rd, rn, shifter_op, cond);
-  } else if (ShifterOperand::CanHoldThumb(rd, rn, ADD, -value, &shifter_op)) {
+  } else if (ShifterOperandCanHold(rd, rn, ADD, -value, &shifter_op)) {
     subs(rd, rn, shifter_op, cond);
   } else {
     CHECK(rn != IP);
-    if (ShifterOperand::CanHoldThumb(rd, rn, MVN, ~value, &shifter_op)) {
+    if (ShifterOperandCanHold(rd, rn, MVN, ~value, &shifter_op)) {
       mvn(IP, shifter_op, cond);
       adds(rd, rn, ShifterOperand(IP), cond);
-    } else if (ShifterOperand::CanHoldThumb(rd, rn, MVN, ~(-value), &shifter_op)) {
+    } else if (ShifterOperandCanHold(rd, rn, MVN, ~(-value), &shifter_op)) {
       mvn(IP, shifter_op, cond);
       subs(rd, rn, ShifterOperand(IP), cond);
     } else {
@@ -2410,11 +2458,12 @@
   }
 }
 
+
 void Thumb2Assembler::LoadImmediate(Register rd, int32_t value, Condition cond) {
   ShifterOperand shifter_op;
-  if (ShifterOperand::CanHoldThumb(rd, R0, MOV, value, &shifter_op)) {
+  if (ShifterOperandCanHold(rd, R0, MOV, value, &shifter_op)) {
     mov(rd, shifter_op, cond);
-  } else if (ShifterOperand::CanHoldThumb(rd, R0, MVN, ~value, &shifter_op)) {
+  } else if (ShifterOperandCanHold(rd, R0, MVN, ~value, &shifter_op)) {
     mvn(rd, shifter_op, cond);
   } else {
     movw(rd, Low16Bits(value), cond);
@@ -2425,6 +2474,7 @@
   }
 }
 
+
 // Implementation note: this method must emit at most one instruction when
 // Address::CanHoldLoadOffsetThumb.
 void Thumb2Assembler::LoadFromOffset(LoadOperandType type,
@@ -2599,10 +2649,8 @@
 
 
 void Thumb2Assembler::dmb(DmbOptions flavor) {
-#if ANDROID_SMP != 0
   int32_t encoding = 0xf3bf8f50;  // dmb in T1 encoding.
   Emit32(encoding | flavor);
-#endif
 }
 
 
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index cfa251a..48a3a7e 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -304,6 +304,12 @@
                       int32_t offset,
                       Condition cond = AL) OVERRIDE;
 
+  bool ShifterOperandCanHold(Register rd,
+                             Register rn,
+                             Opcode opcode,
+                             uint32_t immediate,
+                             ShifterOperand* shifter_op) OVERRIDE;
+
 
   static bool IsInstructionForExceptionHandling(uintptr_t pc);
 
diff --git a/compiler/utils/arm/assembler_thumb2_test.cc b/compiler/utils/arm/assembler_thumb2_test.cc
index 65d6d45..6ae95a4 100644
--- a/compiler/utils/arm/assembler_thumb2_test.cc
+++ b/compiler/utils/arm/assembler_thumb2_test.cc
@@ -30,7 +30,7 @@
   }
 
   std::string GetAssemblerParameters() OVERRIDE {
-    return " -mthumb";
+    return " -mthumb -mfpu=neon";
   }
 
   std::string GetDisassembleParameters() OVERRIDE {
@@ -156,4 +156,12 @@
   DriverStr(expected, "ubfx");
 }
 
+TEST_F(AssemblerThumb2Test, Vmstat) {
+  GetAssembler()->vmstat();
+
+  const char* expected = "vmrs APSR_nzcv, FPSCR\n";
+
+  DriverStr(expected, "vmrs");
+}
+
 }  // namespace art
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index 02011b8..21014c8 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -329,12 +329,12 @@
     if (dst.IsXRegister()) {
       if (size == 4) {
         CHECK(src.IsWRegister());
-        ___ Mov(reg_x(dst.AsXRegister()), reg_w(src.AsWRegister()));
+        ___ Mov(reg_w(dst.AsOverlappingWRegister()), reg_w(src.AsWRegister()));
       } else {
         if (src.IsXRegister()) {
           ___ Mov(reg_x(dst.AsXRegister()), reg_x(src.AsXRegister()));
         } else {
-          ___ Mov(reg_x(dst.AsXRegister()), reg_w(src.AsWRegister()));
+          ___ Mov(reg_x(dst.AsXRegister()), reg_x(src.AsOverlappingXRegister()));
         }
       }
     } else if (dst.IsWRegister()) {
@@ -476,9 +476,7 @@
 
 void Arm64Assembler::MemoryBarrier(ManagedRegister m_scratch ATTRIBUTE_UNUSED) {
   // TODO: Should we check that m_scratch is IP? - see arm.
-#if ANDROID_SMP != 0
   ___ Dmb(vixl::InnerShareable, vixl::BarrierAll);
-#endif
 }
 
 void Arm64Assembler::SignExtend(ManagedRegister mreg, size_t size) {
@@ -486,9 +484,9 @@
   CHECK(size == 1 || size == 2) << size;
   CHECK(reg.IsWRegister()) << reg;
   if (size == 1) {
-    ___ sxtb(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
+    ___ Sxtb(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
   } else {
-    ___ sxth(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
+    ___ Sxth(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
   }
 }
 
@@ -497,9 +495,9 @@
   CHECK(size == 1 || size == 2) << size;
   CHECK(reg.IsWRegister()) << reg;
   if (size == 1) {
-    ___ uxtb(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
+    ___ Uxtb(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
   } else {
-    ___ uxth(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
+    ___ Uxth(reg_w(reg.AsWRegister()), reg_w(reg.AsWRegister()));
   }
 }
 
diff --git a/compiler/utils/assembler_test.h b/compiler/utils/assembler_test.h
index 9d3fa01..2b55120 100644
--- a/compiler/utils/assembler_test.h
+++ b/compiler/utils/assembler_test.h
@@ -38,14 +38,14 @@
 // temp directory.
 static std::string tmpnam_;
 
+enum class RegisterView {  // private
+  kUsePrimaryName,
+  kUseSecondaryName
+};
+
 template<typename Ass, typename Reg, typename FPReg, typename Imm>
 class AssemblerTest : public testing::Test {
  public:
-  enum class RegisterView {  // private
-    kUsePrimaryName,
-    kUseSecondaryName
-  };
-
   Ass* GetAssembler() {
     return assembler_.get();
   }
@@ -159,6 +159,9 @@
                       bool as_uint = false) {
     std::string str;
     std::vector<int64_t> imms = CreateImmediateValues(imm_bytes, as_uint);
+
+    WarnOnCombinations(imms.size());
+
     for (int64_t imm : imms) {
       Imm new_imm = CreateImmediate(imm);
       (assembler_.get()->*f)(new_imm);
@@ -184,12 +187,12 @@
 
   // This is intended to be run as a test.
   bool CheckTools() {
-    if (!FileExists(GetAssemblerCommand())) {
+    if (!FileExists(FindTool(GetAssemblerCmdName()))) {
       return false;
     }
     LOG(INFO) << "Chosen assembler command: " << GetAssemblerCommand();
 
-    if (!FileExists(GetObjdumpCommand())) {
+    if (!FileExists(FindTool(GetObjdumpCmdName()))) {
       return false;
     }
     LOG(INFO) << "Chosen objdump command: " << GetObjdumpCommand();
@@ -197,7 +200,7 @@
     // Disassembly is optional.
     std::string disassembler = GetDisassembleCommand();
     if (disassembler.length() != 0) {
-      if (!FileExists(disassembler)) {
+      if (!FileExists(FindTool(GetDisassembleCmdName()))) {
         return false;
       }
       LOG(INFO) << "Chosen disassemble command: " << GetDisassembleCommand();
@@ -223,6 +226,10 @@
     UNREACHABLE();
   }
 
+  std::string GetRegisterName(const Reg& reg) {
+    return GetRegName<RegisterView::kUsePrimaryName>(reg);
+  }
+
  protected:
   explicit AssemblerTest() {}
 
@@ -271,7 +278,7 @@
 
     resolved_assembler_cmd_ = line + GetAssemblerParameters();
 
-    return line;
+    return resolved_assembler_cmd_;
   }
 
   // Get the name of the objdump, e.g., "objdump" by default.
@@ -298,7 +305,7 @@
 
     resolved_objdump_cmd_ = line + GetObjdumpParameters();
 
-    return line;
+    return resolved_objdump_cmd_;
   }
 
   // Get the name of the objdump, e.g., "objdump" by default.
@@ -324,7 +331,7 @@
 
     resolved_disassemble_cmd_ = line + GetDisassembleParameters();
 
-    return line;
+    return resolved_disassemble_cmd_;
   }
 
   // Create a couple of immediate values up to the number of bytes given.
@@ -406,6 +413,8 @@
                                        std::string (AssemblerTest::*GetName1)(const Reg1&),
                                        std::string (AssemblerTest::*GetName2)(const Reg2&),
                                        std::string fmt) {
+    WarnOnCombinations(reg1_registers.size() * reg2_registers.size());
+
     std::string str;
     for (auto reg1 : reg1_registers) {
       for (auto reg2 : reg2_registers) {
@@ -435,7 +444,6 @@
     return str;
   }
 
- private:
   template <RegisterView kRegView>
   std::string GetRegName(const Reg& reg) {
     std::ostringstream sreg;
@@ -457,12 +465,32 @@
     return sreg.str();
   }
 
+  // If the assembly file needs a header, return it in a sub-class.
+  virtual const char* GetAssemblyHeader() {
+    return nullptr;
+  }
+
+  void WarnOnCombinations(size_t count) {
+    if (count > kWarnManyCombinationsThreshold) {
+      GTEST_LOG_(WARNING) << "Many combinations (" << count << "), test generation might be slow.";
+    }
+  }
+
+  static constexpr const char* REG_TOKEN = "{reg}";
+  static constexpr const char* REG1_TOKEN = "{reg1}";
+  static constexpr const char* REG2_TOKEN = "{reg2}";
+  static constexpr const char* IMM_TOKEN = "{imm}";
+
+ private:
   template <RegisterView kRegView>
   std::string RepeatRegisterImm(void (Ass::*f)(Reg, const Imm&), size_t imm_bytes,
                                   std::string fmt) {
     const std::vector<Reg*> registers = GetRegisters();
     std::string str;
     std::vector<int64_t> imms = CreateImmediateValues(imm_bytes);
+
+    WarnOnCombinations(registers.size() * imms.size());
+
     for (auto reg : registers) {
       for (int64_t imm : imms) {
         Imm new_imm = CreateImmediate(imm);
@@ -547,7 +575,7 @@
 
   // Compile the assembly file from_file to a binary file to_file. Returns true on success.
   bool Assemble(const char* from_file, const char* to_file, std::string* error_msg) {
-    bool have_assembler = FileExists(GetAssemblerCommand());
+    bool have_assembler = FileExists(FindTool(GetAssemblerCmdName()));
     EXPECT_TRUE(have_assembler) << "Cannot find assembler:" << GetAssemblerCommand();
     if (!have_assembler) {
       return false;
@@ -569,13 +597,20 @@
     args.push_back("-c");
     args.push_back(cmd);
 
-    return Exec(args, error_msg);
+    bool success = Exec(args, error_msg);
+    if (!success) {
+      LOG(INFO) << "Assembler command line:";
+      for (std::string arg : args) {
+        LOG(INFO) << arg;
+      }
+    }
+    return success;
   }
 
   // Runs objdump -h on the binary file and extracts the first line with .text.
   // Returns "" on failure.
   std::string Objdump(std::string file) {
-    bool have_objdump = FileExists(GetObjdumpCommand());
+    bool have_objdump = FileExists(FindTool(GetObjdumpCmdName()));
     EXPECT_TRUE(have_objdump) << "Cannot find objdump: " << GetObjdumpCommand();
     if (!have_objdump) {
       return "";
@@ -652,10 +687,10 @@
 
     // If you want to take a look at the differences between the ART assembler and GCC, comment
     // out the removal code.
-    std::remove(data_name.c_str());
-    std::remove(as_name.c_str());
-    std::remove((data_name + ".dis").c_str());
-    std::remove((as_name + ".dis").c_str());
+//    std::remove(data_name.c_str());
+//    std::remove(as_name.c_str());
+//    std::remove((data_name + ".dis").c_str());
+//    std::remove((as_name + ".dis").c_str());
 
     return result;
   }
@@ -714,6 +749,10 @@
     // TODO: Lots of error checking.
 
     std::ofstream s_out(res->base_name + ".S");
+    const char* header = GetAssemblyHeader();
+    if (header != nullptr) {
+      s_out << header;
+    }
     s_out << assembly_code;
     s_out.close();
 
@@ -862,13 +901,9 @@
     return tmpnam_;
   }
 
+  static constexpr size_t kWarnManyCombinationsThreshold = 500;
   static constexpr size_t OBJDUMP_SECTION_LINE_MIN_TOKENS = 6;
 
-  static constexpr const char* REG_TOKEN = "{reg}";
-  static constexpr const char* REG1_TOKEN = "{reg1}";
-  static constexpr const char* REG2_TOKEN = "{reg2}";
-  static constexpr const char* IMM_TOKEN = "{imm}";
-
   std::unique_ptr<Ass> assembler_;
 
   std::string resolved_assembler_cmd_;
diff --git a/compiler/utils/dex_instruction_utils.h b/compiler/utils/dex_instruction_utils.h
new file mode 100644
index 0000000..2c6e525
--- /dev/null
+++ b/compiler/utils/dex_instruction_utils.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_DEX_INSTRUCTION_UTILS_H_
+#define ART_COMPILER_UTILS_DEX_INSTRUCTION_UTILS_H_
+
+#include "dex_instruction.h"
+
+namespace art {
+
+// Dex invoke type corresponds to the ordering of INVOKE instructions;
+// this order is the same for range and non-range invokes.
+enum DexInvokeType : uint8_t {
+  kDexInvokeVirtual = 0,  // invoke-virtual, invoke-virtual-range
+  kDexInvokeSuper,        // invoke-super, invoke-super-range
+  kDexInvokeDirect,       // invoke-direct, invoke-direct-range
+  kDexInvokeStatic,       // invoke-static, invoke-static-range
+  kDexInvokeInterface,    // invoke-interface, invoke-interface-range
+  kDexInvokeTypeCount
+};
+
+// Dex instruction memory access types correspond to the ordering of GET/PUT instructions;
+// this order is the same for IGET, IPUT, SGET, SPUT, AGET and APUT.
+enum DexMemAccessType : uint8_t {
+  kDexMemAccessWord = 0,  // op         0; int or float, the actual type is not encoded.
+  kDexMemAccessWide,      // op_WIDE    1; long or double, the actual type is not encoded.
+  kDexMemAccessObject,    // op_OBJECT  2; the actual reference type is not encoded.
+  kDexMemAccessBoolean,   // op_BOOLEAN 3
+  kDexMemAccessByte,      // op_BYTE    4
+  kDexMemAccessChar,      // op_CHAR    5
+  kDexMemAccessShort,     // op_SHORT   6
+  kDexMemAccessTypeCount
+};
+
+std::ostream& operator<<(std::ostream& os, const DexMemAccessType& type);
+
+// NOTE: The following functions disregard quickened instructions.
+
+constexpr bool IsInstructionReturn(Instruction::Code opcode) {
+  return Instruction::RETURN_VOID <= opcode && opcode <= Instruction::RETURN_OBJECT;
+}
+
+constexpr bool IsInstructionInvoke(Instruction::Code opcode) {
+  return Instruction::INVOKE_VIRTUAL <= opcode && opcode <= Instruction::INVOKE_INTERFACE_RANGE &&
+      opcode != Instruction::RETURN_VOID_BARRIER;
+}
+
+constexpr bool IsInstructionInvokeStatic(Instruction::Code opcode) {
+  return opcode == Instruction::INVOKE_STATIC || opcode == Instruction::INVOKE_STATIC_RANGE;
+}
+
+constexpr bool IsInstructionGoto(Instruction::Code opcode) {
+  return Instruction::GOTO <= opcode && opcode <= Instruction::GOTO_32;
+}
+
+constexpr bool IsInstructionIfCc(Instruction::Code opcode) {
+  return Instruction::IF_EQ <= opcode && opcode <= Instruction::IF_LE;
+}
+
+constexpr bool IsInstructionIfCcZ(Instruction::Code opcode) {
+  return Instruction::IF_EQZ <= opcode && opcode <= Instruction::IF_LEZ;
+}
+
+constexpr bool IsInstructionIGet(Instruction::Code code) {
+  return Instruction::IGET <= code && code <= Instruction::IGET_SHORT;
+}
+
+constexpr bool IsInstructionIPut(Instruction::Code code) {
+  return Instruction::IPUT <= code && code <= Instruction::IPUT_SHORT;
+}
+
+constexpr bool IsInstructionSGet(Instruction::Code code) {
+  return Instruction::SGET <= code && code <= Instruction::SGET_SHORT;
+}
+
+constexpr bool IsInstructionSPut(Instruction::Code code) {
+  return Instruction::SPUT <= code && code <= Instruction::SPUT_SHORT;
+}
+
+constexpr bool IsInstructionAGet(Instruction::Code code) {
+  return Instruction::AGET <= code && code <= Instruction::AGET_SHORT;
+}
+
+constexpr bool IsInstructionAPut(Instruction::Code code) {
+  return Instruction::APUT <= code && code <= Instruction::APUT_SHORT;
+}
+
+constexpr bool IsInstructionIGetOrIPut(Instruction::Code code) {
+  return Instruction::IGET <= code && code <= Instruction::IPUT_SHORT;
+}
+
+constexpr bool IsInstructionSGetOrSPut(Instruction::Code code) {
+  return Instruction::SGET <= code && code <= Instruction::SPUT_SHORT;
+}
+
+constexpr bool IsInstructionAGetOrAPut(Instruction::Code code) {
+  return Instruction::AGET <= code && code <= Instruction::APUT_SHORT;
+}
+
+// TODO: Remove the #if guards below when we fully migrate to C++14.
+
+constexpr bool IsInvokeInstructionRange(Instruction::Code opcode) {
+#if __cplusplus >= 201402  // C++14 allows the DCHECK() in constexpr functions.
+  DCHECK(IsInstructionInvoke(opcode));
+#endif
+  return opcode >= Instruction::INVOKE_VIRTUAL_RANGE;
+}
+
+constexpr DexInvokeType InvokeInstructionType(Instruction::Code opcode) {
+#if __cplusplus >= 201402  // C++14 allows the DCHECK() in constexpr functions.
+  DCHECK(IsInstructionInvoke(opcode));
+#endif
+  return static_cast<DexInvokeType>(IsInvokeInstructionRange(opcode)
+                                    ? (opcode - Instruction::INVOKE_VIRTUAL_RANGE)
+                                    : (opcode - Instruction::INVOKE_VIRTUAL));
+}
+
+constexpr DexMemAccessType IGetMemAccessType(Instruction::Code code) {
+#if __cplusplus >= 201402  // C++14 allows the DCHECK() in constexpr functions.
+  DCHECK(IsInstructionIGet(opcode));
+#endif
+  return static_cast<DexMemAccessType>(code - Instruction::IGET);
+}
+
+constexpr DexMemAccessType IPutMemAccessType(Instruction::Code code) {
+#if __cplusplus >= 201402  // C++14 allows the DCHECK() in constexpr functions.
+  DCHECK(IsInstructionIPut(opcode));
+#endif
+  return static_cast<DexMemAccessType>(code - Instruction::IPUT);
+}
+
+constexpr DexMemAccessType SGetMemAccessType(Instruction::Code code) {
+#if __cplusplus >= 201402  // C++14 allows the DCHECK() in constexpr functions.
+  DCHECK(IsInstructionSGet(opcode));
+#endif
+  return static_cast<DexMemAccessType>(code - Instruction::SGET);
+}
+
+constexpr DexMemAccessType SPutMemAccessType(Instruction::Code code) {
+#if __cplusplus >= 201402  // C++14 allows the DCHECK() in constexpr functions.
+  DCHECK(IsInstructionSPut(opcode));
+#endif
+  return static_cast<DexMemAccessType>(code - Instruction::SPUT);
+}
+
+constexpr DexMemAccessType AGetMemAccessType(Instruction::Code code) {
+#if __cplusplus >= 201402  // C++14 allows the DCHECK() in constexpr functions.
+  DCHECK(IsInstructionAGet(opcode));
+#endif
+  return static_cast<DexMemAccessType>(code - Instruction::AGET);
+}
+
+constexpr DexMemAccessType APutMemAccessType(Instruction::Code code) {
+#if __cplusplus >= 201402  // C++14 allows the DCHECK() in constexpr functions.
+  DCHECK(IsInstructionAPut(opcode));
+#endif
+  return static_cast<DexMemAccessType>(code - Instruction::APUT);
+}
+
+constexpr DexMemAccessType IGetOrIPutMemAccessType(Instruction::Code code) {
+#if __cplusplus >= 201402  // C++14 allows the DCHECK() in constexpr functions.
+  DCHECK(IsInstructionIGetOrIPut(opcode));
+#endif
+  return (code >= Instruction::IPUT) ? IPutMemAccessType(code) : IGetMemAccessType(code);
+}
+
+constexpr DexMemAccessType SGetOrSPutMemAccessType(Instruction::Code code) {
+#if __cplusplus >= 201402  // C++14 allows the DCHECK() in constexpr functions.
+  DCHECK(IsInstructionSGetOrSPut(opcode));
+#endif
+  return (code >= Instruction::SPUT) ? SPutMemAccessType(code) : SGetMemAccessType(code);
+}
+
+constexpr DexMemAccessType AGetOrAPutMemAccessType(Instruction::Code code) {
+#if __cplusplus >= 201402  // C++14 allows the DCHECK() in constexpr functions.
+  DCHECK(IsInstructionAGetOrAPut(opcode));
+#endif
+  return (code >= Instruction::APUT) ? APutMemAccessType(code) : AGetMemAccessType(code);
+}
+
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_DEX_INSTRUCTION_UTILS_H_
diff --git a/compiler/utils/scoped_arena_allocator.cc b/compiler/utils/scoped_arena_allocator.cc
index 2616150..d9e0619 100644
--- a/compiler/utils/scoped_arena_allocator.cc
+++ b/compiler/utils/scoped_arena_allocator.cc
@@ -96,6 +96,7 @@
   uint8_t* ptr = top_ptr_;
   if (UNLIKELY(static_cast<size_t>(top_end_ - ptr) < rounded_bytes)) {
     ptr = AllocateFromNextArena(rounded_bytes);
+    CHECK(ptr != nullptr) << "Failed to allocate memory";
   }
   CurrentStats()->RecordAlloc(bytes, kind);
   top_ptr_ = ptr + rounded_bytes;
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 8ebb40e..f0353f6 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -613,6 +613,23 @@
 }
 
 
+void X86Assembler::ucomiss(XmmRegister a, XmmRegister b) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x2E);
+  EmitXmmRegisterOperand(a, b);
+}
+
+
+void X86Assembler::ucomisd(XmmRegister a, XmmRegister b) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x2E);
+  EmitXmmRegisterOperand(a, b);
+}
+
+
 void X86Assembler::sqrtsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
@@ -1126,7 +1143,8 @@
 }
 
 
-void X86Assembler::shld(Register dst, Register src) {
+void X86Assembler::shld(Register dst, Register src, Register shifter) {
+  DCHECK_EQ(ECX, shifter);
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x0F);
   EmitUint8(0xA5);
@@ -1134,6 +1152,15 @@
 }
 
 
+void X86Assembler::shrd(Register dst, Register src, Register shifter) {
+  DCHECK_EQ(ECX, shifter);
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xAD);
+  EmitRegisterOperand(src, dst);
+}
+
+
 void X86Assembler::negl(Register reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF7);
@@ -1308,13 +1335,19 @@
 }
 
 
+void X86Assembler::LoadLongConstant(XmmRegister dst, int64_t value) {
+  // TODO: Need to have a code constants table.
+  pushl(Immediate(High32Bits(value)));
+  pushl(Immediate(Low32Bits(value)));
+  movsd(dst, Address(ESP, 0));
+  addl(ESP, Immediate(2 * sizeof(int32_t)));
+}
+
+
 void X86Assembler::LoadDoubleConstant(XmmRegister dst, double value) {
   // TODO: Need to have a code constants table.
   int64_t constant = bit_cast<int64_t, double>(value);
-  pushl(Immediate(High32Bits(constant)));
-  pushl(Immediate(Low32Bits(constant)));
-  movsd(dst, Address(ESP, 0));
-  addl(ESP, Immediate(2 * sizeof(intptr_t)));
+  LoadLongConstant(dst, constant);
 }
 
 
@@ -1830,9 +1863,7 @@
 }
 
 void X86Assembler::MemoryBarrier(ManagedRegister) {
-#if ANDROID_SMP != 0
   mfence();
-#endif
 }
 
 void X86Assembler::CreateHandleScopeEntry(ManagedRegister mout_reg,
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 8aed934..9fecf1e 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -42,8 +42,6 @@
 
  private:
   const int32_t value_;
-
-  DISALLOW_COPY_AND_ASSIGN(Immediate);
 };
 
 
@@ -301,6 +299,8 @@
 
   void comiss(XmmRegister a, XmmRegister b);
   void comisd(XmmRegister a, XmmRegister b);
+  void ucomiss(XmmRegister a, XmmRegister b);
+  void ucomisd(XmmRegister a, XmmRegister b);
 
   void sqrtsd(XmmRegister dst, XmmRegister src);
   void sqrtss(XmmRegister dst, XmmRegister src);
@@ -405,7 +405,8 @@
   void shrl(Register operand, Register shifter);
   void sarl(Register reg, const Immediate& imm);
   void sarl(Register operand, Register shifter);
-  void shld(Register dst, Register src);
+  void shld(Register dst, Register src, Register shifter);
+  void shrd(Register dst, Register src, Register shifter);
 
   void negl(Register reg);
   void notl(Register reg);
@@ -440,6 +441,7 @@
 
   void AddImmediate(Register reg, const Immediate& imm);
 
+  void LoadLongConstant(XmmRegister dst, int64_t value);
   void LoadDoubleConstant(XmmRegister dst, double value);
 
   void DoubleNegate(XmmRegister d);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 5d8a3b1..d901673 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -16,7 +16,8 @@
 
 #include "assembler_x86.h"
 
-#include "gtest/gtest.h"
+#include "base/stl_util.h"
+#include "utils/assembler_test.h"
 
 namespace art {
 
@@ -29,4 +30,89 @@
   ASSERT_EQ(static_cast<size_t>(5), buffer.Size());
 }
 
+class AssemblerX86Test : public AssemblerTest<x86::X86Assembler, x86::Register,
+                                              x86::XmmRegister, x86::Immediate> {
+ protected:
+  std::string GetArchitectureString() OVERRIDE {
+    return "x86";
+  }
+
+  std::string GetAssemblerParameters() OVERRIDE {
+    return " --32";
+  }
+
+  std::string GetDisassembleParameters() OVERRIDE {
+    return " -D -bbinary -mi386 --no-show-raw-insn";
+  }
+
+  void SetUpHelpers() OVERRIDE {
+    if (registers_.size() == 0) {
+      registers_.insert(end(registers_),
+                        {  // NOLINT(whitespace/braces)
+                          new x86::Register(x86::EAX),
+                          new x86::Register(x86::EBX),
+                          new x86::Register(x86::ECX),
+                          new x86::Register(x86::EDX),
+                          new x86::Register(x86::EBP),
+                          new x86::Register(x86::ESP),
+                          new x86::Register(x86::ESI),
+                          new x86::Register(x86::EDI)
+                        });
+    }
+
+    if (fp_registers_.size() == 0) {
+      fp_registers_.insert(end(fp_registers_),
+                           {  // NOLINT(whitespace/braces)
+                             new x86::XmmRegister(x86::XMM0),
+                             new x86::XmmRegister(x86::XMM1),
+                             new x86::XmmRegister(x86::XMM2),
+                             new x86::XmmRegister(x86::XMM3),
+                             new x86::XmmRegister(x86::XMM4),
+                             new x86::XmmRegister(x86::XMM5),
+                             new x86::XmmRegister(x86::XMM6),
+                             new x86::XmmRegister(x86::XMM7)
+                           });
+    }
+  }
+
+  void TearDown() OVERRIDE {
+    AssemblerTest::TearDown();
+    STLDeleteElements(&registers_);
+    STLDeleteElements(&fp_registers_);
+  }
+
+  std::vector<x86::Register*> GetRegisters() OVERRIDE {
+    return registers_;
+  }
+
+  std::vector<x86::XmmRegister*> GetFPRegisters() OVERRIDE {
+    return fp_registers_;
+  }
+
+  x86::Immediate CreateImmediate(int64_t imm_value) OVERRIDE {
+    return x86::Immediate(imm_value);
+  }
+
+ private:
+  std::vector<x86::Register*> registers_;
+  std::vector<x86::XmmRegister*> fp_registers_;
+};
+
+
+TEST_F(AssemblerX86Test, Movl) {
+  GetAssembler()->movl(x86::EAX, x86::EBX);
+  const char* expected = "mov %ebx, %eax\n";
+  DriverStr(expected, "movl");
+}
+
+TEST_F(AssemblerX86Test, LoadLongConstant) {
+  GetAssembler()->LoadLongConstant(x86::XMM0, 51);
+  const char* expected =
+      "push $0x0\n"
+      "push $0x33\n"
+      "movsd 0(%esp), %xmm0\n"
+      "add $8, %esp\n";
+  DriverStr(expected, "LoadLongConstant");
+}
+
 }  // namespace art
diff --git a/compiler/utils/x86/constants_x86.h b/compiler/utils/x86/constants_x86.h
index 45c3834..2dfb65c 100644
--- a/compiler/utils/x86/constants_x86.h
+++ b/compiler/utils/x86/constants_x86.h
@@ -96,7 +96,8 @@
   kZero         = kEqual,
   kNotZero      = kNotEqual,
   kNegative     = kSign,
-  kPositive     = kNotSign
+  kPositive     = kNotSign,
+  kUnordered    = kParityEven
 };
 
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 2bb2ed8..474d8a9 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -593,9 +593,19 @@
 
 
 void X86_64Assembler::cvtsi2ss(XmmRegister dst, CpuRegister src) {
+  cvtsi2ss(dst, src, false);
+}
+
+
+void X86_64Assembler::cvtsi2ss(XmmRegister dst, CpuRegister src, bool is64bit) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
-  EmitOptionalRex32(dst, src);
+  if (is64bit) {
+    // Emit a REX.W prefix if the operand size is 64 bits.
+    EmitRex64(dst, src);
+  } else {
+    EmitOptionalRex32(dst, src);
+  }
   EmitUint8(0x0F);
   EmitUint8(0x2A);
   EmitOperand(dst.LowBits(), Operand(src));
@@ -603,9 +613,19 @@
 
 
 void X86_64Assembler::cvtsi2sd(XmmRegister dst, CpuRegister src) {
+  cvtsi2sd(dst, src, false);
+}
+
+
+void X86_64Assembler::cvtsi2sd(XmmRegister dst, CpuRegister src, bool is64bit) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
-  EmitOptionalRex32(dst, src);
+  if (is64bit) {
+    // Emit a REX.W prefix if the operand size is 64 bits.
+    EmitRex64(dst, src);
+  } else {
+    EmitOptionalRex32(dst, src);
+  }
   EmitUint8(0x0F);
   EmitUint8(0x2A);
   EmitOperand(dst.LowBits(), Operand(src));
@@ -700,6 +720,24 @@
   EmitXmmRegisterOperand(a.LowBits(), b);
 }
 
+void X86_64Assembler::ucomiss(XmmRegister a, XmmRegister b) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(a, b);
+  EmitUint8(0x0F);
+  EmitUint8(0x2E);
+  EmitXmmRegisterOperand(a.LowBits(), b);
+}
+
+
+void X86_64Assembler::ucomisd(XmmRegister a, XmmRegister b) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(a, b);
+  EmitUint8(0x0F);
+  EmitUint8(0x2E);
+  EmitXmmRegisterOperand(a.LowBits(), b);
+}
+
 
 void X86_64Assembler::sqrtsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1451,8 +1489,18 @@
 }
 
 
+void X86_64Assembler::shlq(CpuRegister reg, const Immediate& imm) {
+  EmitGenericShift(true, 4, reg, imm);
+}
+
+
 void X86_64Assembler::shll(CpuRegister operand, CpuRegister shifter) {
-  EmitGenericShift(4, operand, shifter);
+  EmitGenericShift(false, 4, operand, shifter);
+}
+
+
+void X86_64Assembler::shlq(CpuRegister operand, CpuRegister shifter) {
+  EmitGenericShift(true, 4, operand, shifter);
 }
 
 
@@ -1467,7 +1515,12 @@
 
 
 void X86_64Assembler::shrl(CpuRegister operand, CpuRegister shifter) {
-  EmitGenericShift(5, operand, shifter);
+  EmitGenericShift(false, 5, operand, shifter);
+}
+
+
+void X86_64Assembler::shrq(CpuRegister operand, CpuRegister shifter) {
+  EmitGenericShift(true, 5, operand, shifter);
 }
 
 
@@ -1477,7 +1530,17 @@
 
 
 void X86_64Assembler::sarl(CpuRegister operand, CpuRegister shifter) {
-  EmitGenericShift(7, operand, shifter);
+  EmitGenericShift(false, 7, operand, shifter);
+}
+
+
+void X86_64Assembler::sarq(CpuRegister reg, const Immediate& imm) {
+  EmitGenericShift(true, 7, reg, imm);
+}
+
+
+void X86_64Assembler::sarq(CpuRegister operand, CpuRegister shifter) {
+  EmitGenericShift(true, 7, operand, shifter);
 }
 
 
@@ -1826,12 +1889,17 @@
 }
 
 
-void X86_64Assembler::EmitGenericShift(int reg_or_opcode,
+void X86_64Assembler::EmitGenericShift(bool wide,
+                                       int reg_or_opcode,
                                        CpuRegister operand,
                                        CpuRegister shifter) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   CHECK_EQ(shifter.AsRegister(), RCX);
-  EmitOptionalRex32(operand);
+  if (wide) {
+    EmitRex64(operand);
+  } else {
+    EmitOptionalRex32(operand);
+  }
   EmitUint8(0xD3);
   EmitOperand(reg_or_opcode, Operand(operand));
 }
@@ -2371,9 +2439,7 @@
 }
 
 void X86_64Assembler::MemoryBarrier(ManagedRegister) {
-#if ANDROID_SMP != 0
   mfence();
-#endif
 }
 
 void X86_64Assembler::CreateHandleScopeEntry(ManagedRegister mout_reg,
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 4dd70e2..6e71e4a 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -329,7 +329,9 @@
   void divsd(XmmRegister dst, const Address& src);
 
   void cvtsi2ss(XmmRegister dst, CpuRegister src);  // Note: this is the r/m32 version.
+  void cvtsi2ss(XmmRegister dst, CpuRegister src, bool is64bit);
   void cvtsi2sd(XmmRegister dst, CpuRegister src);  // Note: this is the r/m32 version.
+  void cvtsi2sd(XmmRegister dst, CpuRegister src, bool is64bit);
 
   void cvtss2si(CpuRegister dst, XmmRegister src);  // Note: this is the r32 version.
   void cvtss2sd(XmmRegister dst, XmmRegister src);
@@ -344,6 +346,8 @@
 
   void comiss(XmmRegister a, XmmRegister b);
   void comisd(XmmRegister a, XmmRegister b);
+  void ucomiss(XmmRegister a, XmmRegister b);
+  void ucomisd(XmmRegister a, XmmRegister b);
 
   void sqrtsd(XmmRegister dst, XmmRegister src);
   void sqrtss(XmmRegister dst, XmmRegister src);
@@ -460,7 +464,12 @@
   void sarl(CpuRegister reg, const Immediate& imm);
   void sarl(CpuRegister operand, CpuRegister shifter);
 
+  void shlq(CpuRegister reg, const Immediate& imm);
+  void shlq(CpuRegister operand, CpuRegister shifter);
   void shrq(CpuRegister reg, const Immediate& imm);
+  void shrq(CpuRegister operand, CpuRegister shifter);
+  void sarq(CpuRegister reg, const Immediate& imm);
+  void sarq(CpuRegister operand, CpuRegister shifter);
 
   void negl(CpuRegister reg);
   void negq(CpuRegister reg);
@@ -657,7 +666,7 @@
   void EmitNearLabelLink(Label* label);
 
   void EmitGenericShift(bool wide, int rm, CpuRegister reg, const Immediate& imm);
-  void EmitGenericShift(int rm, CpuRegister operand, CpuRegister shifter);
+  void EmitGenericShift(bool wide, int rm, CpuRegister operand, CpuRegister shifter);
 
   // If any input is not false, output the necessary rex prefix.
   void EmitOptionalRex(bool force, bool w, bool r, bool x, bool b);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index af389e6..c8e923c 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -296,7 +296,7 @@
   DriverStr(Repeatri(&x86_64::X86_64Assembler::subl, 4U, "sub ${imm}, %{reg}"), "subli");
 }
 
-// Shll only allows CL as the shift register.
+// Shll only allows CL as the shift count.
 std::string shll_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
 
@@ -319,7 +319,31 @@
   DriverStr(Repeatri(&x86_64::X86_64Assembler::shll, 1U, "shll ${imm}, %{reg}"), "shlli");
 }
 
-// Shrl only allows CL as the shift register.
+// Shlq only allows CL as the shift count.
+std::string shlq_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
+  std::ostringstream str;
+
+  std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
+
+  x86_64::CpuRegister shifter(x86_64::RCX);
+  for (auto reg : registers) {
+    assembler->shlq(*reg, shifter);
+    str << "shlq %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
+  }
+  printf("%s\n", str.str().c_str());
+
+  return str.str();
+}
+
+TEST_F(AssemblerX86_64Test, ShlqReg) {
+  DriverFn(&shlq_fn, "shlq");
+}
+
+TEST_F(AssemblerX86_64Test, ShlqImm) {
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::shlq, 1U, "shlq ${imm}, %{reg}"), "shlqi");
+}
+
+// Shrl only allows CL as the shift count.
 std::string shrl_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
 
@@ -342,7 +366,30 @@
   DriverStr(Repeatri(&x86_64::X86_64Assembler::shrl, 1U, "shrl ${imm}, %{reg}"), "shrli");
 }
 
-// Sarl only allows CL as the shift register.
+// Shrq only allows CL as the shift count.
+std::string shrq_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
+  std::ostringstream str;
+
+  std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
+
+  x86_64::CpuRegister shifter(x86_64::RCX);
+  for (auto reg : registers) {
+    assembler->shrq(*reg, shifter);
+    str << "shrq %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
+  }
+
+  return str.str();
+}
+
+TEST_F(AssemblerX86_64Test, ShrqReg) {
+  DriverFn(&shrq_fn, "shrq");
+}
+
+TEST_F(AssemblerX86_64Test, ShrqImm) {
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::shrq, 1U, "shrq ${imm}, %{reg}"), "shrqi");
+}
+
+// Sarl only allows CL as the shift count.
 std::string sarl_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
 
@@ -365,6 +412,29 @@
   DriverStr(Repeatri(&x86_64::X86_64Assembler::sarl, 1U, "sarl ${imm}, %{reg}"), "sarli");
 }
 
+// Sarq only allows CL as the shift count.
+std::string sarq_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
+  std::ostringstream str;
+
+  std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
+
+  x86_64::CpuRegister shifter(x86_64::RCX);
+  for (auto reg : registers) {
+    assembler->sarq(*reg, shifter);
+    str << "sarq %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
+  }
+
+  return str.str();
+}
+
+TEST_F(AssemblerX86_64Test, SarqReg) {
+  DriverFn(&sarq_fn, "sarq");
+}
+
+TEST_F(AssemblerX86_64Test, SarqImm) {
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::sarq, 1U, "sarq ${imm}, %{reg}"), "sarqi");
+}
+
 TEST_F(AssemblerX86_64Test, CmpqRegs) {
   DriverStr(RepeatRR(&x86_64::X86_64Assembler::cmpq, "cmpq %{reg2}, %{reg1}"), "cmpq");
 }
@@ -590,6 +660,14 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::comisd, "comisd %{reg2}, %{reg1}"), "comisd");
 }
 
+TEST_F(AssemblerX86_64Test, Ucomiss) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::ucomiss, "ucomiss %{reg2}, %{reg1}"), "ucomiss");
+}
+
+TEST_F(AssemblerX86_64Test, Ucomisd) {
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::ucomisd, "ucomisd %{reg2}, %{reg1}"), "ucomisd");
+}
+
 TEST_F(AssemblerX86_64Test, Sqrtss) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::sqrtss, "sqrtss %{reg2}, %{reg1}"), "sqrtss");
 }
diff --git a/compiler/utils/x86_64/constants_x86_64.h b/compiler/utils/x86_64/constants_x86_64.h
index 2a5b43d..0c782d4 100644
--- a/compiler/utils/x86_64/constants_x86_64.h
+++ b/compiler/utils/x86_64/constants_x86_64.h
@@ -105,7 +105,8 @@
   kZero         = kEqual,
   kNotZero      = kNotEqual,
   kNegative     = kSign,
-  kPositive     = kNotSign
+  kPositive     = kNotSign,
+  kUnordered    = kParityEven
 };
 
 
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 2d2a82e..869c822 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -165,7 +165,13 @@
   UsageError("  --compiler-backend=(Quick|Optimizing|Portable): select compiler backend");
   UsageError("      set.");
   UsageError("      Example: --compiler-backend=Portable");
-  UsageError("      Default: Quick");
+  if (kUsePortableCompiler) {
+    UsageError("      Default: Portable");
+  } else if (kUseOptimizingCompiler) {
+    UsageError("      Default: Optimizing");
+  } else {
+    UsageError("      Default: Quick");
+  }
   UsageError("");
   UsageError("  --compiler-filter="
                 "(verify-none"
@@ -419,7 +425,9 @@
 class Dex2Oat FINAL {
  public:
   explicit Dex2Oat(TimingLogger* timings) :
-      compiler_kind_(kUsePortableCompiler ? Compiler::kPortable : Compiler::kQuick),
+      compiler_kind_(kUsePortableCompiler
+          ? Compiler::kPortable
+          : (kUseOptimizingCompiler ? Compiler::kOptimizing : Compiler::kQuick)),
       instruction_set_(kRuntimeISA),
       // Take the default set of instruction features from the build.
       method_inliner_map_(),
@@ -597,7 +605,6 @@
           compiler_kind_ = Compiler::kQuick;
         } else if (backend_str == "Optimizing") {
           compiler_kind_ = Compiler::kOptimizing;
-          compile_pic = true;
         } else if (backend_str == "Portable") {
           compiler_kind_ = Compiler::kPortable;
         } else {
@@ -707,6 +714,11 @@
       }
     }
 
+    if (compiler_kind_ == Compiler::kOptimizing) {
+      // Optimizing only supports PIC mode.
+      compile_pic = true;
+    }
+
     if (oat_filename_.empty() && oat_fd_ == -1) {
       Usage("Output must be supplied with either --oat-file or --oat-fd");
     }
@@ -942,9 +954,11 @@
         oat_location_ = oat_filename_;
       }
     } else {
-      oat_file_.reset(new File(oat_fd_, oat_location_));
+      oat_file_.reset(new File(oat_fd_, oat_location_, true));
       oat_file_->DisableAutoClose();
-      oat_file_->SetLength(0);
+      if (oat_file_->SetLength(0) != 0) {
+        PLOG(WARNING) << "Truncating oat file " << oat_location_ << " failed.";
+      }
     }
     if (oat_file_.get() == nullptr) {
       PLOG(ERROR) << "Failed to create oat file: " << oat_location_;
@@ -952,6 +966,7 @@
     }
     if (create_file && fchmod(oat_file_->Fd(), 0644) != 0) {
       PLOG(ERROR) << "Failed to make oat file world readable: " << oat_location_;
+      oat_file_->Erase();
       return false;
     }
     return true;
@@ -1075,7 +1090,10 @@
                 << ". Try: adb shell chmod 777 /data/local/tmp";
             continue;
           }
-          tmp_file->WriteFully(dex_file->Begin(), dex_file->Size());
+          // This is just dumping files for debugging. Ignore errors, and leave remnants.
+          UNUSED(tmp_file->WriteFully(dex_file->Begin(), dex_file->Size()));
+          UNUSED(tmp_file->Flush());
+          UNUSED(tmp_file->Close());
           LOG(INFO) << "Wrote input to " << tmp_file_name;
         }
       }
@@ -1214,6 +1232,8 @@
 
   // Write out the generated code part. Calls the OatWriter and ElfBuilder. Also prepares the
   // ImageWriter, if necessary.
+  // Note: Flushing (and closing) the file is the caller's responsibility, except for the failure
+  //       case (when the file will be explicitly erased).
   bool CreateOatFile() {
     CHECK(key_value_store_.get() != nullptr);
 
@@ -1266,15 +1286,7 @@
       if (!driver_->WriteElf(android_root_, is_host_, dex_files_, oat_writer.get(),
                              oat_file_.get())) {
         LOG(ERROR) << "Failed to write ELF file " << oat_file_->GetPath();
-        return false;
-      }
-    }
-
-    // Flush result to disk.
-    {
-      TimingLogger::ScopedTiming t2("dex2oat Flush ELF", timings_);
-      if (oat_file_->Flush() != 0) {
-        LOG(ERROR) << "Failed to flush ELF file " << oat_file_->GetPath();
+        oat_file_->Erase();
         return false;
       }
     }
@@ -1295,14 +1307,19 @@
     return true;
   }
 
-  // Strip the oat file, if requested. This first creates a copy from unstripped to stripped, and
-  // then runs the ElfStripper. Currently only relevant for the portable compiler.
-  bool Strip() {
+  // Create a copy from unstripped to stripped.
+  bool CopyUnstrippedToStripped() {
     // If we don't want to strip in place, copy from unstripped location to stripped location.
     // We need to strip after image creation because FixupElf needs to use .strtab.
     if (oat_unstripped_ != oat_stripped_) {
+      // If the oat file is still open, flush it.
+      if (oat_file_.get() != nullptr && oat_file_->IsOpened()) {
+        if (!FlushCloseOatFile()) {
+          return false;
+        }
+      }
+
       TimingLogger::ScopedTiming t("dex2oat OatFile copy", timings_);
-      oat_file_.reset();
       std::unique_ptr<File> in(OS::OpenFileForReading(oat_unstripped_.c_str()));
       std::unique_ptr<File> out(OS::CreateEmptyFile(oat_stripped_.c_str()));
       size_t buffer_size = 8192;
@@ -1315,14 +1332,27 @@
         bool write_ok = out->WriteFully(buffer.get(), bytes_read);
         CHECK(write_ok);
       }
-      oat_file_.reset(out.release());
+      if (kUsePortableCompiler) {
+        oat_file_.reset(out.release());
+      } else {
+        if (out->FlushCloseOrErase() != 0) {
+          PLOG(ERROR) << "Failed to flush and close copied oat file: " << oat_stripped_;
+          return false;
+        }
+      }
       VLOG(compiler) << "Oat file copied successfully (stripped): " << oat_stripped_;
     }
+    return true;
+  }
 
+  // Run the ElfStripper. Currently only relevant for the portable compiler.
+  bool Strip() {
     if (kUsePortableCompiler) {
       // Portable includes debug symbols unconditionally. If we are not supposed to create them,
       // strip them now. Quick generates debug symbols only when the flag(s) are set.
       if (!compiler_options_->GetIncludeDebugSymbols()) {
+        CHECK(oat_file_.get() != nullptr && oat_file_->IsOpened());
+
         TimingLogger::ScopedTiming t("dex2oat ElfStripper", timings_);
         // Strip unneeded sections for target
         off_t seek_actual = lseek(oat_file_->Fd(), 0, SEEK_SET);
@@ -1330,6 +1360,11 @@
         std::string error_msg;
         if (!ElfFile::Strip(oat_file_.get(), &error_msg)) {
           LOG(ERROR) << "Failed to strip elf file: " << error_msg;
+          oat_file_->Erase();
+          return false;
+        }
+
+        if (!FlushCloseOatFile()) {
           return false;
         }
 
@@ -1343,6 +1378,31 @@
     return true;
   }
 
+  bool FlushOatFile() {
+    if (oat_file_.get() != nullptr) {
+      TimingLogger::ScopedTiming t2("dex2oat Flush ELF", timings_);
+      if (oat_file_->Flush() != 0) {
+        PLOG(ERROR) << "Failed to flush oat file: " << oat_location_ << " / "
+            << oat_filename_;
+        oat_file_->Erase();
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool FlushCloseOatFile() {
+    if (oat_file_.get() != nullptr) {
+      std::unique_ptr<File> tmp(oat_file_.release());
+      if (tmp->FlushCloseOrErase() != 0) {
+        PLOG(ERROR) << "Failed to flush and close oat file: " << oat_location_ << " / "
+            << oat_filename_;
+        return false;
+      }
+    }
+    return true;
+  }
+
   void DumpTiming() {
     if (dump_timing_ || (dump_slow_timing_ && timings_->GetTotalNs() > MsToNs(1000))) {
       LOG(INFO) << Dumpable<TimingLogger>(*timings_);
@@ -1356,6 +1416,10 @@
     return compiler_options_.get();
   }
 
+  bool IsImage() const {
+    return image_;
+  }
+
   bool IsHost() const {
     return is_host_;
   }
@@ -1451,18 +1515,24 @@
     // Destroy ImageWriter before doing FixupElf.
     image_writer_.reset();
 
-    std::unique_ptr<File> oat_file(OS::OpenFileReadWrite(oat_unstripped_.c_str()));
-    if (oat_file.get() == nullptr) {
-      PLOG(ERROR) << "Failed to open ELF file: " << oat_unstripped_;
-      return false;
-    }
-
     // Do not fix up the ELF file if we are --compile-pic
     if (!compiler_options_->GetCompilePic()) {
+      std::unique_ptr<File> oat_file(OS::OpenFileReadWrite(oat_unstripped_.c_str()));
+      if (oat_file.get() == nullptr) {
+        PLOG(ERROR) << "Failed to open ELF file: " << oat_unstripped_;
+        return false;
+      }
+
       if (!ElfWriter::Fixup(oat_file.get(), oat_data_begin)) {
+        oat_file->Erase();
         LOG(ERROR) << "Failed to fixup ELF file " << oat_file->GetPath();
         return false;
       }
+
+      if (oat_file->FlushCloseOrErase()) {
+        PLOG(ERROR) << "Failed to flush and close fixed ELF file " << oat_file->GetPath();
+        return false;
+      }
     }
 
     return true;
@@ -1609,6 +1679,94 @@
 #endif
 }
 
+static int CompileImage(Dex2Oat& dex2oat) {
+  dex2oat.Compile();
+
+  // Create the boot.oat.
+  if (!dex2oat.CreateOatFile()) {
+    return EXIT_FAILURE;
+  }
+
+  // Flush and close the boot.oat. We always expect the output file by name, and it will be
+  // re-opened from the unstripped name.
+  if (!dex2oat.FlushCloseOatFile()) {
+    return EXIT_FAILURE;
+  }
+
+  // Creates the boot.art and patches the boot.oat.
+  if (!dex2oat.HandleImage()) {
+    return EXIT_FAILURE;
+  }
+
+  // When given --host, finish early without stripping.
+  if (dex2oat.IsHost()) {
+    dex2oat.DumpTiming();
+    return EXIT_SUCCESS;
+  }
+
+  // Copy unstripped to stripped location, if necessary.
+  if (!dex2oat.CopyUnstrippedToStripped()) {
+    return EXIT_FAILURE;
+  }
+
+  // Strip, if necessary.
+  if (!dex2oat.Strip()) {
+    return EXIT_FAILURE;
+  }
+
+  // FlushClose again, as stripping might have re-opened the oat file.
+  if (!dex2oat.FlushCloseOatFile()) {
+    return EXIT_FAILURE;
+  }
+
+  dex2oat.DumpTiming();
+  return EXIT_SUCCESS;
+}
+
+static int CompileApp(Dex2Oat& dex2oat) {
+  dex2oat.Compile();
+
+  // Create the app oat.
+  if (!dex2oat.CreateOatFile()) {
+    return EXIT_FAILURE;
+  }
+
+  // Do not close the oat file here. We might haven gotten the output file by file descriptor,
+  // which we would lose.
+  if (!dex2oat.FlushOatFile()) {
+    return EXIT_FAILURE;
+  }
+
+  // When given --host, finish early without stripping.
+  if (dex2oat.IsHost()) {
+    if (!dex2oat.FlushCloseOatFile()) {
+      return EXIT_FAILURE;
+    }
+
+    dex2oat.DumpTiming();
+    return EXIT_SUCCESS;
+  }
+
+  // Copy unstripped to stripped location, if necessary. This will implicitly flush & close the
+  // unstripped version. If this is given, we expect to be able to open writable files by name.
+  if (!dex2oat.CopyUnstrippedToStripped()) {
+    return EXIT_FAILURE;
+  }
+
+  // Strip, if necessary.
+  if (!dex2oat.Strip()) {
+    return EXIT_FAILURE;
+  }
+
+  // Flush and close the file.
+  if (!dex2oat.FlushCloseOatFile()) {
+    return EXIT_FAILURE;
+  }
+
+  dex2oat.DumpTiming();
+  return EXIT_SUCCESS;
+}
+
 static int dex2oat(int argc, char** argv) {
   b13564922();
 
@@ -1630,27 +1788,11 @@
     return EXIT_FAILURE;
   }
 
-  dex2oat.Compile();
-
-  if (!dex2oat.CreateOatFile()) {
-    return EXIT_FAILURE;
+  if (dex2oat.IsImage()) {
+    return CompileImage(dex2oat);
+  } else {
+    return CompileApp(dex2oat);
   }
-
-  if (!dex2oat.HandleImage()) {
-    return EXIT_FAILURE;
-  }
-
-  if (dex2oat.IsHost()) {
-    dex2oat.DumpTiming();
-    return EXIT_SUCCESS;
-  }
-
-  if (!dex2oat.Strip()) {
-    return EXIT_FAILURE;
-  }
-
-  dex2oat.DumpTiming();
-  return EXIT_SUCCESS;
 }
 }  // namespace art
 
diff --git a/disassembler/Android.mk b/disassembler/Android.mk
index f2dd1ee..3ad2941 100644
--- a/disassembler/Android.mk
+++ b/disassembler/Android.mk
@@ -85,7 +85,11 @@
   LOCAL_ADDITIONAL_DEPENDENCIES += $(LOCAL_PATH)/Android.mk
   include external/libcxx/libcxx.mk
   # For disassembler_arm64.
-  LOCAL_SHARED_LIBRARIES += libvixl
+  ifeq ($$(art_ndebug_or_debug),debug)
+     LOCAL_SHARED_LIBRARIES += libvixld
+  else
+     LOCAL_SHARED_LIBRARIES += libvixl
+  endif
   ifeq ($$(art_target_or_host),target)
     include $(BUILD_SHARED_LIBRARY)
   else # host
diff --git a/disassembler/disassembler.h b/disassembler/disassembler.h
index 9cd631c..966ee3a 100644
--- a/disassembler/disassembler.h
+++ b/disassembler/disassembler.h
@@ -34,8 +34,14 @@
   // Base addess for calculating relative code offsets when absolute_addresses_ is false.
   const uint8_t* const base_address_;
 
-  DisassemblerOptions(bool absolute_addresses, const uint8_t* base_address)
-      : absolute_addresses_(absolute_addresses), base_address_(base_address) {}
+  // If set, the disassembler is allowed to look at load targets in literal
+  // pools.
+  const bool can_read_literals_;
+
+  DisassemblerOptions(bool absolute_addresses, const uint8_t* base_address,
+                      bool can_read_literals)
+      : absolute_addresses_(absolute_addresses), base_address_(base_address),
+        can_read_literals_(can_read_literals) {}
 
  private:
   DISALLOW_COPY_AND_ASSIGN(DisassemblerOptions);
diff --git a/disassembler/disassembler_arm64.cc b/disassembler/disassembler_arm64.cc
index 229ac97..bd3bebf 100644
--- a/disassembler/disassembler_arm64.cc
+++ b/disassembler/disassembler_arm64.cc
@@ -27,10 +27,88 @@
 namespace art {
 namespace arm64 {
 
+void CustomDisassembler::AppendRegisterNameToOutput(
+    const vixl::Instruction* instr,
+    const vixl::CPURegister& reg) {
+  USE(instr);
+  if (reg.IsRegister()) {
+    // This enumeration should mirror the declarations in
+    // runtime/arch/arm64/registers_arm64.h. We do not include that file to
+    // avoid a dependency on libart.
+    enum {
+      TR  = 18,
+      ETR = 21,
+      IP0 = 16,
+      IP1 = 17,
+      FP  = 29,
+      LR  = 30
+    };
+    switch (reg.code()) {
+      case IP0: AppendToOutput(reg.Is64Bits() ? "ip0" : "wip0"); return;
+      case IP1: AppendToOutput(reg.Is64Bits() ? "ip1" : "wip1"); return;
+      case TR:  AppendToOutput(reg.Is64Bits() ? "tr"  :  "w18"); return;
+      case ETR: AppendToOutput(reg.Is64Bits() ? "etr" :  "w21"); return;
+      case FP:  AppendToOutput(reg.Is64Bits() ? "fp"  :  "w29"); return;
+      case LR:  AppendToOutput(reg.Is64Bits() ? "lr"  :  "w30"); return;
+      default:
+        // Fall through.
+        break;
+    }
+  }
+  // Print other register names as usual.
+  Disassembler::AppendRegisterNameToOutput(instr, reg);
+}
+
+void CustomDisassembler::VisitLoadLiteral(const vixl::Instruction* instr) {
+  Disassembler::VisitLoadLiteral(instr);
+
+  if (!read_literals_) {
+    return;
+  }
+
+  char* buffer = buffer_;
+  char* buffer_end = buffer_ + buffer_size_;
+
+  // Find the end position in the buffer.
+  while ((*buffer != 0) && (buffer < buffer_end)) {
+    ++buffer;
+  }
+
+  void* data_address = instr->LiteralAddress<void*>();
+  ptrdiff_t buf_size_remaining = buffer_end - buffer;
+  vixl::Instr op = instr->Mask(vixl::LoadLiteralMask);
+
+  switch (op) {
+    case vixl::LDR_w_lit:
+    case vixl::LDR_x_lit:
+    case vixl::LDRSW_x_lit: {
+      int64_t data = op == vixl::LDR_x_lit ? *reinterpret_cast<int64_t*>(data_address)
+                                           : *reinterpret_cast<int32_t*>(data_address);
+      snprintf(buffer, buf_size_remaining, " (0x%" PRIx64 " / %" PRId64 ")", data, data);
+      break;
+    }
+    case vixl::LDR_s_lit:
+    case vixl::LDR_d_lit: {
+      double data = (op == vixl::LDR_s_lit) ? *reinterpret_cast<float*>(data_address)
+                                            : *reinterpret_cast<double*>(data_address);
+      snprintf(buffer, buf_size_remaining, " (%g)", data);
+      break;
+    }
+    default:
+      break;
+  }
+}
+
 size_t DisassemblerArm64::Dump(std::ostream& os, const uint8_t* begin) {
   const vixl::Instruction* instr = reinterpret_cast<const vixl::Instruction*>(begin);
   decoder.Decode(instr);
-  os << FormatInstructionPointer(begin)
+  // TODO: Use FormatInstructionPointer() once VIXL provides the appropriate
+  // features.
+  // VIXL does not yet allow remapping addresses disassembled. Using
+  // FormatInstructionPointer() would show incoherences between the instruction
+  // location addresses and the target addresses disassembled by VIXL (eg. for
+  // branch instructions).
+  os << StringPrintf("%p", instr)
      << StringPrintf(": %08x\t%s\n", instr->InstructionBits(), disasm.GetOutput());
   return vixl::kInstructionSize;
 }
diff --git a/disassembler/disassembler_arm64.h b/disassembler/disassembler_arm64.h
index e56fe4f..a370b8d 100644
--- a/disassembler/disassembler_arm64.h
+++ b/disassembler/disassembler_arm64.h
@@ -28,9 +28,35 @@
 namespace art {
 namespace arm64 {
 
+class CustomDisassembler FINAL : public vixl::Disassembler {
+ public:
+  explicit CustomDisassembler(bool read_literals) :
+      vixl::Disassembler(), read_literals_(read_literals) {}
+
+  // Use register aliases in the disassembly.
+  virtual void AppendRegisterNameToOutput(const vixl::Instruction* instr,
+                                          const vixl::CPURegister& reg) OVERRIDE;
+
+  // Improve the disassembly of literal load instructions.
+  virtual void VisitLoadLiteral(const vixl::Instruction* instr) OVERRIDE;
+
+ private:
+  // Indicate if the disassembler should read data loaded from literal pools.
+  // This should only be enabled if reading the target of literal loads is safe.
+  // Here are possible outputs when the option is on or off:
+  // read_literals_ | disassembly
+  //           true | 0x72681558: 1c000acb  ldr s11, pc+344 (addr 0x726816b0)
+  //          false | 0x72681558: 1c000acb  ldr s11, pc+344 (addr 0x726816b0) (3.40282e+38)
+  const bool read_literals_;
+};
+
 class DisassemblerArm64 FINAL : public Disassembler {
  public:
-  explicit DisassemblerArm64(DisassemblerOptions* options) : Disassembler(options) {
+  // TODO: Update this code once VIXL provides the ability to map code addresses
+  // to disassemble as a different address (the way FormatInstructionPointer()
+  // does).
+  explicit DisassemblerArm64(DisassemblerOptions* options) :
+      Disassembler(options), disasm(options->can_read_literals_) {
     decoder.AppendVisitor(&disasm);
   }
 
@@ -39,7 +65,7 @@
 
  private:
   vixl::Decoder decoder;
-  vixl::Disassembler disasm;
+  CustomDisassembler disasm;
 
   DISALLOW_COPY_AND_ASSIGN(DisassemblerArm64);
 };
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index d6309f7..d28b626 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -182,8 +182,8 @@
 
     bool result = builder_->Write();
 
-    elf_output_->Flush();
-    elf_output_->Close();
+    // Ignore I/O errors.
+    UNUSED(elf_output_->FlushClose());
 
     return result;
   }
@@ -386,9 +386,11 @@
     : oat_file_(oat_file),
       oat_dex_files_(oat_file.GetOatDexFiles()),
       options_(options),
-      disassembler_(Disassembler::Create(oat_file_.GetOatHeader().GetInstructionSet(),
+      instruction_set_(oat_file_.GetOatHeader().GetInstructionSet()),
+      disassembler_(Disassembler::Create(instruction_set_,
                                          new DisassemblerOptions(options_->absolute_addresses_,
-                                                                 oat_file.Begin()))) {
+                                                                 oat_file.Begin(),
+                                                                 true /* can_read_litals_ */))) {
     CHECK(options_->class_loader_ != nullptr);
     AddAllOffsets();
   }
@@ -398,6 +400,10 @@
     delete disassembler_;
   }
 
+  InstructionSet GetInstructionSet() {
+    return instruction_set_;
+  }
+
   bool Dump(std::ostream& os) {
     bool success = true;
     const OatHeader& oat_header = oat_file_.GetOatHeader();
@@ -514,7 +520,7 @@
     return end_offset - begin_offset;
   }
 
-  InstructionSet GetInstructionSet() {
+  InstructionSet GetOatInstructionSet() {
     return oat_file_.GetOatHeader().GetInstructionSet();
   }
 
@@ -599,7 +605,7 @@
     offsets_.insert(code_offset);
     offsets_.insert(oat_method.GetMappingTableOffset());
     offsets_.insert(oat_method.GetVmapTableOffset());
-    offsets_.insert(oat_method.GetNativeGcMapOffset());
+    offsets_.insert(oat_method.GetGcMapOffset());
   }
 
   bool DumpOatDexFile(std::ostream& os, const OatFile::OatDexFile& oat_dex_file) {
@@ -749,9 +755,9 @@
 
       *indent2_os << "gc_map: ";
       if (options_->absolute_addresses_) {
-        *indent2_os << StringPrintf("%p ", oat_method.GetNativeGcMap());
+        *indent2_os << StringPrintf("%p ", oat_method.GetGcMap());
       }
-      uint32_t gc_map_offset = oat_method.GetNativeGcMapOffset();
+      uint32_t gc_map_offset = oat_method.GetGcMapOffset();
       *indent2_os << StringPrintf("(offset=0x%08x)\n", gc_map_offset);
       if (gc_map_offset > oat_file_.Size()) {
         *indent2_os << StringPrintf("WARNING: "
@@ -929,7 +935,7 @@
     // If the native GC map is null, then this method has been compiled with the
     // optimizing compiler. The optimizing compiler currently outputs its stack map
     // in the vmap table, and the code below does not work with such a stack map.
-    if (oat_method.GetNativeGcMap() == nullptr) {
+    if (oat_method.GetGcMap() == nullptr) {
       return;
     }
     const uint8_t* raw_table = oat_method.GetVmapTable();
@@ -1044,7 +1050,7 @@
   }
   void DumpGcMap(std::ostream& os, const OatFile::OatMethod& oat_method,
                  const DexFile::CodeItem* code_item) {
-    const uint8_t* gc_map_raw = oat_method.GetNativeGcMap();
+    const uint8_t* gc_map_raw = oat_method.GetGcMap();
     if (gc_map_raw == nullptr) {
       return;  // No GC map.
     }
@@ -1122,7 +1128,7 @@
 
   void DumpGcMapAtNativePcOffset(std::ostream& os, const OatFile::OatMethod& oat_method,
                                  const DexFile::CodeItem* code_item, size_t native_pc_offset) {
-    const uint8_t* gc_map_raw = oat_method.GetNativeGcMap();
+    const uint8_t* gc_map_raw = oat_method.GetGcMap();
     if (gc_map_raw != nullptr) {
       NativePcOffsetToReferenceMap map(gc_map_raw);
       if (map.HasEntry(native_pc_offset)) {
@@ -1259,6 +1265,7 @@
   const OatFile& oat_file_;
   const std::vector<const OatFile::OatDexFile*> oat_dex_files_;
   const OatDumperOptions* options_;
+  InstructionSet instruction_set_;
   std::set<uintptr_t> offsets_;
   Disassembler* disassembler_;
 };
@@ -1520,7 +1527,8 @@
 
   const void* GetQuickOatCodeBegin(mirror::ArtMethod* m)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    const void* quick_code = m->GetEntryPointFromQuickCompiledCode();
+    const void* quick_code = m->GetEntryPointFromQuickCompiledCodePtrSize(
+        InstructionSetPointerSize(oat_dumper_->GetOatInstructionSet()));
     if (Runtime::Current()->GetClassLinker()->IsQuickResolutionStub(quick_code)) {
       quick_code = oat_dumper_->GetQuickOatCode(m);
     }
@@ -1621,11 +1629,13 @@
         }
       }
     } else if (obj->IsArtMethod()) {
+      const size_t image_pointer_size = InstructionSetPointerSize(
+          state->oat_dumper_->GetOatInstructionSet());
       mirror::ArtMethod* method = obj->AsArtMethod();
       if (method->IsNative()) {
         // TODO: portable dumping.
-        DCHECK(method->GetNativeGcMap() == nullptr) << PrettyMethod(method);
-        DCHECK(method->GetMappingTable() == nullptr) << PrettyMethod(method);
+        DCHECK(method->GetNativeGcMap(image_pointer_size) == nullptr) << PrettyMethod(method);
+        DCHECK(method->GetMappingTable(image_pointer_size) == nullptr) << PrettyMethod(method);
         bool first_occurrence;
         const void* quick_oat_code = state->GetQuickOatCodeBegin(method);
         uint32_t quick_oat_code_size = state->GetQuickOatCodeSize(method);
@@ -1633,33 +1643,35 @@
         if (first_occurrence) {
           state->stats_.native_to_managed_code_bytes += quick_oat_code_size;
         }
-        if (quick_oat_code != method->GetEntryPointFromQuickCompiledCode()) {
+        if (quick_oat_code != method->GetEntryPointFromQuickCompiledCodePtrSize(
+            image_pointer_size)) {
           indent_os << StringPrintf("OAT CODE: %p\n", quick_oat_code);
         }
       } else if (method->IsAbstract() || method->IsCalleeSaveMethod() ||
           method->IsResolutionMethod() || method->IsImtConflictMethod() ||
           method->IsImtUnimplementedMethod() || method->IsClassInitializer()) {
-        DCHECK(method->GetNativeGcMap() == nullptr) << PrettyMethod(method);
-        DCHECK(method->GetMappingTable() == nullptr) << PrettyMethod(method);
+        DCHECK(method->GetNativeGcMap(image_pointer_size) == nullptr) << PrettyMethod(method);
+        DCHECK(method->GetMappingTable(image_pointer_size) == nullptr) << PrettyMethod(method);
       } else {
         const DexFile::CodeItem* code_item = method->GetCodeItem();
         size_t dex_instruction_bytes = code_item->insns_size_in_code_units_ * 2;
         state->stats_.dex_instruction_bytes += dex_instruction_bytes;
 
         bool first_occurrence;
-        size_t gc_map_bytes = state->ComputeOatSize(method->GetNativeGcMap(), &first_occurrence);
+        size_t gc_map_bytes =
+            state->ComputeOatSize(method->GetNativeGcMap(image_pointer_size), &first_occurrence);
         if (first_occurrence) {
           state->stats_.gc_map_bytes += gc_map_bytes;
         }
 
         size_t pc_mapping_table_bytes =
-            state->ComputeOatSize(method->GetMappingTable(), &first_occurrence);
+            state->ComputeOatSize(method->GetMappingTable(image_pointer_size), &first_occurrence);
         if (first_occurrence) {
           state->stats_.pc_mapping_table_bytes += pc_mapping_table_bytes;
         }
 
         size_t vmap_table_bytes =
-            state->ComputeOatSize(method->GetVmapTable(), &first_occurrence);
+            state->ComputeOatSize(method->GetVmapTable(image_pointer_size), &first_occurrence);
         if (first_occurrence) {
           state->stats_.vmap_table_bytes += vmap_table_bytes;
         }
diff --git a/patchoat/patchoat.cc b/patchoat/patchoat.cc
index 281649e..68fd15b 100644
--- a/patchoat/patchoat.cc
+++ b/patchoat/patchoat.cc
@@ -565,12 +565,6 @@
     copy->SetEntryPointFromJniPtrSize(reinterpret_cast<void*>(native_method + delta_),
                                       pointer_size);
   }
-
-  uintptr_t native_gc_map = reinterpret_cast<uintptr_t>(
-      object->GetNativeGcMapPtrSize(pointer_size));
-  if (native_gc_map != 0) {
-    copy->SetNativeGcMapPtrSize(reinterpret_cast<uint8_t*>(native_gc_map + delta_), pointer_size);
-  }
 }
 
 bool PatchOat::Patch(File* input_oat, off_t delta, File* output_oat, TimingLogger* timings,
@@ -904,6 +898,20 @@
   }
 }
 
+// Either try to close the file (close=true), or erase it.
+static bool FinishFile(File* file, bool close) {
+  if (close) {
+    if (file->FlushCloseOrErase() != 0) {
+      PLOG(ERROR) << "Failed to flush and close file.";
+      return false;
+    }
+    return true;
+  } else {
+    file->Erase();
+    return false;
+  }
+}
+
 static int patchoat(int argc, char **argv) {
   InitLogging(argv);
   MemMap::Init();
@@ -1175,7 +1183,7 @@
       if (output_image_filename.empty()) {
         output_image_filename = "output-image-file";
       }
-      output_image.reset(new File(output_image_fd, output_image_filename));
+      output_image.reset(new File(output_image_fd, output_image_filename, true));
     } else {
       CHECK(!output_image_filename.empty());
       output_image.reset(CreateOrOpen(output_image_filename.c_str(), &new_image_out));
@@ -1189,7 +1197,7 @@
       if (input_oat_filename.empty()) {
         input_oat_filename = "input-oat-file";
       }
-      input_oat.reset(new File(input_oat_fd, input_oat_filename));
+      input_oat.reset(new File(input_oat_fd, input_oat_filename, false));
       if (input_oat == nullptr) {
         // Unlikely, but ensure exhaustive logging in non-0 exit code case
         LOG(ERROR) << "Failed to open input oat file by its FD" << input_oat_fd;
@@ -1208,7 +1216,7 @@
       if (output_oat_filename.empty()) {
         output_oat_filename = "output-oat-file";
       }
-      output_oat.reset(new File(output_oat_fd, output_oat_filename));
+      output_oat.reset(new File(output_oat_fd, output_oat_filename, true));
       if (output_oat == nullptr) {
         // Unlikely, but ensure exhaustive logging in non-0 exit code case
         LOG(ERROR) << "Failed to open output oat file by its FD" << output_oat_fd;
@@ -1281,14 +1289,20 @@
                           output_oat.get(), output_image.get(), isa, &timings,
                           output_oat_fd >= 0,  // was it opened from FD?
                           new_oat_out);
+    // The order here doesn't matter. If the first one is successfully saved and the second one
+    // erased, ImageSpace will still detect a problem and not use the files.
+    ret = ret && FinishFile(output_image.get(), ret);
+    ret = ret && FinishFile(output_oat.get(), ret);
   } else if (have_oat_files) {
     TimingLogger::ScopedTiming pt("patch oat", &timings);
     ret = PatchOat::Patch(input_oat.get(), base_delta, output_oat.get(), &timings,
                           output_oat_fd >= 0,  // was it opened from FD?
                           new_oat_out);
+    ret = ret && FinishFile(output_oat.get(), ret);
   } else if (have_image_files) {
     TimingLogger::ScopedTiming pt("patch image", &timings);
     ret = PatchOat::Patch(input_image_location, base_delta, output_image.get(), isa, &timings);
+    ret = ret && FinishFile(output_image.get(), ret);
   } else {
     CHECK(false);
     ret = true;
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 25fe45f..087c0ea 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -302,6 +302,7 @@
   base/allocator.h \
   base/mutex.h \
   debugger.h \
+  base/unix_file/fd_file.h \
   dex_file.h \
   dex_instruction.h \
   gc/allocator/rosalloc.h \
@@ -389,7 +390,9 @@
   LOCAL_CPP_EXTENSION := $$(ART_CPP_EXTENSION)
   ifeq ($$(art_ndebug_or_debug),ndebug)
     LOCAL_MODULE := libart
-    LOCAL_FDO_SUPPORT := true
+    ifeq ($$(art_target_or_host),target)
+      LOCAL_FDO_SUPPORT := true
+    endif
   else # debug
     LOCAL_MODULE := libartd
   endif
diff --git a/runtime/arch/arm/instruction_set_features_arm.cc b/runtime/arch/arm/instruction_set_features_arm.cc
index f49c037..f8590d3 100644
--- a/runtime/arch/arm/instruction_set_features_arm.cc
+++ b/runtime/arch/arm/instruction_set_features_arm.cc
@@ -108,12 +108,7 @@
 }
 
 const ArmInstructionSetFeatures* ArmInstructionSetFeatures::FromCppDefines() {
-#if defined(HAVE_ANDROID_OS) && (ANDROID_SMP == 0)
-  const bool smp = false;
-#else
   const bool smp = true;
-#endif
-
 #if defined(__ARM_ARCH_EXT_IDIV__)
   const bool has_div = true;
 #else
@@ -204,11 +199,8 @@
 }
 
 const ArmInstructionSetFeatures* ArmInstructionSetFeatures::FromAssembly() {
-#if defined(HAVE_ANDROID_OS) && (ANDROID_SMP == 0)
-  const bool smp = false;
-#else
   const bool smp = true;
-#endif
+
   // See if have a sdiv instruction.  Register a signal handler and try to execute an sdiv
   // instruction.  If we get a SIGILL then it's not supported.
   struct sigaction sa, osa;
diff --git a/runtime/arch/arm64/instruction_set_features_arm64.cc b/runtime/arch/arm64/instruction_set_features_arm64.cc
index 5bc943c..a1270dc 100644
--- a/runtime/arch/arm64/instruction_set_features_arm64.cc
+++ b/runtime/arch/arm64/instruction_set_features_arm64.cc
@@ -58,12 +58,7 @@
 }
 
 const Arm64InstructionSetFeatures* Arm64InstructionSetFeatures::FromCppDefines() {
-#if defined(HAVE_ANDROID_OS) && (ANDROID_SMP == 0)
-  const bool smp = false;
-#else
   const bool smp = true;
-#endif
-
   const bool is_a53 = true;  // Pessimistically assume all ARM64s are A53s.
   return new Arm64InstructionSetFeatures(smp, is_a53);
 }
diff --git a/runtime/arch/mips/asm_support_mips.h b/runtime/arch/mips/asm_support_mips.h
index 5bece18..02c0982 100644
--- a/runtime/arch/mips/asm_support_mips.h
+++ b/runtime/arch/mips/asm_support_mips.h
@@ -19,8 +19,8 @@
 
 #include "asm_support.h"
 
-#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 64
-#define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 64
+#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 48
+#define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 48
 #define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 64
 
 #endif  // ART_RUNTIME_ARCH_MIPS_ASM_SUPPORT_MIPS_H_
diff --git a/runtime/arch/mips/instruction_set_features_mips.cc b/runtime/arch/mips/instruction_set_features_mips.cc
index efec993..11be2a8 100644
--- a/runtime/arch/mips/instruction_set_features_mips.cc
+++ b/runtime/arch/mips/instruction_set_features_mips.cc
@@ -44,11 +44,7 @@
 }
 
 const MipsInstructionSetFeatures* MipsInstructionSetFeatures::FromCppDefines() {
-#if defined(HAVE_ANDROID_OS) && (ANDROID_SMP == 0)
-  const bool smp = false;
-#else
   const bool smp = true;
-#endif
 
   // TODO: here we assume the FPU is always 32-bit.
   const bool fpu_32bit = true;
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 4824857..44feee6 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -26,44 +26,48 @@
     /* Deliver an exception pending on a thread */
     .extern artDeliverPendingExceptionFromCode
 
+#define ARG_SLOT_SIZE   32    // space for a0-a3 plus 4 more words
+
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveAll)
-     * Callee-save: $s0-$s8 + $gp + $ra, 11 total + 1 word padding + 4 open words for args
-     * Clobbers $t0 and $gp
+     * Callee-save: $s0-$s8 + $gp + $ra, 11 total + 1 word for Method*
+     * Clobbers $t0 and $sp
+     * Allocates ARG_SLOT_SIZE bytes at the bottom of the stack for arg slots.
+     * Reserves FRAME_SIZE_SAVE_ALL_CALLEE_SAVE + ARG_SLOT_SIZE bytes on the stack
      */
 .macro SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
-    addiu  $sp, $sp, -64
-    .cfi_adjust_cfa_offset 64
+    addiu  $sp, $sp, -48
+    .cfi_adjust_cfa_offset 48
 
      // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 64)
+#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 48)
 #error "SAVE_ALL_CALLEE_SAVE_FRAME(MIPS) size not as expected."
 #endif
 
-    sw     $ra, 60($sp)
-    .cfi_rel_offset 31, 60
-    sw     $s8, 56($sp)
-    .cfi_rel_offset 30, 56
-    sw     $gp, 52($sp)
-    .cfi_rel_offset 28, 52
-    sw     $s7, 48($sp)
-    .cfi_rel_offset 23, 48
-    sw     $s6, 44($sp)
-    .cfi_rel_offset 22, 44
-    sw     $s5, 40($sp)
-    .cfi_rel_offset 21, 40
-    sw     $s4, 36($sp)
-    .cfi_rel_offset 20, 36
-    sw     $s3, 32($sp)
-    .cfi_rel_offset 19, 32
-    sw     $s2, 28($sp)
-    .cfi_rel_offset 18, 28
-    sw     $s1, 24($sp)
-    .cfi_rel_offset 17, 24
-    sw     $s0, 20($sp)
-    .cfi_rel_offset 16, 20
-    # 1 word for alignment, 4 open words for args $a0-$a3, bottom will hold Method*
+    sw     $ra, 44($sp)
+    .cfi_rel_offset 31, 44
+    sw     $s8, 40($sp)
+    .cfi_rel_offset 30, 40
+    sw     $gp, 36($sp)
+    .cfi_rel_offset 28, 36
+    sw     $s7, 32($sp)
+    .cfi_rel_offset 23, 32
+    sw     $s6, 28($sp)
+    .cfi_rel_offset 22, 28
+    sw     $s5, 24($sp)
+    .cfi_rel_offset 21, 24
+    sw     $s4, 20($sp)
+    .cfi_rel_offset 20, 20
+    sw     $s3, 16($sp)
+    .cfi_rel_offset 19, 16
+    sw     $s2, 12($sp)
+    .cfi_rel_offset 18, 12
+    sw     $s1, 8($sp)
+    .cfi_rel_offset 17, 8
+    sw     $s0, 4($sp)
+    .cfi_rel_offset 16, 4
+    # 1 word for holding Method*
 
     lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
     lw $t0, 0($t0)
@@ -71,42 +75,47 @@
     lw $t0, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET($t0)
     sw $t0, 0($sp)                                # Place Method* at bottom of stack.
     sw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
+    addiu  $sp, $sp, -ARG_SLOT_SIZE               # reserve argument slots on the stack
+    .cfi_adjust_cfa_offset ARG_SLOT_SIZE
 .endm
 
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kRefsOnly). Restoration assumes non-moving GC.
      * Does not include rSUSPEND or rSELF
-     * callee-save: $s2-$s8 + $gp + $ra, 9 total + 3 words padding + 4 open words for args
+     * callee-save: $s2-$s8 + $gp + $ra, 9 total + 2 words padding + 1 word to hold Method*
+     * Clobbers $t0 and $sp
+     * Allocates ARG_SLOT_SIZE bytes at the bottom of the stack for arg slots.
+     * Reserves FRAME_SIZE_REFS_ONLY_CALLEE_SAVE + ARG_SLOT_SIZE bytes on the stack
      */
 .macro SETUP_REFS_ONLY_CALLEE_SAVE_FRAME
-    addiu  $sp, $sp, -64
-    .cfi_adjust_cfa_offset 64
+    addiu  $sp, $sp, -48
+    .cfi_adjust_cfa_offset 48
 
     // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_REFS_ONLY_CALLEE_SAVE != 64)
+#if (FRAME_SIZE_REFS_ONLY_CALLEE_SAVE != 48)
 #error "REFS_ONLY_CALLEE_SAVE_FRAME(MIPS) size not as expected."
 #endif
 
-    sw     $ra, 60($sp)
-    .cfi_rel_offset 31, 60
-    sw     $s8, 56($sp)
-    .cfi_rel_offset 30, 56
-    sw     $gp, 52($sp)
-    .cfi_rel_offset 28, 52
-    sw     $s7, 48($sp)
-    .cfi_rel_offset 23, 48
-    sw     $s6, 44($sp)
-    .cfi_rel_offset 22, 44
-    sw     $s5, 40($sp)
-    .cfi_rel_offset 21, 40
-    sw     $s4, 36($sp)
-    .cfi_rel_offset 20, 36
-    sw     $s3, 32($sp)
-    .cfi_rel_offset 19, 32
-    sw     $s2, 28($sp)
-    .cfi_rel_offset 18, 28
-    # 3 words for alignment and extra args, 4 open words for args $a0-$a3, bottom will hold Method*
+    sw     $ra, 44($sp)
+    .cfi_rel_offset 31, 44
+    sw     $s8, 40($sp)
+    .cfi_rel_offset 30, 40
+    sw     $gp, 36($sp)
+    .cfi_rel_offset 28, 36
+    sw     $s7, 32($sp)
+    .cfi_rel_offset 23, 32
+    sw     $s6, 28($sp)
+    .cfi_rel_offset 22, 28
+    sw     $s5, 24($sp)
+    .cfi_rel_offset 21, 24
+    sw     $s4, 20($sp)
+    .cfi_rel_offset 20, 20
+    sw     $s3, 16($sp)
+    .cfi_rel_offset 19, 16
+    sw     $s2, 12($sp)
+    .cfi_rel_offset 18, 12
+    # 2 words for alignment and bottom word will hold Method*
 
     lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
     lw $t0, 0($t0)
@@ -114,61 +123,47 @@
     lw $t0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET($t0)
     sw $t0, 0($sp)                                # Place Method* at bottom of stack.
     sw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
+    addiu  $sp, $sp, -ARG_SLOT_SIZE               # reserve argument slots on the stack
+    .cfi_adjust_cfa_offset ARG_SLOT_SIZE
 .endm
 
 .macro RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
-    lw     $ra, 60($sp)
+    addiu  $sp, $sp, ARG_SLOT_SIZE                # remove argument slots on the stack
+    .cfi_adjust_cfa_offset -ARG_SLOT_SIZE
+    lw     $ra, 44($sp)
     .cfi_restore 31
-    lw     $s8, 56($sp)
+    lw     $s8, 40($sp)
     .cfi_restore 30
-    lw     $gp, 52($sp)
+    lw     $gp, 36($sp)
     .cfi_restore 28
-    lw     $s7, 48($sp)
+    lw     $s7, 32($sp)
     .cfi_restore 23
-    lw     $s6, 44($sp)
+    lw     $s6, 28($sp)
     .cfi_restore 22
-    lw     $s5, 40($sp)
+    lw     $s5, 24($sp)
     .cfi_restore 21
-    lw     $s4, 36($sp)
+    lw     $s4, 20($sp)
     .cfi_restore 20
-    lw     $s3, 32($sp)
+    lw     $s3, 16($sp)
     .cfi_restore 19
-    lw     $s2, 28($sp)
+    lw     $s2, 12($sp)
     .cfi_restore 18
-    addiu  $sp, $sp, 64
-    .cfi_adjust_cfa_offset -64
+    addiu  $sp, $sp, 48
+    .cfi_adjust_cfa_offset -48
 .endm
 
 .macro RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
-    lw     $ra, 60($sp)
-    .cfi_restore 31
-    lw     $s8, 56($sp)
-    .cfi_restore 30
-    lw     $gp, 52($sp)
-    .cfi_restore 28
-    lw     $s7, 48($sp)
-    .cfi_restore 23
-    lw     $s6, 44($sp)
-    .cfi_restore 22
-    lw     $s5, 40($sp)
-    .cfi_restore 21
-    lw     $s4, 36($sp)
-    .cfi_restore 20
-    lw     $s3, 32($sp)
-    .cfi_restore 19
-    lw     $s2, 28($sp)
-    .cfi_restore 18
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
     jr     $ra
-    addiu  $sp, $sp, 64
-    .cfi_adjust_cfa_offset -64
+    nop
 .endm
 
     /*
      * Macro that sets up the callee save frame to conform with
-     * Runtime::CreateCalleeSaveMethod(kRefsAndArgs). Restoration assumes non-moving GC.
+     * Runtime::CreateCalleeSaveMethod(kRefsAndArgs).
      * callee-save: $a1-$a3, $s2-$s8 + $gp + $ra, 12 total + 3 words padding + method*
      */
-.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME
+.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
     addiu  $sp, $sp, -64
     .cfi_adjust_cfa_offset 64
 
@@ -202,16 +197,48 @@
     sw     $a1, 4($sp)
     .cfi_rel_offset 5, 4
     # bottom will hold Method*
+.endm
 
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kRefsAndArgs). Restoration assumes non-moving GC.
+     * callee-save: $a1-$a3, $s2-$s8 + $gp + $ra, 12 total + 3 words padding + method*
+     * Clobbers $t0 and $sp
+     * Allocates ARG_SLOT_SIZE bytes at the bottom of the stack for arg slots.
+     * Reserves FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE + ARG_SLOT_SIZE bytes on the stack
+     */
+.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME
+    SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
     lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
     lw $t0, 0($t0)
     THIS_LOAD_REQUIRES_READ_BARRIER
     lw $t0, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET($t0)
     sw $t0, 0($sp)                                # Place Method* at bottom of stack.
     sw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
+    addiu  $sp, $sp, -ARG_SLOT_SIZE               # reserve argument slots on the stack
+    .cfi_adjust_cfa_offset ARG_SLOT_SIZE
+.endm
+
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kRefsAndArgs). Restoration assumes non-moving GC.
+     * callee-save: $a1-$a3, $s2-$s8 + $gp + $ra, 12 total + 3 words padding + method*
+     * Clobbers $sp
+     * Use $a0 as the Method* and loads it into bottom of stack.
+     * Allocates ARG_SLOT_SIZE bytes at the bottom of the stack for arg slots.
+     * Reserves FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE + ARG_SLOT_SIZE bytes on the stack
+     */
+.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_WITH_METHOD_IN_A0
+    SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
+    sw $a0, 0($sp)                                # Place Method* at bottom of stack.
+    sw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
+    addiu  $sp, $sp, -ARG_SLOT_SIZE               # reserve argument slots on the stack
+    .cfi_adjust_cfa_offset ARG_SLOT_SIZE
 .endm
 
 .macro RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
+    addiu  $sp, $sp, ARG_SLOT_SIZE                # remove argument slots on the stack
+    .cfi_adjust_cfa_offset -ARG_SLOT_SIZE
     lw     $ra, 60($sp)
     .cfi_restore 31
     lw     $s8, 56($sp)
@@ -444,21 +471,15 @@
     .extern \cxx_name
 ENTRY \c_name
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME  # save callee saves in case allocation triggers GC
-    lw    $a2, FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE($sp)                    # pass caller Method*
-    move  $t0, $sp                        # save $sp
-    addiu $sp, $sp, -32                   # make space for extra args
-    .cfi_adjust_cfa_offset 32
+    lw    $a2, FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE+ARG_SLOT_SIZE($sp)    # pass caller Method*
+    addiu $t0, $sp, ARG_SLOT_SIZE         # save $sp (remove arg slots)
     move  $a3, rSELF                      # pass Thread::Current
-    .cfi_rel_offset 28, 12
     jal   \cxx_name                       # (method_idx, this, caller, Thread*, $sp)
     sw    $t0, 16($sp)                    # pass $sp
-    addiu $sp, $sp, 32                    # release out args
-    .cfi_adjust_cfa_offset -32
     move  $a0, $v0                        # save target Method*
-    move  $t9, $v1                        # save $v0->code_
     RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     beqz  $v0, 1f
-    nop
+    move  $t9, $v1                        # save $v0->code_
     jr    $t9
     nop
 1:
@@ -500,10 +521,10 @@
     .cfi_def_cfa_register 30
     move  $s1, $a3              # move managed thread pointer into s1
     addiu $s0, $zero, SUSPEND_CHECK_INTERVAL  # reset s0 to suspend check interval
-    addiu $t0, $a2, 16          # create space for method pointer in frame
-    srl   $t0, $t0, 4           # shift the frame size right 4
-    sll   $t0, $t0, 4           # shift the frame size left 4 to align to 16 bytes
-    subu  $sp, $sp, $t0         # reserve stack space for argument array
+    addiu $t0, $a2, 4           # create space for method pointer in frame.
+    subu  $t0, $sp, $t0         # reserve & align *stack* to 16 bytes:
+    srl   $t0, $t0, 4           # native calling convention only aligns to 8B,
+    sll   $sp, $t0, 4           # so we have to ensure ART 16B alignment ourselves.
     addiu $a0, $sp, 4           # pass stack pointer + method ptr as dest for memcpy
     jal   memcpy                # (dest, src, bytes)
     addiu $sp, $sp, -16         # make space for argument slots for memcpy
@@ -548,8 +569,8 @@
      */
     .extern artHandleFillArrayDataFromCode
 ENTRY art_quick_handle_fill_data
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  # save callee saves in case exception allocation triggers GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)                   # pass referrer's Method*
+    lw     $a2, 0($sp)                    # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case exception allocation triggers GC
     jal    artHandleFillArrayDataFromCode # (payload offset, Array*, method, Thread*)
     move   $a3, rSELF                     # pass Thread::Current
     RETURN_IF_ZERO
@@ -562,7 +583,7 @@
 ENTRY art_quick_lock_object
     beqz    $a0, .Lart_quick_throw_null_pointer_exception_gp_set
     nop
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME      # save callee saves in case we block
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case we block
     jal     artLockObjectFromCode         # (Object* obj, Thread*)
     move    $a1, rSELF                    # pass Thread::Current
     RETURN_IF_ZERO
@@ -575,7 +596,7 @@
 ENTRY art_quick_unlock_object
     beqz    $a0, .Lart_quick_throw_null_pointer_exception_gp_set
     nop
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  # save callee saves in case exception allocation triggers GC
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME # save callee saves in case exception allocation triggers GC
     jal     artUnlockObjectFromCode   # (Object* obj, Thread*)
     move    $a1, rSELF                # pass Thread::Current
     RETURN_IF_ZERO
@@ -594,7 +615,8 @@
     sw     $a1, 4($sp)
     sw     $a0, 0($sp)
     jal    artIsAssignableFromCode
-    nop
+    addiu  $sp, $sp, -16             # reserve argument slots on the stack
+    addiu  $sp, $sp, 16
     beqz   $v0, .Lthrow_class_cast_exception
     lw     $ra, 12($sp)
     jr     $ra
@@ -670,7 +692,8 @@
     move   $a1, $t1
     move   $a0, $t0
     jal    artIsAssignableFromCode  # (Class*, Class*)
-    nop
+    addiu $sp, $sp, -16     # reserve argument slots on the stack
+    addiu $sp, $sp, 16
     lw     $ra, 28($sp)
     lw     $t9, 12($sp)
     lw     $a2, 8($sp)
@@ -694,7 +717,7 @@
      */
     .extern artInitializeStaticStorageFromCode
 ENTRY art_quick_initialize_static_storage
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME            # save callee saves in case of GC
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME           # save callee saves in case of GC
     # artInitializeStaticStorageFromCode(uint32_t type_idx, Method* referrer, Thread*)
     jal     artInitializeStaticStorageFromCode
     move    $a2, rSELF                          # pass Thread::Current
@@ -706,7 +729,7 @@
      */
     .extern artInitializeTypeFromCode
 ENTRY art_quick_initialize_type
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME           # save callee saves in case of GC
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME          # save callee saves in case of GC
     # artInitializeTypeFromCode(uint32_t type_idx, Method* referrer, Thread*)
     jal     artInitializeTypeFromCode
     move    $a2, rSELF                         # pass Thread::Current
@@ -719,7 +742,7 @@
      */
     .extern artInitializeTypeAndVerifyAccessFromCode
 ENTRY art_quick_initialize_type_and_verify_access
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME           # save callee saves in case of GC
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME          # save callee saves in case of GC
     # artInitializeTypeFromCode(uint32_t type_idx, Method* referrer, Thread*)
     jal     artInitializeTypeAndVerifyAccessFromCode
     move    $a2, rSELF                         # pass Thread::Current
@@ -730,8 +753,8 @@
      */
     .extern artGetBooleanStaticFromCode
 ENTRY art_quick_get_boolean_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a1, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a1, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artGetBooleanStaticFromCode   # (uint32_t field_idx, const Method* referrer, Thread*)
     move   $a2, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
@@ -741,8 +764,8 @@
      */
     .extern artGetByteStaticFromCode
 ENTRY art_quick_get_byte_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a1, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a1, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artGetByteStaticFromCode      # (uint32_t field_idx, const Method* referrer, Thread*)
     move   $a2, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
@@ -753,8 +776,8 @@
      */
     .extern artGetCharStaticFromCode
 ENTRY art_quick_get_char_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a1, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a1, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artGetCharStaticFromCode      # (uint32_t field_idx, const Method* referrer, Thread*)
     move   $a2, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
@@ -764,8 +787,8 @@
      */
     .extern artGetShortStaticFromCode
 ENTRY art_quick_get_short_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a1, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a1, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artGetShortStaticFromCode     # (uint32_t field_idx, const Method* referrer, Thread*)
     move   $a2, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
@@ -776,8 +799,8 @@
      */
     .extern artGet32StaticFromCode
 ENTRY art_quick_get32_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a1, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a1, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artGet32StaticFromCode        # (uint32_t field_idx, const Method* referrer, Thread*)
     move   $a2, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
@@ -788,8 +811,8 @@
      */
     .extern artGet64StaticFromCode
 ENTRY art_quick_get64_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a1, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a1, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artGet64StaticFromCode        # (uint32_t field_idx, const Method* referrer, Thread*)
     move   $a2, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
@@ -800,8 +823,8 @@
      */
     .extern artGetObjStaticFromCode
 ENTRY art_quick_get_obj_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a1, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a1, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artGetObjStaticFromCode       # (uint32_t field_idx, const Method* referrer, Thread*)
     move   $a2, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
@@ -812,8 +835,8 @@
      */
     .extern artGetBooleanInstanceFromCode
 ENTRY art_quick_get_boolean_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a2, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artGetBooleanInstanceFromCode # (field_idx, Object*, referrer, Thread*)
     move   $a3, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
@@ -823,8 +846,8 @@
      */
     .extern artGetByteInstanceFromCode
 ENTRY art_quick_get_byte_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a2, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artGetByteInstanceFromCode    # (field_idx, Object*, referrer, Thread*)
     move   $a3, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
@@ -835,8 +858,8 @@
      */
     .extern artGetCharInstanceFromCode
 ENTRY art_quick_get_char_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a2, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artGetCharInstanceFromCode    # (field_idx, Object*, referrer, Thread*)
     move   $a3, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
@@ -846,9 +869,9 @@
      */
     .extern artGetShortInstanceFromCode
 ENTRY art_quick_get_short_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
-    jal    artGetShortInstanceFromCode      # (field_idx, Object*, referrer, Thread*)
+    lw     $a2, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
+    jal    artGetShortInstanceFromCode   # (field_idx, Object*, referrer, Thread*)
     move   $a3, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
 END art_quick_get_short_instance
@@ -858,11 +881,10 @@
      */
     .extern artGet32InstanceFromCode
 ENTRY art_quick_get32_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a2, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
+    jal    artGet32InstanceFromCode      # (field_idx, Object*, referrer, Thread*)
     move   $a3, rSELF                    # pass Thread::Current
-    jal    artGet32InstanceFromCode      # (field_idx, Object*, referrer, Thread*, $sp)
-    sw     $sp, 16($sp)                  # pass $sp
     RETURN_IF_NO_EXCEPTION
 END art_quick_get32_instance
 
@@ -871,11 +893,10 @@
      */
     .extern artGet64InstanceFromCode
 ENTRY art_quick_get64_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a2, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
+    jal    artGet64InstanceFromCode      # (field_idx, Object*, referrer, Thread*)
     move   $a3, rSELF                    # pass Thread::Current
-    jal    artGet64InstanceFromCode      # (field_idx, Object*, referrer, Thread*, $sp)
-    sw     $sp, 16($sp)                  # pass $sp
     RETURN_IF_NO_EXCEPTION
 END art_quick_get64_instance
 
@@ -884,8 +905,8 @@
      */
     .extern artGetObjInstanceFromCode
 ENTRY art_quick_get_obj_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a2, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artGetObjInstanceFromCode     # (field_idx, Object*, referrer, Thread*)
     move   $a3, rSELF                    # pass Thread::Current
     RETURN_IF_NO_EXCEPTION
@@ -896,8 +917,8 @@
      */
     .extern artSet8StaticFromCode
 ENTRY art_quick_set8_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a2, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artSet8StaticFromCode         # (field_idx, new_val, referrer, Thread*)
     move   $a3, rSELF                    # pass Thread::Current
     RETURN_IF_ZERO
@@ -908,8 +929,8 @@
      */
     .extern artSet16StaticFromCode
 ENTRY art_quick_set16_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a2, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artSet16StaticFromCode        # (field_idx, new_val, referrer, Thread*, $sp)
     move   $a3, rSELF                    # pass Thread::Current
     RETURN_IF_ZERO
@@ -920,8 +941,8 @@
      */
     .extern artSet32StaticFromCode
 ENTRY art_quick_set32_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a2, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artSet32StaticFromCode        # (field_idx, new_val, referrer, Thread*)
     move   $a3, rSELF                    # pass Thread::Current
     RETURN_IF_ZERO
@@ -932,8 +953,8 @@
      */
     .extern artSet64StaticFromCode
 ENTRY art_quick_set64_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a1, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a1, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artSet64StaticFromCode        # (field_idx, referrer, new_val, Thread*)
     sw     rSELF, 16($sp)                # pass Thread::Current
     RETURN_IF_ZERO
@@ -944,10 +965,10 @@
      */
     .extern artSetObjStaticFromCode
 ENTRY art_quick_set_obj_static
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a2, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
-    jal    artSetObjStaticFromCode       # (field_idx, new_val, referrer, Thread*)
+    lw     $a2, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     move   $a3, rSELF                    # pass Thread::Current
+    jal    artSetObjStaticFromCode       # (field_idx, new_val, referrer, Thread*)
     RETURN_IF_ZERO
 END art_quick_set_obj_static
 
@@ -956,9 +977,9 @@
      */
     .extern artSet8InstanceFromCode
 ENTRY art_quick_set8_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a3, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
-    jal    artSet8InstanceFromCode      # (field_idx, Object*, new_val, referrer, Thread*)
+    lw     $a3, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
+    jal    artSet8InstanceFromCode       # (field_idx, Object*, new_val, referrer, Thread*)
     sw     rSELF, 16($sp)                # pass Thread::Current
     RETURN_IF_ZERO
 END art_quick_set8_instance
@@ -968,8 +989,8 @@
      */
     .extern artSet16InstanceFromCode
 ENTRY art_quick_set16_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a3, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a3, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artSet16InstanceFromCode      # (field_idx, Object*, new_val, referrer, Thread*)
     sw     rSELF, 16($sp)                # pass Thread::Current
     RETURN_IF_ZERO
@@ -980,8 +1001,8 @@
      */
     .extern artSet32InstanceFromCode
 ENTRY art_quick_set32_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a3, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a3, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artSet32InstanceFromCode      # (field_idx, Object*, new_val, referrer, Thread*)
     sw     rSELF, 16($sp)                # pass Thread::Current
     RETURN_IF_ZERO
@@ -992,11 +1013,11 @@
      */
     .extern artSet64InstanceFromCode
 ENTRY art_quick_set64_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $t0, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # load referrer's Method*
+    lw     $t1, 0($sp)                   # load referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     sw     rSELF, 20($sp)                # pass Thread::Current
     jal    artSet64InstanceFromCode      # (field_idx, Object*, new_val, referrer, Thread*)
-    sw     $t0, 16($sp)                  # pass referrer's Method*
+    sw     $t1, 16($sp)                  # pass referrer's Method*
     RETURN_IF_ZERO
 END art_quick_set64_instance
 
@@ -1005,8 +1026,8 @@
      */
     .extern artSetObjInstanceFromCode
 ENTRY art_quick_set_obj_instance
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME     # save callee saves in case of GC
-    lw     $a3, FRAME_SIZE_REFS_ONLY_CALLEE_SAVE($sp)  # pass referrer's Method*
+    lw     $a3, 0($sp)                   # pass referrer's Method*
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    # save callee saves in case of GC
     jal    artSetObjInstanceFromCode     # (field_idx, Object*, new_val, referrer, Thread*)
     sw     rSELF, 16($sp)                # pass Thread::Current
     RETURN_IF_ZERO
@@ -1020,7 +1041,7 @@
      */
     .extern artResolveStringFromCode
 ENTRY art_quick_resolve_string
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  # save callee saves in case of GC
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME # save callee saves in case of GC
     # artResolveStringFromCode(Method* referrer, uint32_t string_idx, Thread*)
     jal     artResolveStringFromCode
     move    $a2, rSELF                # pass Thread::Current
@@ -1032,7 +1053,7 @@
 .macro TWO_ARG_DOWNCALL name, entrypoint, return
     .extern \entrypoint
 ENTRY \name
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  # save callee saves in case of GC
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME # save callee saves in case of GC
     jal     \entrypoint
     move    $a2, rSELF                # pass Thread::Current
     \return
@@ -1042,7 +1063,7 @@
 .macro THREE_ARG_DOWNCALL name, entrypoint, return
     .extern \entrypoint
 ENTRY \name
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  # save callee saves in case of GC
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME # save callee saves in case of GC
     jal     \entrypoint
     move    $a3, rSELF                # pass Thread::Current
     \return
@@ -1075,13 +1096,12 @@
      */
     .extern artQuickProxyInvokeHandler
 ENTRY art_quick_proxy_invoke_handler
-    SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME
-    sw      $a0, 0($sp)            # place proxy method at bottom of frame
-    move    $a2, rSELF             # pass Thread::Current
+    SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_WITH_METHOD_IN_A0
+    move    $a2, rSELF                  # pass Thread::Current
     jal     artQuickProxyInvokeHandler  # (Method* proxy method, receiver, Thread*, SP)
-    move    $a3, $sp               # pass $sp
+    addiu   $a3, $sp, ARG_SLOT_SIZE     # pass $sp (remove arg slots)
     lw      $t0, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+    RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     bnez    $t0, 1f
     mtc1    $v0, $f0               # place return value to FP return value
     jr      $ra
@@ -1107,11 +1127,11 @@
     .extern artQuickResolutionTrampoline
 ENTRY art_quick_resolution_trampoline
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME
-    move    $a2, rSELF             # pass Thread::Current
+    move    $a2, rSELF                    # pass Thread::Current
     jal     artQuickResolutionTrampoline  # (Method* called, receiver, Thread*, SP)
-    move    $a3, $sp               # pass $sp
+    addiu   $a3, $sp, ARG_SLOT_SIZE       # pass $sp (remove arg slots)
     beqz    $v0, 1f
-    lw      $a0, 0($sp)            # load resolved method to $a0
+    lw      $a0, ARG_SLOT_SIZE($sp)       # load resolved method to $a0
     RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     move    $t9, $v0               # code pointer must be in $t9 to generate the global pointer
     jr      $v0                    # tail call to method
@@ -1121,16 +1141,75 @@
     DELIVER_PENDING_EXCEPTION
 END art_quick_resolution_trampoline
 
-UNIMPLEMENTED art_quick_generic_jni_trampoline
+    .extern artQuickGenericJniTrampoline
+    .extern artQuickGenericJniEndTrampoline
+ENTRY art_quick_generic_jni_trampoline
+    SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_WITH_METHOD_IN_A0
+    move    $s8, $sp               # save $sp to $s8
+    move    $s3, $gp               # save $gp to $s3
+
+    # prepare for call to artQuickGenericJniTrampoline(Thread*, SP)
+    move    $a0, rSELF                     # pass Thread::Current
+    addiu   $a1, $sp, ARG_SLOT_SIZE        # save $sp (remove arg slots)
+    jal     artQuickGenericJniTrampoline   # (Thread*, SP)
+    addiu   $sp, $sp, -5120                # reserve space on the stack
+
+    # The C call will have registered the complete save-frame on success.
+    # The result of the call is:
+    # v0: ptr to native code, 0 on error.
+    # v1: ptr to the bottom of the used area of the alloca, can restore stack till here.
+    beq     $v0, $zero, 1f         # check entry error
+    move    $t9, $v0               # save the code ptr
+    move    $sp, $v1               # release part of the alloca
+
+    # Load parameters from stack into registers
+    lw      $a0,   0($sp)
+    lw      $a1,   4($sp)
+    lw      $a2,   8($sp)
+
+    # Load FPRs the same as GPRs. Look at BuildNativeCallFrameStateMachine.
+    jalr    $t9                    # native call
+    lw      $a3,  12($sp)
+    addiu   $sp, $sp, 16           # remove arg slots
+
+    move    $gp, $s3               # restore $gp from $s3
+
+    # result sign extension is handled in C code
+    # prepare for call to artQuickGenericJniEndTrampoline(Thread*, result, result_f)
+    move    $a0, rSELF             # pass Thread::Current
+    move    $a2, $v0               # pass result
+    move    $a3, $v1
+    addiu   $sp, $sp, -24          # reserve arg slots
+    jal     artQuickGenericJniEndTrampoline
+    s.d     $f0, 16($sp)           # pass result_f
+    addiu   $sp, $sp, 24           # remove arg slots
+
+    lw      $t0, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
+    bne     $t0, $zero, 2f         # check for pending exceptions
+    move    $sp, $s8               # tear down the alloca
+
+    # tear dpown the callee-save frame
+    RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
+
+    mtc1    $v0, $f0               # place return value to FP return value
+    jr      $ra
+    mtc1    $v1, $f1               # place return value to FP return value
+
+1:
+    move    $sp, $s8               # tear down the alloca
+2:
+    RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
+    DELIVER_PENDING_EXCEPTION
+END art_quick_generic_jni_trampoline
 
     .extern artQuickToInterpreterBridge
 ENTRY art_quick_to_interpreter_bridge
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME
-    move    $a1, rSELF             # pass Thread::Current
-    jal     artQuickToInterpreterBridge    # (Method* method, Thread*, SP)
-    move    $a2, $sp               # pass $sp
+    move    $a1, rSELF                          # pass Thread::Current
+    jal     artQuickToInterpreterBridge         # (Method* method, Thread*, SP)
+    addiu   $a2, $sp, ARG_SLOT_SIZE             # pass $sp (remove arg slots)
     lw      $t0, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+    RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     bnez    $t0, 1f
     mtc1    $v0, $f0               # place return value to FP return value
     jr      $ra
@@ -1146,17 +1225,12 @@
     .extern artInstrumentationMethodExitFromCode
 ENTRY art_quick_instrumentation_entry
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME
-    move     $t0, $sp       # remember bottom of caller's frame
-    addiu    $sp, $sp, -32  # space for saved a0, pad (2 words), arguments (4 words)
-    .cfi_adjust_cfa_offset 32
-    sw       $a0, 28($sp)   # save arg0
+    sw       $a0, 28($sp)   # save arg0 in free arg slot
     move     $a3, $ra       # pass $ra
     jal      artInstrumentationMethodEntryFromCode  # (Method*, Object*, Thread*, LR)
     move     $a2, rSELF     # pass Thread::Current
     move     $t9, $v0       # $t9 holds reference to code
-    lw       $a0, 28($sp)   # restore arg0
-    addiu    $sp, $sp, 32   # remove args
-    .cfi_adjust_cfa_offset -32
+    lw       $a0, 28($sp)   # restore arg0 from free arg slot
     RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     jalr     $t9            # call method
     nop
@@ -1168,32 +1242,33 @@
     addiu    $t9, $ra, 4    # put current address into $t9 to rebuild $gp
     .cpload  $t9
     move     $ra, $zero     # link register is to here, so clobber with 0 for later checks
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME
-    move     $t0, $sp       # remember bottom of caller's frame
-    addiu    $sp, $sp, -48  # save return values and set up args
-    .cfi_adjust_cfa_offset 48
-    sw       $v0, 32($sp)
+
+    addiu    $sp, $sp, -16  # allocate temp storage on the stack
+    .cfi_adjust_cfa_offset 16
+    sw       $v0, 12($sp)
     .cfi_rel_offset 2, 32
-    sw       $v1, 36($sp)
+    sw       $v1, 8($sp)
     .cfi_rel_offset 3, 36 
-    s.s      $f0, 40($sp)
-    s.s      $f1, 44($sp)
+    s.s      $f0, 4($sp)
+    s.s      $f1, 0($sp)
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME
     s.s      $f0, 16($sp)   # pass fpr result
     s.s      $f1, 20($sp)
     move     $a2, $v0       # pass gpr result
     move     $a3, $v1
-    move     $a1, $t0       # pass $sp
+    addiu    $a1, $sp, ARG_SLOT_SIZE   # pass $sp (remove arg slots)
     jal      artInstrumentationMethodExitFromCode  # (Thread*, SP, gpr_res, fpr_res)
     move     $a0, rSELF     # pass Thread::Current
     move     $t0, $v0       # set aside returned link register
     move     $ra, $v1       # set link register for deoptimization
-    lw       $v0, 32($sp)   # restore return values
-    lw       $v1, 36($sp)
-    l.s      $f0, 40($sp)
-    l.s      $f1, 44($sp)
+    addiu    $sp, $sp, ARG_SLOT_SIZE+FRAME_SIZE_REFS_ONLY_CALLEE_SAVE  # args slot + refs_only callee save frame
+    lw       $v0, 12($sp)   # restore return values
+    lw       $v1, 8($sp)
+    l.s      $f0, 4($sp)
+    l.s      $f1, 0($sp)
     jr       $t0            # return
-    addiu    $sp, $sp, 112  # 48 bytes of args + 64 bytes of callee save frame
-    .cfi_adjust_cfa_offset -112
+    addiu    $sp, $sp, 16   # remove temp storage from stack
+    .cfi_adjust_cfa_offset -16
 END art_quick_instrumentation_exit
 
     /*
diff --git a/runtime/arch/mips/quick_method_frame_info_mips.h b/runtime/arch/mips/quick_method_frame_info_mips.h
index 2a8bcf0..5fbffbc 100644
--- a/runtime/arch/mips/quick_method_frame_info_mips.h
+++ b/runtime/arch/mips/quick_method_frame_info_mips.h
@@ -40,8 +40,7 @@
 
 constexpr uint32_t MipsCalleeSaveFrameSize(Runtime::CalleeSaveType type) {
   return RoundUp((POPCOUNT(MipsCalleeSaveCoreSpills(type)) /* gprs */ +
-                  (type == Runtime::kRefsAndArgs ? 0 : 3) + 1 /* Method* */) *
-                 kMipsPointerSize, kStackAlignment);
+                  1 /* Method* */) * kMipsPointerSize, kStackAlignment);
 }
 
 constexpr QuickMethodFrameInfo MipsCalleeSaveMethodFrameInfo(Runtime::CalleeSaveType type) {
diff --git a/runtime/arch/x86/instruction_set_features_x86.cc b/runtime/arch/x86/instruction_set_features_x86.cc
index 32cf909..a12773d 100644
--- a/runtime/arch/x86/instruction_set_features_x86.cc
+++ b/runtime/arch/x86/instruction_set_features_x86.cc
@@ -70,11 +70,7 @@
 }
 
 const X86InstructionSetFeatures* X86InstructionSetFeatures::FromCppDefines(bool x86_64) {
-#if defined(HAVE_ANDROID_OS) && (ANDROID_SMP == 0)
-  const bool smp = false;
-#else
   const bool smp = true;
-#endif
 
 #ifndef __SSSE3__
   const bool has_SSSE3 = false;
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 4b4c8855..7454cca 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -148,15 +148,15 @@
 ADD_TEST_EQ(MIRROR_ART_METHOD_DEX_CACHE_METHODS_OFFSET,
             art::mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value())
 
-#define MIRROR_ART_METHOD_PORTABLE_CODE_OFFSET_32     (48 + MIRROR_OBJECT_HEADER_SIZE)
+#define MIRROR_ART_METHOD_PORTABLE_CODE_OFFSET_32     (40 + MIRROR_OBJECT_HEADER_SIZE)
 ADD_TEST_EQ(MIRROR_ART_METHOD_PORTABLE_CODE_OFFSET_32,
             art::mirror::ArtMethod::EntryPointFromPortableCompiledCodeOffset(4).Int32Value())
 
-#define MIRROR_ART_METHOD_QUICK_CODE_OFFSET_32        (40 + MIRROR_OBJECT_HEADER_SIZE)
+#define MIRROR_ART_METHOD_QUICK_CODE_OFFSET_32        (36 + MIRROR_OBJECT_HEADER_SIZE)
 ADD_TEST_EQ(MIRROR_ART_METHOD_QUICK_CODE_OFFSET_32,
             art::mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(4).Int32Value())
 
-#define MIRROR_ART_METHOD_PORTABLE_CODE_OFFSET_64     (64 + MIRROR_OBJECT_HEADER_SIZE)
+#define MIRROR_ART_METHOD_PORTABLE_CODE_OFFSET_64     (56 + MIRROR_OBJECT_HEADER_SIZE)
 ADD_TEST_EQ(MIRROR_ART_METHOD_PORTABLE_CODE_OFFSET_64,
             art::mirror::ArtMethod::EntryPointFromPortableCompiledCodeOffset(8).Int32Value())
 
diff --git a/runtime/base/mutex-inl.h b/runtime/base/mutex-inl.h
index c310191..cb69817 100644
--- a/runtime/base/mutex-inl.h
+++ b/runtime/base/mutex-inl.h
@@ -97,7 +97,9 @@
         }
       }
     }
-    CHECK(!bad_mutexes_held);
+    if (gAborting == 0) {  // Avoid recursive aborts.
+      CHECK(!bad_mutexes_held);
+    }
   }
   // Don't record monitors as they are outside the scope of analysis. They may be inspected off of
   // the monitor list.
@@ -112,7 +114,7 @@
     return;
   }
   if (level_ != kMonitorLock) {
-    if (kDebugLocking && !gAborting) {
+    if (kDebugLocking && gAborting == 0) {  // Avoid recursive aborts.
       CHECK(self->GetHeldMutex(level_) == this) << "Unlocking on unacquired mutex: " << name_;
     }
     self->SetHeldMutex(level_, NULL);
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 4957988..aa2aefc 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -209,7 +209,9 @@
         }
       }
     }
-    CHECK(!bad_mutexes_held);
+    if (gAborting == 0) {  // Avoid recursive aborts.
+      CHECK(!bad_mutexes_held);
+    }
   }
 }
 
diff --git a/runtime/base/scoped_flock.cc b/runtime/base/scoped_flock.cc
index bf091d0..0e93eee 100644
--- a/runtime/base/scoped_flock.cc
+++ b/runtime/base/scoped_flock.cc
@@ -27,6 +27,9 @@
 
 bool ScopedFlock::Init(const char* filename, std::string* error_msg) {
   while (true) {
+    if (file_.get() != nullptr) {
+      UNUSED(file_->FlushCloseOrErase());  // Ignore result.
+    }
     file_.reset(OS::OpenFileWithFlags(filename, O_CREAT | O_RDWR));
     if (file_.get() == NULL) {
       *error_msg = StringPrintf("Failed to open file '%s': %s", filename, strerror(errno));
@@ -59,7 +62,7 @@
 }
 
 bool ScopedFlock::Init(File* file, std::string* error_msg) {
-  file_.reset(new File(dup(file->Fd())));
+  file_.reset(new File(dup(file->Fd()), true));
   if (file_->Fd() == -1) {
     file_.reset();
     *error_msg = StringPrintf("Failed to duplicate open file '%s': %s",
@@ -89,6 +92,9 @@
   if (file_.get() != NULL) {
     int flock_result = TEMP_FAILURE_RETRY(flock(file_->Fd(), LOCK_UN));
     CHECK_EQ(0, flock_result);
+    if (file_->FlushCloseOrErase() != 0) {
+      PLOG(WARNING) << "Could not close scoped file lock file.";
+    }
   }
 }
 
diff --git a/runtime/base/unix_file/fd_file.cc b/runtime/base/unix_file/fd_file.cc
index f29a7ec..6e5e7a1 100644
--- a/runtime/base/unix_file/fd_file.cc
+++ b/runtime/base/unix_file/fd_file.cc
@@ -14,28 +14,68 @@
  * limitations under the License.
  */
 
-#include "base/logging.h"
 #include "base/unix_file/fd_file.h"
+
 #include <errno.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 
+#include "base/logging.h"
+
 namespace unix_file {
 
-FdFile::FdFile() : fd_(-1), auto_close_(true) {
+FdFile::FdFile() : guard_state_(GuardState::kClosed), fd_(-1), auto_close_(true) {
 }
 
-FdFile::FdFile(int fd) : fd_(fd), auto_close_(true) {
+FdFile::FdFile(int fd, bool check_usage)
+    : guard_state_(check_usage ? GuardState::kBase : GuardState::kNoCheck),
+      fd_(fd), auto_close_(true) {
 }
 
-FdFile::FdFile(int fd, const std::string& path) : fd_(fd), file_path_(path), auto_close_(true) {
+FdFile::FdFile(int fd, const std::string& path, bool check_usage)
+    : guard_state_(check_usage ? GuardState::kBase : GuardState::kNoCheck),
+      fd_(fd), file_path_(path), auto_close_(true) {
   CHECK_NE(0U, path.size());
 }
 
 FdFile::~FdFile() {
+  if (kCheckSafeUsage && (guard_state_ < GuardState::kNoCheck)) {
+    if (guard_state_ < GuardState::kFlushed) {
+      LOG(::art::ERROR) << "File " << file_path_ << " wasn't explicitly flushed before destruction.";
+    }
+    if (guard_state_ < GuardState::kClosed) {
+      LOG(::art::ERROR) << "File " << file_path_ << " wasn't explicitly closed before destruction.";
+    }
+    CHECK_GE(guard_state_, GuardState::kClosed);
+  }
   if (auto_close_ && fd_ != -1) {
-    Close();
+    if (Close() != 0) {
+      PLOG(::art::WARNING) << "Failed to close file " << file_path_;
+    }
+  }
+}
+
+void FdFile::moveTo(GuardState target, GuardState warn_threshold, const char* warning) {
+  if (kCheckSafeUsage) {
+    if (guard_state_ < GuardState::kNoCheck) {
+      if (warn_threshold < GuardState::kNoCheck && guard_state_ >= warn_threshold) {
+        LOG(::art::ERROR) << warning;
+      }
+      guard_state_ = target;
+    }
+  }
+}
+
+void FdFile::moveUp(GuardState target, const char* warning) {
+  if (kCheckSafeUsage) {
+    if (guard_state_ < GuardState::kNoCheck) {
+      if (guard_state_ < target) {
+        guard_state_ = target;
+      } else if (target < guard_state_) {
+        LOG(::art::ERROR) << warning;
+      }
+    }
   }
 }
 
@@ -54,11 +94,28 @@
     return false;
   }
   file_path_ = path;
+  static_assert(O_RDONLY == 0, "Readonly flag has unexpected value.");
+  if (kCheckSafeUsage && (flags & (O_RDWR | O_CREAT | O_WRONLY)) != 0) {
+    // Start in the base state (not flushed, not closed).
+    guard_state_ = GuardState::kBase;
+  } else {
+    // We are not concerned with read-only files. In that case, proper flushing and closing is
+    // not important.
+    guard_state_ = GuardState::kNoCheck;
+  }
   return true;
 }
 
 int FdFile::Close() {
   int result = TEMP_FAILURE_RETRY(close(fd_));
+
+  // Test here, so the file is closed and not leaked.
+  if (kCheckSafeUsage) {
+    CHECK_GE(guard_state_, GuardState::kFlushed) << "File " << file_path_
+        << " has not been flushed before closing.";
+    moveUp(GuardState::kClosed, nullptr);
+  }
+
   if (result == -1) {
     return -errno;
   } else {
@@ -74,6 +131,7 @@
 #else
   int rc = TEMP_FAILURE_RETRY(fsync(fd_));
 #endif
+  moveUp(GuardState::kFlushed, "Flushing closed file.");
   return (rc == -1) ? -errno : rc;
 }
 
@@ -92,6 +150,7 @@
 #else
   int rc = TEMP_FAILURE_RETRY(ftruncate(fd_, new_length));
 #endif
+  moveTo(GuardState::kBase, GuardState::kClosed, "Truncating closed file.");
   return (rc == -1) ? -errno : rc;
 }
 
@@ -107,6 +166,7 @@
 #else
   int rc = TEMP_FAILURE_RETRY(pwrite(fd_, buf, byte_count, offset));
 #endif
+  moveTo(GuardState::kBase, GuardState::kClosed, "Writing into closed file.");
   return (rc == -1) ? -errno : rc;
 }
 
@@ -135,6 +195,7 @@
 
 bool FdFile::WriteFully(const void* buffer, size_t byte_count) {
   const char* ptr = static_cast<const char*>(buffer);
+  moveTo(GuardState::kBase, GuardState::kClosed, "Writing into closed file.");
   while (byte_count > 0) {
     ssize_t bytes_written = TEMP_FAILURE_RETRY(write(fd_, ptr, byte_count));
     if (bytes_written == -1) {
@@ -146,4 +207,38 @@
   return true;
 }
 
+void FdFile::Erase() {
+  TEMP_FAILURE_RETRY(SetLength(0));
+  TEMP_FAILURE_RETRY(Flush());
+  TEMP_FAILURE_RETRY(Close());
+}
+
+int FdFile::FlushCloseOrErase() {
+  int flush_result = TEMP_FAILURE_RETRY(Flush());
+  if (flush_result != 0) {
+    LOG(::art::ERROR) << "CloseOrErase failed while flushing a file.";
+    Erase();
+    return flush_result;
+  }
+  int close_result = TEMP_FAILURE_RETRY(Close());
+  if (close_result != 0) {
+    LOG(::art::ERROR) << "CloseOrErase failed while closing a file.";
+    Erase();
+    return close_result;
+  }
+  return 0;
+}
+
+int FdFile::FlushClose() {
+  int flush_result = TEMP_FAILURE_RETRY(Flush());
+  if (flush_result != 0) {
+    LOG(::art::ERROR) << "FlushClose failed while flushing a file.";
+  }
+  int close_result = TEMP_FAILURE_RETRY(Close());
+  if (close_result != 0) {
+    LOG(::art::ERROR) << "FlushClose failed while closing a file.";
+  }
+  return (flush_result != 0) ? flush_result : close_result;
+}
+
 }  // namespace unix_file
diff --git a/runtime/base/unix_file/fd_file.h b/runtime/base/unix_file/fd_file.h
index 01f4ca2..8db2ee4 100644
--- a/runtime/base/unix_file/fd_file.h
+++ b/runtime/base/unix_file/fd_file.h
@@ -24,6 +24,9 @@
 
 namespace unix_file {
 
+// If true, check whether Flush and Close are called before destruction.
+static constexpr bool kCheckSafeUsage = true;
+
 // A RandomAccessFile implementation backed by a file descriptor.
 //
 // Not thread safe.
@@ -32,8 +35,8 @@
   FdFile();
   // Creates an FdFile using the given file descriptor. Takes ownership of the
   // file descriptor. (Use DisableAutoClose to retain ownership.)
-  explicit FdFile(int fd);
-  explicit FdFile(int fd, const std::string& path);
+  explicit FdFile(int fd, bool checkUsage);
+  explicit FdFile(int fd, const std::string& path, bool checkUsage);
 
   // Destroys an FdFile, closing the file descriptor if Close hasn't already
   // been called. (If you care about the return value of Close, call it
@@ -47,12 +50,21 @@
   bool Open(const std::string& file_path, int flags, mode_t mode);
 
   // RandomAccessFile API.
-  virtual int Close();
-  virtual int64_t Read(char* buf, int64_t byte_count, int64_t offset) const;
-  virtual int SetLength(int64_t new_length);
+  virtual int Close() WARN_UNUSED;
+  virtual int64_t Read(char* buf, int64_t byte_count, int64_t offset) const WARN_UNUSED;
+  virtual int SetLength(int64_t new_length) WARN_UNUSED;
   virtual int64_t GetLength() const;
-  virtual int64_t Write(const char* buf, int64_t byte_count, int64_t offset);
-  virtual int Flush();
+  virtual int64_t Write(const char* buf, int64_t byte_count, int64_t offset) WARN_UNUSED;
+  virtual int Flush() WARN_UNUSED;
+
+  // Short for SetLength(0); Flush(); Close();
+  void Erase();
+
+  // Try to Flush(), then try to Close(); If either fails, call Erase().
+  int FlushCloseOrErase() WARN_UNUSED;
+
+  // Try to Flush and Close(). Attempts both, but returns the first error.
+  int FlushClose() WARN_UNUSED;
 
   // Bonus API.
   int Fd() const;
@@ -61,8 +73,35 @@
     return file_path_;
   }
   void DisableAutoClose();
-  bool ReadFully(void* buffer, size_t byte_count);
-  bool WriteFully(const void* buffer, size_t byte_count);
+  bool ReadFully(void* buffer, size_t byte_count) WARN_UNUSED;
+  bool WriteFully(const void* buffer, size_t byte_count) WARN_UNUSED;
+
+  // This enum is public so that we can define the << operator over it.
+  enum class GuardState {
+    kBase,           // Base, file has not been flushed or closed.
+    kFlushed,        // File has been flushed, but not closed.
+    kClosed,         // File has been flushed and closed.
+    kNoCheck         // Do not check for the current file instance.
+  };
+
+ protected:
+  // If the guard state indicates checking (!=kNoCheck), go to the target state "target". Print the
+  // given warning if the current state is or exceeds warn_threshold.
+  void moveTo(GuardState target, GuardState warn_threshold, const char* warning);
+
+  // If the guard state indicates checking (<kNoCheck), and is below the target state "target", go
+  // to "target." If the current state is higher (excluding kNoCheck) than the trg state, print the
+  // warning.
+  void moveUp(GuardState target, const char* warning);
+
+  // Forcefully sets the state to the given one. This can overwrite kNoCheck.
+  void resetGuard(GuardState new_state) {
+    if (kCheckSafeUsage) {
+      guard_state_ = new_state;
+    }
+  }
+
+  GuardState guard_state_;
 
  private:
   int fd_;
@@ -72,6 +111,8 @@
   DISALLOW_COPY_AND_ASSIGN(FdFile);
 };
 
+std::ostream& operator<<(std::ostream& os, const FdFile::GuardState& kind);
+
 }  // namespace unix_file
 
 #endif  // ART_RUNTIME_BASE_UNIX_FILE_FD_FILE_H_
diff --git a/runtime/base/unix_file/fd_file_test.cc b/runtime/base/unix_file/fd_file_test.cc
index 3481f2f..a7e5b96 100644
--- a/runtime/base/unix_file/fd_file_test.cc
+++ b/runtime/base/unix_file/fd_file_test.cc
@@ -24,7 +24,7 @@
 class FdFileTest : public RandomAccessFileTest {
  protected:
   virtual RandomAccessFile* MakeTestFile() {
-    return new FdFile(fileno(tmpfile()));
+    return new FdFile(fileno(tmpfile()), false);
   }
 };
 
@@ -53,6 +53,7 @@
   ASSERT_TRUE(file.Open(good_path, O_CREAT | O_WRONLY));
   EXPECT_GE(file.Fd(), 0);
   EXPECT_TRUE(file.IsOpened());
+  EXPECT_EQ(0, file.Flush());
   EXPECT_EQ(0, file.Close());
   EXPECT_EQ(-1, file.Fd());
   EXPECT_FALSE(file.IsOpened());
@@ -60,7 +61,7 @@
   EXPECT_GE(file.Fd(), 0);
   EXPECT_TRUE(file.IsOpened());
 
-  file.Close();
+  ASSERT_EQ(file.Close(), 0);
   ASSERT_EQ(unlink(good_path.c_str()), 0);
 }
 
diff --git a/runtime/base/unix_file/random_access_file_test.h b/runtime/base/unix_file/random_access_file_test.h
index 0002433..e7ace4c 100644
--- a/runtime/base/unix_file/random_access_file_test.h
+++ b/runtime/base/unix_file/random_access_file_test.h
@@ -76,6 +76,8 @@
     ASSERT_EQ(content.size(), static_cast<uint64_t>(file->Write(content.data(), content.size(), 0)));
 
     TestReadContent(content, file.get());
+
+    CleanUp(file.get());
   }
 
   void TestReadContent(const std::string& content, RandomAccessFile* file) {
@@ -131,6 +133,8 @@
     ASSERT_EQ(new_length, file->GetLength());
     ASSERT_TRUE(ReadString(file.get(), &new_content));
     ASSERT_EQ('\0', new_content[new_length - 1]);
+
+    CleanUp(file.get());
   }
 
   void TestWrite() {
@@ -163,6 +167,11 @@
     ASSERT_EQ(file->GetLength(), new_length);
     ASSERT_TRUE(ReadString(file.get(), &new_content));
     ASSERT_EQ(std::string("hello\0hello", new_length), new_content);
+
+    CleanUp(file.get());
+  }
+
+  virtual void CleanUp(RandomAccessFile* file ATTRIBUTE_UNUSED) {
   }
 
  protected:
diff --git a/runtime/base/unix_file/random_access_file_utils_test.cc b/runtime/base/unix_file/random_access_file_utils_test.cc
index 6317922..9457d22 100644
--- a/runtime/base/unix_file/random_access_file_utils_test.cc
+++ b/runtime/base/unix_file/random_access_file_utils_test.cc
@@ -37,14 +37,14 @@
 }
 
 TEST_F(RandomAccessFileUtilsTest, BadSrc) {
-  FdFile src(-1);
+  FdFile src(-1, false);
   StringFile dst;
   ASSERT_FALSE(CopyFile(src, &dst));
 }
 
 TEST_F(RandomAccessFileUtilsTest, BadDst) {
   StringFile src;
-  FdFile dst(-1);
+  FdFile dst(-1, false);
 
   // We need some source content to trigger a write.
   // Copying an empty file is a no-op.
diff --git a/runtime/check_reference_map_visitor.h b/runtime/check_reference_map_visitor.h
index 9d2d59c..4fe3852 100644
--- a/runtime/check_reference_map_visitor.h
+++ b/runtime/check_reference_map_visitor.h
@@ -53,7 +53,7 @@
 
   void CheckReferences(int* registers, int number_of_references, uint32_t native_pc_offset)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    if (GetMethod()->IsOptimized()) {
+    if (GetMethod()->IsOptimized(sizeof(void*))) {
       CheckOptimizedMethod(registers, number_of_references, native_pc_offset);
     } else {
       CheckQuickMethod(registers, number_of_references, native_pc_offset);
@@ -98,7 +98,7 @@
   void CheckQuickMethod(int* registers, int number_of_references, uint32_t native_pc_offset)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     mirror::ArtMethod* m = GetMethod();
-    NativePcOffsetToReferenceMap map(m->GetNativeGcMap());
+    NativePcOffsetToReferenceMap map(m->GetNativeGcMap(sizeof(void*)));
     const uint8_t* ref_bitmap = map.FindBitMap(native_pc_offset);
     CHECK(ref_bitmap);
     for (int i = 0; i < number_of_references; ++i) {
diff --git a/runtime/class_linker-inl.h b/runtime/class_linker-inl.h
index ead3fa5..5198769 100644
--- a/runtime/class_linker-inl.h
+++ b/runtime/class_linker-inl.h
@@ -58,9 +58,9 @@
 
 inline mirror::String* ClassLinker::ResolveString(uint32_t string_idx,
                                                   mirror::ArtMethod* referrer) {
-  mirror::String* resolved_string = referrer->GetDexCacheStrings()->Get(string_idx);
+  mirror::Class* declaring_class = referrer->GetDeclaringClass();
+  mirror::String* resolved_string = declaring_class->GetDexCacheStrings()->Get(string_idx);
   if (UNLIKELY(resolved_string == NULL)) {
-    mirror::Class* declaring_class = referrer->GetDeclaringClass();
     StackHandleScope<1> hs(Thread::Current());
     Handle<mirror::DexCache> dex_cache(hs.NewHandle(declaring_class->GetDexCache()));
     const DexFile& dex_file = *dex_cache->GetDexFile();
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 68e20f2..4bd702d 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -2060,6 +2060,7 @@
                                                        Thread* self, const char* descriptor,
                                                        size_t hash,
                                                        Handle<mirror::ClassLoader> class_loader) {
+  // Can we special case for a well understood PathClassLoader with the BootClassLoader as parent?
   if (class_loader->GetClass() !=
       soa.Decode<mirror::Class*>(WellKnownClasses::dalvik_system_PathClassLoader) ||
       class_loader->GetParent()->GetClass() !=
@@ -2071,17 +2072,21 @@
   if (pair.second != nullptr) {
     mirror::Class* klass = LookupClass(self, descriptor, hash, nullptr);
     if (klass != nullptr) {
-      return EnsureResolved(self, descriptor, klass);
+      // May return null if resolution on another thread fails.
+      klass = EnsureResolved(self, descriptor, klass);
+    } else {
+      // May OOME.
+      klass = DefineClass(self, descriptor, hash, NullHandle<mirror::ClassLoader>(), *pair.first,
+                          *pair.second);
     }
-    klass = DefineClass(self, descriptor, hash, NullHandle<mirror::ClassLoader>(), *pair.first,
-                        *pair.second);
-    if (klass != nullptr) {
-      return klass;
+    if (klass == nullptr) {
+      CHECK(self->IsExceptionPending()) << descriptor;
+      self->ClearException();
     }
-    CHECK(self->IsExceptionPending()) << descriptor;
-    self->ClearException();
+    return klass;
   } else {
-    // RegisterDexFile may allocate dex caches (and cause thread suspension).
+    // Handle as if this is the child PathClassLoader.
+    // Handles as RegisterDexFile may allocate dex caches (and cause thread suspension).
     StackHandleScope<3> hs(self);
     // The class loader is a PathClassLoader which inherits from BaseDexClassLoader.
     // We need to get the DexPathList and loop through it.
@@ -2138,8 +2143,9 @@
         }
       }
     }
+    self->AssertNoPendingException();
+    return nullptr;
   }
-  return nullptr;
 }
 
 mirror::Class* ClassLinker::FindClass(Thread* self, const char* descriptor,
@@ -2774,6 +2780,7 @@
 
   klass->SetDexClassDefIndex(dex_file.GetIndexForClassDef(dex_class_def));
   klass->SetDexTypeIndex(dex_class_def.class_idx_);
+  CHECK(klass->GetDexCacheStrings() != nullptr);
 
   const uint8_t* class_data = dex_file.GetClassData(dex_class_def);
   if (class_data == nullptr) {
@@ -2929,7 +2936,6 @@
   dst->SetDeclaringClass(klass.Get());
   dst->SetCodeItemOffset(it.GetMethodCodeItemOffset());
 
-  dst->SetDexCacheStrings(klass->GetDexCache()->GetStrings());
   dst->SetDexCacheResolvedMethods(klass->GetDexCache()->GetResolvedMethods());
   dst->SetDexCacheResolvedTypes(klass->GetDexCache()->GetResolvedTypes());
 
@@ -4057,7 +4063,6 @@
 
   // The proxy method doesn't have its own dex cache or dex file and so it steals those of its
   // interface prototype. The exception to this are Constructors and the Class of the Proxy itself.
-  CHECK_EQ(prototype->GetDexCacheStrings(), method->GetDexCacheStrings());
   CHECK(prototype->HasSameDexCacheResolvedMethods(method.Get()));
   CHECK(prototype->HasSameDexCacheResolvedTypes(method.Get()));
   CHECK_EQ(prototype->GetDexMethodIndex(), method->GetDexMethodIndex());
@@ -5731,24 +5736,13 @@
   }
 }
 
-static OatFile::OatMethod CreateOatMethod(const void* code, const uint8_t* gc_map,
-                                          bool is_portable) {
+static OatFile::OatMethod CreateOatMethod(const void* code, bool is_portable) {
   CHECK_EQ(kUsePortableCompiler, is_portable);
   CHECK(code != nullptr);
-  const uint8_t* base;
-  uint32_t code_offset, gc_map_offset;
-  if (gc_map == nullptr) {
-    base = reinterpret_cast<const uint8_t*>(code);  // Base of data points at code.
-    base -= sizeof(void*);  // Move backward so that code_offset != 0.
-    code_offset = sizeof(void*);
-    gc_map_offset = 0;
-  } else {
-    // TODO: 64bit support.
-    base = nullptr;  // Base of data in oat file, ie 0.
-    code_offset = PointerToLowMemUInt32(code);
-    gc_map_offset = PointerToLowMemUInt32(gc_map);
-  }
-  return OatFile::OatMethod(base, code_offset, gc_map_offset);
+  const uint8_t* base = reinterpret_cast<const uint8_t*>(code);  // Base of data points at code.
+  base -= sizeof(void*);  // Move backward so that code_offset != 0.
+  const uint32_t code_offset = sizeof(void*);
+  return OatFile::OatMethod(base, code_offset);
 }
 
 bool ClassLinker::IsPortableResolutionStub(const void* entry_point) const {
@@ -5782,7 +5776,7 @@
 
 void ClassLinker::SetEntryPointsToCompiledCode(mirror::ArtMethod* method, const void* method_code,
                                                bool is_portable) const {
-  OatFile::OatMethod oat_method = CreateOatMethod(method_code, nullptr, is_portable);
+  OatFile::OatMethod oat_method = CreateOatMethod(method_code, is_portable);
   oat_method.LinkMethod(method);
   method->SetEntryPointFromInterpreter(artInterpreterToCompiledCodeBridge);
   // Create bridges to transition between different kinds of compiled bridge.
@@ -5802,7 +5796,7 @@
     method->SetEntryPointFromQuickCompiledCode(GetQuickToInterpreterBridge());
   } else {
     const void* quick_method_code = GetQuickGenericJniStub();
-    OatFile::OatMethod oat_method = CreateOatMethod(quick_method_code, nullptr, false);
+    OatFile::OatMethod oat_method = CreateOatMethod(quick_method_code, false);
     oat_method.LinkMethod(method);
     method->SetEntryPointFromInterpreter(artInterpreterToCompiledCodeBridge);
     method->SetEntryPointFromPortableCompiledCode(GetPortableToQuickBridge());
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index 006354f..b78d0b5 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -117,8 +117,8 @@
                            Handle<mirror::ClassLoader> class_loader)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  // Find a class in the path class loader, loading it if necessary. Hash function is supposed to
-  // be ComputeModifiedUtf8Hash(descriptor).
+  // Find a class in the path class loader, loading it if necessary without using JNI. Hash
+  // function is supposed to be ComputeModifiedUtf8Hash(descriptor).
   mirror::Class* FindClassInPathClassLoader(ScopedObjectAccessAlreadyRunnable& soa,
                                             Thread* self, const char* descriptor, size_t hash,
                                             Handle<mirror::ClassLoader> class_loader)
diff --git a/runtime/class_linker_test.cc b/runtime/class_linker_test.cc
index 0c86761..99d0746 100644
--- a/runtime/class_linker_test.cc
+++ b/runtime/class_linker_test.cc
@@ -164,11 +164,8 @@
     EXPECT_TRUE(method->GetName() != nullptr);
     EXPECT_TRUE(method->GetSignature() != Signature::NoSignature());
 
-    EXPECT_TRUE(method->GetDexCacheStrings() != nullptr);
     EXPECT_TRUE(method->HasDexCacheResolvedMethods());
     EXPECT_TRUE(method->HasDexCacheResolvedTypes());
-    EXPECT_EQ(method->GetDeclaringClass()->GetDexCache()->GetStrings(),
-              method->GetDexCacheStrings());
     EXPECT_TRUE(method->HasSameDexCacheResolvedMethods(
         method->GetDeclaringClass()->GetDexCache()->GetResolvedMethods()));
     EXPECT_TRUE(method->HasSameDexCacheResolvedTypes(
@@ -205,6 +202,8 @@
     EXPECT_FALSE(klass->IsArrayClass());
     EXPECT_TRUE(klass->GetComponentType() == nullptr);
     EXPECT_TRUE(klass->IsInSamePackage(klass.Get()));
+    EXPECT_TRUE(klass->GetDexCacheStrings() != nullptr);
+    EXPECT_EQ(klass->GetDexCacheStrings(), klass->GetDexCache()->GetStrings());
     std::string temp2;
     EXPECT_TRUE(mirror::Class::IsInSamePackage(klass->GetDescriptor(&temp),
                                                klass->GetDescriptor(&temp2)));
@@ -399,7 +398,8 @@
 
     bool error = false;
 
-    if (!klass->IsClassClass() && !is_static) {
+    // Art method have a different size due to the padding field.
+    if (!klass->IsArtMethodClass() && !klass->IsClassClass() && !is_static) {
       size_t expected_size = is_static ? klass->GetClassSize(): klass->GetObjectSize();
       if (sizeof(T) != expected_size) {
         LOG(ERROR) << "Class size mismatch:"
@@ -496,7 +496,6 @@
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::ArtMethod, declaring_class_),                      "declaringClass"));
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::ArtMethod, dex_cache_resolved_methods_),           "dexCacheResolvedMethods"));
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::ArtMethod, dex_cache_resolved_types_),             "dexCacheResolvedTypes"));
-    offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::ArtMethod, dex_cache_strings_),                    "dexCacheStrings"));
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::ArtMethod, dex_code_item_offset_),           "dexCodeItemOffset"));
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::ArtMethod, dex_method_index_),               "dexMethodIndex"));
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::ArtMethod, method_index_),                   "methodIndex"));
@@ -511,6 +510,7 @@
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Class, clinit_thread_id_),              "clinitThreadId"));
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Class, component_type_),                "componentType"));
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Class, dex_cache_),                     "dexCache"));
+    offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Class, dex_cache_strings_),             "dexCacheStrings"));
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Class, dex_class_def_idx_),             "dexClassDefIndex"));
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Class, dex_type_idx_),                  "dexTypeIndex"));
     offsets.push_back(CheckOffset(OFFSETOF_MEMBER(mirror::Class, direct_methods_),                "directMethods"));
diff --git a/runtime/common_runtime_test.cc b/runtime/common_runtime_test.cc
index 6e3ebc2..03b33e9 100644
--- a/runtime/common_runtime_test.cc
+++ b/runtime/common_runtime_test.cc
@@ -59,7 +59,7 @@
   filename_ += "/TmpFile-XXXXXX";
   int fd = mkstemp(&filename_[0]);
   CHECK_NE(-1, fd);
-  file_.reset(new File(fd, GetFilename()));
+  file_.reset(new File(fd, GetFilename(), true));
 }
 
 ScratchFile::ScratchFile(const ScratchFile& other, const char* suffix) {
@@ -67,7 +67,7 @@
   filename_ += suffix;
   int fd = open(filename_.c_str(), O_RDWR | O_CREAT, 0666);
   CHECK_NE(-1, fd);
-  file_.reset(new File(fd, GetFilename()));
+  file_.reset(new File(fd, GetFilename(), true));
 }
 
 ScratchFile::ScratchFile(File* file) {
@@ -88,6 +88,11 @@
   if (!OS::FileExists(filename_.c_str())) {
     return;
   }
+  if (file_.get() != nullptr) {
+    if (file_->FlushCloseOrErase() != 0) {
+      PLOG(WARNING) << "Error closing scratch file.";
+    }
+  }
   int unlink_result = unlink(filename_.c_str());
   CHECK_EQ(0, unlink_result);
 }
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index e2f6085..49b132d 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -31,7 +31,7 @@
 #include "gc/space/space-inl.h"
 #include "handle_scope.h"
 #include "jdwp/object_registry.h"
-#include "method_helper.h"
+#include "method_helper-inl.h"
 #include "mirror/art_field-inl.h"
 #include "mirror/art_method-inl.h"
 #include "mirror/class.h"
@@ -183,16 +183,20 @@
 
 class Breakpoint {
  public:
-  Breakpoint(mirror::ArtMethod* method, uint32_t dex_pc, bool need_full_deoptimization)
+  Breakpoint(mirror::ArtMethod* method, uint32_t dex_pc,
+             DeoptimizationRequest::Kind deoptimization_kind)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-    : method_(nullptr), dex_pc_(dex_pc), need_full_deoptimization_(need_full_deoptimization) {
+    : method_(nullptr), dex_pc_(dex_pc), deoptimization_kind_(deoptimization_kind) {
+    CHECK(deoptimization_kind_ == DeoptimizationRequest::kNothing ||
+          deoptimization_kind_ == DeoptimizationRequest::kSelectiveDeoptimization ||
+          deoptimization_kind_ == DeoptimizationRequest::kFullDeoptimization);
     ScopedObjectAccessUnchecked soa(Thread::Current());
     method_ = soa.EncodeMethod(method);
   }
 
   Breakpoint(const Breakpoint& other) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
     : method_(nullptr), dex_pc_(other.dex_pc_),
-      need_full_deoptimization_(other.need_full_deoptimization_) {
+      deoptimization_kind_(other.deoptimization_kind_) {
     ScopedObjectAccessUnchecked soa(Thread::Current());
     method_ = soa.EncodeMethod(other.Method());
   }
@@ -206,8 +210,8 @@
     return dex_pc_;
   }
 
-  bool NeedFullDeoptimization() const {
-    return need_full_deoptimization_;
+  DeoptimizationRequest::Kind GetDeoptimizationKind() const {
+    return deoptimization_kind_;
   }
 
  private:
@@ -216,7 +220,7 @@
   uint32_t dex_pc_;
 
   // Indicates whether breakpoint needs full deoptimization or selective deoptimization.
-  bool need_full_deoptimization_;
+  DeoptimizationRequest::Kind deoptimization_kind_;
 };
 
 static std::ostream& operator<<(std::ostream& os, const Breakpoint& rhs)
@@ -736,6 +740,12 @@
   return gDisposed;
 }
 
+bool Dbg::RequiresDeoptimization() {
+  // We don't need deoptimization if everything runs with interpreter after
+  // enabling -Xint mode.
+  return !Runtime::Current()->GetInstrumentation()->IsForcedInterpretOnly();
+}
+
 void Dbg::GoActive() {
   // Enable all debugging features, including scans for breakpoints.
   // This is a no-op if we're already active.
@@ -768,7 +778,9 @@
   Thread* self = Thread::Current();
   ThreadState old_state = self->SetStateUnsafe(kRunnable);
   CHECK_NE(old_state, kRunnable);
-  runtime->GetInstrumentation()->EnableDeoptimization();
+  if (RequiresDeoptimization()) {
+    runtime->GetInstrumentation()->EnableDeoptimization();
+  }
   instrumentation_events_ = 0;
   gDebuggerActive = true;
   CHECK_EQ(self->SetStateUnsafe(old_state), kRunnable);
@@ -806,7 +818,9 @@
                                                     instrumentation_events_);
       instrumentation_events_ = 0;
     }
-    runtime->GetInstrumentation()->DisableDeoptimization();
+    if (RequiresDeoptimization()) {
+      runtime->GetInstrumentation()->DisableDeoptimization();
+    }
     gDebuggerActive = false;
   }
   gRegistry->Clear();
@@ -3035,9 +3049,11 @@
 }
 
 void Dbg::DelayFullUndeoptimization() {
-  MutexLock mu(Thread::Current(), *Locks::deoptimization_lock_);
-  ++delayed_full_undeoptimization_count_;
-  DCHECK_LE(delayed_full_undeoptimization_count_, full_deoptimization_event_count_);
+  if (RequiresDeoptimization()) {
+    MutexLock mu(Thread::Current(), *Locks::deoptimization_lock_);
+    ++delayed_full_undeoptimization_count_;
+    DCHECK_LE(delayed_full_undeoptimization_count_, full_deoptimization_event_count_);
+  }
 }
 
 void Dbg::ProcessDelayedFullUndeoptimizations() {
@@ -3196,20 +3212,78 @@
 }
 
 // Sanity checks all existing breakpoints on the same method.
-static void SanityCheckExistingBreakpoints(mirror::ArtMethod* m, bool need_full_deoptimization)
+static void SanityCheckExistingBreakpoints(mirror::ArtMethod* m,
+                                           DeoptimizationRequest::Kind deoptimization_kind)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::breakpoint_lock_) {
   for (const Breakpoint& breakpoint : gBreakpoints) {
-    CHECK_EQ(need_full_deoptimization, breakpoint.NeedFullDeoptimization());
+    if (breakpoint.Method() == m) {
+      CHECK_EQ(deoptimization_kind, breakpoint.GetDeoptimizationKind());
+    }
   }
-  if (need_full_deoptimization) {
+  instrumentation::Instrumentation* instrumentation = Runtime::Current()->GetInstrumentation();
+  if (deoptimization_kind == DeoptimizationRequest::kFullDeoptimization) {
     // We should have deoptimized everything but not "selectively" deoptimized this method.
-    CHECK(Runtime::Current()->GetInstrumentation()->AreAllMethodsDeoptimized());
-    CHECK(!Runtime::Current()->GetInstrumentation()->IsDeoptimized(m));
-  } else {
+    CHECK(instrumentation->AreAllMethodsDeoptimized());
+    CHECK(!instrumentation->IsDeoptimized(m));
+  } else if (deoptimization_kind == DeoptimizationRequest::kSelectiveDeoptimization) {
     // We should have "selectively" deoptimized this method.
     // Note: while we have not deoptimized everything for this method, we may have done it for
     // another event.
-    CHECK(Runtime::Current()->GetInstrumentation()->IsDeoptimized(m));
+    CHECK(instrumentation->IsDeoptimized(m));
+  } else {
+    // This method does not require deoptimization.
+    CHECK_EQ(deoptimization_kind, DeoptimizationRequest::kNothing);
+    CHECK(!instrumentation->IsDeoptimized(m));
+  }
+}
+
+static DeoptimizationRequest::Kind GetRequiredDeoptimizationKind(Thread* self,
+                                                                 mirror::ArtMethod* m)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  if (!Dbg::RequiresDeoptimization()) {
+    // We already run in interpreter-only mode so we don't need to deoptimize anything.
+    VLOG(jdwp) << "No need for deoptimization when fully running with interpreter for method "
+               << PrettyMethod(m);
+    return DeoptimizationRequest::kNothing;
+  }
+  const Breakpoint* existing_breakpoint;
+  {
+    ReaderMutexLock mu(self, *Locks::breakpoint_lock_);
+    existing_breakpoint = FindFirstBreakpointForMethod(m);
+  }
+  if (existing_breakpoint == nullptr) {
+    // There is no breakpoint on this method yet: we need to deoptimize. If this method may be
+    // inlined, we deoptimize everything; otherwise we deoptimize only this method.
+    // Note: IsMethodPossiblyInlined goes into the method verifier and may cause thread suspension.
+    // Therefore we must not hold any lock when we call it.
+    bool need_full_deoptimization = IsMethodPossiblyInlined(self, m);
+    if (need_full_deoptimization) {
+      VLOG(jdwp) << "Need full deoptimization because of possible inlining of method "
+                 << PrettyMethod(m);
+      return DeoptimizationRequest::kFullDeoptimization;
+    } else {
+      // We don't need to deoptimize if the method has not been compiled.
+      ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
+      const bool is_compiled = class_linker->GetOatMethodQuickCodeFor(m) != nullptr;
+      if (is_compiled) {
+        VLOG(jdwp) << "Need selective deoptimization for compiled method " << PrettyMethod(m);
+        return DeoptimizationRequest::kSelectiveDeoptimization;
+      } else {
+        // Method is not compiled: we don't need to deoptimize.
+        VLOG(jdwp) << "No need for deoptimization for non-compiled method " << PrettyMethod(m);
+        return DeoptimizationRequest::kNothing;
+      }
+    }
+  } else {
+    // There is at least one breakpoint for this method: we don't need to deoptimize.
+    // Let's check that all breakpoints are configured the same way for deoptimization.
+    VLOG(jdwp) << "Breakpoint already set: no deoptimization is required";
+    DeoptimizationRequest::Kind deoptimization_kind = existing_breakpoint->GetDeoptimizationKind();
+    if (kIsDebugBuild) {
+      ReaderMutexLock mu(self, *Locks::breakpoint_lock_);
+      SanityCheckExistingBreakpoints(m, deoptimization_kind);
+    }
+    return DeoptimizationRequest::kNothing;
   }
 }
 
@@ -3220,40 +3294,19 @@
   mirror::ArtMethod* m = FromMethodId(location->method_id);
   DCHECK(m != nullptr) << "No method for method id " << location->method_id;
 
-  const Breakpoint* existing_breakpoint;
-  {
-    ReaderMutexLock mu(self, *Locks::breakpoint_lock_);
-    existing_breakpoint = FindFirstBreakpointForMethod(m);
-  }
-  bool need_full_deoptimization;
-  if (existing_breakpoint == nullptr) {
-    // There is no breakpoint on this method yet: we need to deoptimize. If this method may be
-    // inlined, we deoptimize everything; otherwise we deoptimize only this method.
-    // Note: IsMethodPossiblyInlined goes into the method verifier and may cause thread suspension.
-    // Therefore we must not hold any lock when we call it.
-    need_full_deoptimization = IsMethodPossiblyInlined(self, m);
-    if (need_full_deoptimization) {
-      req->SetKind(DeoptimizationRequest::kFullDeoptimization);
-      req->SetMethod(nullptr);
-    } else {
-      req->SetKind(DeoptimizationRequest::kSelectiveDeoptimization);
-      req->SetMethod(m);
-    }
+  const DeoptimizationRequest::Kind deoptimization_kind = GetRequiredDeoptimizationKind(self, m);
+  req->SetKind(deoptimization_kind);
+  if (deoptimization_kind == DeoptimizationRequest::kSelectiveDeoptimization) {
+    req->SetMethod(m);
   } else {
-    // There is at least one breakpoint for this method: we don't need to deoptimize.
-    req->SetKind(DeoptimizationRequest::kNothing);
+    CHECK(deoptimization_kind == DeoptimizationRequest::kNothing ||
+          deoptimization_kind == DeoptimizationRequest::kFullDeoptimization);
     req->SetMethod(nullptr);
-
-    need_full_deoptimization = existing_breakpoint->NeedFullDeoptimization();
-    if (kIsDebugBuild) {
-      ReaderMutexLock mu(self, *Locks::breakpoint_lock_);
-      SanityCheckExistingBreakpoints(m, need_full_deoptimization);
-    }
   }
 
   {
     WriterMutexLock mu(self, *Locks::breakpoint_lock_);
-    gBreakpoints.push_back(Breakpoint(m, location->dex_pc, need_full_deoptimization));
+    gBreakpoints.push_back(Breakpoint(m, location->dex_pc, deoptimization_kind));
     VLOG(jdwp) << "Set breakpoint #" << (gBreakpoints.size() - 1) << ": "
                << gBreakpoints[gBreakpoints.size() - 1];
   }
@@ -3265,12 +3318,13 @@
   WriterMutexLock mu(Thread::Current(), *Locks::breakpoint_lock_);
   mirror::ArtMethod* m = FromMethodId(location->method_id);
   DCHECK(m != nullptr) << "No method for method id " << location->method_id;
-  bool need_full_deoptimization = false;
+  DeoptimizationRequest::Kind deoptimization_kind = DeoptimizationRequest::kNothing;
   for (size_t i = 0, e = gBreakpoints.size(); i < e; ++i) {
     if (gBreakpoints[i].DexPc() == location->dex_pc && gBreakpoints[i].Method() == m) {
       VLOG(jdwp) << "Removed breakpoint #" << i << ": " << gBreakpoints[i];
-      need_full_deoptimization = gBreakpoints[i].NeedFullDeoptimization();
-      DCHECK_NE(need_full_deoptimization, Runtime::Current()->GetInstrumentation()->IsDeoptimized(m));
+      deoptimization_kind = gBreakpoints[i].GetDeoptimizationKind();
+      DCHECK_EQ(deoptimization_kind == DeoptimizationRequest::kSelectiveDeoptimization,
+                Runtime::Current()->GetInstrumentation()->IsDeoptimized(m));
       gBreakpoints.erase(gBreakpoints.begin() + i);
       break;
     }
@@ -3278,21 +3332,26 @@
   const Breakpoint* const existing_breakpoint = FindFirstBreakpointForMethod(m);
   if (existing_breakpoint == nullptr) {
     // There is no more breakpoint on this method: we need to undeoptimize.
-    if (need_full_deoptimization) {
+    if (deoptimization_kind == DeoptimizationRequest::kFullDeoptimization) {
       // This method required full deoptimization: we need to undeoptimize everything.
       req->SetKind(DeoptimizationRequest::kFullUndeoptimization);
       req->SetMethod(nullptr);
-    } else {
+    } else if (deoptimization_kind == DeoptimizationRequest::kSelectiveDeoptimization) {
       // This method required selective deoptimization: we need to undeoptimize only that method.
       req->SetKind(DeoptimizationRequest::kSelectiveUndeoptimization);
       req->SetMethod(m);
+    } else {
+      // This method had no need for deoptimization: do nothing.
+      CHECK_EQ(deoptimization_kind, DeoptimizationRequest::kNothing);
+      req->SetKind(DeoptimizationRequest::kNothing);
+      req->SetMethod(nullptr);
     }
   } else {
     // There is at least one breakpoint for this method: we don't need to undeoptimize.
     req->SetKind(DeoptimizationRequest::kNothing);
     req->SetMethod(nullptr);
     if (kIsDebugBuild) {
-      SanityCheckExistingBreakpoints(m, need_full_deoptimization);
+      SanityCheckExistingBreakpoints(m, deoptimization_kind);
     }
   }
 }
diff --git a/runtime/debugger.h b/runtime/debugger.h
index 488ba7f..9203163 100644
--- a/runtime/debugger.h
+++ b/runtime/debugger.h
@@ -523,6 +523,9 @@
       LOCKS_EXCLUDED(Locks::breakpoint_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Indicates whether we need deoptimization for debugging.
+  static bool RequiresDeoptimization();
+
   // Records deoptimization request in the queue.
   static void RequestDeoptimization(const DeoptimizationRequest& req)
       LOCKS_EXCLUDED(Locks::deoptimization_lock_)
diff --git a/runtime/dex_file_test.cc b/runtime/dex_file_test.cc
index 134e284..b304779 100644
--- a/runtime/dex_file_test.cc
+++ b/runtime/dex_file_test.cc
@@ -146,6 +146,9 @@
   if (!file->WriteFully(dex_bytes.get(), length)) {
     PLOG(FATAL) << "Failed to write base64 as dex file";
   }
+  if (file->FlushCloseOrErase() != 0) {
+    PLOG(FATAL) << "Could not flush and close test file.";
+  }
   file.reset();
 
   // read dex file
diff --git a/runtime/dex_file_verifier_test.cc b/runtime/dex_file_verifier_test.cc
index addd948..ec1e5f0 100644
--- a/runtime/dex_file_verifier_test.cc
+++ b/runtime/dex_file_verifier_test.cc
@@ -115,6 +115,9 @@
   if (!file->WriteFully(dex_bytes.get(), length)) {
     PLOG(FATAL) << "Failed to write base64 as dex file";
   }
+  if (file->FlushCloseOrErase() != 0) {
+    PLOG(FATAL) << "Could not flush and close test file.";
+  }
   file.reset();
 
   // read dex file
@@ -177,6 +180,9 @@
   if (!file->WriteFully(bytes, length)) {
     PLOG(FATAL) << "Failed to write base64 as dex file";
   }
+  if (file->FlushCloseOrErase() != 0) {
+    PLOG(FATAL) << "Could not flush and close test file.";
+  }
   file.reset();
 
   // read dex file
diff --git a/runtime/elf_file.cc b/runtime/elf_file.cc
index 37c5f9c..6597235 100644
--- a/runtime/elf_file.cc
+++ b/runtime/elf_file.cc
@@ -794,7 +794,7 @@
 Elf_Word ElfFileImpl<Elf_Ehdr, Elf_Phdr, Elf_Shdr, Elf_Word,
     Elf_Sword, Elf_Addr, Elf_Sym, Elf_Rel, Elf_Rela, Elf_Dyn, Elf_Off>
     ::GetHashChain(size_t i, bool* ok) const {
-  if (i >= GetHashBucketNum()) {
+  if (i >= GetHashChainNum()) {
     *ok = false;
     return 0;
   }
diff --git a/runtime/entrypoints/entrypoint_utils-inl.h b/runtime/entrypoints/entrypoint_utils-inl.h
index 1a8ca02..67265a2 100644
--- a/runtime/entrypoints/entrypoint_utils-inl.h
+++ b/runtime/entrypoints/entrypoint_utils-inl.h
@@ -37,9 +37,9 @@
 
 template <const bool kAccessCheck>
 ALWAYS_INLINE
-static inline mirror::Class* CheckObjectAlloc(uint32_t type_idx,
-                                              mirror::ArtMethod* method,
-                                              Thread* self, bool* slow_path) {
+inline mirror::Class* CheckObjectAlloc(uint32_t type_idx,
+                                       mirror::ArtMethod* method,
+                                       Thread* self, bool* slow_path) {
   mirror::Class* klass = method->GetDexCacheResolvedType<false>(type_idx);
   if (UNLIKELY(klass == NULL)) {
     klass = Runtime::Current()->GetClassLinker()->ResolveType(type_idx, method);
@@ -90,9 +90,9 @@
 }
 
 ALWAYS_INLINE
-static inline mirror::Class* CheckClassInitializedForObjectAlloc(mirror::Class* klass,
-                                                                 Thread* self,
-                                                                 bool* slow_path) {
+inline mirror::Class* CheckClassInitializedForObjectAlloc(mirror::Class* klass,
+                                                          Thread* self,
+                                                          bool* slow_path) {
   if (UNLIKELY(!klass->IsInitialized())) {
     StackHandleScope<1> hs(self);
     Handle<mirror::Class> h_class(hs.NewHandle(klass));
@@ -120,10 +120,10 @@
 // check.
 template <bool kAccessCheck, bool kInstrumented>
 ALWAYS_INLINE
-static inline mirror::Object* AllocObjectFromCode(uint32_t type_idx,
-                                                  mirror::ArtMethod* method,
-                                                  Thread* self,
-                                                  gc::AllocatorType allocator_type) {
+inline mirror::Object* AllocObjectFromCode(uint32_t type_idx,
+                                           mirror::ArtMethod* method,
+                                           Thread* self,
+                                           gc::AllocatorType allocator_type) {
   bool slow_path = false;
   mirror::Class* klass = CheckObjectAlloc<kAccessCheck>(type_idx, method, self, &slow_path);
   if (UNLIKELY(slow_path)) {
@@ -139,9 +139,9 @@
 // Given the context of a calling Method and a resolved class, create an instance.
 template <bool kInstrumented>
 ALWAYS_INLINE
-static inline mirror::Object* AllocObjectFromCodeResolved(mirror::Class* klass,
-                                                          Thread* self,
-                                                          gc::AllocatorType allocator_type) {
+inline mirror::Object* AllocObjectFromCodeResolved(mirror::Class* klass,
+                                                   Thread* self,
+                                                   gc::AllocatorType allocator_type) {
   DCHECK(klass != nullptr);
   bool slow_path = false;
   klass = CheckClassInitializedForObjectAlloc(klass, self, &slow_path);
@@ -160,9 +160,9 @@
 // Given the context of a calling Method and an initialized class, create an instance.
 template <bool kInstrumented>
 ALWAYS_INLINE
-static inline mirror::Object* AllocObjectFromCodeInitialized(mirror::Class* klass,
-                                                             Thread* self,
-                                                             gc::AllocatorType allocator_type) {
+inline mirror::Object* AllocObjectFromCodeInitialized(mirror::Class* klass,
+                                                      Thread* self,
+                                                      gc::AllocatorType allocator_type) {
   DCHECK(klass != nullptr);
   // Pass in false since the object can not be finalizable.
   return klass->Alloc<kInstrumented, false>(self, allocator_type);
@@ -171,10 +171,10 @@
 
 template <bool kAccessCheck>
 ALWAYS_INLINE
-static inline mirror::Class* CheckArrayAlloc(uint32_t type_idx,
-                                             mirror::ArtMethod* method,
-                                             int32_t component_count,
-                                             bool* slow_path) {
+inline mirror::Class* CheckArrayAlloc(uint32_t type_idx,
+                                      mirror::ArtMethod* method,
+                                      int32_t component_count,
+                                      bool* slow_path) {
   if (UNLIKELY(component_count < 0)) {
     ThrowNegativeArraySizeException(component_count);
     *slow_path = true;
@@ -207,11 +207,11 @@
 // check.
 template <bool kAccessCheck, bool kInstrumented>
 ALWAYS_INLINE
-static inline mirror::Array* AllocArrayFromCode(uint32_t type_idx,
-                                                mirror::ArtMethod* method,
-                                                int32_t component_count,
-                                                Thread* self,
-                                                gc::AllocatorType allocator_type) {
+inline mirror::Array* AllocArrayFromCode(uint32_t type_idx,
+                                         mirror::ArtMethod* method,
+                                         int32_t component_count,
+                                         Thread* self,
+                                         gc::AllocatorType allocator_type) {
   bool slow_path = false;
   mirror::Class* klass = CheckArrayAlloc<kAccessCheck>(type_idx, method, component_count,
                                                        &slow_path);
@@ -230,11 +230,11 @@
 
 template <bool kAccessCheck, bool kInstrumented>
 ALWAYS_INLINE
-static inline mirror::Array* AllocArrayFromCodeResolved(mirror::Class* klass,
-                                                        mirror::ArtMethod* method,
-                                                        int32_t component_count,
-                                                        Thread* self,
-                                                        gc::AllocatorType allocator_type) {
+inline mirror::Array* AllocArrayFromCodeResolved(mirror::Class* klass,
+                                                 mirror::ArtMethod* method,
+                                                 int32_t component_count,
+                                                 Thread* self,
+                                                 gc::AllocatorType allocator_type) {
   DCHECK(klass != nullptr);
   if (UNLIKELY(component_count < 0)) {
     ThrowNegativeArraySizeException(component_count);
@@ -254,8 +254,8 @@
 }
 
 template<FindFieldType type, bool access_check>
-static inline mirror::ArtField* FindFieldFromCode(uint32_t field_idx, mirror::ArtMethod* referrer,
-                                                  Thread* self, size_t expected_size) {
+inline mirror::ArtField* FindFieldFromCode(uint32_t field_idx, mirror::ArtMethod* referrer,
+                                           Thread* self, size_t expected_size) {
   bool is_primitive;
   bool is_set;
   bool is_static;
@@ -349,9 +349,9 @@
 #undef EXPLICIT_FIND_FIELD_FROM_CODE_TEMPLATE_DECL
 
 template<InvokeType type, bool access_check>
-static inline mirror::ArtMethod* FindMethodFromCode(uint32_t method_idx,
-                                                    mirror::Object** this_object,
-                                                    mirror::ArtMethod** referrer, Thread* self) {
+inline mirror::ArtMethod* FindMethodFromCode(uint32_t method_idx,
+                                             mirror::Object** this_object,
+                                             mirror::ArtMethod** referrer, Thread* self) {
   ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
   mirror::ArtMethod* resolved_method = class_linker->GetResolvedMethod(method_idx, *referrer);
   if (resolved_method == nullptr) {
@@ -475,9 +475,9 @@
 #undef EXPLICIT_FIND_METHOD_FROM_CODE_TEMPLATE_DECL
 
 // Fast path field resolution that can't initialize classes or throw exceptions.
-static inline mirror::ArtField* FindFieldFast(uint32_t field_idx,
-                                              mirror::ArtMethod* referrer,
-                                              FindFieldType type, size_t expected_size) {
+inline mirror::ArtField* FindFieldFast(uint32_t field_idx,
+                                       mirror::ArtMethod* referrer,
+                                       FindFieldType type, size_t expected_size) {
   mirror::ArtField* resolved_field =
       referrer->GetDeclaringClass()->GetDexCache()->GetResolvedField(field_idx);
   if (UNLIKELY(resolved_field == nullptr)) {
@@ -528,10 +528,10 @@
 }
 
 // Fast path method resolution that can't throw exceptions.
-static inline mirror::ArtMethod* FindMethodFast(uint32_t method_idx,
-                                                mirror::Object* this_object,
-                                                mirror::ArtMethod* referrer,
-                                                bool access_check, InvokeType type) {
+inline mirror::ArtMethod* FindMethodFast(uint32_t method_idx,
+                                         mirror::Object* this_object,
+                                         mirror::ArtMethod* referrer,
+                                         bool access_check, InvokeType type) {
   if (UNLIKELY(this_object == NULL && type != kStatic)) {
     return NULL;
   }
@@ -568,7 +568,7 @@
   }
 }
 
-static inline mirror::Class* ResolveVerifyAndClinit(uint32_t type_idx,
+inline mirror::Class* ResolveVerifyAndClinit(uint32_t type_idx,
                                                     mirror::ArtMethod* referrer,
                                                     Thread* self, bool can_run_clinit,
                                                     bool verify_access) {
@@ -604,13 +604,13 @@
   return h_class.Get();
 }
 
-static inline mirror::String* ResolveStringFromCode(mirror::ArtMethod* referrer,
-                                                    uint32_t string_idx) {
+inline mirror::String* ResolveStringFromCode(mirror::ArtMethod* referrer,
+                                             uint32_t string_idx) {
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
   return class_linker->ResolveString(string_idx, referrer);
 }
 
-static inline void UnlockJniSynchronizedMethod(jobject locked, Thread* self) {
+inline void UnlockJniSynchronizedMethod(jobject locked, Thread* self) {
   // Save any pending exception over monitor exit call.
   mirror::Throwable* saved_exception = NULL;
   ThrowLocation saved_throw_location;
@@ -635,7 +635,7 @@
 }
 
 template <typename INT_TYPE, typename FLOAT_TYPE>
-static inline INT_TYPE art_float_to_integral(FLOAT_TYPE f) {
+inline INT_TYPE art_float_to_integral(FLOAT_TYPE f) {
   const INT_TYPE kMaxInt = static_cast<INT_TYPE>(std::numeric_limits<INT_TYPE>::max());
   const INT_TYPE kMinInt = static_cast<INT_TYPE>(std::numeric_limits<INT_TYPE>::min());
   const FLOAT_TYPE kMaxIntAsFloat = static_cast<FLOAT_TYPE>(kMaxInt);
diff --git a/runtime/entrypoints/entrypoint_utils.h b/runtime/entrypoints/entrypoint_utils.h
index 311cafa..0531122 100644
--- a/runtime/entrypoints/entrypoint_utils.h
+++ b/runtime/entrypoints/entrypoint_utils.h
@@ -42,13 +42,14 @@
 class Thread;
 
 template <const bool kAccessCheck>
-ALWAYS_INLINE static inline mirror::Class* CheckObjectAlloc(uint32_t type_idx,
-                                                            mirror::ArtMethod* method,
-                                                            Thread* self, bool* slow_path)
+ALWAYS_INLINE inline mirror::Class* CheckObjectAlloc(uint32_t type_idx,
+                                                     mirror::ArtMethod* method,
+                                                     Thread* self, bool* slow_path)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-ALWAYS_INLINE static inline mirror::Class* CheckClassInitializedForObjectAlloc(mirror::Class* klass,
-                                                                               Thread* self, bool* slow_path)
+ALWAYS_INLINE inline mirror::Class* CheckClassInitializedForObjectAlloc(mirror::Class* klass,
+                                                                        Thread* self,
+                                                                        bool* slow_path)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 // Given the context of a calling Method, use its DexCache to resolve a type to a Class. If it
@@ -56,32 +57,32 @@
 // When verification/compiler hasn't been able to verify access, optionally perform an access
 // check.
 template <bool kAccessCheck, bool kInstrumented>
-ALWAYS_INLINE static inline mirror::Object* AllocObjectFromCode(uint32_t type_idx,
-                                                                mirror::ArtMethod* method,
-                                                                Thread* self,
-                                                                gc::AllocatorType allocator_type)
+ALWAYS_INLINE inline mirror::Object* AllocObjectFromCode(uint32_t type_idx,
+                                                         mirror::ArtMethod* method,
+                                                         Thread* self,
+                                                         gc::AllocatorType allocator_type)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 // Given the context of a calling Method and a resolved class, create an instance.
 template <bool kInstrumented>
-ALWAYS_INLINE static inline mirror::Object* AllocObjectFromCodeResolved(mirror::Class* klass,
-                                                                        Thread* self,
-                                                                        gc::AllocatorType allocator_type)
+ALWAYS_INLINE inline mirror::Object* AllocObjectFromCodeResolved(mirror::Class* klass,
+                                                                 Thread* self,
+                                                                 gc::AllocatorType allocator_type)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 // Given the context of a calling Method and an initialized class, create an instance.
 template <bool kInstrumented>
-ALWAYS_INLINE static inline mirror::Object* AllocObjectFromCodeInitialized(mirror::Class* klass,
-                                                                           Thread* self,
-                                                                           gc::AllocatorType allocator_type)
+ALWAYS_INLINE inline mirror::Object* AllocObjectFromCodeInitialized(mirror::Class* klass,
+                                                                    Thread* self,
+                                                                    gc::AllocatorType allocator_type)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 
 template <bool kAccessCheck>
-ALWAYS_INLINE static inline mirror::Class* CheckArrayAlloc(uint32_t type_idx,
-                                                           mirror::ArtMethod* method,
-                                                           int32_t component_count,
-                                                           bool* slow_path)
+ALWAYS_INLINE inline mirror::Class* CheckArrayAlloc(uint32_t type_idx,
+                                                    mirror::ArtMethod* method,
+                                                    int32_t component_count,
+                                                    bool* slow_path)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 // Given the context of a calling Method, use its DexCache to resolve a type to an array Class. If
@@ -89,19 +90,19 @@
 // When verification/compiler hasn't been able to verify access, optionally perform an access
 // check.
 template <bool kAccessCheck, bool kInstrumented>
-ALWAYS_INLINE static inline mirror::Array* AllocArrayFromCode(uint32_t type_idx,
-                                                              mirror::ArtMethod* method,
-                                                              int32_t component_count,
-                                                              Thread* self,
-                                                              gc::AllocatorType allocator_type)
+ALWAYS_INLINE inline mirror::Array* AllocArrayFromCode(uint32_t type_idx,
+                                                       mirror::ArtMethod* method,
+                                                       int32_t component_count,
+                                                       Thread* self,
+                                                       gc::AllocatorType allocator_type)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 template <bool kAccessCheck, bool kInstrumented>
-ALWAYS_INLINE static inline mirror::Array* AllocArrayFromCodeResolved(mirror::Class* klass,
-                                                                      mirror::ArtMethod* method,
-                                                                      int32_t component_count,
-                                                                      Thread* self,
-                                                                      gc::AllocatorType allocator_type)
+ALWAYS_INLINE inline mirror::Array* AllocArrayFromCodeResolved(mirror::Class* klass,
+                                                               mirror::ArtMethod* method,
+                                                               int32_t component_count,
+                                                               Thread* self,
+                                                               gc::AllocatorType allocator_type)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 extern mirror::Array* CheckAndAllocArrayFromCode(uint32_t type_idx, mirror::ArtMethod* method,
@@ -130,43 +131,42 @@
 };
 
 template<FindFieldType type, bool access_check>
-static inline mirror::ArtField* FindFieldFromCode(uint32_t field_idx, mirror::ArtMethod* referrer,
-                                                  Thread* self, size_t expected_size)
+inline mirror::ArtField* FindFieldFromCode(uint32_t field_idx, mirror::ArtMethod* referrer,
+                                           Thread* self, size_t expected_size)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 template<InvokeType type, bool access_check>
-static inline mirror::ArtMethod* FindMethodFromCode(uint32_t method_idx,
-                                                    mirror::Object** this_object,
-                                                    mirror::ArtMethod** referrer, Thread* self)
+inline mirror::ArtMethod* FindMethodFromCode(uint32_t method_idx,
+                                             mirror::Object** this_object,
+                                             mirror::ArtMethod** referrer, Thread* self)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 // Fast path field resolution that can't initialize classes or throw exceptions.
-static inline mirror::ArtField* FindFieldFast(uint32_t field_idx,
-                                              mirror::ArtMethod* referrer,
-                                              FindFieldType type, size_t expected_size)
+inline mirror::ArtField* FindFieldFast(uint32_t field_idx,
+                                       mirror::ArtMethod* referrer,
+                                       FindFieldType type, size_t expected_size)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 // Fast path method resolution that can't throw exceptions.
-static inline mirror::ArtMethod* FindMethodFast(uint32_t method_idx,
-                                                mirror::Object* this_object,
-                                                mirror::ArtMethod* referrer,
-                                                bool access_check, InvokeType type)
+inline mirror::ArtMethod* FindMethodFast(uint32_t method_idx,
+                                         mirror::Object* this_object,
+                                         mirror::ArtMethod* referrer,
+                                         bool access_check, InvokeType type)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-static inline mirror::Class* ResolveVerifyAndClinit(uint32_t type_idx,
-                                                    mirror::ArtMethod* referrer,
-                                                    Thread* self, bool can_run_clinit,
-                                                    bool verify_access)
+inline mirror::Class* ResolveVerifyAndClinit(uint32_t type_idx,
+                                             mirror::ArtMethod* referrer,
+                                             Thread* self, bool can_run_clinit,
+                                             bool verify_access)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 extern void ThrowStackOverflowError(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-static inline mirror::String* ResolveStringFromCode(mirror::ArtMethod* referrer,
-                                                    uint32_t string_idx)
+inline mirror::String* ResolveStringFromCode(mirror::ArtMethod* referrer, uint32_t string_idx)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 // TODO: annotalysis disabled as monitor semantics are maintained in Java code.
-static inline void UnlockJniSynchronizedMethod(jobject locked, Thread* self)
+inline void UnlockJniSynchronizedMethod(jobject locked, Thread* self)
     NO_THREAD_SAFETY_ANALYSIS;
 
 void CheckReferenceResult(mirror::Object* o, Thread* self)
@@ -181,7 +181,7 @@
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
 template <typename INT_TYPE, typename FLOAT_TYPE>
-static inline INT_TYPE art_float_to_integral(FLOAT_TYPE f);
+inline INT_TYPE art_float_to_integral(FLOAT_TYPE f);
 
 }  // namespace art
 
diff --git a/runtime/entrypoints/portable/portable_thread_entrypoints.cc b/runtime/entrypoints/portable/portable_thread_entrypoints.cc
index ecbc65e..95ac66c 100644
--- a/runtime/entrypoints/portable/portable_thread_entrypoints.cc
+++ b/runtime/entrypoints/portable/portable_thread_entrypoints.cc
@@ -34,7 +34,7 @@
       uint32_t dex_pc = cur_frame->GetDexPC();
       ShadowFrame* new_frame = ShadowFrame::Create(num_regs, NULL, method, dex_pc);
 
-      const uint8_t* gc_map = method->GetNativeGcMap();
+      const uint8_t* gc_map = method->GetNativeGcMap(sizeof(void*));
       verifier::DexPcToReferenceMap dex_gc_map(gc_map);
       const uint8_t* reg_bitmap = dex_gc_map.FindBitMap(dex_pc);
       for (size_t reg = 0; reg < num_regs; ++reg) {
diff --git a/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc b/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
index d5493bd..54dbd8c 100644
--- a/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
@@ -35,7 +35,7 @@
   if (instrumentation->IsDeoptimized(method)) {
     result = GetQuickToInterpreterBridge();
   } else {
-    result = instrumentation->GetQuickCodeFor(method);
+    result = instrumentation->GetQuickCodeFor(method, sizeof(void*));
     DCHECK(!Runtime::Current()->GetClassLinker()->IsQuickToInterpreterBridge(result));
   }
   bool interpreter_entry = (result == GetQuickToInterpreterBridge());
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 0b7d382..93dc62a 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -919,17 +919,16 @@
   static constexpr bool kAlignLongOnStack = false;
   static constexpr bool kAlignDoubleOnStack = false;
 #elif defined(__mips__)
-  // TODO: These are all dummy values!
   static constexpr bool kNativeSoftFloatAbi = true;  // This is a hard float ABI.
-  static constexpr size_t kNumNativeGprArgs = 0;  // 6 arguments passed in GPRs.
-  static constexpr size_t kNumNativeFprArgs = 0;  // 8 arguments passed in FPRs.
+  static constexpr size_t kNumNativeGprArgs = 4;  // 4 arguments passed in GPRs.
+  static constexpr size_t kNumNativeFprArgs = 0;  // 0 arguments passed in FPRs.
 
   static constexpr size_t kRegistersNeededForLong = 2;
   static constexpr size_t kRegistersNeededForDouble = 2;
   static constexpr bool kMultiRegistersAligned = true;
   static constexpr bool kMultiRegistersWidened = true;
-  static constexpr bool kAlignLongOnStack = false;
-  static constexpr bool kAlignDoubleOnStack = false;
+  static constexpr bool kAlignLongOnStack = true;
+  static constexpr bool kAlignDoubleOnStack = true;
 #elif defined(__i386__)
   // TODO: Check these!
   static constexpr bool kNativeSoftFloatAbi = false;  // Not using int registers for fp
diff --git a/runtime/exception_test.cc b/runtime/exception_test.cc
index ee9b221..580b541 100644
--- a/runtime/exception_test.cc
+++ b/runtime/exception_test.cc
@@ -76,7 +76,8 @@
     const std::vector<uint8_t>& fake_mapping_data = fake_mapping_data_.GetData();
     uint32_t vmap_table_offset = sizeof(OatQuickMethodHeader) + fake_vmap_table_data.size();
     uint32_t mapping_table_offset = vmap_table_offset + fake_mapping_data.size();
-    OatQuickMethodHeader method_header(mapping_table_offset, vmap_table_offset,
+    uint32_t gc_map_offset = mapping_table_offset + fake_gc_map_.size();
+    OatQuickMethodHeader method_header(mapping_table_offset, vmap_table_offset, gc_map_offset,
                                        4 * sizeof(void*), 0u, 0u, code_size);
     fake_header_code_and_maps_.resize(sizeof(method_header));
     memcpy(&fake_header_code_and_maps_[0], &method_header, sizeof(method_header));
@@ -84,23 +85,23 @@
                                       fake_vmap_table_data.begin(), fake_vmap_table_data.end());
     fake_header_code_and_maps_.insert(fake_header_code_and_maps_.begin(),
                                       fake_mapping_data.begin(), fake_mapping_data.end());
+    fake_header_code_and_maps_.insert(fake_header_code_and_maps_.begin(),
+                                      fake_gc_map_.begin(), fake_gc_map_.end());
     fake_header_code_and_maps_.insert(fake_header_code_and_maps_.end(),
                                       fake_code_.begin(), fake_code_.end());
 
     // NOTE: Don't align the code (it will not be executed) but check that the Thumb2
     // adjustment will be a NOP, see ArtMethod::EntryPointToCodePointer().
     CHECK_EQ(mapping_table_offset & 1u, 0u);
-    const uint8_t* code_ptr = &fake_header_code_and_maps_[mapping_table_offset];
+    const uint8_t* code_ptr = &fake_header_code_and_maps_[gc_map_offset];
 
     method_f_ = my_klass_->FindVirtualMethod("f", "()I");
     ASSERT_TRUE(method_f_ != nullptr);
     method_f_->SetEntryPointFromQuickCompiledCode(code_ptr);
-    method_f_->SetNativeGcMap(&fake_gc_map_[0]);
 
     method_g_ = my_klass_->FindVirtualMethod("g", "(I)V");
     ASSERT_TRUE(method_g_ != nullptr);
     method_g_->SetEntryPointFromQuickCompiledCode(code_ptr);
-    method_g_->SetNativeGcMap(&fake_gc_map_[0]);
   }
 
   const DexFile* dex_;
diff --git a/runtime/fault_handler.cc b/runtime/fault_handler.cc
index 65844a5..94753d4 100644
--- a/runtime/fault_handler.cc
+++ b/runtime/fault_handler.cc
@@ -366,7 +366,8 @@
   // at the return PC address.
   if (true || kIsDebugBuild) {
     VLOG(signals) << "looking for dex pc for return pc " << std::hex << return_pc;
-    const void* code = Runtime::Current()->GetInstrumentation()->GetQuickCodeFor(method_obj);
+    const void* code = Runtime::Current()->GetInstrumentation()->GetQuickCodeFor(method_obj,
+                                                                                 sizeof(void*));
     uint32_t sought_offset = return_pc - reinterpret_cast<uintptr_t>(code);
     VLOG(signals) << "pc offset: " << std::hex << sought_offset;
   }
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 991b956..7c2474f 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -1772,7 +1772,7 @@
     if (i < 4) {
       numOfPages[i] = 1;
     } else if (i < 8) {
-      numOfPages[i] = 2;
+      numOfPages[i] = 1;
     } else if (i < 16) {
       numOfPages[i] = 4;
     } else if (i < 32) {
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index 431686a..3269e10 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -414,8 +414,7 @@
 
   // We use thread-local runs for the size Brackets whose indexes
   // are less than this index. We use shared (current) runs for the rest.
-
-  static const size_t kNumThreadLocalSizeBrackets = 11;
+  static const size_t kNumThreadLocalSizeBrackets = 8;
 
  private:
   // The base address of the memory region that's managed by this allocator.
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 3101c68..9d2f6d1 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -48,11 +48,20 @@
   }
   // Need to check that we arent the large object allocator since the large object allocation code
   // path this function. If we didn't check we would have an infinite loop.
-  if (kCheckLargeObject && UNLIKELY(ShouldAllocLargeObject(klass, byte_count))) {
-    return AllocLargeObject<kInstrumented, PreFenceVisitor>(self, klass, byte_count,
-                                                            pre_fence_visitor);
-  }
   mirror::Object* obj;
+  if (kCheckLargeObject && UNLIKELY(ShouldAllocLargeObject(klass, byte_count))) {
+    obj = AllocLargeObject<kInstrumented, PreFenceVisitor>(self, &klass, byte_count,
+                                                           pre_fence_visitor);
+    if (obj != nullptr) {
+      return obj;
+    } else {
+      // There should be an OOM exception, since we are retrying, clear it.
+      self->ClearException();
+    }
+    // If the large object allocation failed, try to use the normal spaces (main space,
+    // non moving space). This can happen if there is significant virtual address space
+    // fragmentation.
+  }
   AllocationTimer alloc_timer(this, &obj);
   size_t bytes_allocated;
   size_t usable_size;
@@ -171,10 +180,13 @@
 }
 
 template <bool kInstrumented, typename PreFenceVisitor>
-inline mirror::Object* Heap::AllocLargeObject(Thread* self, mirror::Class* klass,
+inline mirror::Object* Heap::AllocLargeObject(Thread* self, mirror::Class** klass,
                                               size_t byte_count,
                                               const PreFenceVisitor& pre_fence_visitor) {
-  return AllocObjectWithAllocator<kInstrumented, false, PreFenceVisitor>(self, klass, byte_count,
+  // Save and restore the class in case it moves.
+  StackHandleScope<1> hs(self);
+  auto klass_wrapper = hs.NewHandleWrapper(klass);
+  return AllocObjectWithAllocator<kInstrumented, false, PreFenceVisitor>(self, *klass, byte_count,
                                                                          kAllocatorTypeLOS,
                                                                          pre_fence_visitor);
 }
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 3f747ee..0fd0a9f 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -96,6 +96,8 @@
 static const char* kDlMallocSpaceName[2] = {"main dlmalloc space", "main dlmalloc space 1"};
 static const char* kRosAllocSpaceName[2] = {"main rosalloc space", "main rosalloc space 1"};
 static const char* kMemMapSpaceName[2] = {"main space", "main space 1"};
+static const char* kNonMovingSpaceName = "non moving space";
+static const char* kZygoteSpaceName = "zygote space";
 static constexpr size_t kGSSBumpPointerSpaceCapacity = 32 * MB;
 
 Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max_free,
@@ -258,10 +260,14 @@
   std::string error_str;
   std::unique_ptr<MemMap> non_moving_space_mem_map;
   if (separate_non_moving_space) {
+    // If we are the zygote, the non moving space becomes the zygote space when we run
+    // PreZygoteFork the first time. In this case, call the map "zygote space" since we can't
+    // rename the mem map later.
+    const char* space_name = is_zygote ? kZygoteSpaceName: kNonMovingSpaceName;
     // Reserve the non moving mem map before the other two since it needs to be at a specific
     // address.
     non_moving_space_mem_map.reset(
-        MemMap::MapAnonymous("non moving space", requested_alloc_space_begin,
+        MemMap::MapAnonymous(space_name, requested_alloc_space_begin,
                              non_moving_space_capacity, PROT_READ | PROT_WRITE, true, &error_str));
     CHECK(non_moving_space_mem_map != nullptr) << error_str;
     // Try to reserve virtual memory at a lower address if we have a separate non moving space.
@@ -359,6 +365,7 @@
   uint8_t* heap_end = continuous_spaces_.back()->Limit();
   size_t heap_capacity = heap_end - heap_begin;
   // Remove the main backup space since it slows down the GC to have unused extra spaces.
+  // TODO: Avoid needing to do this.
   if (main_space_backup_.get() != nullptr) {
     RemoveSpace(main_space_backup_.get());
   }
@@ -971,6 +978,22 @@
   Trim();
 }
 
+class TrimIndirectReferenceTableClosure : public Closure {
+ public:
+  explicit TrimIndirectReferenceTableClosure(Barrier* barrier) : barrier_(barrier) {
+  }
+  virtual void Run(Thread* thread) OVERRIDE NO_THREAD_SAFETY_ANALYSIS {
+    ATRACE_BEGIN("Trimming reference table");
+    thread->GetJniEnv()->locals.Trim();
+    ATRACE_END();
+    barrier_->Pass(Thread::Current());
+  }
+
+ private:
+  Barrier* const barrier_;
+};
+
+
 void Heap::Trim() {
   Thread* self = Thread::Current();
   {
@@ -992,6 +1015,19 @@
     WaitForGcToCompleteLocked(kGcCauseTrim, self);
     collector_type_running_ = kCollectorTypeHeapTrim;
   }
+  // Trim reference tables.
+  {
+    ScopedObjectAccess soa(self);
+    JavaVMExt* vm = soa.Vm();
+    // Trim globals indirect reference table.
+    vm->TrimGlobals();
+    // Trim locals indirect reference tables.
+    Barrier barrier(0);
+    TrimIndirectReferenceTableClosure closure(&barrier);
+    ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
+    size_t barrier_count = Runtime::Current()->GetThreadList()->RunCheckpoint(&closure);
+    barrier.Increment(self, barrier_count);
+  }
   uint64_t start_ns = NanoTime();
   // Trim the managed spaces.
   uint64_t total_alloc_space_allocated = 0;
@@ -1565,6 +1601,8 @@
   to_space->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
   const uint64_t space_size_before_compaction = from_space->Size();
   AddSpace(to_space);
+  // Make sure that we will have enough room to copy.
+  CHECK_GE(to_space->GetFootprintLimit(), from_space->GetFootprintLimit());
   Compact(to_space, from_space, kGcCauseHomogeneousSpaceCompact);
   // Leave as prot read so that we can still run ROSAlloc verification on this space.
   from_space->GetMemMap()->Protect(PROT_READ);
@@ -1683,8 +1721,8 @@
         RemoveSpace(temp_space_);
         temp_space_ = nullptr;
         mem_map->Protect(PROT_READ | PROT_WRITE);
-        CreateMainMallocSpace(mem_map.get(), kDefaultInitialSize, mem_map->Size(),
-                              mem_map->Size());
+        CreateMainMallocSpace(mem_map.get(), kDefaultInitialSize,
+                              std::min(mem_map->Size(), growth_limit_), mem_map->Size());
         mem_map.release();
         // Compact to the main space from the bump pointer space, don't need to swap semispaces.
         AddSpace(main_space_);
@@ -1697,9 +1735,9 @@
         if (kIsDebugBuild && kUseRosAlloc) {
           mem_map->Protect(PROT_READ | PROT_WRITE);
         }
-        main_space_backup_.reset(CreateMallocSpaceFromMemMap(mem_map.get(), kDefaultInitialSize,
-                                                             mem_map->Size(), mem_map->Size(),
-                                                             name, true));
+        main_space_backup_.reset(CreateMallocSpaceFromMemMap(
+            mem_map.get(), kDefaultInitialSize, std::min(mem_map->Size(), growth_limit_),
+            mem_map->Size(), name, true));
         if (kIsDebugBuild && kUseRosAlloc) {
           mem_map->Protect(PROT_NONE);
         }
@@ -1941,7 +1979,8 @@
       MemMap* mem_map = main_space_->ReleaseMemMap();
       RemoveSpace(main_space_);
       space::Space* old_main_space = main_space_;
-      CreateMainMallocSpace(mem_map, kDefaultInitialSize, mem_map->Size(), mem_map->Size());
+      CreateMainMallocSpace(mem_map, kDefaultInitialSize, std::min(mem_map->Size(), growth_limit_),
+                            mem_map->Size());
       delete old_main_space;
       AddSpace(main_space_);
     } else {
@@ -1976,7 +2015,8 @@
     // from this point on.
     RemoveRememberedSet(old_alloc_space);
   }
-  zygote_space_ = old_alloc_space->CreateZygoteSpace("alloc space", low_memory_mode_,
+  // Remaining space becomes the new non moving space.
+  zygote_space_ = old_alloc_space->CreateZygoteSpace(kNonMovingSpaceName, low_memory_mode_,
                                                      &non_moving_space_);
   CHECK(!non_moving_space_->CanMoveObjects());
   if (same_space) {
@@ -2952,7 +2992,18 @@
 
 void Heap::ClearGrowthLimit() {
   growth_limit_ = capacity_;
-  non_moving_space_->ClearGrowthLimit();
+  for (const auto& space : continuous_spaces_) {
+    if (space->IsMallocSpace()) {
+      gc::space::MallocSpace* malloc_space = space->AsMallocSpace();
+      malloc_space->ClearGrowthLimit();
+      malloc_space->SetFootprintLimit(malloc_space->Capacity());
+    }
+  }
+  // This space isn't added for performance reasons.
+  if (main_space_backup_.get() != nullptr) {
+    main_space_backup_->ClearGrowthLimit();
+    main_space_backup_->SetFootprintLimit(main_space_backup_->Capacity());
+  }
 }
 
 void Heap::AddFinalizerReference(Thread* self, mirror::Object** object) {
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 69a573e..4e1a0ff 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -654,7 +654,7 @@
 
   // We don't force this to be inlined since it is a slow path.
   template <bool kInstrumented, typename PreFenceVisitor>
-  mirror::Object* AllocLargeObject(Thread* self, mirror::Class* klass, size_t byte_count,
+  mirror::Object* AllocLargeObject(Thread* self, mirror::Class** klass, size_t byte_count,
                                    const PreFenceVisitor& pre_fence_visitor)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index b232128..071997f 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -74,17 +74,17 @@
 // out-of-date. We also don't really care if this fails since it is just a convenience.
 // Adapted from prune_dex_cache(const char* subdir) in frameworks/native/cmds/installd/commands.c
 // Note this should only be used during first boot.
-static void RealPruneDexCache(const std::string& cache_dir_path);
+static void RealPruneDalvikCache(const std::string& cache_dir_path);
 
-static void PruneDexCache(InstructionSet isa) {
+static void PruneDalvikCache(InstructionSet isa) {
   CHECK_NE(isa, kNone);
   // Prune the base /data/dalvik-cache.
-  RealPruneDexCache(GetDalvikCacheOrDie(".", false));
+  RealPruneDalvikCache(GetDalvikCacheOrDie(".", false));
   // Prune /data/dalvik-cache/<isa>.
-  RealPruneDexCache(GetDalvikCacheOrDie(GetInstructionSetString(isa), false));
+  RealPruneDalvikCache(GetDalvikCacheOrDie(GetInstructionSetString(isa), false));
 }
 
-static void RealPruneDexCache(const std::string& cache_dir_path) {
+static void RealPruneDalvikCache(const std::string& cache_dir_path) {
   if (!OS::DirectoryExists(cache_dir_path.c_str())) {
     return;
   }
@@ -118,6 +118,28 @@
   CHECK_EQ(0, TEMP_FAILURE_RETRY(closedir(cache_dir))) << "Unable to close directory.";
 }
 
+// We write out an empty file to the zygote's ISA specific cache dir at the start of
+// every zygote boot and delete it when the boot completes. If we find a file already
+// present, it usually means the boot didn't complete. We wipe the entire dalvik
+// cache if that's the case.
+static void MarkZygoteStart(const InstructionSet isa) {
+  const std::string isa_subdir = GetDalvikCacheOrDie(GetInstructionSetString(isa), false);
+  const std::string boot_marker = isa_subdir + "/.booting";
+
+  if (OS::FileExists(boot_marker.c_str())) {
+    LOG(WARNING) << "Incomplete boot detected. Pruning dalvik cache";
+    RealPruneDalvikCache(isa_subdir);
+  }
+
+  VLOG(startup) << "Creating boot start marker: " << boot_marker;
+  std::unique_ptr<File> f(OS::CreateEmptyFile(boot_marker.c_str()));
+  if (f.get() != nullptr) {
+    if (f->FlushCloseOrErase() != 0) {
+      PLOG(WARNING) << "Failed to write boot marker.";
+    }
+  }
+}
+
 static bool GenerateImage(const std::string& image_filename, InstructionSet image_isa,
                           std::string* error_msg) {
   const std::string boot_class_path_string(Runtime::Current()->GetBootClassPathString());
@@ -130,7 +152,7 @@
   // We should clean up so we are more likely to have room for the image.
   if (Runtime::Current()->IsZygote()) {
     LOG(INFO) << "Pruning dalvik-cache since we are generating an image and will need to recompile";
-    PruneDexCache(image_isa);
+    PruneDalvikCache(image_isa);
   }
 
   std::vector<std::string> arg_vector;
@@ -232,7 +254,7 @@
   // We should clean up so we are more likely to have room for the image.
   if (Runtime::Current()->IsZygote()) {
     LOG(INFO) << "Pruning dalvik-cache since we are relocating an image and will need to recompile";
-    PruneDexCache(isa);
+    PruneDalvikCache(isa);
   }
 
   std::string patchoat(Runtime::Current()->GetPatchoatExecutable());
@@ -427,6 +449,10 @@
                                              &has_system, &cache_filename, &dalvik_cache_exists,
                                              &has_cache, &is_global_cache);
 
+  if (Runtime::Current()->IsZygote()) {
+    MarkZygoteStart(image_isa);
+  }
+
   ImageSpace* space;
   bool relocate = Runtime::Current()->ShouldRelocate();
   bool can_compile = Runtime::Current()->IsImageDex2OatEnabled();
@@ -475,7 +501,7 @@
             // Since ImageCreationAllowed was true above, we are the zygote
             // and therefore the only process expected to generate these for
             // the device.
-            PruneDexCache(image_isa);
+            PruneDalvikCache(image_isa);
             return nullptr;
           }
         }
@@ -530,7 +556,7 @@
                                 "but image failed to load: %s",
                                 image_location, cache_filename.c_str(), system_filename.c_str(),
                                 error_msg->c_str());
-      PruneDexCache(image_isa);
+      PruneDalvikCache(image_isa);
       return nullptr;
     } else if (is_system) {
       // If the /system file exists, it should be up-to-date, don't try to generate it.
@@ -558,13 +584,13 @@
     // Since ImageCreationAllowed was true above, we are the zygote
     // and therefore the only process expected to generate these for
     // the device.
-    PruneDexCache(image_isa);
+    PruneDalvikCache(image_isa);
     return nullptr;
   } else {
     // Check whether there is enough space left over after we have generated the image.
     if (!CheckSpace(cache_filename, error_msg)) {
       // No. Delete the generated image and try to run out of the dex files.
-      PruneDexCache(image_isa);
+      PruneDalvikCache(image_isa);
       return nullptr;
     }
 
diff --git a/runtime/gc/space/valgrind_malloc_space-inl.h b/runtime/gc/space/valgrind_malloc_space-inl.h
index 793d798..ae8e892 100644
--- a/runtime/gc/space/valgrind_malloc_space-inl.h
+++ b/runtime/gc/space/valgrind_malloc_space-inl.h
@@ -126,6 +126,30 @@
           size_t kValgrindRedZoneBytes,
           bool kAdjustForRedzoneInAllocSize,
           bool kUseObjSizeForUsable>
+mirror::Object* ValgrindMallocSpace<S,
+                                    kValgrindRedZoneBytes,
+                                    kAdjustForRedzoneInAllocSize,
+                                    kUseObjSizeForUsable>::AllocThreadUnsafe(
+    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out) {
+  size_t bytes_allocated;
+  size_t usable_size;
+  void* obj_with_rdz = S::AllocThreadUnsafe(self, num_bytes + 2 * kValgrindRedZoneBytes,
+                                &bytes_allocated, &usable_size);
+  if (obj_with_rdz == nullptr) {
+    return nullptr;
+  }
+
+  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes,
+                                             kUseObjSizeForUsable>(obj_with_rdz, num_bytes,
+                                                                   bytes_allocated, usable_size,
+                                                                   bytes_allocated_out,
+                                                                   usable_size_out);
+}
+
+template <typename S,
+          size_t kValgrindRedZoneBytes,
+          bool kAdjustForRedzoneInAllocSize,
+          bool kUseObjSizeForUsable>
 size_t ValgrindMallocSpace<S,
                            kValgrindRedZoneBytes,
                            kAdjustForRedzoneInAllocSize,
diff --git a/runtime/gc/space/valgrind_malloc_space.h b/runtime/gc/space/valgrind_malloc_space.h
index d102f49..707ea69 100644
--- a/runtime/gc/space/valgrind_malloc_space.h
+++ b/runtime/gc/space/valgrind_malloc_space.h
@@ -37,6 +37,9 @@
                                   size_t* usable_size) OVERRIDE;
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
                         size_t* usable_size) OVERRIDE;
+  mirror::Object* AllocThreadUnsafe(Thread* self, size_t num_bytes, size_t* bytes_allocated,
+                                    size_t* usable_size) OVERRIDE
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   size_t AllocationSize(mirror::Object* obj, size_t* usable_size) OVERRIDE;
 
diff --git a/runtime/globals.h b/runtime/globals.h
index 4d33196..3104229 100644
--- a/runtime/globals.h
+++ b/runtime/globals.h
@@ -64,6 +64,12 @@
 static constexpr bool kUsePortableCompiler = false;
 #endif
 
+#if defined(ART_USE_OPTIMIZING_COMPILER)
+static constexpr bool kUseOptimizingCompiler = true;
+#else
+static constexpr bool kUseOptimizingCompiler = false;
+#endif
+
 // Garbage collector constants.
 static constexpr bool kMovingCollector = true && !kUsePortableCompiler;
 static constexpr bool kMarkCompactSupport = false && kMovingCollector;
diff --git a/runtime/hprof/hprof.cc b/runtime/hprof/hprof.cc
index 14d7432..3069581 100644
--- a/runtime/hprof/hprof.cc
+++ b/runtime/hprof/hprof.cc
@@ -475,9 +475,14 @@
         }
       }
 
-      std::unique_ptr<File> file(new File(out_fd, filename_));
+      std::unique_ptr<File> file(new File(out_fd, filename_, true));
       okay = file->WriteFully(header_data_ptr_, header_data_size_) &&
-          file->WriteFully(body_data_ptr_, body_data_size_);
+             file->WriteFully(body_data_ptr_, body_data_size_);
+      if (okay) {
+        okay = file->FlushCloseOrErase() == 0;
+      } else {
+        file->Erase();
+      }
       if (!okay) {
         std::string msg(StringPrintf("Couldn't dump heap; writing \"%s\" failed: %s",
                                      filename_.c_str(), strerror(errno)));
diff --git a/runtime/indirect_reference_table.cc b/runtime/indirect_reference_table.cc
index 4d177a3..0d84a1e 100644
--- a/runtime/indirect_reference_table.cc
+++ b/runtime/indirect_reference_table.cc
@@ -162,13 +162,12 @@
   DCHECK(table_ != NULL);
   DCHECK_GE(segment_state_.parts.numHoles, prevState.parts.numHoles);
 
-  int idx = ExtractIndex(iref);
-
   if (GetIndirectRefKind(iref) == kHandleScopeOrInvalid &&
       Thread::Current()->HandleScopeContains(reinterpret_cast<jobject>(iref))) {
     LOG(WARNING) << "Attempt to remove local handle scope entry from IRT, ignoring";
     return true;
   }
+  const int idx = ExtractIndex(iref);
   if (idx < bottomIndex) {
     // Wrong segment.
     LOG(WARNING) << "Attempt to remove index outside index area (" << idx
@@ -236,6 +235,13 @@
   return true;
 }
 
+void IndirectReferenceTable::Trim() {
+  const size_t top_index = Capacity();
+  auto* release_start = AlignUp(reinterpret_cast<uint8_t*>(&table_[top_index]), kPageSize);
+  uint8_t* release_end = table_mem_map_->End();
+  madvise(release_start, release_end - release_start, MADV_DONTNEED);
+}
+
 void IndirectReferenceTable::VisitRoots(RootCallback* callback, void* arg, uint32_t tid,
                                         RootType root_type) {
   for (auto ref : *this) {
diff --git a/runtime/indirect_reference_table.h b/runtime/indirect_reference_table.h
index 168f9f2..fbd5714 100644
--- a/runtime/indirect_reference_table.h
+++ b/runtime/indirect_reference_table.h
@@ -331,6 +331,9 @@
     return Offset(OFFSETOF_MEMBER(IndirectReferenceTable, segment_state_));
   }
 
+  // Release pages past the end of the table that may have previously held references.
+  void Trim() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
  private:
   // Extract the table index from an indirect reference.
   static uint32_t ExtractIndex(IndirectRef iref) {
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 003e160..639b0f0 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -869,10 +869,10 @@
   ConfigureStubs(false, false);
 }
 
-const void* Instrumentation::GetQuickCodeFor(mirror::ArtMethod* method) const {
+const void* Instrumentation::GetQuickCodeFor(mirror::ArtMethod* method, size_t pointer_size) const {
   Runtime* runtime = Runtime::Current();
   if (LIKELY(!instrumentation_stubs_installed_)) {
-    const void* code = method->GetEntryPointFromQuickCompiledCode();
+    const void* code = method->GetEntryPointFromQuickCompiledCodePtrSize(pointer_size);
     DCHECK(code != nullptr);
     ClassLinker* class_linker = runtime->GetClassLinker();
     if (LIKELY(!class_linker->IsQuickResolutionStub(code) &&
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index 369039d..effa9f7 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -200,7 +200,7 @@
   // Get the quick code for the given method. More efficient than asking the class linker as it
   // will short-cut to GetCode if instrumentation and static method resolution stubs aren't
   // installed.
-  const void* GetQuickCodeFor(mirror::ArtMethod* method) const
+  const void* GetQuickCodeFor(mirror::ArtMethod* method, size_t pointer_size) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void ForceInterpretOnly() {
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index 7f6303a..f88d56a 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -201,10 +201,11 @@
     }
   }
   mirror::ArtMethod* method = shadow_frame.GetMethod();
-  mirror::String* s = method->GetDexCacheStrings()->Get(string_idx);
+  mirror::Class* declaring_class = method->GetDeclaringClass();
+  mirror::String* s = declaring_class->GetDexCacheStrings()->Get(string_idx);
   if (UNLIKELY(s == nullptr)) {
     StackHandleScope<1> hs(self);
-    Handle<mirror::DexCache> dex_cache(hs.NewHandle(method->GetDexCache()));
+    Handle<mirror::DexCache> dex_cache(hs.NewHandle(declaring_class->GetDexCache()));
     s = Runtime::Current()->GetClassLinker()->ResolveString(*method->GetDexFile(), string_idx,
                                                             dex_cache);
   }
diff --git a/runtime/java_vm_ext.cc b/runtime/java_vm_ext.cc
index a5abce6..5d04fac 100644
--- a/runtime/java_vm_ext.cc
+++ b/runtime/java_vm_ext.cc
@@ -756,6 +756,11 @@
   }
 }
 
+void JavaVMExt::TrimGlobals() {
+  WriterMutexLock mu(Thread::Current(), globals_lock_);
+  globals_.Trim();
+}
+
 void JavaVMExt::VisitRoots(RootCallback* callback, void* arg) {
   Thread* self = Thread::Current();
   {
diff --git a/runtime/java_vm_ext.h b/runtime/java_vm_ext.h
index 2957ba3..749b9fb 100644
--- a/runtime/java_vm_ext.h
+++ b/runtime/java_vm_ext.h
@@ -131,6 +131,9 @@
     return unchecked_functions_;
   }
 
+  void TrimGlobals() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      LOCKS_EXCLUDED(globals_lock_);
+
  private:
   Runtime* const runtime_;
 
diff --git a/runtime/jdwp/jdwp_event.cc b/runtime/jdwp/jdwp_event.cc
index 44f713c..1e0a2d2 100644
--- a/runtime/jdwp/jdwp_event.cc
+++ b/runtime/jdwp/jdwp_event.cc
@@ -125,6 +125,10 @@
 };
 
 static bool NeedsFullDeoptimization(JdwpEventKind eventKind) {
+  if (!Dbg::RequiresDeoptimization()) {
+    // We don't need deoptimization for debugging.
+    return false;
+  }
   switch (eventKind) {
       case EK_METHOD_ENTRY:
       case EK_METHOD_EXIT:
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 1dcfcab..4797e69 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -566,7 +566,8 @@
     return soa.AddLocalReference<jobject>(decoded_obj);
   }
 
-  static void DeleteLocalRef(JNIEnv* env, jobject obj) {
+  static void DeleteLocalRef(JNIEnv* env, jobject obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (obj == nullptr) {
       return;
     }
diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc
index c24ef05..62b6b34 100644
--- a/runtime/jni_internal_test.cc
+++ b/runtime/jni_internal_test.cc
@@ -390,59 +390,72 @@
   void ReleasePrimitiveArrayElementsOfWrongType(bool check_jni) {
     bool old_check_jni = vm_->SetCheckJniEnabled(check_jni);
     CheckJniAbortCatcher jni_abort_catcher;
+    {
+      jbooleanArray array = env_->NewBooleanArray(10);
+      ASSERT_TRUE(array != nullptr);
+      jboolean is_copy;
+      jboolean* elements = env_->GetBooleanArrayElements(array, &is_copy);
+      ASSERT_TRUE(elements != nullptr);
+      env_->ReleaseByteArrayElements(reinterpret_cast<jbyteArray>(array),
+                                     reinterpret_cast<jbyte*>(elements), 0);
+      jni_abort_catcher.Check(
+          check_jni ? "incompatible array type boolean[] expected byte[]"
+              : "attempt to release byte primitive array elements with an object of type boolean[]");
+      env_->ReleaseShortArrayElements(reinterpret_cast<jshortArray>(array),
+                                      reinterpret_cast<jshort*>(elements), 0);
+      jni_abort_catcher.Check(
+          check_jni ? "incompatible array type boolean[] expected short[]"
+              : "attempt to release short primitive array elements with an object of type boolean[]");
+      env_->ReleaseCharArrayElements(reinterpret_cast<jcharArray>(array),
+                                     reinterpret_cast<jchar*>(elements), 0);
+      jni_abort_catcher.Check(
+          check_jni ? "incompatible array type boolean[] expected char[]"
+              : "attempt to release char primitive array elements with an object of type boolean[]");
+      env_->ReleaseIntArrayElements(reinterpret_cast<jintArray>(array),
+                                    reinterpret_cast<jint*>(elements), 0);
+      jni_abort_catcher.Check(
+          check_jni ? "incompatible array type boolean[] expected int[]"
+              : "attempt to release int primitive array elements with an object of type boolean[]");
+      env_->ReleaseLongArrayElements(reinterpret_cast<jlongArray>(array),
+                                     reinterpret_cast<jlong*>(elements), 0);
+      jni_abort_catcher.Check(
+          check_jni ? "incompatible array type boolean[] expected long[]"
+              : "attempt to release long primitive array elements with an object of type boolean[]");
+      env_->ReleaseFloatArrayElements(reinterpret_cast<jfloatArray>(array),
+                                      reinterpret_cast<jfloat*>(elements), 0);
+      jni_abort_catcher.Check(
+          check_jni ? "incompatible array type boolean[] expected float[]"
+              : "attempt to release float primitive array elements with an object of type boolean[]");
+      env_->ReleaseDoubleArrayElements(reinterpret_cast<jdoubleArray>(array),
+                                       reinterpret_cast<jdouble*>(elements), 0);
+      jni_abort_catcher.Check(
+          check_jni ? "incompatible array type boolean[] expected double[]"
+              : "attempt to release double primitive array elements with an object of type boolean[]");
 
-    jbooleanArray array = env_->NewBooleanArray(10);
-    ASSERT_TRUE(array != nullptr);
-    jboolean is_copy;
-    jboolean* elements = env_->GetBooleanArrayElements(array, &is_copy);
-    ASSERT_TRUE(elements != nullptr);
-    env_->ReleaseByteArrayElements(reinterpret_cast<jbyteArray>(array),
-                                   reinterpret_cast<jbyte*>(elements), 0);
-    jni_abort_catcher.Check(
-        check_jni ? "incompatible array type boolean[] expected byte[]"
-            : "attempt to release byte primitive array elements with an object of type boolean[]");
-    env_->ReleaseShortArrayElements(reinterpret_cast<jshortArray>(array),
-                                    reinterpret_cast<jshort*>(elements), 0);
-    jni_abort_catcher.Check(
-        check_jni ? "incompatible array type boolean[] expected short[]"
-            : "attempt to release short primitive array elements with an object of type boolean[]");
-    env_->ReleaseCharArrayElements(reinterpret_cast<jcharArray>(array),
-                                   reinterpret_cast<jchar*>(elements), 0);
-    jni_abort_catcher.Check(
-        check_jni ? "incompatible array type boolean[] expected char[]"
-            : "attempt to release char primitive array elements with an object of type boolean[]");
-    env_->ReleaseIntArrayElements(reinterpret_cast<jintArray>(array),
-                                  reinterpret_cast<jint*>(elements), 0);
-    jni_abort_catcher.Check(
-        check_jni ? "incompatible array type boolean[] expected int[]"
-            : "attempt to release int primitive array elements with an object of type boolean[]");
-    env_->ReleaseLongArrayElements(reinterpret_cast<jlongArray>(array),
-                                   reinterpret_cast<jlong*>(elements), 0);
-    jni_abort_catcher.Check(
-        check_jni ? "incompatible array type boolean[] expected long[]"
-            : "attempt to release long primitive array elements with an object of type boolean[]");
-    env_->ReleaseFloatArrayElements(reinterpret_cast<jfloatArray>(array),
-                                    reinterpret_cast<jfloat*>(elements), 0);
-    jni_abort_catcher.Check(
-        check_jni ? "incompatible array type boolean[] expected float[]"
-            : "attempt to release float primitive array elements with an object of type boolean[]");
-    env_->ReleaseDoubleArrayElements(reinterpret_cast<jdoubleArray>(array),
-                                     reinterpret_cast<jdouble*>(elements), 0);
-    jni_abort_catcher.Check(
-        check_jni ? "incompatible array type boolean[] expected double[]"
-            : "attempt to release double primitive array elements with an object of type boolean[]");
-    jbyteArray array2 = env_->NewByteArray(10);
-    env_->ReleaseBooleanArrayElements(reinterpret_cast<jbooleanArray>(array2), elements, 0);
-    jni_abort_catcher.Check(
-        check_jni ? "incompatible array type byte[] expected boolean[]"
-            : "attempt to release boolean primitive array elements with an object of type byte[]");
-    jobject object = env_->NewStringUTF("Test String");
-    env_->ReleaseBooleanArrayElements(reinterpret_cast<jbooleanArray>(object), elements, 0);
-    jni_abort_catcher.Check(
-        check_jni ? "jarray argument has non-array type: java.lang.String"
-            : "attempt to release boolean primitive array elements with an object of type "
+      // Don't leak the elements array.
+      env_->ReleaseBooleanArrayElements(array, elements, 0);
+    }
+    {
+      jbyteArray array = env_->NewByteArray(10);
+      jboolean is_copy;
+      jbyte* elements = env_->GetByteArrayElements(array, &is_copy);
+
+      env_->ReleaseBooleanArrayElements(reinterpret_cast<jbooleanArray>(array),
+                                        reinterpret_cast<jboolean*>(elements), 0);
+      jni_abort_catcher.Check(
+          check_jni ? "incompatible array type byte[] expected boolean[]"
+              : "attempt to release boolean primitive array elements with an object of type byte[]");
+      jobject object = env_->NewStringUTF("Test String");
+      env_->ReleaseBooleanArrayElements(reinterpret_cast<jbooleanArray>(object),
+                                        reinterpret_cast<jboolean*>(elements), 0);
+      jni_abort_catcher.Check(
+          check_jni ? "jarray argument has non-array type: java.lang.String"
+              : "attempt to release boolean primitive array elements with an object of type "
               "java.lang.String");
 
+      // Don't leak the elements array.
+      env_->ReleaseByteArrayElements(array, elements, 0);
+    }
     EXPECT_EQ(check_jni, vm_->SetCheckJniEnabled(old_check_jni));
   }
 
diff --git a/runtime/mirror/art_method-inl.h b/runtime/mirror/art_method-inl.h
index 62d17ab..22d55e2 100644
--- a/runtime/mirror/art_method-inl.h
+++ b/runtime/mirror/art_method-inl.h
@@ -82,11 +82,6 @@
   return GetField32(OFFSET_OF_OBJECT_MEMBER(ArtMethod, dex_method_index_));
 }
 
-inline ObjectArray<String>* ArtMethod::GetDexCacheStrings() {
-  return GetFieldObject<ObjectArray<String>>(
-      OFFSET_OF_OBJECT_MEMBER(ArtMethod, dex_cache_strings_));
-}
-
 inline ObjectArray<ArtMethod>* ArtMethod::GetDexCacheResolvedMethods() {
   return GetFieldObject<ObjectArray<ArtMethod>>(
       OFFSET_OF_OBJECT_MEMBER(ArtMethod, dex_cache_resolved_methods_));
@@ -199,17 +194,17 @@
   SetEntryPointFromPortableCompiledCode(reinterpret_cast<void*>(code_offset));
 }
 
-inline const uint8_t* ArtMethod::GetMappingTable() {
-  const void* code_pointer = GetQuickOatCodePointer();
+inline const uint8_t* ArtMethod::GetMappingTable(size_t pointer_size) {
+  const void* code_pointer = GetQuickOatCodePointer(pointer_size);
   if (code_pointer == nullptr) {
     return nullptr;
   }
-  return GetMappingTable(code_pointer);
+  return GetMappingTable(code_pointer, pointer_size);
 }
 
-inline const uint8_t* ArtMethod::GetMappingTable(const void* code_pointer) {
+inline const uint8_t* ArtMethod::GetMappingTable(const void* code_pointer, size_t pointer_size) {
   DCHECK(code_pointer != nullptr);
-  DCHECK(code_pointer == GetQuickOatCodePointer());
+  DCHECK_EQ(code_pointer, GetQuickOatCodePointer(pointer_size));
   uint32_t offset =
       reinterpret_cast<const OatQuickMethodHeader*>(code_pointer)[-1].mapping_table_offset_;
   if (UNLIKELY(offset == 0u)) {
@@ -218,18 +213,18 @@
   return reinterpret_cast<const uint8_t*>(code_pointer) - offset;
 }
 
-inline const uint8_t* ArtMethod::GetVmapTable() {
-  const void* code_pointer = GetQuickOatCodePointer();
+inline const uint8_t* ArtMethod::GetVmapTable(size_t pointer_size) {
+  const void* code_pointer = GetQuickOatCodePointer(pointer_size);
   if (code_pointer == nullptr) {
     return nullptr;
   }
-  return GetVmapTable(code_pointer);
+  return GetVmapTable(code_pointer, pointer_size);
 }
 
-inline const uint8_t* ArtMethod::GetVmapTable(const void* code_pointer) {
-  CHECK(!IsOptimized()) << "Unimplemented vmap table for optimized compiler";
+inline const uint8_t* ArtMethod::GetVmapTable(const void* code_pointer, size_t pointer_size) {
+  CHECK(!IsOptimized(pointer_size)) << "Unimplemented vmap table for optimized compiler";
   DCHECK(code_pointer != nullptr);
-  DCHECK(code_pointer == GetQuickOatCodePointer());
+  DCHECK_EQ(code_pointer, GetQuickOatCodePointer(pointer_size));
   uint32_t offset =
       reinterpret_cast<const OatQuickMethodHeader*>(code_pointer)[-1].vmap_table_offset_;
   if (UNLIKELY(offset == 0u)) {
@@ -243,8 +238,8 @@
 }
 
 inline CodeInfo ArtMethod::GetOptimizedCodeInfo() {
-  DCHECK(IsOptimized());
-  const void* code_pointer = GetQuickOatCodePointer();
+  DCHECK(IsOptimized(sizeof(void*)));
+  const void* code_pointer = GetQuickOatCodePointer(sizeof(void*));
   DCHECK(code_pointer != nullptr);
   uint32_t offset =
       reinterpret_cast<const OatQuickMethodHeader*>(code_pointer)[-1].vmap_table_offset_;
@@ -252,14 +247,23 @@
   return CodeInfo(data);
 }
 
-inline void ArtMethod::SetOatNativeGcMapOffset(uint32_t gc_map_offset) {
-  DCHECK(!Runtime::Current()->IsStarted());
-  SetNativeGcMap(reinterpret_cast<uint8_t*>(gc_map_offset));
+inline const uint8_t* ArtMethod::GetNativeGcMap(size_t pointer_size) {
+  const void* code_pointer = GetQuickOatCodePointer(pointer_size);
+  if (code_pointer == nullptr) {
+    return nullptr;
+  }
+  return GetNativeGcMap(code_pointer, pointer_size);
 }
 
-inline uint32_t ArtMethod::GetOatNativeGcMapOffset() {
-  DCHECK(!Runtime::Current()->IsStarted());
-  return PointerToLowMemUInt32(GetNativeGcMap());
+inline const uint8_t* ArtMethod::GetNativeGcMap(const void* code_pointer, size_t pointer_size) {
+  DCHECK(code_pointer != nullptr);
+  DCHECK_EQ(code_pointer, GetQuickOatCodePointer(pointer_size));
+  uint32_t offset =
+      reinterpret_cast<const OatQuickMethodHeader*>(code_pointer)[-1].gc_map_offset_;
+  if (UNLIKELY(offset == 0u)) {
+    return nullptr;
+  }
+  return reinterpret_cast<const uint8_t*>(code_pointer) - offset;
 }
 
 inline bool ArtMethod::IsRuntimeMethod() {
@@ -303,13 +307,14 @@
 }
 
 inline uintptr_t ArtMethod::NativeQuickPcOffset(const uintptr_t pc) {
-  const void* code = Runtime::Current()->GetInstrumentation()->GetQuickCodeFor(this);
+  const void* code = Runtime::Current()->GetInstrumentation()->GetQuickCodeFor(
+      this, sizeof(void*));
   return pc - reinterpret_cast<uintptr_t>(code);
 }
 
 inline QuickMethodFrameInfo ArtMethod::GetQuickFrameInfo(const void* code_pointer) {
   DCHECK(code_pointer != nullptr);
-  DCHECK_EQ(code_pointer, GetQuickOatCodePointer());
+  DCHECK_EQ(code_pointer, GetQuickOatCodePointer(sizeof(void*)));
   return reinterpret_cast<const OatQuickMethodHeader*>(code_pointer)[-1].frame_info_;
 }
 
@@ -450,11 +455,6 @@
   return interface_method;
 }
 
-inline void ArtMethod::SetDexCacheStrings(ObjectArray<String>* new_dex_cache_strings) {
-  SetFieldObject<false>(OFFSET_OF_OBJECT_MEMBER(ArtMethod, dex_cache_strings_),
-                        new_dex_cache_strings);
-}
-
 inline void ArtMethod::SetDexCacheResolvedMethods(ObjectArray<ArtMethod>* new_dex_cache_methods) {
   SetFieldObject<false>(OFFSET_OF_OBJECT_MEMBER(ArtMethod, dex_cache_resolved_methods_),
                         new_dex_cache_methods);
@@ -482,7 +482,11 @@
 inline void ArtMethod::CheckObjectSizeEqualsMirrorSize() {
   // Using the default, check the class object size to make sure it matches the size of the
   // object.
-  DCHECK_EQ(GetClass()->GetObjectSize(), sizeof(*this));
+  size_t this_size = sizeof(*this);
+#ifdef ART_METHOD_HAS_PADDING_FIELD_ON_64_BIT
+  this_size += sizeof(void*) - sizeof(uint32_t);
+#endif
+  DCHECK_EQ(GetClass()->GetObjectSize(), this_size);
 }
 
 }  // namespace mirror
diff --git a/runtime/mirror/art_method.cc b/runtime/mirror/art_method.cc
index 3b4d5f3..4f5ca3f 100644
--- a/runtime/mirror/art_method.cc
+++ b/runtime/mirror/art_method.cc
@@ -165,9 +165,9 @@
     // Portable doesn't use the machine pc, we just use dex pc instead.
     return static_cast<uint32_t>(pc);
   }
-  const void* entry_point = GetQuickOatEntryPoint();
-  MappingTable table(
-      entry_point != nullptr ? GetMappingTable(EntryPointToCodePointer(entry_point)) : nullptr);
+  const void* entry_point = GetQuickOatEntryPoint(sizeof(void*));
+  MappingTable table(entry_point != nullptr ?
+      GetMappingTable(EntryPointToCodePointer(entry_point), sizeof(void*)) : nullptr);
   if (table.TotalSize() == 0) {
     // NOTE: Special methods (see Mir2Lir::GenSpecialCase()) have an empty mapping
     // but they have no suspend checks and, consequently, we never call ToDexPc() for them.
@@ -198,9 +198,9 @@
 }
 
 uintptr_t ArtMethod::ToNativeQuickPc(const uint32_t dex_pc, bool abort_on_failure) {
-  const void* entry_point = GetQuickOatEntryPoint();
-  MappingTable table(
-      entry_point != nullptr ? GetMappingTable(EntryPointToCodePointer(entry_point)) : nullptr);
+  const void* entry_point = GetQuickOatEntryPoint(sizeof(void*));
+  MappingTable table(entry_point != nullptr ?
+      GetMappingTable(EntryPointToCodePointer(entry_point), sizeof(void*)) : nullptr);
   if (table.TotalSize() == 0) {
     DCHECK_EQ(dex_pc, 0U);
     return 0;   // Special no mapping/pc == 0 case
@@ -320,13 +320,13 @@
   }
 }
 
-const void* ArtMethod::GetQuickOatEntryPoint() {
+const void* ArtMethod::GetQuickOatEntryPoint(size_t pointer_size) {
   if (IsPortableCompiled() || IsAbstract() || IsRuntimeMethod() || IsProxyMethod()) {
     return nullptr;
   }
   Runtime* runtime = Runtime::Current();
   ClassLinker* class_linker = runtime->GetClassLinker();
-  const void* code = runtime->GetInstrumentation()->GetQuickCodeFor(this);
+  const void* code = runtime->GetInstrumentation()->GetQuickCodeFor(this, pointer_size);
   // On failure, instead of nullptr we get the quick-generic-jni-trampoline for native method
   // indicating the generic JNI, or the quick-to-interpreter-bridge (but not the trampoline)
   // for non-native methods.
@@ -340,7 +340,7 @@
 #ifndef NDEBUG
 uintptr_t ArtMethod::NativeQuickPcOffset(const uintptr_t pc, const void* quick_entry_point) {
   CHECK_NE(quick_entry_point, GetQuickToInterpreterBridge());
-  CHECK_EQ(quick_entry_point, Runtime::Current()->GetInstrumentation()->GetQuickCodeFor(this));
+  CHECK_EQ(quick_entry_point, Runtime::Current()->GetInstrumentation()->GetQuickCodeFor(this, sizeof(void*)));
   return pc - reinterpret_cast<uintptr_t>(quick_entry_point);
 }
 #endif
@@ -436,18 +436,32 @@
     return QuickMethodFrameInfo(kStackAlignment, 0u, 0u);
   }
   Runtime* runtime = Runtime::Current();
-  // For Proxy method we exclude direct method (there is only one direct method - constructor).
-  // Direct method is cloned from original java.lang.reflect.Proxy class together with code
-  // and as a result it is executed as usual quick compiled method without any stubs.
-  // So the frame info should be returned as it is a quick method not a stub.
-  if (UNLIKELY(IsAbstract()) || UNLIKELY(IsProxyMethod() && !IsDirect())) {
+
+  if (UNLIKELY(IsAbstract())) {
     return runtime->GetCalleeSaveMethodFrameInfo(Runtime::kRefsAndArgs);
   }
+
+  // For Proxy method we add special handling for the direct method case  (there is only one
+  // direct method - constructor). Direct method is cloned from original
+  // java.lang.reflect.Proxy class together with code and as a result it is executed as usual
+  // quick compiled method without any stubs. So the frame info should be returned as it is a
+  // quick method not a stub. However, if instrumentation stubs are installed, the
+  // instrumentation->GetQuickCodeFor() returns the artQuickProxyInvokeHandler instead of an
+  // oat code pointer, thus we have to add a special case here.
+  if (UNLIKELY(IsProxyMethod())) {
+    if (IsDirect()) {
+      CHECK(IsConstructor());
+      return GetQuickFrameInfo(EntryPointToCodePointer(GetEntryPointFromQuickCompiledCode()));
+    } else {
+      return runtime->GetCalleeSaveMethodFrameInfo(Runtime::kRefsAndArgs);
+    }
+  }
+
   if (UNLIKELY(IsRuntimeMethod())) {
     return runtime->GetRuntimeMethodFrameInfo(this);
   }
 
-  const void* entry_point = runtime->GetInstrumentation()->GetQuickCodeFor(this);
+  const void* entry_point = runtime->GetInstrumentation()->GetQuickCodeFor(this, sizeof(void*));
   ClassLinker* class_linker = runtime->GetClassLinker();
   // On failure, instead of nullptr we get the quick-generic-jni-trampoline for native method
   // indicating the generic JNI, or the quick-to-interpreter-bridge (but not the trampoline)
diff --git a/runtime/mirror/art_method.h b/runtime/mirror/art_method.h
index 4a7831f..da494e0 100644
--- a/runtime/mirror/art_method.h
+++ b/runtime/mirror/art_method.h
@@ -42,6 +42,8 @@
 typedef void (EntryPointFromInterpreter)(Thread* self, MethodHelper* mh,
     const DexFile::CodeItem* code_item, ShadowFrame* shadow_frame, JValue* result);
 
+#define ART_METHOD_HAS_PADDING_FIELD_ON_64_BIT
+
 // C++ mirror of java.lang.reflect.ArtMethod.
 class MANAGED ArtMethod FINAL : public Object {
  public:
@@ -146,13 +148,13 @@
     SetAccessFlags(GetAccessFlags() | kAccPreverified);
   }
 
-  bool IsOptimized() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  bool IsOptimized(size_t pointer_size) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Temporary solution for detecting if a method has been optimized: the compiler
     // does not create a GC map. Instead, the vmap table contains the stack map
     // (as in stack_map.h).
-    return (GetEntryPointFromQuickCompiledCode() != nullptr)
-        && (GetQuickOatCodePointer() != nullptr)
-        && (GetNativeGcMap() == nullptr);
+    return GetEntryPointFromQuickCompiledCodePtrSize(pointer_size) != nullptr
+        && GetQuickOatCodePointer(pointer_size) != nullptr
+        && GetNativeGcMap(pointer_size) == nullptr;
   }
 
   bool IsPortableCompiled() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -208,14 +210,6 @@
     SetField32<false>(OFFSET_OF_OBJECT_MEMBER(ArtMethod, dex_method_index_), new_idx);
   }
 
-  ObjectArray<String>* GetDexCacheStrings() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void SetDexCacheStrings(ObjectArray<String>* new_dex_cache_strings)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
-  static MemberOffset DexCacheStringsOffset() {
-    return OFFSET_OF_OBJECT_MEMBER(ArtMethod, dex_cache_strings_);
-  }
-
   static MemberOffset DexCacheResolvedMethodsOffset() {
     return OFFSET_OF_OBJECT_MEMBER(ArtMethod, dex_cache_resolved_methods_);
   }
@@ -278,7 +272,7 @@
   }
 
   ALWAYS_INLINE static MemberOffset EntryPointFromPortableCompiledCodeOffset(size_t pointer_size) {
-    return MemberOffset(PtrSizedFieldsOffset() + OFFSETOF_MEMBER(
+    return MemberOffset(PtrSizedFieldsOffset(pointer_size) + OFFSETOF_MEMBER(
         PtrSizedFields, entry_point_from_portable_compiled_code_) / sizeof(void*) * pointer_size);
   }
 
@@ -381,50 +375,35 @@
     return reinterpret_cast<const void*>(code);
   }
 
-  // Actual entry point pointer to compiled oat code or nullptr if method has no compiled code.
-  const void* GetQuickOatEntryPoint() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
+  // Actual entry point pointer to compiled oat code or nullptr.
+  const void* GetQuickOatEntryPoint(size_t pointer_size)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   // Actual pointer to compiled oat code or nullptr.
-  const void* GetQuickOatCodePointer() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return EntryPointToCodePointer(GetQuickOatEntryPoint());
+  const void* GetQuickOatCodePointer(size_t pointer_size)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return EntryPointToCodePointer(GetQuickOatEntryPoint(pointer_size));
   }
 
   // Callers should wrap the uint8_t* in a MappingTable instance for convenient access.
-  const uint8_t* GetMappingTable() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  const uint8_t* GetMappingTable(const void* code_pointer)
+  const uint8_t* GetMappingTable(size_t pointer_size)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  const uint8_t* GetMappingTable(const void* code_pointer, size_t pointer_size)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Callers should wrap the uint8_t* in a VmapTable instance for convenient access.
-  const uint8_t* GetVmapTable() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  const uint8_t* GetVmapTable(const void* code_pointer)
+  const uint8_t* GetVmapTable(size_t pointer_size)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  const uint8_t* GetVmapTable(const void* code_pointer, size_t pointer_size)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   StackMap GetStackMap(uint32_t native_pc_offset) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   CodeInfo GetOptimizedCodeInfo() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  const uint8_t* GetNativeGcMap() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    CheckObjectSizeEqualsMirrorSize();
-    return GetNativeGcMapPtrSize(sizeof(void*));
-  }
-  ALWAYS_INLINE const uint8_t* GetNativeGcMapPtrSize(size_t pointer_size)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return GetFieldPtrWithSize<uint8_t*>(GcMapOffset(pointer_size), pointer_size);
-  }
-  template <VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
-  void SetNativeGcMap(const uint8_t* data) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    CheckObjectSizeEqualsMirrorSize();
-    SetNativeGcMapPtrSize(data, sizeof(void*));
-  }
-  template <VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
-  ALWAYS_INLINE void SetNativeGcMapPtrSize(const uint8_t* data, size_t pointer_size)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    SetFieldPtrWithSize<false, true, kVerifyFlags>(GcMapOffset(pointer_size), data,
-                                                   pointer_size);
-  }
-
-  // When building the oat need a convenient place to stuff the offset of the native GC map.
-  void SetOatNativeGcMapOffset(uint32_t gc_map_offset) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  uint32_t GetOatNativeGcMapOffset() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  // Callers should wrap the uint8_t* in a GcMap instance for convenient access.
+  const uint8_t* GetNativeGcMap(size_t pointer_size)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  const uint8_t* GetNativeGcMap(const void* code_pointer, size_t pointer_size)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   template <bool kCheckFrameSize = true>
   uint32_t GetFrameSizeInBytes() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -460,25 +439,20 @@
   void UnregisterNative() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   static MemberOffset EntryPointFromInterpreterOffset(size_t pointer_size) {
-    return MemberOffset(PtrSizedFieldsOffset() + OFFSETOF_MEMBER(
+    return MemberOffset(PtrSizedFieldsOffset(pointer_size) + OFFSETOF_MEMBER(
         PtrSizedFields, entry_point_from_interpreter_) / sizeof(void*) * pointer_size);
   }
 
   static MemberOffset EntryPointFromJniOffset(size_t pointer_size) {
-    return MemberOffset(PtrSizedFieldsOffset() + OFFSETOF_MEMBER(
+    return MemberOffset(PtrSizedFieldsOffset(pointer_size) + OFFSETOF_MEMBER(
         PtrSizedFields, entry_point_from_jni_) / sizeof(void*) * pointer_size);
   }
 
   static MemberOffset EntryPointFromQuickCompiledCodeOffset(size_t pointer_size) {
-    return MemberOffset(PtrSizedFieldsOffset() + OFFSETOF_MEMBER(
+    return MemberOffset(PtrSizedFieldsOffset(pointer_size) + OFFSETOF_MEMBER(
         PtrSizedFields, entry_point_from_quick_compiled_code_) / sizeof(void*) * pointer_size);
   }
 
-  static MemberOffset GcMapOffset(size_t pointer_size) {
-    return MemberOffset(PtrSizedFieldsOffset() + OFFSETOF_MEMBER(
-        PtrSizedFields, gc_map_) / sizeof(void*) * pointer_size);
-  }
-
   void* GetEntryPointFromJni() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     CheckObjectSizeEqualsMirrorSize();
     return GetEntryPointFromJniPtrSize(sizeof(void*));
@@ -601,13 +575,19 @@
 
   ALWAYS_INLINE ArtMethod* GetInterfaceMethodIfProxy() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  static size_t SizeWithoutPointerFields() {
-    return sizeof(ArtMethod) - sizeof(PtrSizedFields);
+  static size_t SizeWithoutPointerFields(size_t pointer_size) {
+    size_t total = sizeof(ArtMethod) - sizeof(PtrSizedFields);
+#ifdef ART_METHOD_HAS_PADDING_FIELD_ON_64_BIT
+    // Add 4 bytes if 64 bit, otherwise 0.
+    total += pointer_size - sizeof(uint32_t);
+#endif
+    return total;
   }
 
   // Size of an instance of java.lang.reflect.ArtMethod not including its value array.
   static size_t InstanceSize(size_t pointer_size) {
-    return SizeWithoutPointerFields() + (sizeof(PtrSizedFields) / sizeof(void*)) * pointer_size;
+    return SizeWithoutPointerFields(pointer_size) +
+        (sizeof(PtrSizedFields) / sizeof(void*)) * pointer_size;
   }
 
  protected:
@@ -621,9 +601,6 @@
   // Short cuts to declaring_class_->dex_cache_ member for fast compiled code access.
   HeapReference<ObjectArray<Class>> dex_cache_resolved_types_;
 
-  // Short cuts to declaring_class_->dex_cache_ member for fast compiled code access.
-  HeapReference<ObjectArray<String>> dex_cache_strings_;
-
   // Access flags; low 16 bits are defined by spec.
   uint32_t access_flags_;
 
@@ -642,7 +619,7 @@
   // ifTable.
   uint32_t method_index_;
 
-  // Add alignment word here if necessary.
+  // Fake padding field gets inserted here.
 
   // Must be the last fields in the method.
   struct PACKED(4) PtrSizedFields {
@@ -657,11 +634,6 @@
     // portable compiled code or the interpreter.
     void* entry_point_from_quick_compiled_code_;
 
-    // Pointer to a data structure created by the compiler and used by the garbage collector to
-    // determine which registers hold live references to objects within the heap. Keyed by native PC
-    // offsets for the quick compiler and dex PCs for the portable.
-    void* gc_map_;
-
     // Method dispatch from portable compiled code invokes this pointer which may cause bridging
     // into quick compiled code or the interpreter. Last to simplify entrypoint logic.
     void* entry_point_from_portable_compiled_code_;
@@ -678,8 +650,13 @@
   ALWAYS_INLINE ObjectArray<Class>* GetDexCacheResolvedTypes()
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  static size_t PtrSizedFieldsOffset() {
-    return OFFSETOF_MEMBER(ArtMethod, ptr_sized_fields_);
+  static size_t PtrSizedFieldsOffset(size_t pointer_size) {
+    size_t offset = OFFSETOF_MEMBER(ArtMethod, ptr_sized_fields_);
+#ifdef ART_METHOD_HAS_PADDING_FIELD_ON_64_BIT
+    // Add 4 bytes if 64 bit, otherwise 0.
+    offset += pointer_size - sizeof(uint32_t);
+#endif
+    return offset;
   }
 
   friend struct art::ArtMethodOffsets;  // for verifying offset information
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index a69d37e..599f178 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -800,6 +800,14 @@
   }
 }
 
+inline void Class::SetDexCacheStrings(ObjectArray<String>* new_dex_cache_strings) {
+  SetFieldObject<false>(DexCacheStringsOffset(), new_dex_cache_strings);
+}
+
+inline ObjectArray<String>* Class::GetDexCacheStrings() {
+  return GetFieldObject<ObjectArray<String>>(DexCacheStringsOffset());
+}
+
 }  // namespace mirror
 }  // namespace art
 
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index 5665059..bd3bfbf 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -149,6 +149,7 @@
 
 void Class::SetDexCache(DexCache* new_dex_cache) {
   SetFieldObject<false>(OFFSET_OF_OBJECT_MEMBER(Class, dex_cache_), new_dex_cache);
+  SetDexCacheStrings(new_dex_cache != nullptr ? new_dex_cache->GetStrings() : nullptr);
 }
 
 void Class::SetClassSize(uint32_t new_class_size) {
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 82425b5..812cfd3 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -29,6 +29,10 @@
 #include "read_barrier_option.h"
 #include "utils.h"
 
+#ifndef IMT_SIZE
+#error IMT_SIZE not defined
+#endif
+
 namespace art {
 
 struct ClassOffsets;
@@ -58,7 +62,7 @@
   // Interface method table size. Increasing this value reduces the chance of two interface methods
   // colliding in the interface method table but increases the size of classes that implement
   // (non-marker) interfaces.
-  static constexpr size_t kImtSize = 64;
+  static constexpr size_t kImtSize = IMT_SIZE;
 
   // imtable entry embedded in class object.
   struct MANAGED ImTableEntry {
@@ -654,6 +658,7 @@
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   DexCache* GetDexCache() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Also updates the dex_cache_strings_ variable from new_dex_cache.
   void SetDexCache(DexCache* new_dex_cache) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   ALWAYS_INLINE ObjectArray<ArtMethod>* GetDirectMethods()
@@ -1020,6 +1025,13 @@
   bool GetSlowPathEnabled() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void SetSlowPath(bool enabled) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  ObjectArray<String>* GetDexCacheStrings() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void SetDexCacheStrings(ObjectArray<String>* new_dex_cache_strings)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  static MemberOffset DexCacheStringsOffset() {
+    return OFFSET_OF_OBJECT_MEMBER(Class, dex_cache_strings_);
+  }
+
   // Used to initialize a class in the allocation code path to ensure it is guarded by a StoreStore
   // fence.
   class InitializeClassVisitor {
@@ -1065,6 +1077,9 @@
   // runtime such as arrays and primitive classes).
   HeapReference<DexCache> dex_cache_;
 
+  // Short cuts to dex_cache_ member for fast compiled code access.
+  HeapReference<ObjectArray<String>> dex_cache_strings_;
+
   // static, private, and <init> methods
   HeapReference<ObjectArray<ArtMethod>> direct_methods_;
 
diff --git a/runtime/native/java_lang_Thread.cc b/runtime/native/java_lang_Thread.cc
index 420e9df..760eb9b 100644
--- a/runtime/native/java_lang_Thread.cc
+++ b/runtime/native/java_lang_Thread.cc
@@ -118,14 +118,12 @@
 
 static void Thread_nativeSetName(JNIEnv* env, jobject peer, jstring java_name) {
   ScopedUtfChars name(env, java_name);
-  Thread* self;
   {
     ScopedObjectAccess soa(env);
     if (soa.Decode<mirror::Object*>(peer) == soa.Self()->GetPeer()) {
       soa.Self()->SetThreadName(name.c_str());
       return;
     }
-    self = soa.Self();
   }
   // Suspend thread to avoid it from killing itself while we set its name. We don't just hold the
   // thread list lock to avoid this, as setting the thread name causes mutator to lock/unlock
diff --git a/runtime/oat.cc b/runtime/oat.cc
index bfb27dd..eab34f7 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -25,7 +25,7 @@
 namespace art {
 
 const uint8_t OatHeader::kOatMagic[] = { 'o', 'a', 't', '\n' };
-const uint8_t OatHeader::kOatVersion[] = { '0', '4', '8', '\0' };
+const uint8_t OatHeader::kOatVersion[] = { '0', '5', '1', '\0' };
 
 static size_t ComputeOatHeaderSize(const SafeMap<std::string, std::string>* variable_data) {
   size_t estimate = 0U;
@@ -493,35 +493,19 @@
   key_value_store_size_ = data_ptr - reinterpret_cast<char*>(&key_value_store_);
 }
 
-OatMethodOffsets::OatMethodOffsets()
-  : code_offset_(0),
-    gc_map_offset_(0)
-{}
-
-OatMethodOffsets::OatMethodOffsets(uint32_t code_offset,
-                                   uint32_t gc_map_offset
-                                   )
-  : code_offset_(code_offset),
-    gc_map_offset_(gc_map_offset)
-{}
+OatMethodOffsets::OatMethodOffsets(uint32_t code_offset) : code_offset_(code_offset) {
+}
 
 OatMethodOffsets::~OatMethodOffsets() {}
 
-OatQuickMethodHeader::OatQuickMethodHeader()
-  : mapping_table_offset_(0),
-    vmap_table_offset_(0),
-    frame_info_(0, 0, 0),
-    code_size_(0)
-{}
-
 OatQuickMethodHeader::OatQuickMethodHeader(
-    uint32_t mapping_table_offset, uint32_t vmap_table_offset, uint32_t frame_size_in_bytes,
-    uint32_t core_spill_mask, uint32_t fp_spill_mask, uint32_t code_size)
-  : mapping_table_offset_(mapping_table_offset),
-    vmap_table_offset_(vmap_table_offset),
-    frame_info_(frame_size_in_bytes, core_spill_mask, fp_spill_mask),
-    code_size_(code_size)
-{}
+    uint32_t mapping_table_offset, uint32_t vmap_table_offset, uint32_t gc_map_offset,
+    uint32_t frame_size_in_bytes, uint32_t core_spill_mask, uint32_t fp_spill_mask,
+    uint32_t code_size)
+    : mapping_table_offset_(mapping_table_offset), vmap_table_offset_(vmap_table_offset),
+      gc_map_offset_(gc_map_offset),
+      frame_info_(frame_size_in_bytes, core_spill_mask, fp_spill_mask), code_size_(code_size) {
+}
 
 OatQuickMethodHeader::~OatQuickMethodHeader() {}
 
diff --git a/runtime/oat.h b/runtime/oat.h
index 8fb02b8..11ed4fb 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -164,25 +164,20 @@
 
 class PACKED(4) OatMethodOffsets {
  public:
-  OatMethodOffsets();
-
-  OatMethodOffsets(uint32_t code_offset,
-                   uint32_t gc_map_offset);
+  OatMethodOffsets(uint32_t code_offset = 0);
 
   ~OatMethodOffsets();
 
   uint32_t code_offset_;
-  uint32_t gc_map_offset_;
 };
 
 // OatQuickMethodHeader precedes the raw code chunk generated by the Quick compiler.
 class PACKED(4) OatQuickMethodHeader {
  public:
-  OatQuickMethodHeader();
-
-  explicit OatQuickMethodHeader(uint32_t mapping_table_offset, uint32_t vmap_table_offset,
-                                uint32_t frame_size_in_bytes, uint32_t core_spill_mask,
-                                uint32_t fp_spill_mask, uint32_t code_size);
+  OatQuickMethodHeader(uint32_t mapping_table_offset = 0U, uint32_t vmap_table_offset = 0U,
+                       uint32_t gc_map_offset = 0U, uint32_t frame_size_in_bytes = 0U,
+                       uint32_t core_spill_mask = 0U, uint32_t fp_spill_mask = 0U,
+                       uint32_t code_size = 0U);
 
   ~OatQuickMethodHeader();
 
@@ -190,6 +185,8 @@
   uint32_t mapping_table_offset_;
   // The offset in bytes from the start of the vmap table to the end of the header.
   uint32_t vmap_table_offset_;
+  // The offset in bytes from the start of the gc map to the end of the header.
+  uint32_t gc_map_offset_;
   // The stack frame information.
   QuickMethodFrameInfo frame_info_;
   // The code size in bytes.
diff --git a/runtime/oat_file-inl.h b/runtime/oat_file-inl.h
index 6237767..a429c87 100644
--- a/runtime/oat_file-inl.h
+++ b/runtime/oat_file-inl.h
@@ -78,6 +78,31 @@
   return reinterpret_cast<const OatQuickMethodHeader*>(code)[-1].frame_info_.FpSpillMask();
 }
 
+const uint8_t* OatFile::OatMethod::GetGcMap() const {
+  const void* code = mirror::ArtMethod::EntryPointToCodePointer(GetQuickCode());
+  if (code == nullptr) {
+    return nullptr;
+  }
+  uint32_t offset = reinterpret_cast<const OatQuickMethodHeader*>(code)[-1].gc_map_offset_;
+  if (UNLIKELY(offset == 0u)) {
+    return nullptr;
+  }
+  return reinterpret_cast<const uint8_t*>(code) - offset;
+}
+
+uint32_t OatFile::OatMethod::GetGcMapOffset() const {
+  const uint8_t* gc_map = GetGcMap();
+  return static_cast<uint32_t>(gc_map != nullptr ? gc_map - begin_ : 0u);
+}
+
+uint32_t OatFile::OatMethod::GetGcMapOffsetOffset() const {
+  const OatQuickMethodHeader* method_header = GetOatQuickMethodHeader();
+  if (method_header == nullptr) {
+    return 0u;
+  }
+  return reinterpret_cast<const uint8_t*>(&method_header->gc_map_offset_) - begin_;
+}
+
 inline uint32_t OatFile::OatMethod::GetMappingTableOffset() const {
   const uint8_t* mapping_table = GetMappingTable();
   return static_cast<uint32_t>(mapping_table != nullptr ? mapping_table - begin_ : 0u);
diff --git a/runtime/oat_file.cc b/runtime/oat_file.cc
index 54f5eab..91e571b 100644
--- a/runtime/oat_file.cc
+++ b/runtime/oat_file.cc
@@ -577,18 +577,15 @@
 const OatFile::OatMethod OatFile::OatClass::GetOatMethod(uint32_t method_index) const {
   const OatMethodOffsets* oat_method_offsets = GetOatMethodOffsets(method_index);
   if (oat_method_offsets == nullptr) {
-    return OatMethod(nullptr, 0, 0);
+    return OatMethod(nullptr, 0);
   }
   if (oat_file_->IsExecutable() ||
       Runtime::Current() == nullptr ||        // This case applies for oatdump.
       Runtime::Current()->IsCompiler()) {
-    return OatMethod(
-        oat_file_->Begin(),
-        oat_method_offsets->code_offset_,
-        oat_method_offsets->gc_map_offset_);
+    return OatMethod(oat_file_->Begin(), oat_method_offsets->code_offset_);
   } else {
     // We aren't allowed to use the compiled code. We just force it down the interpreted version.
-    return OatMethod(oat_file_->Begin(), 0, 0);
+    return OatMethod(oat_file_->Begin(), 0);
   }
 }
 
@@ -596,7 +593,6 @@
   CHECK(method != NULL);
   method->SetEntryPointFromPortableCompiledCode(GetPortableCode());
   method->SetEntryPointFromQuickCompiledCode(GetQuickCode());
-  method->SetNativeGcMap(GetNativeGcMap());  // Used by native methods in work around JNI mode.
 }
 
 bool OatFile::IsPic() const {
diff --git a/runtime/oat_file.h b/runtime/oat_file.h
index 2b94249..a335c94 100644
--- a/runtime/oat_file.h
+++ b/runtime/oat_file.h
@@ -96,9 +96,6 @@
     uint32_t GetCodeOffset() const {
       return code_offset_;
     }
-    uint32_t GetNativeGcMapOffset() const {
-      return native_gc_map_offset_;
-    }
 
     const void* GetPortableCode() const {
       // TODO: encode whether code is portable/quick in flags within OatMethod.
@@ -134,10 +131,6 @@
     const OatQuickMethodHeader* GetOatQuickMethodHeader() const;
     uint32_t GetOatQuickMethodHeaderOffset() const;
 
-    const uint8_t* GetNativeGcMap() const {
-      return GetOatPointer<const uint8_t*>(native_gc_map_offset_);
-    }
-
     size_t GetFrameSizeInBytes() const;
     uint32_t GetCoreSpillMask() const;
     uint32_t GetFpSpillMask() const;
@@ -150,18 +143,20 @@
     uint32_t GetVmapTableOffset() const;
     uint32_t GetVmapTableOffsetOffset() const;
 
+    const uint8_t* GetGcMap() const;
+    uint32_t GetGcMapOffset() const;
+    uint32_t GetGcMapOffsetOffset() const;
+
     // Create an OatMethod with offsets relative to the given base address
-    OatMethod(const uint8_t* base, const uint32_t code_offset, const uint32_t gc_map_offset)
-      : begin_(base),
-        code_offset_(code_offset),
-        native_gc_map_offset_(gc_map_offset) {
+    OatMethod(const uint8_t* base, const uint32_t code_offset)
+        : begin_(base), code_offset_(code_offset) {
     }
     ~OatMethod() {}
 
     // A representation of an invalid OatMethod, used when an OatMethod or OatClass can't be found.
     // See ClassLinker::FindOatMethodFor.
     static const OatMethod Invalid() {
-      return OatMethod(nullptr, -1, -1);
+      return OatMethod(nullptr, -1);
     }
 
    private:
@@ -174,9 +169,7 @@
     }
 
     const uint8_t* const begin_;
-
     const uint32_t code_offset_;
-    const uint32_t native_gc_map_offset_;
 
     friend class OatClass;
   };
diff --git a/runtime/quick/inline_method_analyser.h b/runtime/quick/inline_method_analyser.h
index a8d4308..72b696b 100644
--- a/runtime/quick/inline_method_analyser.h
+++ b/runtime/quick/inline_method_analyser.h
@@ -106,9 +106,7 @@
 };
 
 struct InlineIGetIPutData {
-  // The op_variant below is opcode-Instruction::IGET for IGETs and
-  // opcode-Instruction::IPUT for IPUTs. This is because the runtime
-  // doesn't know the OpSize enumeration.
+  // The op_variant below is DexMemAccessType but the runtime doesn't know that enumeration.
   uint16_t op_variant : 3;
   uint16_t method_is_static : 1;
   uint16_t object_arg : 4;
diff --git a/runtime/signal_catcher.cc b/runtime/signal_catcher.cc
index d448460..e377542 100644
--- a/runtime/signal_catcher.cc
+++ b/runtime/signal_catcher.cc
@@ -110,11 +110,17 @@
     PLOG(ERROR) << "Unable to open stack trace file '" << stack_trace_file_ << "'";
     return;
   }
-  std::unique_ptr<File> file(new File(fd, stack_trace_file_));
-  if (!file->WriteFully(s.data(), s.size())) {
-    PLOG(ERROR) << "Failed to write stack traces to '" << stack_trace_file_ << "'";
+  std::unique_ptr<File> file(new File(fd, stack_trace_file_, true));
+  bool success = file->WriteFully(s.data(), s.size());
+  if (success) {
+    success = file->FlushCloseOrErase() == 0;
   } else {
+    file->Erase();
+  }
+  if (success) {
     LOG(INFO) << "Wrote stack traces to '" << stack_trace_file_ << "'";
+  } else {
+    PLOG(ERROR) << "Failed to write stack traces to '" << stack_trace_file_ << "'";
   }
 }
 
diff --git a/runtime/stack.cc b/runtime/stack.cc
index 4408609..43714b9 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -125,7 +125,7 @@
     } else {
       return cur_shadow_frame_->GetVRegReference(0);
     }
-  } else if (m->IsOptimized()) {
+  } else if (m->IsOptimized(sizeof(void*))) {
     // TODO: Implement, currently only used for exceptions when jdwp is enabled.
     UNIMPLEMENTED(WARNING)
         << "StackVisitor::GetThisObject is unimplemented with the optimizing compiler";
@@ -153,9 +153,9 @@
   if (cur_quick_frame_ != nullptr) {
     DCHECK(context_ != nullptr);  // You can't reliably read registers without a context.
     DCHECK(m == GetMethod());
-    const void* code_pointer = m->GetQuickOatCodePointer();
+    const void* code_pointer = m->GetQuickOatCodePointer(sizeof(void*));
     DCHECK(code_pointer != nullptr);
-    const VmapTable vmap_table(m->GetVmapTable(code_pointer));
+    const VmapTable vmap_table(m->GetVmapTable(code_pointer, sizeof(void*)));
     QuickMethodFrameInfo frame_info = m->GetQuickFrameInfo(code_pointer);
     uint32_t vmap_offset;
     // TODO: IsInContext stops before spotting floating point registers.
@@ -207,9 +207,9 @@
   if (cur_quick_frame_ != nullptr) {
     DCHECK(context_ != nullptr);  // You can't reliably read registers without a context.
     DCHECK(m == GetMethod());
-    const void* code_pointer = m->GetQuickOatCodePointer();
+    const void* code_pointer = m->GetQuickOatCodePointer(sizeof(void*));
     DCHECK(code_pointer != nullptr);
-    const VmapTable vmap_table(m->GetVmapTable(code_pointer));
+    const VmapTable vmap_table(m->GetVmapTable(code_pointer, sizeof(void*)));
     QuickMethodFrameInfo frame_info = m->GetQuickFrameInfo(code_pointer);
     uint32_t vmap_offset_lo, vmap_offset_hi;
     // TODO: IsInContext stops before spotting floating point registers.
@@ -254,9 +254,9 @@
   if (cur_quick_frame_ != nullptr) {
     DCHECK(context_ != nullptr);  // You can't reliably write registers without a context.
     DCHECK(m == GetMethod());
-    const void* code_pointer = m->GetQuickOatCodePointer();
+    const void* code_pointer = m->GetQuickOatCodePointer(sizeof(void*));
     DCHECK(code_pointer != nullptr);
-    const VmapTable vmap_table(m->GetVmapTable(code_pointer));
+    const VmapTable vmap_table(m->GetVmapTable(code_pointer, sizeof(void*)));
     QuickMethodFrameInfo frame_info = m->GetQuickFrameInfo(code_pointer);
     uint32_t vmap_offset;
     // TODO: IsInContext stops before spotting floating point registers.
@@ -318,9 +318,9 @@
   if (cur_quick_frame_ != nullptr) {
     DCHECK(context_ != nullptr);  // You can't reliably write registers without a context.
     DCHECK(m == GetMethod());
-    const void* code_pointer = m->GetQuickOatCodePointer();
+    const void* code_pointer = m->GetQuickOatCodePointer(sizeof(void*));
     DCHECK(code_pointer != nullptr);
-    const VmapTable vmap_table(m->GetVmapTable(code_pointer));
+    const VmapTable vmap_table(m->GetVmapTable(code_pointer, sizeof(void*)));
     QuickMethodFrameInfo frame_info = m->GetQuickFrameInfo(code_pointer);
     uint32_t vmap_offset_lo, vmap_offset_hi;
     // TODO: IsInContext stops before spotting floating point registers.
diff --git a/runtime/thread.cc b/runtime/thread.cc
index c769faf..f7c7106 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -96,8 +96,8 @@
 void Thread::InitTlsEntryPoints() {
   // Insert a placeholder so we can easily tell if we call an unimplemented entry point.
   uintptr_t* begin = reinterpret_cast<uintptr_t*>(&tlsPtr_.interpreter_entrypoints);
-  uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(begin) +
-                                                sizeof(tlsPtr_.quick_entrypoints));
+  uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(&tlsPtr_.quick_entrypoints) +
+      sizeof(tlsPtr_.quick_entrypoints));
   for (uintptr_t* it = begin; it != end; ++it) {
     *it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint);
   }
@@ -2102,7 +2102,7 @@
     } else {
       // Java method.
       // Portable path use DexGcMap and store in Method.native_gc_map_.
-      const uint8_t* gc_map = m->GetNativeGcMap();
+      const uint8_t* gc_map = m->GetNativeGcMap(sizeof(void*));
       CHECK(gc_map != nullptr) << PrettyMethod(m);
       verifier::DexPcToReferenceMap dex_gc_map(gc_map);
       uint32_t dex_pc = shadow_frame->GetDexPC();
@@ -2136,9 +2136,9 @@
 
     // Process register map (which native and runtime methods don't have)
     if (!m->IsNative() && !m->IsRuntimeMethod() && !m->IsProxyMethod()) {
-      if (m->IsOptimized()) {
+      if (m->IsOptimized(sizeof(void*))) {
         Runtime* runtime = Runtime::Current();
-        const void* entry_point = runtime->GetInstrumentation()->GetQuickCodeFor(m);
+        const void* entry_point = runtime->GetInstrumentation()->GetQuickCodeFor(m, sizeof(void*));
         uintptr_t native_pc_offset = m->NativeQuickPcOffset(GetCurrentQuickFramePc(), entry_point);
         StackMap map = m->GetStackMap(native_pc_offset);
         MemoryRegion mask = map.GetStackMask();
@@ -2157,7 +2157,7 @@
           }
         }
       } else {
-        const uint8_t* native_gc_map = m->GetNativeGcMap();
+        const uint8_t* native_gc_map = m->GetNativeGcMap(sizeof(void*));
         CHECK(native_gc_map != nullptr) << PrettyMethod(m);
         const DexFile::CodeItem* code_item = m->GetCodeItem();
         // Can't be nullptr or how would we compile its instructions?
@@ -2167,12 +2167,12 @@
                                    static_cast<size_t>(code_item->registers_size_));
         if (num_regs > 0) {
           Runtime* runtime = Runtime::Current();
-          const void* entry_point = runtime->GetInstrumentation()->GetQuickCodeFor(m);
+          const void* entry_point = runtime->GetInstrumentation()->GetQuickCodeFor(m, sizeof(void*));
           uintptr_t native_pc_offset = m->NativeQuickPcOffset(GetCurrentQuickFramePc(), entry_point);
           const uint8_t* reg_bitmap = map.FindBitMap(native_pc_offset);
           DCHECK(reg_bitmap != nullptr);
           const void* code_pointer = mirror::ArtMethod::EntryPointToCodePointer(entry_point);
-          const VmapTable vmap_table(m->GetVmapTable(code_pointer));
+          const VmapTable vmap_table(m->GetVmapTable(code_pointer, sizeof(void*)));
           QuickMethodFrameInfo frame_info = m->GetQuickFrameInfo(code_pointer);
           // For all dex registers in the bitmap
           DCHECK(cur_quick_frame != nullptr);
diff --git a/runtime/trace.cc b/runtime/trace.cc
index 29c01e4..b510844 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -431,6 +431,15 @@
                                                     instrumentation::Instrumentation::kMethodExited |
                                                     instrumentation::Instrumentation::kMethodUnwind);
     }
+    if (the_trace->trace_file_.get() != nullptr) {
+      // Do not try to erase, so flush and close explicitly.
+      if (the_trace->trace_file_->Flush() != 0) {
+        PLOG(ERROR) << "Could not flush trace file.";
+      }
+      if (the_trace->trace_file_->Close() != 0) {
+        PLOG(ERROR) << "Could not close trace file.";
+      }
+    }
     delete the_trace;
   }
   runtime->GetThreadList()->ResumeAll();
@@ -726,7 +735,9 @@
   if (the_trace_ != nullptr) {
     std::string name;
     thread->GetThreadName(name);
-    the_trace_->exited_threads_.Put(thread->GetTid(), name);
+    // The same thread/tid may be used multiple times. As SafeMap::Put does not allow to override
+    // a previous mapping, use SafeMap::Overwrite.
+    the_trace_->exited_threads_.Overwrite(thread->GetTid(), name);
   }
 }
 
diff --git a/runtime/utils.cc b/runtime/utils.cc
index 9a4c875..ad46be6 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -1120,13 +1120,20 @@
 
 void DumpNativeStack(std::ostream& os, pid_t tid, const char* prefix,
     mirror::ArtMethod* current_method) {
-  // TODO: enable on __linux__ b/15446488.
-#if 0
+#if __linux__
   // b/18119146
   if (RUNNING_ON_VALGRIND != 0) {
     return;
   }
 
+#if !defined(HAVE_ANDROID_OS)
+  if (GetTid() != tid) {
+    // TODO: dumping of other threads is disabled to avoid crashes during stress testing.
+    //       b/15446488.
+    return;
+  }
+#endif
+
   std::unique_ptr<Backtrace> backtrace(Backtrace::Create(BACKTRACE_CURRENT_PROCESS, tid));
   if (!backtrace->Unwind(0)) {
     os << prefix << "(backtrace::Unwind failed for thread " << tid << ")\n";
diff --git a/runtime/utils.h b/runtime/utils.h
index d83013a..668c897 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -165,6 +165,18 @@
   typedef T type;
 };
 
+// Like sizeof, but count how many bits a type takes. Pass type explicitly.
+template <typename T>
+static constexpr size_t BitSizeOf() {
+  return sizeof(T) * CHAR_BIT;
+}
+
+// Like sizeof, but count how many bits a type takes. Infers type from parameter.
+template <typename T>
+static constexpr size_t BitSizeOf(T /*x*/) {
+  return sizeof(T) * CHAR_BIT;
+}
+
 // For rounding integers.
 template<typename T>
 static constexpr T RoundDown(T x, typename TypeIdentity<T>::type n) WARN_UNUSED;
@@ -201,10 +213,39 @@
   return reinterpret_cast<T*>(RoundUp(reinterpret_cast<uintptr_t>(x), n));
 }
 
+namespace utils {
+namespace detail {  // Private, implementation-specific namespace. Do not poke outside of this file.
+template <typename T>
+static constexpr inline T RoundUpToPowerOfTwoRecursive(T x, size_t bit) {
+  return bit == (BitSizeOf<T>()) ? x: RoundUpToPowerOfTwoRecursive(x | x >> bit, bit << 1);
+}
+}  // namespace detail
+}  // namespace utils
+
+// Recursive implementation is from "Hacker's Delight" by Henry S. Warren, Jr.,
+// figure 3-3, page 48, where the function is called clp2.
+template <typename T>
+static constexpr inline T RoundUpToPowerOfTwo(T x) {
+  return art::utils::detail::RoundUpToPowerOfTwoRecursive(x - 1, 1) + 1;
+}
+
+// Find the bit position of the most significant bit (0-based), or -1 if there were no bits set.
+template <typename T>
+static constexpr ssize_t MostSignificantBit(T value) {
+  return (value == 0) ? -1 : (MostSignificantBit(value >> 1) + 1);
+}
+
+// How many bits (minimally) does it take to store the constant 'value'? i.e. 1 for 1, 3 for 5, etc.
+template <typename T>
+static constexpr size_t MinimumBitsToStore(T value) {
+  return static_cast<size_t>(MostSignificantBit(value) + 1);
+}
+
 template<typename T>
 static constexpr int CLZ(T x) {
+  static_assert(sizeof(T) <= sizeof(long long), "T too large, must be smaller than long long");  // NOLINT [runtime/int] [4]
   return (sizeof(T) == sizeof(uint32_t))
-      ? __builtin_clz(x)
+      ? __builtin_clz(x)  // TODO: __builtin_clz[ll] has undefined behavior for x=0
       : __builtin_clzll(x);
 }
 
diff --git a/runtime/utils_test.cc b/runtime/utils_test.cc
index 92323da..a98bc90 100644
--- a/runtime/utils_test.cc
+++ b/runtime/utils_test.cc
@@ -402,4 +402,36 @@
   }
 }
 
+TEST_F(UtilsTest, RoundUpToPowerOfTwo) {
+  // Tests the constexpr variant since all the parameters are constexpr
+  EXPECT_EQ(0, RoundUpToPowerOfTwo(0));
+  EXPECT_EQ(1, RoundUpToPowerOfTwo(1));
+  EXPECT_EQ(2, RoundUpToPowerOfTwo(2));
+  EXPECT_EQ(4, RoundUpToPowerOfTwo(3));
+  EXPECT_EQ(8, RoundUpToPowerOfTwo(7));
+
+  EXPECT_EQ(0b10000L, RoundUpToPowerOfTwo(0b01101L));
+  EXPECT_EQ(1ULL << 63, RoundUpToPowerOfTwo(1ULL << 62 | 1ULL));
+}
+
+TEST_F(UtilsTest, MostSignificantBit) {
+  EXPECT_EQ(-1, MostSignificantBit(0));
+  EXPECT_EQ(0, MostSignificantBit(1));
+  EXPECT_EQ(31, MostSignificantBit(~static_cast<uint32_t>(0)));
+  EXPECT_EQ(2, MostSignificantBit(0b110));
+  EXPECT_EQ(2, MostSignificantBit(0b100));
+}
+
+TEST_F(UtilsTest, MinimumBitsToStore) {
+  EXPECT_EQ(0u, MinimumBitsToStore(0));
+  EXPECT_EQ(1u, MinimumBitsToStore(1));
+  EXPECT_EQ(2u, MinimumBitsToStore(0b10));
+  EXPECT_EQ(2u, MinimumBitsToStore(0b11));
+  EXPECT_EQ(3u, MinimumBitsToStore(0b100));
+  EXPECT_EQ(3u, MinimumBitsToStore(0b110));
+  EXPECT_EQ(3u, MinimumBitsToStore(0b101));
+  EXPECT_EQ(8u, MinimumBitsToStore(0xFF));
+  EXPECT_EQ(32u, MinimumBitsToStore(~static_cast<uint32_t>(0)));
+}
+
 }  // namespace art
diff --git a/runtime/verifier/reg_type-inl.h b/runtime/verifier/reg_type-inl.h
index 480ed40..f445132 100644
--- a/runtime/verifier/reg_type-inl.h
+++ b/runtime/verifier/reg_type-inl.h
@@ -81,6 +81,9 @@
       return rhs.IsLongTypes();
     } else if (lhs.IsDoubleLo()) {
       return rhs.IsDoubleTypes();
+    } else if (lhs.IsConflict()) {
+      LOG(WARNING) << "RegType::AssignableFrom lhs is Conflict!";
+      return false;
     } else {
       CHECK(lhs.IsReferenceTypes())
           << "Unexpected register type in IsAssignableFrom: '"
diff --git a/runtime/zip_archive_test.cc b/runtime/zip_archive_test.cc
index 96abee2..70a4dda 100644
--- a/runtime/zip_archive_test.cc
+++ b/runtime/zip_archive_test.cc
@@ -41,7 +41,7 @@
 
   ScratchFile tmp;
   ASSERT_NE(-1, tmp.GetFd());
-  std::unique_ptr<File> file(new File(tmp.GetFd(), tmp.GetFilename()));
+  std::unique_ptr<File> file(new File(tmp.GetFd(), tmp.GetFilename(), false));
   ASSERT_TRUE(file.get() != NULL);
   bool success = zip_entry->ExtractToFile(*file, &error_msg);
   ASSERT_TRUE(success) << error_msg;
diff --git a/test/083-compiler-regressions/expected.txt b/test/083-compiler-regressions/expected.txt
index 51bf847..78c92fc 100644
--- a/test/083-compiler-regressions/expected.txt
+++ b/test/083-compiler-regressions/expected.txt
@@ -1,3 +1,4 @@
+b17325447 passes
 b17630605 passes
 b17411468 passes
 b2296099 passes
diff --git a/test/083-compiler-regressions/src/Main.java b/test/083-compiler-regressions/src/Main.java
index 9ad8ea7..285c360 100644
--- a/test/083-compiler-regressions/src/Main.java
+++ b/test/083-compiler-regressions/src/Main.java
@@ -30,6 +30,7 @@
     }
 
     public static void main(String args[]) throws Exception {
+        b17325447();
         b17630605();
         b17411468();
         b2296099Test();
@@ -64,6 +65,31 @@
         minDoubleWith3ConstsTest();
     }
 
+    public static double b17325447_i1(int i1, double f) {
+      return f;
+    }
+
+    public static double b17325447_i2(int i1, int i2, double f) {
+      return f;
+    }
+
+    public static double b17325447_i3(int i1, int i2, int i3, double f) {
+      return f;
+    }
+
+    public static void b17325447() {
+      // b/17325447 - x86 handling of special identity method w/ double spanning reg/mem.
+      double d = 0.0;
+      d += b17325447_i1(123, 1.0);
+      d += b17325447_i2(123, 456, 2.0);
+      d += b17325447_i3(123, 456, 789, 3.0);
+      if (d == 6.0) {
+        System.out.println("b17325447 passes");
+      } else {
+        System.out.println("b17325447 fails: " + d);
+      }
+    }
+
     public static void b17630605() {
       // b/17630605 - failure to properly handle min long immediates.
       long a1 = 40455547223404749L;
diff --git a/test/415-optimizing-arith-neg/src/Main.java b/test/415-optimizing-arith-neg/src/Main.java
index d9f8bcf..bd8a158 100644
--- a/test/415-optimizing-arith-neg/src/Main.java
+++ b/test/415-optimizing-arith-neg/src/Main.java
@@ -36,12 +36,24 @@
     }
   }
 
+  public static void assertEquals(String expected, float result) {
+    if (!expected.equals(new Float(result).toString())) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
   public static void assertEquals(double expected, double result) {
     if (expected != result) {
       throw new Error("Expected: " + expected + ", found: " + result);
     }
   }
 
+  public static void assertEquals(String expected, double result) {
+    if (!expected.equals(new Double(result).toString())) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
   public static void assertIsNaN(float result) {
     if (!Float.isNaN(result)) {
       throw new Error("Expected NaN: " + result);
@@ -116,9 +128,10 @@
   }
 
   private static void negFloat() {
+     assertEquals("-0.0", $opt$NegFloat(0F));
+     assertEquals("0.0", $opt$NegFloat(-0F));
      assertEquals(-1F, $opt$NegFloat(1F));
      assertEquals(1F, $opt$NegFloat(-1F));
-     assertEquals(0F, $opt$NegFloat(0F));
      assertEquals(51F, $opt$NegFloat(-51F));
      assertEquals(-51F, $opt$NegFloat(51F));
 
@@ -140,9 +153,10 @@
   }
 
   private static void negDouble() {
+     assertEquals("-0.0", $opt$NegDouble(0D));
+     assertEquals("0.0", $opt$NegDouble(-0D));
      assertEquals(-1D, $opt$NegDouble(1D));
      assertEquals(1D, $opt$NegDouble(-1D));
-     assertEquals(0D, $opt$NegDouble(0D));
      assertEquals(51D, $opt$NegDouble(-51D));
      assertEquals(-51D, $opt$NegDouble(51D));
 
diff --git a/test/422-type-conversion/src/Main.java b/test/422-type-conversion/src/Main.java
index 37bc777..c434db3 100644
--- a/test/422-type-conversion/src/Main.java
+++ b/test/422-type-conversion/src/Main.java
@@ -85,6 +85,12 @@
     // Generate, compile and check long-to-int Dex instructions.
     longToInt();
 
+    // Generate, compile and check long-to-float Dex instructions.
+    longToFloat();
+
+    // Generate, compile and check long-to-double Dex instructions.
+    longToDouble();
+
     // Generate, compile and check int-to-byte Dex instructions.
     shortToByte();
     intToByte();
@@ -267,6 +273,46 @@
     assertLongEquals(-1, $opt$IntToLong($opt$LongToInt(-4294967297L)));  // -(2^32 + 1)
   }
 
+  private static void longToFloat() {
+    assertFloatEquals(1F, $opt$LongToFloat(1L));
+    assertFloatEquals(0F, $opt$LongToFloat(0L));
+    assertFloatEquals(-1F, $opt$LongToFloat(-1L));
+    assertFloatEquals(51F, $opt$LongToFloat(51L));
+    assertFloatEquals(-51F, $opt$LongToFloat(-51L));
+    assertFloatEquals(2147483647F, $opt$LongToFloat(2147483647L));  // 2^31 - 1
+    assertFloatEquals(-2147483647F, $opt$LongToFloat(-2147483647L));  // -(2^31 - 1)
+    assertFloatEquals(-2147483648F, $opt$LongToFloat(-2147483648L));  // -(2^31)
+    assertFloatEquals(2147483648F, $opt$LongToFloat(2147483648L));  // (2^31)
+    assertFloatEquals(-2147483649F, $opt$LongToFloat(-2147483649L));  // -(2^31 + 1)
+    assertFloatEquals(4294967296F, $opt$LongToFloat(4294967296L));  // (2^32)
+    assertFloatEquals(-4294967296F, $opt$LongToFloat(-4294967296L));  // -(2^32)
+    assertFloatEquals(140739635871745F, $opt$LongToFloat(140739635871745L));  // 1 + 2^15 + 2^31 + 2^47
+    assertFloatEquals(-140739635871745F, $opt$LongToFloat(-140739635871745L));  // -(1 + 2^15 + 2^31 + 2^47)
+    assertFloatEquals(9223372036854775807F, $opt$LongToFloat(9223372036854775807L));  // 2^63 - 1
+    assertFloatEquals(-9223372036854775807F, $opt$LongToFloat(-9223372036854775807L));  // -(2^63 - 1)
+    assertFloatEquals(-9223372036854775808F, $opt$LongToFloat(-9223372036854775808L));  // -(2^63)
+  }
+
+  private static void longToDouble() {
+    assertDoubleEquals(1D, $opt$LongToDouble(1L));
+    assertDoubleEquals(0D, $opt$LongToDouble(0L));
+    assertDoubleEquals(-1D, $opt$LongToDouble(-1L));
+    assertDoubleEquals(51D, $opt$LongToDouble(51L));
+    assertDoubleEquals(-51D, $opt$LongToDouble(-51L));
+    assertDoubleEquals(2147483647D, $opt$LongToDouble(2147483647L));  // 2^31 - 1
+    assertDoubleEquals(-2147483647D, $opt$LongToDouble(-2147483647L));  // -(2^31 - 1)
+    assertDoubleEquals(-2147483648D, $opt$LongToDouble(-2147483648L));  // -(2^31)
+    assertDoubleEquals(2147483648D, $opt$LongToDouble(2147483648L));  // (2^31)
+    assertDoubleEquals(-2147483649D, $opt$LongToDouble(-2147483649L));  // -(2^31 + 1)
+    assertDoubleEquals(4294967296D, $opt$LongToDouble(4294967296L));  // (2^32)
+    assertDoubleEquals(-4294967296D, $opt$LongToDouble(-4294967296L));  // -(2^32)
+    assertDoubleEquals(140739635871745D, $opt$LongToDouble(140739635871745L));  // 1 + 2^15 + 2^31 + 2^47
+    assertDoubleEquals(-140739635871745D, $opt$LongToDouble(-140739635871745L));  // -(1 + 2^15 + 2^31 + 2^47)
+    assertDoubleEquals(9223372036854775807D, $opt$LongToDouble(9223372036854775807L));  // 2^63 - 1
+    assertDoubleEquals(-9223372036854775807D, $opt$LongToDouble(-9223372036854775807L));  // -(2^63 - 1)
+    assertDoubleEquals(-9223372036854775808D, $opt$LongToDouble(-9223372036854775808L));  // -(2^63)
+  }
+
   private static void shortToByte() {
     assertByteEquals((byte)1, $opt$ShortToByte((short)1));
     assertByteEquals((byte)0, $opt$ShortToByte((short)0));
@@ -416,6 +462,12 @@
   static int $opt$LongToInt(long a){ return (int)a; }
   static int $opt$LongLiteralToInt(){ return (int)42L; }
 
+  // This method produces a long-to-float Dex instruction.
+  static float $opt$LongToFloat(long a){ return (float)a; }
+
+  // This method produces a long-to-double Dex instruction.
+  static double $opt$LongToDouble(long a){ return (double)a; }
+
   // These methods produce int-to-byte Dex instructions.
   static byte $opt$ShortToByte(short a){ return (byte)a; }
   static byte $opt$IntToByte(int a){ return (byte)a; }
diff --git a/test/430-live-register-slow-path/expected.txt b/test/430-live-register-slow-path/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/430-live-register-slow-path/expected.txt
diff --git a/test/430-live-register-slow-path/info.txt b/test/430-live-register-slow-path/info.txt
new file mode 100644
index 0000000..6f2af28
--- /dev/null
+++ b/test/430-live-register-slow-path/info.txt
@@ -0,0 +1,2 @@
+Regression test for the linear scan register allocator. It used
+to miscompute the number of live registers at a safepoint.
diff --git a/test/430-live-register-slow-path/src/Main.java b/test/430-live-register-slow-path/src/Main.java
new file mode 100644
index 0000000..b84e647
--- /dev/null
+++ b/test/430-live-register-slow-path/src/Main.java
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+   $opt$TestSlowPath();
+  }
+
+  public static void $opt$TestSlowPath() {
+    Object[] o = bar();
+    assertEquals(0, o.length);
+    // The slowpath of the instanceof requires the live register
+    // holding `o` to be saved before going into runtime. The linear
+    // scan register allocator used to miscompute the number of
+    // live registers at a safepoint, so the place at which the register
+    // was saved was wrong.
+    doCall(o instanceof Interface[], o);
+  }
+
+  public static void assertEquals(int a, int b) {}
+  public static boolean doCall(boolean val, Object o) { return val; }
+
+  static Object[] bar() { return new Object[0]; }
+
+  static interface Interface {}
+}
diff --git a/test/431-optimizing-arith-shifts/expected.txt b/test/431-optimizing-arith-shifts/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/431-optimizing-arith-shifts/expected.txt
diff --git a/test/431-optimizing-arith-shifts/info.txt b/test/431-optimizing-arith-shifts/info.txt
new file mode 100644
index 0000000..14ff264
--- /dev/null
+++ b/test/431-optimizing-arith-shifts/info.txt
@@ -0,0 +1 @@
+Tests for shift operations.
diff --git a/test/431-optimizing-arith-shifts/src/Main.java b/test/431-optimizing-arith-shifts/src/Main.java
new file mode 100644
index 0000000..d8667c6
--- /dev/null
+++ b/test/431-optimizing-arith-shifts/src/Main.java
@@ -0,0 +1,305 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  public static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void main(String[] args) {
+    shlInt();
+    shlLong();
+    shrInt();
+    shrLong();
+    ushrInt();
+    ushrLong();
+  }
+
+  private static void shlInt() {
+    expectEquals(48, $opt$ShlConst2(12));
+    expectEquals(12, $opt$ShlConst0(12));
+    expectEquals(-48, $opt$Shl(-12, 2));
+    expectEquals(1024, $opt$Shl(32, 5));
+
+    expectEquals(7, $opt$Shl(7, 0));
+    expectEquals(14, $opt$Shl(7, 1));
+    expectEquals(0, $opt$Shl(0, 30));
+
+    expectEquals(1073741824L, $opt$Shl(1, 30));
+    expectEquals(Integer.MIN_VALUE, $opt$Shl(1, 31));  // overflow
+    expectEquals(Integer.MIN_VALUE, $opt$Shl(1073741824, 1));  // overflow
+    expectEquals(1073741824, $opt$Shl(268435456, 2));
+
+   // othe nly 5 lower bits should be used for shifting (& 0x1f).
+    expectEquals(7, $opt$Shl(7, 32));  // 32 & 0x1f = 0
+    expectEquals(14, $opt$Shl(7, 33));  // 33 & 0x1f = 1
+    expectEquals(32, $opt$Shl(1, 101));  // 101 & 0x1f = 5
+
+    expectEquals(Integer.MIN_VALUE, $opt$Shl(1, -1));  // -1 & 0x1f = 31
+    expectEquals(14, $opt$Shl(7, -31));  // -31 & 0x1f = 1
+    expectEquals(7, $opt$Shl(7, -32));  // -32 & 0x1f = 0
+    expectEquals(-536870912, $opt$Shl(7, -3));  // -3 & 0x1f = 29
+
+    expectEquals(Integer.MIN_VALUE, $opt$Shl(7, Integer.MAX_VALUE));
+    expectEquals(7, $opt$Shl(7, Integer.MIN_VALUE));
+  }
+
+  private static void shlLong() {
+    expectEquals(48L, $opt$ShlConst2(12L));
+    expectEquals(12L, $opt$ShlConst0(12L));
+    expectEquals(-48L, $opt$Shl(-12L, 2L));
+    expectEquals(1024L, $opt$Shl(32L, 5L));
+
+    expectEquals(7L, $opt$Shl(7L, 0L));
+    expectEquals(14L, $opt$Shl(7L, 1L));
+    expectEquals(0L, $opt$Shl(0L, 30L));
+
+    expectEquals(1073741824L, $opt$Shl(1L, 30L));
+    expectEquals(2147483648L, $opt$Shl(1L, 31L));
+    expectEquals(2147483648L, $opt$Shl(1073741824L, 1L));
+
+    // Long shifts can use up to 6 lower bits.
+    expectEquals(4294967296L, $opt$Shl(1L, 32L));
+    expectEquals(60129542144L, $opt$Shl(7L, 33L));
+    expectEquals(Long.MIN_VALUE, $opt$Shl(1L, 63L));  // overflow
+
+    // Only the 6 lower bits should be used for shifting (& 0x3f).
+    expectEquals(7L, $opt$Shl(7L, 64L));  // 64 & 0x3f = 0
+    expectEquals(14L, $opt$Shl(7L, 65L));  // 65 & 0x3f = 1
+    expectEquals(137438953472L, $opt$Shl(1L, 101L));  // 101 & 0x3f = 37
+
+    expectEquals(Long.MIN_VALUE, $opt$Shl(1L, -1L));  // -1 & 0x3f = 63
+    expectEquals(14L, $opt$Shl(7L, -63L));  // -63 & 0x3f = 1
+    expectEquals(7L, $opt$Shl(7L, -64L));  // -64 & 0x3f = 0
+    expectEquals(2305843009213693952L, $opt$Shl(1L, -3L));  // -3 & 0x3f = 61
+
+    expectEquals(Long.MIN_VALUE, $opt$Shl(7L, Long.MAX_VALUE));
+    expectEquals(7L, $opt$Shl(7L, Long.MIN_VALUE));
+  }
+
+  private static void shrInt() {
+    expectEquals(3, $opt$ShrConst2(12));
+    expectEquals(12, $opt$ShrConst0(12));
+    expectEquals(-3, $opt$Shr(-12, 2));
+    expectEquals(1, $opt$Shr(32, 5));
+
+    expectEquals(7, $opt$Shr(7, 0));
+    expectEquals(3, $opt$Shr(7, 1));
+    expectEquals(0, $opt$Shr(0, 30));
+    expectEquals(0, $opt$Shr(1, 30));
+    expectEquals(-1, $opt$Shr(-1, 30));
+
+    expectEquals(0, $opt$Shr(Integer.MAX_VALUE, 31));
+    expectEquals(-1, $opt$Shr(Integer.MIN_VALUE, 31));
+
+    // Only the 5 lower bits should be used for shifting (& 0x1f).
+    expectEquals(7, $opt$Shr(7, 32));  // 32 & 0x1f = 0
+    expectEquals(3, $opt$Shr(7, 33));  // 33 & 0x1f = 1
+
+    expectEquals(0, $opt$Shr(1, -1));  // -1 & 0x1f = 31
+    expectEquals(3, $opt$Shr(7, -31));  // -31 & 0x1f = 1
+    expectEquals(7, $opt$Shr(7, -32));  // -32 & 0x1f = 0
+    expectEquals(-4, $opt$Shr(Integer.MIN_VALUE, -3));  // -3 & 0x1f = 29
+
+    expectEquals(0, $opt$Shr(7, Integer.MAX_VALUE));
+    expectEquals(7, $opt$Shr(7, Integer.MIN_VALUE));
+  }
+
+  private static void shrLong() {
+    expectEquals(3L, $opt$ShrConst2(12L));
+    expectEquals(12L, $opt$ShrConst0(12L));
+    expectEquals(-3L, $opt$Shr(-12L, 2L));
+    expectEquals(1, $opt$Shr(32, 5));
+
+    expectEquals(7L, $opt$Shr(7L, 0L));
+    expectEquals(3L, $opt$Shr(7L, 1L));
+    expectEquals(0L, $opt$Shr(0L, 30L));
+    expectEquals(0L, $opt$Shr(1L, 30L));
+    expectEquals(-1L, $opt$Shr(-1L, 30L));
+
+
+    expectEquals(1L, $opt$Shr(1073741824L, 30L));
+    expectEquals(1L, $opt$Shr(2147483648L, 31L));
+    expectEquals(1073741824L, $opt$Shr(2147483648L, 1L));
+
+    // Long shifts can use up to 6 lower bits.
+    expectEquals(1L, $opt$Shr(4294967296L, 32L));
+    expectEquals(7L, $opt$Shr(60129542144L, 33L));
+    expectEquals(0L, $opt$Shr(Long.MAX_VALUE, 63L));
+    expectEquals(-1L, $opt$Shr(Long.MIN_VALUE, 63L));
+
+    // Only the 6 lower bits should be used for shifting (& 0x3f).
+    expectEquals(7L, $opt$Shr(7L, 64L));  // 64 & 0x3f = 0
+    expectEquals(3L, $opt$Shr(7L, 65L));  // 65 & 0x3f = 1
+
+    expectEquals(-1L, $opt$Shr(Long.MIN_VALUE, -1L));  // -1 & 0x3f = 63
+    expectEquals(3L, $opt$Shr(7L, -63L));  // -63 & 0x3f = 1
+    expectEquals(7L, $opt$Shr(7L, -64L));  // -64 & 0x3f = 0
+    expectEquals(1L, $opt$Shr(2305843009213693952L, -3L));  // -3 & 0x3f = 61
+    expectEquals(-4L, $opt$Shr(Integer.MIN_VALUE, -3));  // -3 & 0x1f = 29
+
+    expectEquals(0L, $opt$Shr(7L, Long.MAX_VALUE));
+    expectEquals(7L, $opt$Shr(7L, Long.MIN_VALUE));
+  }
+
+  private static void ushrInt() {
+    expectEquals(3, $opt$UShrConst2(12));
+    expectEquals(12, $opt$UShrConst0(12));
+    expectEquals(1073741821, $opt$UShr(-12, 2));
+    expectEquals(1, $opt$UShr(32, 5));
+
+    expectEquals(7, $opt$UShr(7, 0));
+    expectEquals(3, $opt$UShr(7, 1));
+    expectEquals(0, $opt$UShr(0, 30));
+    expectEquals(0, $opt$UShr(1, 30));
+    expectEquals(3, $opt$UShr(-1, 30));
+
+    expectEquals(0, $opt$UShr(Integer.MAX_VALUE, 31));
+    expectEquals(1, $opt$UShr(Integer.MIN_VALUE, 31));
+
+    // Only the 5 lower bits should be used for shifting (& 0x1f).
+    expectEquals(7, $opt$UShr(7, 32));  // 32 & 0x1f = 0
+    expectEquals(3, $opt$UShr(7, 33));  // 33 & 0x1f = 1
+
+    expectEquals(0, $opt$UShr(1, -1));  // -1 & 0x1f = 31
+    expectEquals(3, $opt$UShr(7, -31));  // -31 & 0x1f = 1
+    expectEquals(7, $opt$UShr(7, -32));  // -32 & 0x1f = 0
+    expectEquals(4, $opt$UShr(Integer.MIN_VALUE, -3));  // -3 & 0x1f = 29
+
+    expectEquals(0, $opt$UShr(7, Integer.MAX_VALUE));
+    expectEquals(7, $opt$UShr(7, Integer.MIN_VALUE));
+  }
+
+  private static void ushrLong() {
+    expectEquals(3L, $opt$UShrConst2(12L));
+    expectEquals(12L, $opt$UShrConst0(12L));
+    expectEquals(4611686018427387901L, $opt$UShr(-12L, 2L));
+    expectEquals(1, $opt$UShr(32, 5));
+
+    expectEquals(7L, $opt$UShr(7L, 0L));
+    expectEquals(3L, $opt$UShr(7L, 1L));
+    expectEquals(0L, $opt$UShr(0L, 30L));
+    expectEquals(0L, $opt$UShr(1L, 30L));
+    expectEquals(17179869183L, $opt$UShr(-1L, 30L));
+
+
+    expectEquals(1L, $opt$UShr(1073741824L, 30L));
+    expectEquals(1L, $opt$UShr(2147483648L, 31L));
+    expectEquals(1073741824L, $opt$UShr(2147483648L, 1L));
+
+    // Long shifts can use use up to 6 lower bits.
+    expectEquals(1L, $opt$UShr(4294967296L, 32L));
+    expectEquals(7L, $opt$UShr(60129542144L, 33L));
+    expectEquals(0L, $opt$UShr(Long.MAX_VALUE, 63L));
+    expectEquals(1L, $opt$UShr(Long.MIN_VALUE, 63L));
+
+    // Only the 6 lower bits should be used for shifting (& 0x3f).
+    expectEquals(7L, $opt$UShr(7L, 64L));  // 64 & 0x3f = 0
+    expectEquals(3L, $opt$UShr(7L, 65L));  // 65 & 0x3f = 1
+
+    expectEquals(1L, $opt$UShr(Long.MIN_VALUE, -1L));  // -1 & 0x3f = 63
+    expectEquals(3L, $opt$UShr(7L, -63L));  // -63 & 0x3f = 1
+    expectEquals(7L, $opt$UShr(7L, -64L));  // -64 & 0x3f = 0
+    expectEquals(1L, $opt$UShr(2305843009213693952L, -3L));  // -3 & 0x3f = 61
+    expectEquals(4L, $opt$UShr(Long.MIN_VALUE, -3L));  // -3 & 0x3f = 61
+
+    expectEquals(0L, $opt$UShr(7L, Long.MAX_VALUE));
+    expectEquals(7L, $opt$UShr(7L, Long.MIN_VALUE));
+  }
+
+  static int $opt$Shl(int a, int b) {
+    return a << b;
+  }
+
+  static long $opt$Shl(long a, long b) {
+    return a << b;
+  }
+
+  static int $opt$Shr(int a, int b) {
+    return a >> b;
+  }
+
+  static long $opt$Shr(long a, long b) {
+    return a >> b;
+  }
+
+  static int $opt$UShr(int a, int b) {
+    return a >>> b;
+  }
+
+  static long $opt$UShr(long a, long b) {
+    return a >>> b;
+  }
+
+  static int $opt$ShlConst2(int a) {
+    return a << 2;
+  }
+
+  static long $opt$ShlConst2(long a) {
+    return a << 2L;
+  }
+
+  static int $opt$ShrConst2(int a) {
+    return a >> 2;
+  }
+
+  static long $opt$ShrConst2(long a) {
+    return a >> 2L;
+  }
+
+  static int $opt$UShrConst2(int a) {
+    return a >>> 2;
+  }
+
+  static long $opt$UShrConst2(long a) {
+    return a >>> 2L;
+  }
+
+    static int $opt$ShlConst0(int a) {
+    return a << 0;
+  }
+
+  static long $opt$ShlConst0(long a) {
+    return a << 0L;
+  }
+
+  static int $opt$ShrConst0(int a) {
+    return a >> 0;
+  }
+
+  static long $opt$ShrConst0(long a) {
+    return a >> 0L;
+  }
+
+  static int $opt$UShrConst0(int a) {
+    return a >>> 0;
+  }
+
+  static long $opt$UShrConst0(long a) {
+    return a >>> 0L;
+  }
+
+}
+
diff --git a/test/431-type-propagation/expected.txt b/test/431-type-propagation/expected.txt
new file mode 100644
index 0000000..ccaf6f8
--- /dev/null
+++ b/test/431-type-propagation/expected.txt
@@ -0,0 +1 @@
+Enter
diff --git a/test/431-type-propagation/info.txt b/test/431-type-propagation/info.txt
new file mode 100644
index 0000000..b895e91
--- /dev/null
+++ b/test/431-type-propagation/info.txt
@@ -0,0 +1,2 @@
+Regression test for the SSA building of the optimizing
+compiler. See comment in smali file.
diff --git a/test/431-type-propagation/smali/TypePropagation.smali b/test/431-type-propagation/smali/TypePropagation.smali
new file mode 100644
index 0000000..817f0c5
--- /dev/null
+++ b/test/431-type-propagation/smali/TypePropagation.smali
@@ -0,0 +1,43 @@
+# Copyright (C) 2014 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.class public LTypePropagation;
+
+.super Ljava/lang/Object;
+
+.method public static method([I)V
+   .registers 3
+   const/4 v0, 0
+   aget v1, v2, v0
+   add-int v2, v1, v0
+   if-eq v1, v0, :end
+   # Putting a float in v1 will lead to the creation of a phi with one
+   # float input and one integer input. Since the SSA builder trusts
+   # the verifier, it assumes that the integer input must be converted
+   # to float. However, since v0 is not used afterwards, the verifier
+   # hasn't ensured that. Therefore, the compiler must remove
+   # the phi prior to doing type propagation.
+   int-to-float v1, v0
+   :end
+   # Do a call to create an environment that will capture all Dex registers.
+   # This environment is the reason why a phi is created at the join block
+   # of the if.
+   invoke-static {}, LTypePropagation;->emptyMethod()V
+   return-void
+.end method
+
+.method public static emptyMethod()V
+   .registers 0
+   return-void
+.end method
diff --git a/test/431-type-propagation/src/Main.java b/test/431-type-propagation/src/Main.java
new file mode 100644
index 0000000..91dfe10
--- /dev/null
+++ b/test/431-type-propagation/src/Main.java
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Method;
+
+public class Main {
+  public static void main(String[] args) throws Exception {
+    System.out.println("Enter");
+    Class<?> c = Class.forName("TypePropagation");
+    Method m = c.getMethod("method", int[].class);
+    int[] array = new int[7];
+    Object[] arguments = { array };
+    m.invoke(null, arguments);
+  }
+}
diff --git a/test/432-optimizing-cmp/expected.txt b/test/432-optimizing-cmp/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/432-optimizing-cmp/expected.txt
diff --git a/test/432-optimizing-cmp/info.txt b/test/432-optimizing-cmp/info.txt
new file mode 100644
index 0000000..fad6cee
--- /dev/null
+++ b/test/432-optimizing-cmp/info.txt
@@ -0,0 +1 @@
+Tests for compare operations.
diff --git a/test/432-optimizing-cmp/smali/cmp.smali b/test/432-optimizing-cmp/smali/cmp.smali
new file mode 100644
index 0000000..470d940
--- /dev/null
+++ b/test/432-optimizing-cmp/smali/cmp.smali
@@ -0,0 +1,33 @@
+.class public LTestCmp;
+
+.super Ljava/lang/Object;
+
+.method public static $opt$CmpLong(JJ)I
+   .registers 5
+   cmp-long v0, v1, v3
+   return v0
+.end method
+
+.method public static $opt$CmpGtFloat(FF)I
+   .registers 3
+   cmpg-float v0, v1, v2
+   return v0
+.end method
+
+.method public static $opt$CmpLtFloat(FF)I
+   .registers 3
+   cmpl-float v0, v1, v2
+   return v0
+.end method
+
+.method public static $opt$CmpGtDouble(DD)I
+   .registers 5
+   cmpg-double v0, v1, v3
+   return v0
+.end method
+
+.method public static $opt$CmpLtDouble(DD)I
+   .registers 5
+   cmpl-double v0, v1, v3
+   return v0
+.end method
diff --git a/test/432-optimizing-cmp/src/Main.java b/test/432-optimizing-cmp/src/Main.java
new file mode 100644
index 0000000..3c7b13f
--- /dev/null
+++ b/test/432-optimizing-cmp/src/Main.java
@@ -0,0 +1,227 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Method;
+
+public class Main {
+
+  public static void main(String[] args) throws Exception {
+    cmpLong();
+    cmpFloat();
+    cmpDouble();
+  }
+
+  private static void cmpLong() throws Exception {
+    expectLt(3L, 5L);
+    expectGt(5L, 3L);
+    expectLt(Long.MIN_VALUE, Long.MAX_VALUE);
+    expectGt(Long.MAX_VALUE, Long.MIN_VALUE);
+
+    expectEquals(0, smaliCmpLong(0L, 0L));
+    expectEquals(0, smaliCmpLong(1L, 1L));
+    expectEquals(-1, smaliCmpLong(1L, 2L));
+    expectEquals(1, smaliCmpLong(2L, 1L));
+    expectEquals(-1, smaliCmpLong(Long.MIN_VALUE, Long.MAX_VALUE));
+    expectEquals(1, smaliCmpLong(Long.MAX_VALUE, Long.MIN_VALUE));
+    expectEquals(0, smaliCmpLong(Long.MIN_VALUE, Long.MIN_VALUE));
+    expectEquals(0, smaliCmpLong(Long.MAX_VALUE, Long.MAX_VALUE));
+  }
+
+  private static void cmpFloat() throws Exception {
+    expectLt(3.1F, 5.1F);
+    expectGt(5.1F, 3.1F);
+    expectLt(Float.MIN_VALUE, Float.MAX_VALUE);
+    expectGt(Float.MAX_VALUE, Float.MIN_VALUE);
+    expectFalse(3.1F, Float.NaN);
+    expectFalse(Float.NaN, 3.1F);
+
+    expectEquals(0, smaliCmpGtFloat(0F, 0F));
+    expectEquals(0, smaliCmpGtFloat(1F, 1F));
+    expectEquals(-1, smaliCmpGtFloat(1.1F, 2.1F));
+    expectEquals(1, smaliCmpGtFloat(2.1F, 1.1F));
+    expectEquals(-1, smaliCmpGtFloat(Float.MIN_VALUE, Float.MAX_VALUE));
+    expectEquals(1, smaliCmpGtFloat(Float.MAX_VALUE, Float.MIN_VALUE));
+    expectEquals(0, smaliCmpGtFloat(Float.MIN_VALUE, Float.MIN_VALUE));
+    expectEquals(0, smaliCmpGtFloat(Float.MAX_VALUE, Float.MAX_VALUE));
+    expectEquals(1, smaliCmpGtFloat(5F, Float.NaN));
+    expectEquals(1, smaliCmpGtFloat(Float.NaN, 5F));
+
+    expectEquals(0, smaliCmpLtFloat(0F, 0F));
+    expectEquals(0, smaliCmpLtFloat(1F, 1F));
+    expectEquals(-1, smaliCmpLtFloat(1.1F, 2.1F));
+    expectEquals(1, smaliCmpLtFloat(2.1F, 1.1F));
+    expectEquals(-1, smaliCmpLtFloat(Float.MIN_VALUE, Float.MAX_VALUE));
+    expectEquals(1, smaliCmpLtFloat(Float.MAX_VALUE, Float.MIN_VALUE));
+    expectEquals(0, smaliCmpLtFloat(Float.MIN_VALUE, Float.MIN_VALUE));
+    expectEquals(0, smaliCmpLtFloat(Float.MAX_VALUE, Float.MAX_VALUE));
+    expectEquals(-1, smaliCmpLtFloat(5F, Float.NaN));
+    expectEquals(-1, smaliCmpLtFloat(Float.NaN, 5F));
+  }
+
+  private static void cmpDouble() throws Exception {
+    expectLt(3.1D, 5.1D);
+    expectGt(5.1D, 3.1D);
+    expectLt(Double.MIN_VALUE, Double.MAX_VALUE);
+    expectGt(Double.MAX_VALUE, Double.MIN_VALUE);
+    expectFalse(3.1D, Double.NaN);
+    expectFalse(Double.NaN, 3.1D);
+
+    expectEquals(0, smaliCmpGtDouble(0D, 0D));
+    expectEquals(0, smaliCmpGtDouble(1D, 1D));
+    expectEquals(-1, smaliCmpGtDouble(1.1D, 2.1D));
+    expectEquals(1, smaliCmpGtDouble(2.1D, 1.1D));
+    expectEquals(-1, smaliCmpGtDouble(Double.MIN_VALUE, Double.MAX_VALUE));
+    expectEquals(1, smaliCmpGtDouble(Double.MAX_VALUE, Double.MIN_VALUE));
+    expectEquals(0, smaliCmpGtDouble(Double.MIN_VALUE, Double.MIN_VALUE));
+    expectEquals(0, smaliCmpGtDouble(Double.MAX_VALUE, Double.MAX_VALUE));
+    expectEquals(1, smaliCmpGtDouble(5D, Double.NaN));
+    expectEquals(1, smaliCmpGtDouble(Double.NaN, 5D));
+
+    expectEquals(0, smaliCmpLtDouble(0D, 0D));
+    expectEquals(0, smaliCmpLtDouble(1D, 1D));
+    expectEquals(-1, smaliCmpLtDouble(1.1D, 2.1D));
+    expectEquals(1, smaliCmpLtDouble(2.1D, 1.1D));
+    expectEquals(-1, smaliCmpLtDouble(Double.MIN_VALUE, Double.MAX_VALUE));
+    expectEquals(1, smaliCmpLtDouble(Double.MAX_VALUE, Double.MIN_VALUE));
+    expectEquals(0, smaliCmpLtDouble(Double.MIN_VALUE, Double.MIN_VALUE));
+    expectEquals(0, smaliCmpLtDouble(Double.MAX_VALUE, Double.MAX_VALUE));
+    expectEquals(-1, smaliCmpLtDouble(5D, Double.NaN));
+    expectEquals(-1, smaliCmpLtDouble(Float.NaN, 5D));
+  }
+
+ static boolean $opt$lt(long a, long b) {
+    return a < b;
+  }
+
+  static boolean $opt$lt(float a, float b) {
+    return a < b;
+  }
+
+  static boolean $opt$lt(double a, double b) {
+    return a < b;
+  }
+
+  static boolean $opt$gt(long a, long b) {
+    return a > b;
+  }
+
+  static boolean $opt$gt(float a, float b) {
+    return a > b;
+  }
+
+  static boolean $opt$gt(double a, double b) {
+    return a > b;
+  }
+
+  // Wrappers around methods located in file cmp.smali.
+
+  private static int smaliCmpLong(long a, long b) throws Exception {
+    Class<?> c = Class.forName("TestCmp");
+    Method m = c.getMethod("$opt$CmpLong", long.class, long.class);
+    int result = (Integer)m.invoke(null, a, b);
+    return result;
+  }
+
+  private static int smaliCmpGtFloat(float a, float b) throws Exception {
+    Class<?> c = Class.forName("TestCmp");
+    Method m = c.getMethod("$opt$CmpGtFloat", float.class, float.class);
+    int result = (Integer)m.invoke(null, a, b);
+    return result;
+  }
+
+  private static int smaliCmpLtFloat(float a, float b) throws Exception {
+    Class<?> c = Class.forName("TestCmp");
+    Method m = c.getMethod("$opt$CmpLtFloat", float.class, float.class);
+    int result = (Integer)m.invoke(null, a, b);
+    return result;
+  }
+
+  private static int smaliCmpGtDouble(double a, double b) throws Exception {
+    Class<?> c = Class.forName("TestCmp");
+    Method m = c.getMethod("$opt$CmpGtDouble", double.class, double.class);
+    int result = (Integer)m.invoke(null, a, b);
+    return result;
+  }
+
+  private static int smaliCmpLtDouble(double a, double b) throws Exception {
+    Class<?> c = Class.forName("TestCmp");
+    Method m = c.getMethod("$opt$CmpLtDouble", double.class, double.class);
+    int result = (Integer)m.invoke(null, a, b);
+    return result;
+  }
+
+    public static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void expectLt(long a, long b) {
+    if (!$opt$lt(a, b)) {
+      throw new Error("Expected: " + a + " < " + b);
+    }
+  }
+
+  public static void expectGt(long a, long b) {
+    if (!$opt$gt(a, b)) {
+      throw new Error("Expected: " + a + " > " + b);
+    }
+  }
+
+  public static void expectLt(float a, float b) {
+    if (!$opt$lt(a, b)) {
+      throw new Error("Expected: " + a + " < " + b);
+    }
+  }
+
+  public static void expectGt(float a, float b) {
+    if (!$opt$gt(a, b)) {
+      throw new Error("Expected: " + a + " > " + b);
+    }
+  }
+
+  public static void expectFalse(float a, float b) {
+    if ($opt$lt(a, b)) {
+      throw new Error("Not expecting: " + a + " < " + b);
+    }
+    if ($opt$gt(a, b)) {
+      throw new Error("Not expecting: " + a + " > " + b);
+    }
+  }
+
+  public static void expectLt(double a, double b) {
+    if (!$opt$lt(a, b)) {
+      throw new Error("Expected: " + a + " < " + b);
+    }
+  }
+
+  public static void expectGt(double a, double b) {
+    if (!$opt$gt(a, b)) {
+      throw new Error("Expected: " + a + " > " + b);
+    }
+  }
+
+  public static void expectFalse(double a, double b) {
+    if ($opt$lt(a, b)) {
+      throw new Error("Not expecting: " + a + " < " + b);
+    }
+    if ($opt$gt(a, b)) {
+      throw new Error("Not expecting: " + a + " > " + b);
+    }
+  }
+
+}
+
diff --git a/test/433-gvn/expected.txt b/test/433-gvn/expected.txt
new file mode 100644
index 0000000..d81cc07
--- /dev/null
+++ b/test/433-gvn/expected.txt
@@ -0,0 +1 @@
+42
diff --git a/test/433-gvn/info.txt b/test/433-gvn/info.txt
new file mode 100644
index 0000000..bcdab15
--- /dev/null
+++ b/test/433-gvn/info.txt
@@ -0,0 +1,3 @@
+Regression test for the optimizing compiler's GVN, that
+used to not take into account all side effects between
+a dominator and its dominated blocks.
diff --git a/test/433-gvn/src/Main.java b/test/433-gvn/src/Main.java
new file mode 100644
index 0000000..f9cb594
--- /dev/null
+++ b/test/433-gvn/src/Main.java
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+    System.out.println(foo());
+  }
+
+  public static int foo() {
+    Main m = new Main();
+    int a = m.field;
+    if (a == 0) {
+      m.field = 42;
+      if (m.test) {
+        a = 3;
+      }
+    }
+    // The compiler used to GVN this field get with the one line 24,
+    // even though the field is updated in the if.
+    return m.field;
+  }
+
+  public int field;
+  public boolean test = true;
+}
diff --git a/test/434-shifter-operand/expected.txt b/test/434-shifter-operand/expected.txt
new file mode 100644
index 0000000..52289c6
--- /dev/null
+++ b/test/434-shifter-operand/expected.txt
@@ -0,0 +1,3 @@
+false
+false
+true
diff --git a/test/434-shifter-operand/info.txt b/test/434-shifter-operand/info.txt
new file mode 100644
index 0000000..1ec9adc
--- /dev/null
+++ b/test/434-shifter-operand/info.txt
@@ -0,0 +1,2 @@
+Regression test for the arm backend of the optimizing
+compiler, that used to misuse ShifterOperand::CanHold.
diff --git a/test/434-shifter-operand/src/Main.java b/test/434-shifter-operand/src/Main.java
new file mode 100644
index 0000000..4d188eb
--- /dev/null
+++ b/test/434-shifter-operand/src/Main.java
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+    System.out.println(foo(42));
+    System.out.println(foo(0xffffffff));
+    System.out.println(foo(0xf0000000));
+  }
+
+  public static boolean foo(int a) {
+    return a < 0xf000000b;
+  }
+}
diff --git a/test/800-smali/expected.txt b/test/800-smali/expected.txt
index 01d7b81..0f7001f 100644
--- a/test/800-smali/expected.txt
+++ b/test/800-smali/expected.txt
@@ -1,3 +1,4 @@
+PackedSwitch
 b/17790197
 b/17978759
 FloatBadArgReg
@@ -5,4 +6,6 @@
 sameFieldNames
 b/18380491
 invoke-super abstract
+BadCaseInOpRegRegReg
+CmpLong
 Done!
diff --git a/test/800-smali/smali/BadCaseInOpRegRegReg.smali b/test/800-smali/smali/BadCaseInOpRegRegReg.smali
new file mode 100644
index 0000000..2683790
--- /dev/null
+++ b/test/800-smali/smali/BadCaseInOpRegRegReg.smali
@@ -0,0 +1,13 @@
+.class public LBadCaseInOpRegRegReg;
+
+.super Ljava/lang/Object;
+
+.method public static getInt()I
+    .registers 2
+    const/4 v0, 0x0
+    const/4 v1, 0x1
+    add-int/2addr v0, v1
+    add-int/lit8 v1, v0, 0x1
+    mul-int v0, v1, v0
+    return v0
+.end method
diff --git a/test/800-smali/smali/CmpLong.smali b/test/800-smali/smali/CmpLong.smali
new file mode 100644
index 0000000..d54812f
--- /dev/null
+++ b/test/800-smali/smali/CmpLong.smali
@@ -0,0 +1,18 @@
+.class public LCmpLong;
+.super Ljava/lang/Object;
+
+
+.method public constructor <init>()V
+.registers 1
+       invoke-direct {p0}, Ljava/lang/Object;-><init>()V
+       return-void
+.end method
+
+.method public static run()I
+.registers 5000
+       const-wide v100, 5678233453L
+       move-wide/from16 v101, v100
+       const-wide v4, 5678233453L
+       cmp-long v0, v101, v4
+       return v0
+.end method
diff --git a/test/800-smali/smali/PackedSwitch.smali b/test/800-smali/smali/PackedSwitch.smali
new file mode 100644
index 0000000..6a3e5f0
--- /dev/null
+++ b/test/800-smali/smali/PackedSwitch.smali
@@ -0,0 +1,26 @@
+.class public LPackedSwitch;
+
+.super Ljava/lang/Object;
+
+.method public static packedSwitch(I)I
+    .registers 2
+
+    const/4 v0, 0
+    packed-switch v0, :switch_data
+    goto :default
+
+    :switch_data
+    .packed-switch 0x0
+        :case
+    .end packed-switch
+
+    :return
+    return v1
+
+    :default
+    goto :return
+
+    :case
+    goto :return
+
+.end method
diff --git a/test/800-smali/src/Main.java b/test/800-smali/src/Main.java
index 3f613ef..f2c1ab5 100644
--- a/test/800-smali/src/Main.java
+++ b/test/800-smali/src/Main.java
@@ -49,6 +49,8 @@
     public Main() {
         // Create the test cases.
         testCases = new LinkedList<TestCase>();
+        testCases.add(new TestCase("PackedSwitch", "PackedSwitch", "packedSwitch",
+          new Object[]{123}, null, 123));
 
         testCases.add(new TestCase("b/17790197", "B17790197", "getInt", null, null, 100));
         testCases.add(new TestCase("b/17978759", "B17978759", "test", null, new VerifyError(), null));
@@ -60,6 +62,8 @@
             new Object[]{42}, null, 42));
         testCases.add(new TestCase("invoke-super abstract", "B18380491ConcreteClass", "foo",
             new Object[]{0}, new AbstractMethodError(), null));
+        testCases.add(new TestCase("BadCaseInOpRegRegReg", "BadCaseInOpRegRegReg", "getInt", null, null, 2));
+        testCases.add(new TestCase("CmpLong", "CmpLong", "run", null, null, 0));
     }
 
     public void runTests() {
diff --git a/test/801-VoidCheckCast/classes.dex b/test/801-VoidCheckCast/classes.dex
new file mode 100644
index 0000000..e6f0f02
--- /dev/null
+++ b/test/801-VoidCheckCast/classes.dex
Binary files differ
diff --git a/test/801-VoidCheckCast/expected.txt b/test/801-VoidCheckCast/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/801-VoidCheckCast/expected.txt
diff --git a/test/801-VoidCheckCast/info.txt b/test/801-VoidCheckCast/info.txt
new file mode 100644
index 0000000..422f740
--- /dev/null
+++ b/test/801-VoidCheckCast/info.txt
@@ -0,0 +1,4 @@
+A test that is only available as a DEX binary.
+
+This tests that an attempt to use check-cast with the void type doesn't
+cause the compiler to crash.
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 11713d4..3c959fb 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -177,15 +177,6 @@
 
 TEST_ART_TIMING_SENSITIVE_RUN_TESTS :=
 
-TEST_ART_BROKEN_RUN_TESTS := \
-  004-ThreadStress
-
-ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES),$(PREBUILD_TYPES), \
-      $(COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES),$(JNI_TYPES), \
-      $(IMAGE_TYPES), $(PICTEST_TYPES), $(TEST_ART_BROKEN_RUN_TESTS), $(ALL_ADDRESS_SIZES))
-
-TEST_ART_BROKEN_RUN_TESTS :=
-
 # Note 116-nodex2oat is not broken per-se it just doesn't (and isn't meant to) work with --prebuild.
 TEST_ART_BROKEN_PREBUILD_RUN_TESTS := \
   116-nodex2oat
@@ -308,14 +299,11 @@
 # Known broken tests for the arm64 optimizing compiler backend.
 TEST_ART_BROKEN_OPTIMIZING_ARM64_RUN_TESTS := \
   003-omnibus-opcodes \
-  004-NativeAllocations \
   004-ReferenceMap \
   005-annotations \
   009-instanceof \
   010-instance \
-  012-math \
   023-many-interfaces \
-  037-inherit \
   044-proxy \
   045-reflect-array \
   046-reflect \
@@ -325,19 +313,16 @@
   068-classloader \
   069-field-type \
   071-dexfile \
-  083-compiler-regressions \
   106-exceptions2 \
   107-int-math2 \
-  114-ParallelGC \
   201-built-in-exception-detail-messages \
   407-arrays \
   412-new-array \
   422-instanceof \
-  422-type-conversion \
   424-checkcast \
   427-bounds \
-  428-optimizing-arith-rem \
-  701-easy-div-rem \
+  430-live-register-slow-path \
+  800-smali \
 
 ifneq (,$(filter optimizing,$(COMPILER_TYPES)))
   ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,target,$(RUN_TYPES),$(PREBUILD_TYPES), \
diff --git a/test/etc/default-build b/test/etc/default-build
index ab859ec..6731ad3 100755
--- a/test/etc/default-build
+++ b/test/etc/default-build
@@ -17,6 +17,11 @@
 # Stop if something fails.
 set -e
 
+if [ -e classes.dex ]; then
+  zip $TEST_NAME.jar classes.dex
+  exit 0
+fi
+
 mkdir classes
 ${JAVAC} -d classes `find src -name '*.java'`
 
diff --git a/test/run-test b/test/run-test
index 843714b..e9dd86a 100755
--- a/test/run-test
+++ b/test/run-test
@@ -586,7 +586,7 @@
         echo '#################### info'
         cat "${td_info}" | sed 's/^/# /g'
         echo '#################### diffs'
-        diff --strip-trailing-cr -u "$expected" "$output" | tail -n 500
+        diff --strip-trailing-cr -u "$expected" "$output" | tail -n 2000
         echo '####################'
         echo ' '
     fi
diff --git a/tools/run-libcore-tests.sh b/tools/run-libcore-tests.sh
new file mode 100755
index 0000000..5c7e3c5
--- /dev/null
+++ b/tools/run-libcore-tests.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#
+# Copyright (C) 2014 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ ! -d libcore ]; then
+  echo "Script needs to be run at the root of the android tree"
+  exit 1
+fi
+
+# Jar containing all the tests.
+test_jar=out/target/common/obj/JAVA_LIBRARIES/core-tests_intermediates/javalib.jar
+
+if [ ! -f $test_jar ]; then
+  echo "Before running, you must build core-tests and vogar: make core-tests vogar vogar.jar"
+  exit 1
+fi
+
+# Packages that currently report no failures.
+working_packages=("libcore.java.lang"
+                  "libcore.java.util"
+                  "org.apache.harmony.annotation"
+                  "org.apache.harmony.regex"
+                  "org.apache.harmony.tests.java.lang"
+                  "org.apache.harmony.tests.java.util"
+                  "tests.java.lang.String")
+
+# Run the tests using vogar.
+echo "Running tests for the following test packages:"
+echo ${working_packages[@]} | tr " " "\n"
+vogar $@ --classpath $test_jar ${working_packages[@]}
