Merge "Make patchoat match offset when given a patched image"
diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index 3a1bd09..9f15294 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -275,6 +275,16 @@
   -DVIXL_DEBUG \
   -UNDEBUG
 
+# The latest clang update trips over many of the files in art and never finishes
+# compiling for aarch64 with -O3 (or -O2). Drop back to -O1 while we investigate
+# to stop punishing the build server.
+ifeq ($(TARGET_ARCH),arm64)
+  ifeq ($(USE_CLANG_PLATFORM_BUILD),true)
+    art_debug_cflags += -O1
+    art_non_debug_cflags += -O1
+  endif
+endif
+
 art_host_non_debug_cflags := $(art_non_debug_cflags)
 art_target_non_debug_cflags := $(art_non_debug_cflags)
 
@@ -299,6 +309,16 @@
 ART_HOST_CFLAGS += -DART_DEFAULT_INSTRUCTION_SET_FEATURES=default
 ART_HOST_ASFLAGS += $(art_asflags)
 
+# Disable -Wpessimizing-move: triggered for art/runtime/base/variant_map.h:261
+# Adding this flag to art_clang_cflags doesn't work because -Wall gets added to
+# ART_HOST_CFLAGS (as a part of art_cflags) after
+# -Wno-pessimizing-move.  Instead, add the flag here to both
+# ART_TARGET_CLANG_CFLAGS and ART_HOST_CFLAGS
+ifeq ($(ART_HOST_CLANG),true)
+ART_HOST_CFLAGS += -Wno-pessimizing-move
+endif
+ART_TARGET_CLANG_CFLAGS += -Wno-pessimizing-move
+
 ifndef LIBART_IMG_TARGET_BASE_ADDRESS
   $(error LIBART_IMG_TARGET_BASE_ADDRESS unset)
 endif
diff --git a/cmdline/cmdline_parse_result.h b/cmdline/cmdline_parse_result.h
index 717642f..982f178 100644
--- a/cmdline/cmdline_parse_result.h
+++ b/cmdline/cmdline_parse_result.h
@@ -126,7 +126,7 @@
     : CmdlineResult(kSuccess), value_(value), has_value_(true) {}
   explicit CmdlineParseResult(T&& value)
     : CmdlineResult(kSuccess), value_(std::forward<T>(value)), has_value_(true) {}
-  explicit CmdlineParseResult()
+  CmdlineParseResult()
     : CmdlineResult(kSuccess), value_(), has_value_(false) {}
 
   T value_;
diff --git a/cmdline/cmdline_parser.h b/cmdline/cmdline_parser.h
index cebba65..cfc0967 100644
--- a/cmdline/cmdline_parser.h
+++ b/cmdline/cmdline_parser.h
@@ -497,11 +497,10 @@
   friend struct Builder;
 
   // Construct a new parser from the builder. Move all the arguments.
-  explicit CmdlineParser(bool ignore_unrecognized,
-                         std::vector<const char*>&& ignore_list,
-                         std::shared_ptr<SaveDestination> save_destination,
-                         std::vector<std::unique_ptr<detail::CmdlineParseArgumentAny>>&&
-                             completed_arguments)
+  CmdlineParser(bool ignore_unrecognized,
+                std::vector<const char*>&& ignore_list,
+                std::shared_ptr<SaveDestination> save_destination,
+                std::vector<std::unique_ptr<detail::CmdlineParseArgumentAny>>&& completed_arguments)
     : ignore_unrecognized_(ignore_unrecognized),
       ignore_list_(std::move(ignore_list)),
       save_destination_(save_destination),
diff --git a/cmdline/detail/cmdline_parse_argument_detail.h b/cmdline/detail/cmdline_parse_argument_detail.h
index 81ef36b..3009b32 100644
--- a/cmdline/detail/cmdline_parse_argument_detail.h
+++ b/cmdline/detail/cmdline_parse_argument_detail.h
@@ -300,9 +300,9 @@
     // be able to parse arguments.
     template <typename TArg>
     struct CmdlineParseArgument : CmdlineParseArgumentAny {
-      explicit CmdlineParseArgument(CmdlineParserArgumentInfo<TArg>&& argument_info,
-                                    std::function<void(TArg&)>&& save_argument,
-                                    std::function<TArg&(void)>&& load_argument)
+      CmdlineParseArgument(CmdlineParserArgumentInfo<TArg>&& argument_info,
+                           std::function<void(TArg&)>&& save_argument,
+                           std::function<TArg&(void)>&& load_argument)
           : argument_info_(std::forward<decltype(argument_info)>(argument_info)),
             save_argument_(std::forward<decltype(save_argument)>(save_argument)),
             load_argument_(std::forward<decltype(load_argument)>(load_argument)) {
diff --git a/cmdline/token_range.h b/cmdline/token_range.h
index 5b54384..3358067 100644
--- a/cmdline/token_range.h
+++ b/cmdline/token_range.h
@@ -45,7 +45,7 @@
 
   // Copying-from-iterator constructor
   template <typename ForwardIterator>
-  explicit TokenRange(ForwardIterator it_begin, ForwardIterator it_end)
+  TokenRange(ForwardIterator it_begin, ForwardIterator it_end)
     : token_list_(new TokenList(it_begin, it_end)),
       begin_(token_list_->begin()),
       end_(token_list_->end())
diff --git a/compiler/compiler.h b/compiler/compiler.h
index fcd3434..01ca46e 100644
--- a/compiler/compiler.h
+++ b/compiler/compiler.h
@@ -89,7 +89,7 @@
                                  const DexFile& dex_file);
 
  protected:
-  explicit Compiler(CompilerDriver* driver, uint64_t warning) :
+  Compiler(CompilerDriver* driver, uint64_t warning) :
       driver_(driver), maximum_compilation_time_before_warning_(warning) {
   }
 
diff --git a/compiler/dex/compiler_ir.h b/compiler/dex/compiler_ir.h
index d28df1d..5203355 100644
--- a/compiler/dex/compiler_ir.h
+++ b/compiler/dex/compiler_ir.h
@@ -129,7 +129,7 @@
    * Union containing the option value of either type.
    */
   union OptionContainer {
-    explicit OptionContainer(const OptionContainer& c, OptionType t) {
+    OptionContainer(const OptionContainer& c, OptionType t) {
       if (t == kString) {
         DCHECK(c.s != nullptr);
         s = strndup(c.s, kOptionStringMaxLength);
diff --git a/compiler/dex/mir_field_info.h b/compiler/dex/mir_field_info.h
index 04c58ac..053029d 100644
--- a/compiler/dex/mir_field_info.h
+++ b/compiler/dex/mir_field_info.h
@@ -138,7 +138,7 @@
       REQUIRES(!Locks::mutator_lock_);
 
   // Construct an unresolved instance field lowering info.
-  explicit MirIFieldLoweringInfo(uint16_t field_idx, DexMemAccessType type, bool is_quickened)
+  MirIFieldLoweringInfo(uint16_t field_idx, DexMemAccessType type, bool is_quickened)
       : MirFieldInfo(field_idx,
                      kFlagIsVolatile | (is_quickened ? kFlagIsQuickened : 0u),
                      type),  // Without kFlagIsStatic.
@@ -195,7 +195,7 @@
       REQUIRES(!Locks::mutator_lock_);
 
   // Construct an unresolved static field lowering info.
-  explicit MirSFieldLoweringInfo(uint16_t field_idx, DexMemAccessType type)
+  MirSFieldLoweringInfo(uint16_t field_idx, DexMemAccessType type)
       : MirFieldInfo(field_idx, kFlagIsVolatile | kFlagIsStatic, type),
         field_offset_(0u),
         storage_index_(DexFile::kDexNoIndex) {
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 23b7c42..8bf709a 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -261,7 +261,7 @@
     uint32_t arg[5];         /* vC/D/E/F/G in invoke or filled-new-array */
     Instruction::Code opcode;
 
-    explicit DecodedInstruction():vA(0), vB(0), vB_wide(0), vC(0), opcode(Instruction::NOP) {
+    DecodedInstruction() : vA(0), vB(0), vB_wide(0), vC(0), opcode(Instruction::NOP) {
     }
 
     /*
@@ -353,7 +353,7 @@
     uint32_t method_lowering_info;
   } meta;
 
-  explicit MIR() : offset(0), optimization_flags(0), m_unit_index(0), bb(NullBasicBlockId),
+  MIR() : offset(0), optimization_flags(0), m_unit_index(0), bb(NullBasicBlockId),
                  next(nullptr), ssa_rep(nullptr) {
     memset(&meta, 0, sizeof(meta));
   }
diff --git a/compiler/dex/pass_driver_me.h b/compiler/dex/pass_driver_me.h
index cbe4a02..d0af71c 100644
--- a/compiler/dex/pass_driver_me.h
+++ b/compiler/dex/pass_driver_me.h
@@ -36,7 +36,7 @@
 
 class PassDriverME: public PassDriver {
  public:
-  explicit PassDriverME(const PassManager* const pass_manager, CompilationUnit* cu)
+  PassDriverME(const PassManager* const pass_manager, CompilationUnit* cu)
       : PassDriver(pass_manager), pass_me_data_holder_(), dump_cfg_folder_("/sdcard/") {
         pass_me_data_holder_.bb = nullptr;
         pass_me_data_holder_.c_unit = cu;
@@ -314,4 +314,3 @@
 };
 }  // namespace art
 #endif  // ART_COMPILER_DEX_PASS_DRIVER_ME_H_
-
diff --git a/compiler/dex/pass_driver_me_opts.h b/compiler/dex/pass_driver_me_opts.h
index e94c189..c8093d0 100644
--- a/compiler/dex/pass_driver_me_opts.h
+++ b/compiler/dex/pass_driver_me_opts.h
@@ -29,9 +29,9 @@
 
 class PassDriverMEOpts : public PassDriverME {
  public:
-  explicit PassDriverMEOpts(const PassManager* const manager,
-                            const PassManager* const post_opt_pass_manager,
-                            CompilationUnit* cu)
+  PassDriverMEOpts(const PassManager* const manager,
+                   const PassManager* const post_opt_pass_manager,
+                   CompilationUnit* cu)
       : PassDriverME(manager, cu), post_opt_pass_manager_(post_opt_pass_manager) {
   }
 
diff --git a/compiler/dex/pass_driver_me_post_opt.h b/compiler/dex/pass_driver_me_post_opt.h
index 9e03c4e..94176db 100644
--- a/compiler/dex/pass_driver_me_post_opt.h
+++ b/compiler/dex/pass_driver_me_post_opt.h
@@ -28,7 +28,7 @@
 
 class PassDriverMEPostOpt : public PassDriverME {
  public:
-  explicit PassDriverMEPostOpt(const PassManager* const manager, CompilationUnit* cu)
+  PassDriverMEPostOpt(const PassManager* const manager, CompilationUnit* cu)
       : PassDriverME(manager, cu) {
   }
 
diff --git a/compiler/dex/quick/dex_file_method_inliner.cc b/compiler/dex/quick/dex_file_method_inliner.cc
index 7fc6fa2..42b792c 100644
--- a/compiler/dex/quick/dex_file_method_inliner.cc
+++ b/compiler/dex/quick/dex_file_method_inliner.cc
@@ -56,6 +56,7 @@
     false,  // kIntrinsicReferenceGetReferent
     false,  // kIntrinsicCharAt
     false,  // kIntrinsicCompareTo
+    false,  // kIntrinsicEquals
     false,  // kIntrinsicGetCharsNoCheck
     false,  // kIntrinsicIsEmptyOrLength
     false,  // kIntrinsicIndexOf
@@ -95,6 +96,7 @@
 static_assert(!kIntrinsicIsStatic[kIntrinsicReferenceGetReferent], "Get must not be static");
 static_assert(!kIntrinsicIsStatic[kIntrinsicCharAt], "CharAt must not be static");
 static_assert(!kIntrinsicIsStatic[kIntrinsicCompareTo], "CompareTo must not be static");
+static_assert(!kIntrinsicIsStatic[kIntrinsicEquals], "String equals must not be static");
 static_assert(!kIntrinsicIsStatic[kIntrinsicGetCharsNoCheck], "GetCharsNoCheck must not be static");
 static_assert(!kIntrinsicIsStatic[kIntrinsicIsEmptyOrLength], "IsEmptyOrLength must not be static");
 static_assert(!kIntrinsicIsStatic[kIntrinsicIndexOf], "IndexOf must not be static");
@@ -192,6 +194,7 @@
     "getReferent",           // kNameCacheReferenceGet
     "charAt",                // kNameCacheCharAt
     "compareTo",             // kNameCacheCompareTo
+    "equals",                // kNameCacheEquals
     "getCharsNoCheck",       // kNameCacheGetCharsNoCheck
     "isEmpty",               // kNameCacheIsEmpty
     "indexOf",               // kNameCacheIndexOf
@@ -284,6 +287,8 @@
     { kClassCacheVoid, 2, { kClassCacheLong, kClassCacheLong } },
     // kProtoCacheJS_V
     { kClassCacheVoid, 2, { kClassCacheLong, kClassCacheShort } },
+    // kProtoCacheObject_Z
+    { kClassCacheBoolean, 1, { kClassCacheJavaLangObject } },
     // kProtoCacheObjectJII_Z
     { kClassCacheBoolean, 4, { kClassCacheJavaLangObject, kClassCacheLong,
         kClassCacheInt, kClassCacheInt } },
@@ -418,6 +423,7 @@
 
     INTRINSIC(JavaLangString, CharAt, I_C, kIntrinsicCharAt, 0),
     INTRINSIC(JavaLangString, CompareTo, String_I, kIntrinsicCompareTo, 0),
+    INTRINSIC(JavaLangString, Equals, Object_Z, kIntrinsicEquals, 0),
     INTRINSIC(JavaLangString, GetCharsNoCheck, IICharArrayI_V, kIntrinsicGetCharsNoCheck, 0),
     INTRINSIC(JavaLangString, IsEmpty, _Z, kIntrinsicIsEmptyOrLength, kIntrinsicFlagIsEmpty),
     INTRINSIC(JavaLangString, IndexOf, II_I, kIntrinsicIndexOf, kIntrinsicFlagNone),
@@ -588,6 +594,9 @@
       return backend->GenInlinedCharAt(info);
     case kIntrinsicCompareTo:
       return backend->GenInlinedStringCompareTo(info);
+    case kIntrinsicEquals:
+      // Quick does not implement this intrinsic.
+      return false;
     case kIntrinsicGetCharsNoCheck:
       return backend->GenInlinedStringGetCharsNoCheck(info);
     case kIntrinsicIsEmptyOrLength:
diff --git a/compiler/dex/quick/dex_file_method_inliner.h b/compiler/dex/quick/dex_file_method_inliner.h
index bcb9ee5..d6c8bfb 100644
--- a/compiler/dex/quick/dex_file_method_inliner.h
+++ b/compiler/dex/quick/dex_file_method_inliner.h
@@ -170,6 +170,7 @@
       kNameCacheReferenceGetReferent,
       kNameCacheCharAt,
       kNameCacheCompareTo,
+      kNameCacheEquals,
       kNameCacheGetCharsNoCheck,
       kNameCacheIsEmpty,
       kNameCacheIndexOf,
@@ -243,6 +244,7 @@
       kProtoCacheJJ_J,
       kProtoCacheJJ_V,
       kProtoCacheJS_V,
+      kProtoCacheObject_Z,
       kProtoCacheObjectJII_Z,
       kProtoCacheObjectJJJ_Z,
       kProtoCacheObjectJObjectObject_Z,
diff --git a/compiler/dex/quick/lazy_debug_frame_opcode_writer.h b/compiler/dex/quick/lazy_debug_frame_opcode_writer.h
index 94ffd7f..3e9fb96 100644
--- a/compiler/dex/quick/lazy_debug_frame_opcode_writer.h
+++ b/compiler/dex/quick/lazy_debug_frame_opcode_writer.h
@@ -40,12 +40,11 @@
 
   const ArenaVector<uint8_t>* Patch(size_t code_size);
 
-  explicit LazyDebugFrameOpCodeWriter(LIR** last_lir_insn, bool enable_writes,
-                                      ArenaAllocator* allocator)
-    : Base(enable_writes, allocator->Adapter()),
-      last_lir_insn_(last_lir_insn),
-      advances_(allocator->Adapter()),
-      patched_(false) {
+  LazyDebugFrameOpCodeWriter(LIR** last_lir_insn, bool enable_writes, ArenaAllocator* allocator)
+      : Base(enable_writes, allocator->Adapter()),
+        last_lir_insn_(last_lir_insn),
+        advances_(allocator->Adapter()),
+        patched_(false) {
   }
 
  private:
diff --git a/compiler/dex/quick/mips/target_mips.cc b/compiler/dex/quick/mips/target_mips.cc
index b098bc2..ec4bad7 100644
--- a/compiler/dex/quick/mips/target_mips.cc
+++ b/compiler/dex/quick/mips/target_mips.cc
@@ -49,9 +49,11 @@
 static constexpr RegStorage core_temps_arr_32[] =
     {rs_rV0, rs_rV1, rs_rA0, rs_rA1, rs_rA2, rs_rA3, rs_rT0_32, rs_rT1_32, rs_rT2_32, rs_rT3_32,
      rs_rT4_32, rs_rT5_32, rs_rT6_32, rs_rT7_32, rs_rT8};
-static constexpr RegStorage sp_temps_arr_32[] =
+static constexpr RegStorage sp_fr0_temps_arr_32[] =
     {rs_rF0, rs_rF1, rs_rF2, rs_rF3, rs_rF4, rs_rF5, rs_rF6, rs_rF7, rs_rF8, rs_rF9, rs_rF10,
      rs_rF11, rs_rF12, rs_rF13, rs_rF14, rs_rF15};
+static constexpr RegStorage sp_fr1_temps_arr_32[] =
+    {rs_rF0, rs_rF2, rs_rF4, rs_rF6, rs_rF8, rs_rF10, rs_rF12, rs_rF14};
 static constexpr RegStorage dp_fr0_temps_arr_32[] =
     {rs_rD0_fr0, rs_rD1_fr0, rs_rD2_fr0, rs_rD3_fr0, rs_rD4_fr0, rs_rD5_fr0, rs_rD6_fr0,
      rs_rD7_fr0};
@@ -130,7 +132,8 @@
 static constexpr ArrayRef<const RegStorage> dp_fr1_regs_32(dp_fr1_regs_arr_32);
 static constexpr ArrayRef<const RegStorage> reserved_regs_32(reserved_regs_arr_32);
 static constexpr ArrayRef<const RegStorage> core_temps_32(core_temps_arr_32);
-static constexpr ArrayRef<const RegStorage> sp_temps_32(sp_temps_arr_32);
+static constexpr ArrayRef<const RegStorage> sp_fr0_temps_32(sp_fr0_temps_arr_32);
+static constexpr ArrayRef<const RegStorage> sp_fr1_temps_32(sp_fr1_temps_arr_32);
 static constexpr ArrayRef<const RegStorage> dp_fr0_temps_32(dp_fr0_temps_arr_32);
 static constexpr ArrayRef<const RegStorage> dp_fr1_temps_32(dp_fr1_temps_arr_32);
 
@@ -591,22 +594,22 @@
     Clobber(rs_rFP);
     Clobber(rs_rRA);
     Clobber(rs_rF0);
-    Clobber(rs_rF1);
     Clobber(rs_rF2);
-    Clobber(rs_rF3);
     Clobber(rs_rF4);
-    Clobber(rs_rF5);
     Clobber(rs_rF6);
-    Clobber(rs_rF7);
     Clobber(rs_rF8);
-    Clobber(rs_rF9);
     Clobber(rs_rF10);
-    Clobber(rs_rF11);
     Clobber(rs_rF12);
-    Clobber(rs_rF13);
     Clobber(rs_rF14);
-    Clobber(rs_rF15);
     if (fpuIs32Bit_) {
+      Clobber(rs_rF1);
+      Clobber(rs_rF3);
+      Clobber(rs_rF5);
+      Clobber(rs_rF7);
+      Clobber(rs_rF9);
+      Clobber(rs_rF11);
+      Clobber(rs_rF13);
+      Clobber(rs_rF15);
       Clobber(rs_rD0_fr0);
       Clobber(rs_rD1_fr0);
       Clobber(rs_rD2_fr0);
@@ -717,24 +720,26 @@
                                               fpuIs32Bit_ ? dp_fr0_regs_32 : dp_fr1_regs_32,
                                               reserved_regs_32, empty_pool,  // reserved64
                                               core_temps_32, empty_pool,  // core64_temps
-                                              sp_temps_32,
+                                              fpuIs32Bit_ ? sp_fr0_temps_32 : sp_fr1_temps_32,
                                               fpuIs32Bit_ ? dp_fr0_temps_32 : dp_fr1_temps_32));
 
     // Alias single precision floats to appropriate half of overlapping double.
     for (RegisterInfo* info : reg_pool_->sp_regs_) {
       int sp_reg_num = info->GetReg().GetRegNum();
       int dp_reg_num = sp_reg_num & ~1;
-      RegStorage dp_reg = RegStorage::Solo64(RegStorage::kFloatingPoint | dp_reg_num);
-      RegisterInfo* dp_reg_info = GetRegInfo(dp_reg);
-      // Double precision register's master storage should refer to itself.
-      DCHECK_EQ(dp_reg_info, dp_reg_info->Master());
-      // Redirect single precision's master storage to master.
-      info->SetMaster(dp_reg_info);
-      // Singles should show a single 32-bit mask bit, at first referring to the low half.
-      DCHECK_EQ(info->StorageMask(), 0x1U);
-      if (sp_reg_num & 1) {
-        // For odd singles, change to user the high word of the backing double.
-        info->SetStorageMask(0x2);
+      if (fpuIs32Bit_ || (sp_reg_num == dp_reg_num)) {
+        RegStorage dp_reg = RegStorage::Solo64(RegStorage::kFloatingPoint | dp_reg_num);
+        RegisterInfo* dp_reg_info = GetRegInfo(dp_reg);
+        // Double precision register's master storage should refer to itself.
+        DCHECK_EQ(dp_reg_info, dp_reg_info->Master());
+        // Redirect single precision's master storage to master.
+        info->SetMaster(dp_reg_info);
+        // Singles should show a single 32-bit mask bit, at first referring to the low half.
+        DCHECK_EQ(info->StorageMask(), 0x1U);
+        if (sp_reg_num & 1) {
+          // For odd singles, change to user the high word of the backing double.
+          info->SetStorageMask(0x2);
+        }
       }
     }
   }
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 299b995..fa4667e 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -829,6 +829,18 @@
       std::set<std::pair<uint16_t, const DexFile*>>& exceptions_to_resolve)
      : exceptions_to_resolve_(exceptions_to_resolve) {}
 
+  virtual bool Visit(mirror::Class* c) OVERRIDE SHARED_REQUIRES(Locks::mutator_lock_) {
+    const auto pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
+    for (auto& m : c->GetVirtualMethods(pointer_size)) {
+      ResolveExceptionsForMethod(&m);
+    }
+    for (auto& m : c->GetDirectMethods(pointer_size)) {
+      ResolveExceptionsForMethod(&m);
+    }
+    return true;
+  }
+
+ private:
   void ResolveExceptionsForMethod(ArtMethod* method_handle) SHARED_REQUIRES(Locks::mutator_lock_) {
     const DexFile::CodeItem* code_item = method_handle->GetCodeItem();
     if (code_item == nullptr) {
@@ -864,18 +876,6 @@
     }
   }
 
-  virtual bool Visit(mirror::Class* c) OVERRIDE SHARED_REQUIRES(Locks::mutator_lock_) {
-    const auto pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
-    for (auto& m : c->GetVirtualMethods(pointer_size)) {
-      ResolveExceptionsForMethod(&m);
-    }
-    for (auto& m : c->GetDirectMethods(pointer_size)) {
-      ResolveExceptionsForMethod(&m);
-    }
-    return true;
-  }
-
- private:
   std::set<std::pair<uint16_t, const DexFile*>>& exceptions_to_resolve_;
 };
 
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 4de9c73..5718be9 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -89,19 +89,19 @@
   // enabled.  "image_classes" lets the compiler know what classes it
   // can assume will be in the image, with null implying all available
   // classes.
-  explicit CompilerDriver(const CompilerOptions* compiler_options,
-                          VerificationResults* verification_results,
-                          DexFileToMethodInlinerMap* method_inliner_map,
-                          Compiler::Kind compiler_kind,
-                          InstructionSet instruction_set,
-                          const InstructionSetFeatures* instruction_set_features,
-                          bool image, std::unordered_set<std::string>* image_classes,
-                          std::unordered_set<std::string>* compiled_classes,
-                          std::unordered_set<std::string>* compiled_methods,
-                          size_t thread_count, bool dump_stats, bool dump_passes,
-                          const std::string& dump_cfg_file_name,
-                          CumulativeLogger* timer, int swap_fd,
-                          const std::string& profile_file);
+  CompilerDriver(const CompilerOptions* compiler_options,
+                 VerificationResults* verification_results,
+                 DexFileToMethodInlinerMap* method_inliner_map,
+                 Compiler::Kind compiler_kind,
+                 InstructionSet instruction_set,
+                 const InstructionSetFeatures* instruction_set_features,
+                 bool image, std::unordered_set<std::string>* image_classes,
+                 std::unordered_set<std::string>* compiled_classes,
+                 std::unordered_set<std::string>* compiled_methods,
+                 size_t thread_count, bool dump_stats, bool dump_passes,
+                 const std::string& dump_cfg_file_name,
+                 CumulativeLogger* timer, int swap_fd,
+                 const std::string& profile_file);
 
   ~CompilerDriver();
 
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index a03ff75..3a3410c 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -244,8 +244,8 @@
   DCHECK(object != nullptr);
   DCHECK_NE(image_objects_offset_begin_, 0u);
 
-  size_t previous_bin_sizes = bin_slot_previous_sizes_[bin_slot.GetBin()];
-  size_t new_offset = image_objects_offset_begin_ + previous_bin_sizes + bin_slot.GetIndex();
+  size_t bin_slot_offset = bin_slot_offsets_[bin_slot.GetBin()];
+  size_t new_offset = bin_slot_offset + bin_slot.GetIndex();
   DCHECK_ALIGNED(new_offset, kObjectAlignment);
 
   SetImageOffset(object, new_offset);
@@ -866,8 +866,10 @@
         }
         bool any_dirty = false;
         size_t count = 0;
-        const size_t method_size = ArtMethod::ObjectSize(target_ptr_size_);
-        auto iteration_range = MakeIterationRangeFromLengthPrefixedArray(array, method_size);
+        const size_t method_alignment = ArtMethod::Alignment(target_ptr_size_);
+        const size_t method_size = ArtMethod::Size(target_ptr_size_);
+        auto iteration_range =
+            MakeIterationRangeFromLengthPrefixedArray(array, method_size, method_alignment);
         for (auto& m : iteration_range) {
           any_dirty = any_dirty || WillMethodBeDirty(&m);
           ++count;
@@ -876,7 +878,9 @@
             kNativeObjectRelocationTypeArtMethodClean;
         Bin bin_type = BinTypeForNativeRelocationType(type);
         // Forward the entire array at once, but header first.
-        const size_t header_size = LengthPrefixedArray<ArtMethod>::ComputeSize(0, method_size);
+        const size_t header_size = LengthPrefixedArray<ArtMethod>::ComputeSize(0,
+                                                                               method_size,
+                                                                               method_alignment);
         auto it = native_object_relocations_.find(array);
         CHECK(it == native_object_relocations_.end()) << "Method array " << array
             << " already forwarded";
@@ -910,7 +914,7 @@
       << PrettyMethod(method);
   size_t& offset = bin_slot_sizes_[BinTypeForNativeRelocationType(type)];
   native_object_relocations_.emplace(method, NativeObjectRelocation { offset, type });
-  offset += ArtMethod::ObjectSize(target_ptr_size_);
+  offset += ArtMethod::Size(target_ptr_size_);
 }
 
 void ImageWriter::WalkFieldsCallback(mirror::Object* obj, void* arg) {
@@ -972,9 +976,10 @@
   size_t& offset = bin_slot_sizes_[BinTypeForNativeRelocationType(image_method_type)];
   native_object_relocations_.emplace(&image_method_array_,
                                      NativeObjectRelocation { offset, image_method_type });
+  size_t method_alignment = ArtMethod::Alignment(target_ptr_size_);
   const size_t array_size = LengthPrefixedArray<ArtMethod>::ComputeSize(
-      0, ArtMethod::ObjectSize(target_ptr_size_));
-  CHECK_ALIGNED(array_size, 8u);
+      0, ArtMethod::Size(target_ptr_size_), method_alignment);
+  CHECK_ALIGNED_PARAM(array_size, method_alignment);
   offset += array_size;
   for (auto* m : image_methods_) {
     CHECK(m != nullptr);
@@ -982,13 +987,21 @@
     AssignMethodOffset(m, kNativeObjectRelocationTypeArtMethodClean);
   }
 
-  // Calculate cumulative bin slot sizes.
-  size_t previous_sizes = 0u;
+  // Calculate bin slot offsets.
+  size_t bin_offset = image_objects_offset_begin_;
   for (size_t i = 0; i != kBinSize; ++i) {
-    bin_slot_previous_sizes_[i] = previous_sizes;
-    previous_sizes += bin_slot_sizes_[i];
+    bin_slot_offsets_[i] = bin_offset;
+    bin_offset += bin_slot_sizes_[i];
+    if (i == kBinArtField) {
+      static_assert(kBinArtField + 1 == kBinArtMethodClean, "Methods follow fields.");
+      static_assert(alignof(ArtField) == 4u, "ArtField alignment is 4.");
+      DCHECK_ALIGNED(bin_offset, 4u);
+      DCHECK(method_alignment == 4u || method_alignment == 8u);
+      bin_offset = RoundUp(bin_offset, method_alignment);
+    }
   }
-  DCHECK_EQ(previous_sizes, GetBinSizeSum());
+  // NOTE: There may be additional padding between the bin slots and the intern table.
+
   DCHECK_EQ(image_end_, GetBinSizeSum(kBinMirrorCount) + image_objects_offset_begin_);
 
   // Transform each object's bin slot into an offset which will be used to do the final copy.
@@ -1002,7 +1015,7 @@
   for (auto& pair : native_object_relocations_) {
     NativeObjectRelocation& relocation = pair.second;
     Bin bin_type = BinTypeForNativeRelocationType(relocation.type);
-    relocation.offset += image_objects_offset_begin_ + bin_slot_previous_sizes_[bin_type];
+    relocation.offset += bin_slot_offsets_[bin_type];
   }
 
   // Calculate how big the intern table will be after being serialized.
@@ -1029,15 +1042,15 @@
   // Add field section.
   auto* field_section = &sections[ImageHeader::kSectionArtFields];
   *field_section = ImageSection(cur_pos, bin_slot_sizes_[kBinArtField]);
-  CHECK_EQ(image_objects_offset_begin_ + bin_slot_previous_sizes_[kBinArtField],
-           field_section->Offset());
+  CHECK_EQ(bin_slot_offsets_[kBinArtField], field_section->Offset());
   cur_pos = field_section->End();
+  // Round up to the alignment the required by the method section.
+  cur_pos = RoundUp(cur_pos, ArtMethod::Alignment(target_ptr_size_));
   // Add method section.
   auto* methods_section = &sections[ImageHeader::kSectionArtMethods];
   *methods_section = ImageSection(cur_pos, bin_slot_sizes_[kBinArtMethodClean] +
                                   bin_slot_sizes_[kBinArtMethodDirty]);
-  CHECK_EQ(image_objects_offset_begin_ + bin_slot_previous_sizes_[kBinArtMethodClean],
-           methods_section->Offset());
+  CHECK_EQ(bin_slot_offsets_[kBinArtMethodClean], methods_section->Offset());
   cur_pos = methods_section->End();
   // Round up to the alignment the string table expects. See HashSet::WriteToMemory.
   cur_pos = RoundUp(cur_pos, sizeof(uint64_t));
@@ -1135,7 +1148,10 @@
       }
       case kNativeObjectRelocationTypeArtMethodArrayClean:
       case kNativeObjectRelocationTypeArtMethodArrayDirty: {
-        memcpy(dest, pair.first, LengthPrefixedArray<ArtMethod>::ComputeSize(0));
+        memcpy(dest, pair.first, LengthPrefixedArray<ArtMethod>::ComputeSize(
+            0,
+            ArtMethod::Size(target_ptr_size_),
+            ArtMethod::Alignment(target_ptr_size_)));
         break;
       }
     }
@@ -1444,7 +1460,7 @@
 }
 
 void ImageWriter::CopyAndFixupMethod(ArtMethod* orig, ArtMethod* copy) {
-  memcpy(copy, orig, ArtMethod::ObjectSize(target_ptr_size_));
+  memcpy(copy, orig, ArtMethod::Size(target_ptr_size_));
 
   copy->SetDeclaringClass(GetImageAddress(orig->GetDeclaringClassUnchecked()));
   copy->SetDexCacheResolvedMethods(GetImageAddress(orig->GetDexCacheResolvedMethods()));
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index f4e10cc..c8aa82d 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -54,7 +54,7 @@
         quick_imt_conflict_trampoline_offset_(0), quick_resolution_trampoline_offset_(0),
         quick_to_interpreter_bridge_offset_(0), compile_pic_(compile_pic),
         target_ptr_size_(InstructionSetPointerSize(compiler_driver_.GetInstructionSet())),
-        bin_slot_sizes_(), bin_slot_previous_sizes_(), bin_slot_count_(),
+        bin_slot_sizes_(), bin_slot_offsets_(), bin_slot_count_(),
         intern_table_bytes_(0u), image_method_array_(ImageHeader::kImageMethodsCount),
         dirty_methods_(0u), clean_methods_(0u) {
     CHECK_NE(image_begin, 0U);
@@ -359,7 +359,7 @@
 
   // Bin slot tracking for dirty object packing
   size_t bin_slot_sizes_[kBinSize];  // Number of bytes in a bin
-  size_t bin_slot_previous_sizes_[kBinSize];  // Number of bytes in previous bins.
+  size_t bin_slot_offsets_[kBinSize];  // Number of bytes in previous bins.
   size_t bin_slot_count_[kBinSize];  // Number of objects in a bin
 
   // Cached size of the intern table for when we allocate memory.
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index dbecb8e..35b5093 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -48,7 +48,7 @@
 
 class ArmJniCallingConvention FINAL : public JniCallingConvention {
  public:
-  explicit ArmJniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
+  ArmJniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
   ~ArmJniCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index 9fd3265..37c92b2 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -48,7 +48,7 @@
 
 class Arm64JniCallingConvention FINAL : public JniCallingConvention {
  public:
-  explicit Arm64JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
+  Arm64JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
   ~Arm64JniCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index c9b595a..243d124 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -348,8 +348,8 @@
     kObjectOrClass = 1
   };
 
-  explicit JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty,
-                                size_t frame_pointer_size)
+  JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty,
+                       size_t frame_pointer_size)
       : CallingConvention(is_static, is_synchronized, shorty, frame_pointer_size) {}
 
   // Number of stack slots for outgoing arguments, above which the handle scope is
diff --git a/compiler/jni/quick/mips/calling_convention_mips.h b/compiler/jni/quick/mips/calling_convention_mips.h
index 8d82dce..dc45432 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.h
+++ b/compiler/jni/quick/mips/calling_convention_mips.h
@@ -48,7 +48,7 @@
 
 class MipsJniCallingConvention FINAL : public JniCallingConvention {
  public:
-  explicit MipsJniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
+  MipsJniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
   ~MipsJniCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/mips64/calling_convention_mips64.h b/compiler/jni/quick/mips64/calling_convention_mips64.h
index dc9273b..3d6aab7 100644
--- a/compiler/jni/quick/mips64/calling_convention_mips64.h
+++ b/compiler/jni/quick/mips64/calling_convention_mips64.h
@@ -48,7 +48,7 @@
 
 class Mips64JniCallingConvention FINAL : public JniCallingConvention {
  public:
-  explicit Mips64JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
+  Mips64JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
   ~Mips64JniCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index b1b3598..cdf0956 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -26,8 +26,7 @@
 
 class X86ManagedRuntimeCallingConvention FINAL : public ManagedRuntimeCallingConvention {
  public:
-  explicit X86ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized,
-                                              const char* shorty)
+  X86ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized, const char* shorty)
       : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize),
         gpr_arg_count_(0) {}
   ~X86ManagedRuntimeCallingConvention() OVERRIDE {}
@@ -51,7 +50,7 @@
 
 class X86JniCallingConvention FINAL : public JniCallingConvention {
  public:
-  explicit X86JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
+  X86JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
   ~X86JniCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index 7a90c6e..6e47c9f 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -26,8 +26,7 @@
 
 class X86_64ManagedRuntimeCallingConvention FINAL : public ManagedRuntimeCallingConvention {
  public:
-  explicit X86_64ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized,
-                                              const char* shorty)
+  X86_64ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized, const char* shorty)
       : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {}
   ~X86_64ManagedRuntimeCallingConvention() OVERRIDE {}
   // Calling convention
@@ -47,7 +46,7 @@
 
 class X86_64JniCallingConvention FINAL : public JniCallingConvention {
  public:
-  explicit X86_64JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
+  X86_64JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
   ~X86_64JniCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h
index 3baf438..760fb7c 100644
--- a/compiler/oat_writer.h
+++ b/compiler/oat_writer.h
@@ -178,7 +178,7 @@
 
   class OatDexFile {
    public:
-    explicit OatDexFile(size_t offset, const DexFile& dex_file);
+    OatDexFile(size_t offset, const DexFile& dex_file);
     size_t SizeOf() const;
     void UpdateChecksum(OatHeader* oat_header) const;
     bool Write(OatWriter* oat_writer, OutputStream* out, const size_t file_offset) const;
@@ -200,10 +200,10 @@
 
   class OatClass {
    public:
-    explicit OatClass(size_t offset,
-                      const std::vector<CompiledMethod*>& compiled_methods,
-                      uint32_t num_non_null_compiled_methods,
-                      mirror::Class::Status status);
+    OatClass(size_t offset,
+             const std::vector<CompiledMethod*>& compiled_methods,
+             uint32_t num_non_null_compiled_methods,
+             mirror::Class::Status status);
     ~OatClass();
     size_t GetOatMethodOffsetsOffsetFromOatHeader(size_t class_def_method_index_) const;
     size_t GetOatMethodOffsetsOffsetFromOatClass(size_t class_def_method_index_) const;
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 4607ebe..d0b5ffd 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -1005,7 +1005,39 @@
   GetMoveResolver()->EmitNativeCode(&parallel_move);
 }
 
-void SlowPathCode::RecordPcInfo(CodeGenerator* codegen, HInstruction* instruction, uint32_t dex_pc) {
+void CodeGenerator::ValidateInvokeRuntime(HInstruction* instruction, SlowPathCode* slow_path) {
+  // Ensure that the call kind indication given to the register allocator is
+  // coherent with the runtime call generated, and that the GC side effect is
+  // set when required.
+  if (slow_path == nullptr) {
+    DCHECK(instruction->GetLocations()->WillCall()) << instruction->DebugName();
+    DCHECK(instruction->GetSideEffects().Includes(SideEffects::CanTriggerGC()))
+        << instruction->DebugName() << instruction->GetSideEffects().ToString();
+  } else {
+    DCHECK(instruction->GetLocations()->OnlyCallsOnSlowPath() || slow_path->IsFatal())
+        << instruction->DebugName() << slow_path->GetDescription();
+    DCHECK(instruction->GetSideEffects().Includes(SideEffects::CanTriggerGC()) ||
+           // Control flow would not come back into the code if a fatal slow
+           // path is taken, so we do not care if it triggers GC.
+           slow_path->IsFatal() ||
+           // HDeoptimize is a special case: we know we are not coming back from
+           // it into the code.
+           instruction->IsDeoptimize())
+        << instruction->DebugName() << instruction->GetSideEffects().ToString()
+        << slow_path->GetDescription();
+  }
+
+  // Check the coherency of leaf information.
+  DCHECK(instruction->IsSuspendCheck()
+         || ((slow_path != nullptr) && slow_path->IsFatal())
+         || instruction->GetLocations()->CanCall()
+         || !IsLeafMethod())
+      << instruction->DebugName() << ((slow_path != nullptr) ? slow_path->GetDescription() : "");
+}
+
+void SlowPathCode::RecordPcInfo(CodeGenerator* codegen,
+                                HInstruction* instruction,
+                                uint32_t dex_pc) {
   codegen->RecordPcInfo(instruction, dex_pc, this);
 }
 
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 540da1c..2582444 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -292,6 +292,8 @@
     return type == Primitive::kPrimNot && !value->IsNullConstant();
   }
 
+  void ValidateInvokeRuntime(HInstruction* instruction, SlowPathCode* slow_path);
+
   void AddAllocatedRegister(Location location) {
     allocated_registers_.Add(location);
   }
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 6c0292c..1bd4216 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -953,23 +953,10 @@
                                      HInstruction* instruction,
                                      uint32_t dex_pc,
                                      SlowPathCode* slow_path) {
-  // Ensure that the call kind indication given to the register allocator is
-  // coherent with the runtime call generated.
-  if (slow_path == nullptr) {
-    DCHECK(instruction->GetLocations()->WillCall());
-  } else {
-    DCHECK(instruction->GetLocations()->OnlyCallsOnSlowPath() || slow_path->IsFatal());
-  }
-
+  ValidateInvokeRuntime(instruction, slow_path);
   __ LoadFromOffset(kLoadWord, LR, TR, entry_point_offset);
   __ blx(LR);
   RecordPcInfo(instruction, dex_pc, slow_path);
-  DCHECK(instruction->IsSuspendCheck()
-      || instruction->IsBoundsCheck()
-      || instruction->IsNullCheck()
-      || instruction->IsDivZeroCheck()
-      || instruction->GetLocations()->CanCall()
-      || !IsLeafMethod());
 }
 
 void InstructionCodeGeneratorARM::HandleGoto(HInstruction* got, HBasicBlock* successor) {
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index b44c5ba..b8ac421 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -360,8 +360,7 @@
 
 class SuspendCheckSlowPathARM64 : public SlowPathCodeARM64 {
  public:
-  explicit SuspendCheckSlowPathARM64(HSuspendCheck* instruction,
-                                     HBasicBlock* successor)
+  SuspendCheckSlowPathARM64(HSuspendCheck* instruction, HBasicBlock* successor)
       : instruction_(instruction), successor_(successor) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
@@ -1103,23 +1102,11 @@
                                        HInstruction* instruction,
                                        uint32_t dex_pc,
                                        SlowPathCode* slow_path) {
-  // Ensure that the call kind indication given to the register allocator is
-  // coherent with the runtime call generated.
-  if (slow_path == nullptr) {
-    DCHECK(instruction->GetLocations()->WillCall());
-  } else {
-    DCHECK(instruction->GetLocations()->OnlyCallsOnSlowPath() || slow_path->IsFatal());
-  }
-
+  ValidateInvokeRuntime(instruction, slow_path);
   BlockPoolsScope block_pools(GetVIXLAssembler());
   __ Ldr(lr, MemOperand(tr, entry_point_offset));
   __ Blr(lr);
   RecordPcInfo(instruction, dex_pc, slow_path);
-  DCHECK(instruction->IsSuspendCheck()
-         || instruction->IsBoundsCheck()
-         || instruction->IsNullCheck()
-         || instruction->IsDivZeroCheck()
-         || !IsLeafMethod());
 }
 
 void InstructionCodeGeneratorARM64::GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path,
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 2c61038..ac7ee10 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -191,7 +191,7 @@
 
 class LocationsBuilderARM64 : public HGraphVisitor {
  public:
-  explicit LocationsBuilderARM64(HGraph* graph, CodeGeneratorARM64* codegen)
+  LocationsBuilderARM64(HGraph* graph, CodeGeneratorARM64* codegen)
       : HGraphVisitor(graph), codegen_(codegen) {}
 
 #define DECLARE_VISIT_INSTRUCTION(name, super) \
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index b6ebeb4..167e025 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -294,8 +294,7 @@
 
 class SuspendCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 {
  public:
-  explicit SuspendCheckSlowPathMIPS64(HSuspendCheck* instruction,
-                                      HBasicBlock* successor)
+  SuspendCheckSlowPathMIPS64(HSuspendCheck* instruction, HBasicBlock* successor)
       : instruction_(instruction), successor_(successor) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
@@ -977,23 +976,11 @@
                                         HInstruction* instruction,
                                         uint32_t dex_pc,
                                         SlowPathCode* slow_path) {
-  // Ensure that the call kind indication given to the register allocator is
-  // coherent with the runtime call generated.
-  if (slow_path == nullptr) {
-    DCHECK(instruction->GetLocations()->WillCall());
-  } else {
-    DCHECK(instruction->GetLocations()->OnlyCallsOnSlowPath() || slow_path->IsFatal());
-  }
-
+  ValidateInvokeRuntime(instruction, slow_path);
   // TODO: anything related to T9/GP/GOT/PIC/.so's?
   __ LoadFromOffset(kLoadDoubleword, T9, TR, entry_point_offset);
   __ Jalr(T9);
   RecordPcInfo(instruction, dex_pc, slow_path);
-  DCHECK(instruction->IsSuspendCheck()
-      || instruction->IsBoundsCheck()
-      || instruction->IsNullCheck()
-      || instruction->IsDivZeroCheck()
-      || !IsLeafMethod());
 }
 
 void InstructionCodeGeneratorMIPS64::GenerateClassInitializationCheck(SlowPathCodeMIPS64* slow_path,
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 4efdbb9..091a3e5 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -93,7 +93,7 @@
 
 class DivRemMinusOneSlowPathX86 : public SlowPathCodeX86 {
  public:
-  explicit DivRemMinusOneSlowPathX86(Register reg, bool is_div) : reg_(reg), is_div_(is_div) {}
+  DivRemMinusOneSlowPathX86(Register reg, bool is_div) : reg_(reg), is_div_(is_div) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     __ Bind(GetEntryLabel());
@@ -429,21 +429,9 @@
                                      HInstruction* instruction,
                                      uint32_t dex_pc,
                                      SlowPathCode* slow_path) {
-  // Ensure that the call kind indication given to the register allocator is
-  // coherent with the runtime call generated.
-  if (slow_path == nullptr) {
-    DCHECK(instruction->GetLocations()->WillCall());
-  } else {
-    DCHECK(instruction->GetLocations()->OnlyCallsOnSlowPath() || slow_path->IsFatal());
-  }
-
+  ValidateInvokeRuntime(instruction, slow_path);
   __ fs()->call(entry_point);
   RecordPcInfo(instruction, dex_pc, slow_path);
-  DCHECK(instruction->IsSuspendCheck()
-         || instruction->IsBoundsCheck()
-         || instruction->IsNullCheck()
-         || instruction->IsDivZeroCheck()
-         || !IsLeafMethod());
 }
 
 CodeGeneratorX86::CodeGeneratorX86(HGraph* graph,
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 6991414..2c5cef3 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -95,7 +95,7 @@
 
 class DivRemMinusOneSlowPathX86_64 : public SlowPathCodeX86_64 {
  public:
-  explicit DivRemMinusOneSlowPathX86_64(Register reg, Primitive::Type type, bool is_div)
+  DivRemMinusOneSlowPathX86_64(Register reg, Primitive::Type type, bool is_div)
       : cpu_reg_(CpuRegister(reg)), type_(type), is_div_(is_div) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
@@ -104,7 +104,7 @@
       if (is_div_) {
         __ negl(cpu_reg_);
       } else {
-        __ movl(cpu_reg_, Immediate(0));
+        __ xorl(cpu_reg_, cpu_reg_);
       }
 
     } else {
@@ -129,7 +129,7 @@
 
 class SuspendCheckSlowPathX86_64 : public SlowPathCodeX86_64 {
  public:
-  explicit SuspendCheckSlowPathX86_64(HSuspendCheck* instruction, HBasicBlock* successor)
+  SuspendCheckSlowPathX86_64(HSuspendCheck* instruction, HBasicBlock* successor)
       : instruction_(instruction), successor_(successor) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
@@ -484,21 +484,9 @@
                                         HInstruction* instruction,
                                         uint32_t dex_pc,
                                         SlowPathCode* slow_path) {
-  // Ensure that the call kind indication given to the register allocator is
-  // coherent with the runtime call generated.
-  if (slow_path == nullptr) {
-    DCHECK(instruction->GetLocations()->WillCall());
-  } else {
-    DCHECK(instruction->GetLocations()->OnlyCallsOnSlowPath() || slow_path->IsFatal());
-  }
-
+  ValidateInvokeRuntime(instruction, slow_path);
   __ gs()->call(entry_point);
   RecordPcInfo(instruction, dex_pc, slow_path);
-  DCHECK(instruction->IsSuspendCheck()
-         || instruction->IsBoundsCheck()
-         || instruction->IsNullCheck()
-         || instruction->IsDivZeroCheck()
-         || !IsLeafMethod());
 }
 
 static constexpr int kNumberOfCpuRegisterPairs = 0;
@@ -749,8 +737,7 @@
         DCHECK(constant->IsLongConstant());
         value = constant->AsLongConstant()->GetValue();
       }
-      Load64BitValue(CpuRegister(TMP), value);
-      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+      Store64BitValueToStack(destination, value);
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -784,8 +771,7 @@
       if (location.IsRegister()) {
         Load64BitValue(location.AsRegister<CpuRegister>(), value);
       } else if (location.IsDoubleStackSlot()) {
-        Load64BitValue(CpuRegister(TMP), value);
-        __ movq(Address(CpuRegister(RSP), location.GetStackIndex()), CpuRegister(TMP));
+        Store64BitValueToStack(location, value);
       } else {
         DCHECK(location.IsConstant());
         DCHECK_EQ(location.GetConstant(), const_to_move);
@@ -1849,14 +1835,12 @@
           // Processing a Dex `float-to-int' instruction.
           locations->SetInAt(0, Location::RequiresFpuRegister());
           locations->SetOut(Location::RequiresRegister());
-          locations->AddTemp(Location::RequiresFpuRegister());
           break;
 
         case Primitive::kPrimDouble:
           // Processing a Dex `double-to-int' instruction.
           locations->SetInAt(0, Location::RequiresFpuRegister());
           locations->SetOut(Location::RequiresRegister());
-          locations->AddTemp(Location::RequiresFpuRegister());
           break;
 
         default:
@@ -1884,14 +1868,12 @@
           // Processing a Dex `float-to-long' instruction.
           locations->SetInAt(0, Location::RequiresFpuRegister());
           locations->SetOut(Location::RequiresRegister());
-          locations->AddTemp(Location::RequiresFpuRegister());
           break;
 
         case Primitive::kPrimDouble:
           // Processing a Dex `double-to-long' instruction.
           locations->SetInAt(0, Location::RequiresFpuRegister());
           locations->SetOut(Location::RequiresRegister());
-          locations->AddTemp(Location::RequiresFpuRegister());
           break;
 
         default:
@@ -2067,14 +2049,11 @@
           // Processing a Dex `float-to-int' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
           __ movl(output, Immediate(kPrimIntMax));
-          // temp = int-to-float(output)
-          __ cvtsi2ss(temp, output, false);
-          // if input >= temp goto done
-          __ comiss(input, temp);
+          // if input >= (float)INT_MAX goto done
+          __ comiss(input, codegen_->LiteralFloatAddress(kPrimIntMax));
           __ j(kAboveEqual, &done);
           // if input == NaN goto nan
           __ j(kUnordered, &nan);
@@ -2092,14 +2071,11 @@
           // Processing a Dex `double-to-int' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
           __ movl(output, Immediate(kPrimIntMax));
-          // temp = int-to-double(output)
-          __ cvtsi2sd(temp, output);
-          // if input >= temp goto done
-          __ comisd(input, temp);
+          // if input >= (double)INT_MAX goto done
+          __ comisd(input, codegen_->LiteralDoubleAddress(kPrimIntMax));
           __ j(kAboveEqual, &done);
           // if input == NaN goto nan
           __ j(kUnordered, &nan);
@@ -2137,14 +2113,11 @@
           // Processing a Dex `float-to-long' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
           codegen_->Load64BitValue(output, kPrimLongMax);
-          // temp = long-to-float(output)
-          __ cvtsi2ss(temp, output, true);
-          // if input >= temp goto done
-          __ comiss(input, temp);
+          // if input >= (float)LONG_MAX goto done
+          __ comiss(input, codegen_->LiteralFloatAddress(kPrimLongMax));
           __ j(kAboveEqual, &done);
           // if input == NaN goto nan
           __ j(kUnordered, &nan);
@@ -2162,14 +2135,11 @@
           // Processing a Dex `double-to-long' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
           codegen_->Load64BitValue(output, kPrimLongMax);
-          // temp = long-to-double(output)
-          __ cvtsi2sd(temp, output, true);
-          // if input >= temp goto done
-          __ comisd(input, temp);
+          // if input >= (double)LONG_MAX goto done
+          __ comisd(input, codegen_->LiteralDoubleAddress(kPrimLongMax));
           __ j(kAboveEqual, &done);
           // if input == NaN goto nan
           __ j(kUnordered, &nan);
@@ -4339,8 +4309,7 @@
         codegen_->Load64BitValue(destination.AsRegister<CpuRegister>(), value);
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
-        codegen_->Load64BitValue(CpuRegister(TMP), value);
-        __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+        codegen_->Store64BitValueToStack(destination, value);
       }
     } else if (constant->IsFloatConstant()) {
       float fp_value = constant->AsFloatConstant()->GetValue();
@@ -4371,8 +4340,7 @@
         }
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
-        codegen_->Load64BitValue(CpuRegister(TMP), value);
-        __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+        codegen_->Store64BitValueToStack(destination, value);
       }
     }
   } else if (source.IsFpuRegister()) {
@@ -4874,6 +4842,18 @@
   }
 }
 
+void CodeGeneratorX86_64::Store64BitValueToStack(Location dest, int64_t value) {
+  DCHECK(dest.IsDoubleStackSlot());
+  if (IsInt<32>(value)) {
+    // Can move directly as an int32 constant.
+    __ movq(Address(CpuRegister(RSP), dest.GetStackIndex()),
+            Immediate(static_cast<int32_t>(value)));
+  } else {
+    Load64BitValue(CpuRegister(TMP), value);
+    __ movq(Address(CpuRegister(RSP), dest.GetStackIndex()), CpuRegister(TMP));
+  }
+}
+
 void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) {
   // Generate the constant area if needed.
   X86_64Assembler* assembler = GetAssembler();
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 3b3915f..41bebac 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -322,6 +322,9 @@
   // Load a 64 bit value into a register in the most efficient manner.
   void Load64BitValue(CpuRegister dest, int64_t value);
 
+  // Store a 64 bit value into a DoubleStackSlot in the most efficient manner.
+  void Store64BitValueToStack(Location dest, int64_t value);
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
diff --git a/compiler/optimizing/dead_code_elimination.cc b/compiler/optimizing/dead_code_elimination.cc
index 6269d16..5de629d 100644
--- a/compiler/optimizing/dead_code_elimination.cc
+++ b/compiler/optimizing/dead_code_elimination.cc
@@ -128,7 +128,7 @@
     for (i.Advance(); !i.Done(); i.Advance()) {
       HInstruction* inst = i.Current();
       DCHECK(!inst->IsControlFlow());
-      if (!inst->DoesAnyWrite()
+      if (!inst->HasSideEffects()
           && !inst->CanThrow()
           && !inst->IsSuspendCheck()
           // If we added an explicit barrier then we should keep it.
diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc
index 3900646..833dfb0 100644
--- a/compiler/optimizing/gvn.cc
+++ b/compiler/optimizing/gvn.cc
@@ -264,7 +264,7 @@
   // odd buckets to speed up deletion.
   size_t HashCode(HInstruction* instruction) const {
     size_t hash_code = instruction->ComputeHashCode();
-    if (instruction->GetSideEffects().DoesAnyRead()) {
+    if (instruction->GetSideEffects().HasDependencies()) {
       return (hash_code << 1) | 0;
     } else {
       return (hash_code << 1) | 1;
diff --git a/compiler/optimizing/gvn_test.cc b/compiler/optimizing/gvn_test.cc
index 5c6239b..42ef3ff 100644
--- a/compiler/optimizing/gvn_test.cc
+++ b/compiler/optimizing/gvn_test.cc
@@ -266,6 +266,8 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
+  static const SideEffects kCanTriggerGC = SideEffects::CanTriggerGC();
+
   HGraph* graph = CreateGraph(&allocator);
   HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
   graph->AddBlock(entry);
@@ -309,7 +311,7 @@
   ASSERT_TRUE(inner_loop_header->GetLoopInformation()->IsIn(
       *outer_loop_header->GetLoopInformation()));
 
-  // Check that the loops don't have side effects.
+  // Check that the only side effect of loops is to potentially trigger GC.
   {
     // Make one block with a side effect.
     entry->AddInstruction(new (&allocator) HInstanceFieldSet(parameter,
@@ -327,6 +329,8 @@
     ASSERT_FALSE(side_effects.GetBlockEffects(outer_loop_body).DoesAnyWrite());
     ASSERT_FALSE(side_effects.GetLoopEffects(outer_loop_header).DoesAnyWrite());
     ASSERT_FALSE(side_effects.GetLoopEffects(inner_loop_header).DoesAnyWrite());
+    ASSERT_TRUE(side_effects.GetLoopEffects(outer_loop_header).Equals(kCanTriggerGC));
+    ASSERT_TRUE(side_effects.GetLoopEffects(inner_loop_header).Equals(kCanTriggerGC));
   }
 
   // Check that the side effects of the outer loop does not affect the inner loop.
@@ -348,6 +352,7 @@
     ASSERT_TRUE(side_effects.GetBlockEffects(outer_loop_body).DoesAnyWrite());
     ASSERT_TRUE(side_effects.GetLoopEffects(outer_loop_header).DoesAnyWrite());
     ASSERT_FALSE(side_effects.GetLoopEffects(inner_loop_header).DoesAnyWrite());
+    ASSERT_TRUE(side_effects.GetLoopEffects(inner_loop_header).Equals(kCanTriggerGC));
   }
 
   // Check that the side effects of the inner loop affects the outer loop.
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 931a1c3..df6e550 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -238,12 +238,19 @@
   }
 
   bool outcome;
-  if (TypeCheckHasKnownOutcome(check_cast->InputAt(1)->AsLoadClass(), object, &outcome)) {
+  HLoadClass* load_class = check_cast->InputAt(1)->AsLoadClass();
+  if (TypeCheckHasKnownOutcome(load_class, object, &outcome)) {
     if (outcome) {
       check_cast->GetBlock()->RemoveInstruction(check_cast);
       if (stats_ != nullptr) {
         stats_->RecordStat(MethodCompilationStat::kRemovedCheckedCast);
       }
+      if (!load_class->HasUses()) {
+        // We cannot rely on DCE to remove the class because the `HLoadClass` thinks it can throw.
+        // However, here we know that it cannot because the checkcast was successfull, hence
+        // the class was already loaded.
+        load_class->GetBlock()->RemoveInstruction(load_class);
+      }
     } else {
       // Don't do anything for exceptional cases for now. Ideally we should remove
       // all instructions and blocks this instruction dominates.
@@ -268,7 +275,8 @@
   }
 
   bool outcome;
-  if (TypeCheckHasKnownOutcome(instruction->InputAt(1)->AsLoadClass(), object, &outcome)) {
+  HLoadClass* load_class = instruction->InputAt(1)->AsLoadClass();
+  if (TypeCheckHasKnownOutcome(load_class, object, &outcome)) {
     if (outcome && can_be_null) {
       // Type test will succeed, we just need a null test.
       HNotEqual* test = new (graph->GetArena()) HNotEqual(graph->GetNullConstant(), object);
@@ -280,6 +288,12 @@
     }
     RecordSimplification();
     instruction->GetBlock()->RemoveInstruction(instruction);
+    if (outcome && !load_class->HasUses()) {
+      // We cannot rely on DCE to remove the class because the `HLoadClass` thinks it can throw.
+      // However, here we know that it cannot because the instanceof check was successfull, hence
+      // the class was already loaded.
+      load_class->GetBlock()->RemoveInstruction(load_class);
+    }
   }
 }
 
diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc
index 55e964e..3db9816 100644
--- a/compiler/optimizing/intrinsics.cc
+++ b/compiler/optimizing/intrinsics.cc
@@ -31,7 +31,7 @@
   switch (i) {
     case Intrinsics::kNone:
       return kInterface;  // Non-sensical for intrinsic.
-#define OPTIMIZING_INTRINSICS(Name, IsStatic) \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment) \
     case Intrinsics::k ## Name:               \
       return IsStatic;
 #include "intrinsics_list.h"
@@ -42,7 +42,21 @@
   return kInterface;
 }
 
-
+// Function that returns whether an intrinsic needs an environment or not.
+static inline IntrinsicNeedsEnvironment IntrinsicNeedsEnvironment(Intrinsics i) {
+  switch (i) {
+    case Intrinsics::kNone:
+      return kNeedsEnvironment;  // Non-sensical for intrinsic.
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment) \
+    case Intrinsics::k ## Name:               \
+      return NeedsEnvironment;
+#include "intrinsics_list.h"
+INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
+#undef INTRINSICS_LIST
+#undef OPTIMIZING_INTRINSICS
+  }
+  return kNeedsEnvironment;
+}
 
 static Primitive::Type GetType(uint64_t data, bool is_op_size) {
   if (is_op_size) {
@@ -70,7 +84,10 @@
   }
 }
 
-static Intrinsics GetIntrinsic(InlineMethod method) {
+static Intrinsics GetIntrinsic(InlineMethod method, InstructionSet instruction_set) {
+  if (instruction_set == kMips || instruction_set == kMips64) {
+    return Intrinsics::kNone;
+  }
   switch (method.opcode) {
     // Floating-point conversions.
     case kIntrinsicDoubleCvt:
@@ -197,6 +214,8 @@
       return Intrinsics::kStringCharAt;
     case kIntrinsicCompareTo:
       return Intrinsics::kStringCompareTo;
+    case kIntrinsicEquals:
+      return Intrinsics::kStringEquals;
     case kIntrinsicGetCharsNoCheck:
       return Intrinsics::kStringGetCharsNoCheck;
     case kIntrinsicIsEmptyOrLength:
@@ -349,7 +368,7 @@
             driver_->GetMethodInlinerMap()->GetMethodInliner(&invoke->GetDexFile());
         DCHECK(inliner != nullptr);
         if (inliner->IsIntrinsic(invoke->GetDexMethodIndex(), &method)) {
-          Intrinsics intrinsic = GetIntrinsic(method);
+          Intrinsics intrinsic = GetIntrinsic(method, graph_->GetInstructionSet());
 
           if (intrinsic != Intrinsics::kNone) {
             if (!CheckInvokeType(intrinsic, invoke)) {
@@ -357,7 +376,7 @@
                            << intrinsic << " for "
                            << PrettyMethod(invoke->GetDexMethodIndex(), invoke->GetDexFile());
             } else {
-              invoke->SetIntrinsic(intrinsic);
+              invoke->SetIntrinsic(intrinsic, IntrinsicNeedsEnvironment(intrinsic));
             }
           }
         }
@@ -371,7 +390,7 @@
     case Intrinsics::kNone:
       os << "None";
       break;
-#define OPTIMIZING_INTRINSICS(Name, IsStatic) \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment) \
     case Intrinsics::k ## Name: \
       os << # Name; \
       break;
diff --git a/compiler/optimizing/intrinsics.h b/compiler/optimizing/intrinsics.h
index 9044982..d1a17b6 100644
--- a/compiler/optimizing/intrinsics.h
+++ b/compiler/optimizing/intrinsics.h
@@ -54,7 +54,7 @@
     switch (invoke->GetIntrinsic()) {
       case Intrinsics::kNone:
         return;
-#define OPTIMIZING_INTRINSICS(Name, IsStatic) \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment) \
       case Intrinsics::k ## Name:             \
         Visit ## Name(invoke);                \
         return;
@@ -69,7 +69,7 @@
 
   // Define visitor methods.
 
-#define OPTIMIZING_INTRINSICS(Name, IsStatic)                    \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment)                    \
   virtual void Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) { \
   }
 #include "intrinsics_list.h"
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index a797654..1527a6a 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -1110,6 +1110,7 @@
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
 UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(StringGetCharsNoCheck)
+UNIMPLEMENTED_INTRINSIC(StringEquals)
 
 #undef UNIMPLEMENTED_INTRINSIC
 
diff --git a/compiler/optimizing/intrinsics_arm.h b/compiler/optimizing/intrinsics_arm.h
index 8bfb7d4..f013bd6 100644
--- a/compiler/optimizing/intrinsics_arm.h
+++ b/compiler/optimizing/intrinsics_arm.h
@@ -33,13 +33,12 @@
 
 class IntrinsicLocationsBuilderARM FINAL : public IntrinsicVisitor {
  public:
-  explicit IntrinsicLocationsBuilderARM(ArenaAllocator* arena,
-                                        const ArmInstructionSetFeatures& features)
+  IntrinsicLocationsBuilderARM(ArenaAllocator* arena, const ArmInstructionSetFeatures& features)
       : arena_(arena), features_(features) {}
 
   // Define visitor methods.
 
-#define OPTIMIZING_INTRINSICS(Name, IsStatic)   \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment)   \
   void Visit ## Name(HInvoke* invoke) OVERRIDE;
 #include "intrinsics_list.h"
 INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
@@ -65,7 +64,7 @@
 
   // Define visitor methods.
 
-#define OPTIMIZING_INTRINSICS(Name, IsStatic)   \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment)   \
   void Visit ## Name(HInvoke* invoke) OVERRIDE;
 #include "intrinsics_list.h"
 INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 2c93fea..a5332ea 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -1055,6 +1055,102 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+void IntrinsicLocationsBuilderARM64::VisitStringEquals(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  // Temporary registers to store lengths of strings and for calculations.
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+
+  locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitStringEquals(HInvoke* invoke) {
+  vixl::MacroAssembler* masm = GetVIXLAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  Register str = WRegisterFrom(locations->InAt(0));
+  Register arg = WRegisterFrom(locations->InAt(1));
+  Register out = XRegisterFrom(locations->Out());
+
+  UseScratchRegisterScope scratch_scope(masm);
+  Register temp = scratch_scope.AcquireW();
+  Register temp1 = WRegisterFrom(locations->GetTemp(0));
+  Register temp2 = WRegisterFrom(locations->GetTemp(1));
+
+  vixl::Label loop;
+  vixl::Label end;
+  vixl::Label return_true;
+  vixl::Label return_false;
+
+  // Get offsets of count, value, and class fields within a string object.
+  const int32_t count_offset = mirror::String::CountOffset().Int32Value();
+  const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
+  const int32_t class_offset = mirror::Object::ClassOffset().Int32Value();
+
+  // Note that the null check must have been done earlier.
+  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
+
+  // Check if input is null, return false if it is.
+  __ Cbz(arg, &return_false);
+
+  // Reference equality check, return true if same reference.
+  __ Cmp(str, arg);
+  __ B(&return_true, eq);
+
+  // Instanceof check for the argument by comparing class fields.
+  // All string objects must have the same type since String cannot be subclassed.
+  // Receiver must be a string object, so its class field is equal to all strings' class fields.
+  // If the argument is a string object, its class field must be equal to receiver's class field.
+  __ Ldr(temp, MemOperand(str.X(), class_offset));
+  __ Ldr(temp1, MemOperand(arg.X(), class_offset));
+  __ Cmp(temp, temp1);
+  __ B(&return_false, ne);
+
+  // Load lengths of this and argument strings.
+  __ Ldr(temp, MemOperand(str.X(), count_offset));
+  __ Ldr(temp1, MemOperand(arg.X(), count_offset));
+  // Check if lengths are equal, return false if they're not.
+  __ Cmp(temp, temp1);
+  __ B(&return_false, ne);
+  // Store offset of string value in preparation for comparison loop
+  __ Mov(temp1, value_offset);
+  // Return true if both strings are empty.
+  __ Cbz(temp, &return_true);
+
+  // Assertions that must hold in order to compare strings 4 characters at a time.
+  DCHECK_ALIGNED(value_offset, 8);
+  static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
+
+  temp1 = temp1.X();
+  temp2 = temp2.X();
+
+  // Loop to compare strings 4 characters at a time starting at the beginning of the string.
+  // Ok to do this because strings are zero-padded to be 8-byte aligned.
+  __ Bind(&loop);
+  __ Ldr(out, MemOperand(str.X(), temp1));
+  __ Ldr(temp2, MemOperand(arg.X(), temp1));
+  __ Add(temp1, temp1, Operand(sizeof(uint64_t)));
+  __ Cmp(out, temp2);
+  __ B(&return_false, ne);
+  __ Sub(temp, temp, Operand(4), SetFlags);
+  __ B(&loop, gt);
+
+  // Return true and exit the function.
+  // If loop does not result in returning false, we return true.
+  __ Bind(&return_true);
+  __ Mov(out, 1);
+  __ B(&end);
+
+  // Return false and exit the function.
+  __ Bind(&return_false);
+  __ Mov(out, 0);
+  __ Bind(&end);
+}
+
 static void GenerateVisitStringIndexOf(HInvoke* invoke,
                                        vixl::MacroAssembler* masm,
                                        CodeGeneratorARM64* codegen,
diff --git a/compiler/optimizing/intrinsics_arm64.h b/compiler/optimizing/intrinsics_arm64.h
index ba21889..ebaf5e5 100644
--- a/compiler/optimizing/intrinsics_arm64.h
+++ b/compiler/optimizing/intrinsics_arm64.h
@@ -41,7 +41,7 @@
 
   // Define visitor methods.
 
-#define OPTIMIZING_INTRINSICS(Name, IsStatic)   \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment)   \
   void Visit ## Name(HInvoke* invoke) OVERRIDE;
 #include "intrinsics_list.h"
 INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
@@ -65,7 +65,7 @@
 
   // Define visitor methods.
 
-#define OPTIMIZING_INTRINSICS(Name, IsStatic)   \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment)   \
   void Visit ## Name(HInvoke* invoke) OVERRIDE;
 #include "intrinsics_list.h"
 INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
diff --git a/compiler/optimizing/intrinsics_list.h b/compiler/optimizing/intrinsics_list.h
index d28c5a3..15ee5d4 100644
--- a/compiler/optimizing/intrinsics_list.h
+++ b/compiler/optimizing/intrinsics_list.h
@@ -18,75 +18,77 @@
 #define ART_COMPILER_OPTIMIZING_INTRINSICS_LIST_H_
 
 // All intrinsics supported by the optimizing compiler. Format is name, then whether it is expected
-// to be a HInvokeStaticOrDirect node (compared to HInvokeVirtual).
+// to be a HInvokeStaticOrDirect node (compared to HInvokeVirtual), then whether it requires an
+// environment.
 
 #define INTRINSICS_LIST(V) \
-  V(DoubleDoubleToRawLongBits, kStatic) \
-  V(DoubleLongBitsToDouble, kStatic) \
-  V(FloatFloatToRawIntBits, kStatic) \
-  V(FloatIntBitsToFloat, kStatic) \
-  V(IntegerReverse, kStatic) \
-  V(IntegerReverseBytes, kStatic) \
-  V(IntegerNumberOfLeadingZeros, kStatic) \
-  V(LongReverse, kStatic) \
-  V(LongReverseBytes, kStatic) \
-  V(LongNumberOfLeadingZeros, kStatic) \
-  V(ShortReverseBytes, kStatic) \
-  V(MathAbsDouble, kStatic) \
-  V(MathAbsFloat, kStatic) \
-  V(MathAbsLong, kStatic) \
-  V(MathAbsInt, kStatic) \
-  V(MathMinDoubleDouble, kStatic) \
-  V(MathMinFloatFloat, kStatic) \
-  V(MathMinLongLong, kStatic) \
-  V(MathMinIntInt, kStatic) \
-  V(MathMaxDoubleDouble, kStatic) \
-  V(MathMaxFloatFloat, kStatic) \
-  V(MathMaxLongLong, kStatic) \
-  V(MathMaxIntInt, kStatic) \
-  V(MathSqrt, kStatic) \
-  V(MathCeil, kStatic) \
-  V(MathFloor, kStatic) \
-  V(MathRint, kStatic) \
-  V(MathRoundDouble, kStatic) \
-  V(MathRoundFloat, kStatic) \
-  V(SystemArrayCopyChar, kStatic) \
-  V(ThreadCurrentThread, kStatic) \
-  V(MemoryPeekByte, kStatic) \
-  V(MemoryPeekIntNative, kStatic) \
-  V(MemoryPeekLongNative, kStatic) \
-  V(MemoryPeekShortNative, kStatic) \
-  V(MemoryPokeByte, kStatic) \
-  V(MemoryPokeIntNative, kStatic) \
-  V(MemoryPokeLongNative, kStatic) \
-  V(MemoryPokeShortNative, kStatic) \
-  V(StringCharAt, kDirect) \
-  V(StringCompareTo, kDirect) \
-  V(StringGetCharsNoCheck, kDirect) \
-  V(StringIndexOf, kDirect) \
-  V(StringIndexOfAfter, kDirect) \
-  V(StringNewStringFromBytes, kStatic) \
-  V(StringNewStringFromChars, kStatic) \
-  V(StringNewStringFromString, kStatic) \
-  V(UnsafeCASInt, kDirect) \
-  V(UnsafeCASLong, kDirect) \
-  V(UnsafeCASObject, kDirect) \
-  V(UnsafeGet, kDirect) \
-  V(UnsafeGetVolatile, kDirect) \
-  V(UnsafeGetObject, kDirect) \
-  V(UnsafeGetObjectVolatile, kDirect) \
-  V(UnsafeGetLong, kDirect) \
-  V(UnsafeGetLongVolatile, kDirect) \
-  V(UnsafePut, kDirect) \
-  V(UnsafePutOrdered, kDirect) \
-  V(UnsafePutVolatile, kDirect) \
-  V(UnsafePutObject, kDirect) \
-  V(UnsafePutObjectOrdered, kDirect) \
-  V(UnsafePutObjectVolatile, kDirect) \
-  V(UnsafePutLong, kDirect) \
-  V(UnsafePutLongOrdered, kDirect) \
-  V(UnsafePutLongVolatile, kDirect) \
-  V(ReferenceGetReferent, kDirect)
+  V(DoubleDoubleToRawLongBits, kStatic, kNeedsEnvironment) \
+  V(DoubleLongBitsToDouble, kStatic, kNeedsEnvironment) \
+  V(FloatFloatToRawIntBits, kStatic, kNeedsEnvironment) \
+  V(FloatIntBitsToFloat, kStatic, kNeedsEnvironment) \
+  V(IntegerReverse, kStatic, kNeedsEnvironment) \
+  V(IntegerReverseBytes, kStatic, kNeedsEnvironment) \
+  V(IntegerNumberOfLeadingZeros, kStatic, kNeedsEnvironment) \
+  V(LongReverse, kStatic, kNeedsEnvironment) \
+  V(LongReverseBytes, kStatic, kNeedsEnvironment) \
+  V(LongNumberOfLeadingZeros, kStatic, kNeedsEnvironment) \
+  V(ShortReverseBytes, kStatic, kNeedsEnvironment) \
+  V(MathAbsDouble, kStatic, kNeedsEnvironment) \
+  V(MathAbsFloat, kStatic, kNeedsEnvironment) \
+  V(MathAbsLong, kStatic, kNeedsEnvironment) \
+  V(MathAbsInt, kStatic, kNeedsEnvironment) \
+  V(MathMinDoubleDouble, kStatic, kNeedsEnvironment) \
+  V(MathMinFloatFloat, kStatic, kNeedsEnvironment) \
+  V(MathMinLongLong, kStatic, kNeedsEnvironment) \
+  V(MathMinIntInt, kStatic, kNeedsEnvironment) \
+  V(MathMaxDoubleDouble, kStatic, kNeedsEnvironment) \
+  V(MathMaxFloatFloat, kStatic, kNeedsEnvironment) \
+  V(MathMaxLongLong, kStatic, kNeedsEnvironment) \
+  V(MathMaxIntInt, kStatic, kNeedsEnvironment) \
+  V(MathSqrt, kStatic, kNeedsEnvironment) \
+  V(MathCeil, kStatic, kNeedsEnvironment) \
+  V(MathFloor, kStatic, kNeedsEnvironment) \
+  V(MathRint, kStatic, kNeedsEnvironment) \
+  V(MathRoundDouble, kStatic, kNeedsEnvironment) \
+  V(MathRoundFloat, kStatic, kNeedsEnvironment) \
+  V(SystemArrayCopyChar, kStatic, kNeedsEnvironment) \
+  V(ThreadCurrentThread, kStatic, kNeedsEnvironment) \
+  V(MemoryPeekByte, kStatic, kNeedsEnvironment) \
+  V(MemoryPeekIntNative, kStatic, kNeedsEnvironment) \
+  V(MemoryPeekLongNative, kStatic, kNeedsEnvironment) \
+  V(MemoryPeekShortNative, kStatic, kNeedsEnvironment) \
+  V(MemoryPokeByte, kStatic, kNeedsEnvironment) \
+  V(MemoryPokeIntNative, kStatic, kNeedsEnvironment) \
+  V(MemoryPokeLongNative, kStatic, kNeedsEnvironment) \
+  V(MemoryPokeShortNative, kStatic, kNeedsEnvironment) \
+  V(StringCharAt, kDirect, kNeedsEnvironment) \
+  V(StringCompareTo, kDirect, kNeedsEnvironment) \
+  V(StringEquals, kDirect, kNeedsEnvironment) \
+  V(StringGetCharsNoCheck, kDirect, kNeedsEnvironment) \
+  V(StringIndexOf, kDirect, kNeedsEnvironment) \
+  V(StringIndexOfAfter, kDirect, kNeedsEnvironment) \
+  V(StringNewStringFromBytes, kStatic, kNeedsEnvironment) \
+  V(StringNewStringFromChars, kStatic, kNeedsEnvironment) \
+  V(StringNewStringFromString, kStatic, kNeedsEnvironment) \
+  V(UnsafeCASInt, kDirect, kNeedsEnvironment) \
+  V(UnsafeCASLong, kDirect, kNeedsEnvironment) \
+  V(UnsafeCASObject, kDirect, kNeedsEnvironment) \
+  V(UnsafeGet, kDirect, kNeedsEnvironment) \
+  V(UnsafeGetVolatile, kDirect, kNeedsEnvironment) \
+  V(UnsafeGetObject, kDirect, kNeedsEnvironment) \
+  V(UnsafeGetObjectVolatile, kDirect, kNeedsEnvironment) \
+  V(UnsafeGetLong, kDirect, kNeedsEnvironment) \
+  V(UnsafeGetLongVolatile, kDirect, kNeedsEnvironment) \
+  V(UnsafePut, kDirect, kNeedsEnvironment) \
+  V(UnsafePutOrdered, kDirect, kNeedsEnvironment) \
+  V(UnsafePutVolatile, kDirect, kNeedsEnvironment) \
+  V(UnsafePutObject, kDirect, kNeedsEnvironment) \
+  V(UnsafePutObjectOrdered, kDirect, kNeedsEnvironment) \
+  V(UnsafePutObjectVolatile, kDirect, kNeedsEnvironment) \
+  V(UnsafePutLong, kDirect, kNeedsEnvironment) \
+  V(UnsafePutLongOrdered, kDirect, kNeedsEnvironment) \
+  V(UnsafePutLongVolatile, kDirect, kNeedsEnvironment) \
+  V(ReferenceGetReferent, kDirect, kNeedsEnvironment)
 
 #endif  // ART_COMPILER_OPTIMIZING_INTRINSICS_LIST_H_
 #undef ART_COMPILER_OPTIMIZING_INTRINSICS_LIST_H_   // #define is only for lint.
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 993c005..4471d71 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -945,6 +945,97 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+void IntrinsicLocationsBuilderX86::VisitStringEquals(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+
+  // Request temporary registers, ECX and EDI needed for repe_cmpsl instruction.
+  locations->AddTemp(Location::RegisterLocation(ECX));
+  locations->AddTemp(Location::RegisterLocation(EDI));
+
+  // Set output, ESI needed for repe_cmpsl instruction anyways.
+  locations->SetOut(Location::RegisterLocation(ESI), Location::kOutputOverlap);
+}
+
+void IntrinsicCodeGeneratorX86::VisitStringEquals(HInvoke* invoke) {
+  X86Assembler* assembler = GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  Register str = locations->InAt(0).AsRegister<Register>();
+  Register arg = locations->InAt(1).AsRegister<Register>();
+  Register ecx = locations->GetTemp(0).AsRegister<Register>();
+  Register edi = locations->GetTemp(1).AsRegister<Register>();
+  Register esi = locations->Out().AsRegister<Register>();
+
+  Label end;
+  Label return_true;
+  Label return_false;
+
+  // Get offsets of count, value, and class fields within a string object.
+  const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+  const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
+  const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value();
+
+  // Note that the null check must have been done earlier.
+  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
+
+  // Check if input is null, return false if it is.
+  __ testl(arg, arg);
+  __ j(kEqual, &return_false);
+
+  // Instanceof check for the argument by comparing class fields.
+  // All string objects must have the same type since String cannot be subclassed.
+  // Receiver must be a string object, so its class field is equal to all strings' class fields.
+  // If the argument is a string object, its class field must be equal to receiver's class field.
+  __ movl(ecx, Address(str, class_offset));
+  __ cmpl(ecx, Address(arg, class_offset));
+  __ j(kNotEqual, &return_false);
+
+  // Reference equality check, return true if same reference.
+  __ cmpl(str, arg);
+  __ j(kEqual, &return_true);
+
+  // Load length of receiver string.
+  __ movl(ecx, Address(str, count_offset));
+  // Check if lengths are equal, return false if they're not.
+  __ cmpl(ecx, Address(arg, count_offset));
+  __ j(kNotEqual, &return_false);
+  // Return true if both strings are empty.
+  __ testl(ecx, ecx);
+  __ j(kEqual, &return_true);
+
+  // Load starting addresses of string values into ESI/EDI as required for repe_cmpsl instruction.
+  __ leal(esi, Address(str, value_offset));
+  __ leal(edi, Address(arg, value_offset));
+
+  // Divide string length by 2 to compare characters 2 at a time and adjust for odd lengths.
+  __ addl(ecx, Immediate(1));
+  __ shrl(ecx, Immediate(1));
+
+  // Assertions that must hold in order to compare strings 2 characters at a time.
+  DCHECK_ALIGNED(value_offset, 4);
+  static_assert(IsAligned<4>(kObjectAlignment), "String of odd length is not zero padded");
+
+  // Loop to compare strings two characters at a time starting at the beginning of the string.
+  __ repe_cmpsl();
+  // If strings are not equal, zero flag will be cleared.
+  __ j(kNotEqual, &return_false);
+
+  // Return true and exit the function.
+  // If loop does not result in returning false, we return true.
+  __ Bind(&return_true);
+  __ movl(esi, Immediate(1));
+  __ jmp(&end);
+
+  // Return false and exit the function.
+  __ Bind(&return_false);
+  __ xorl(esi, esi);
+  __ Bind(&end);
+}
+
 static void CreateStringIndexOfLocations(HInvoke* invoke,
                                          ArenaAllocator* allocator,
                                          bool start_at_zero) {
diff --git a/compiler/optimizing/intrinsics_x86.h b/compiler/optimizing/intrinsics_x86.h
index 4292ec7..ac68f39 100644
--- a/compiler/optimizing/intrinsics_x86.h
+++ b/compiler/optimizing/intrinsics_x86.h
@@ -36,7 +36,7 @@
 
   // Define visitor methods.
 
-#define OPTIMIZING_INTRINSICS(Name, IsStatic)   \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment)   \
   void Visit ## Name(HInvoke* invoke) OVERRIDE;
 #include "intrinsics_list.h"
 INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
@@ -61,7 +61,7 @@
 
   // Define visitor methods.
 
-#define OPTIMIZING_INTRINSICS(Name, IsStatic)   \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment)   \
   void Visit ## Name(HInvoke* invoke) OVERRIDE;
 #include "intrinsics_list.h"
 INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 62cdb4c..9ea68ec 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -282,8 +282,6 @@
                                                            LocationSummary::kNoCall,
                                                            kIntrinsified);
   locations->SetInAt(0, Location::RequiresFpuRegister());
-  // TODO: Allow x86 to work with memory. This requires assembler support, see below.
-  // locations->SetInAt(0, Location::Any());               // X86 can work on memory directly.
   locations->SetOut(Location::SameAsFirstInput());
   locations->AddTemp(Location::RequiresFpuRegister());  // FP reg to hold mask.
 }
@@ -294,34 +292,18 @@
                       CodeGeneratorX86_64* codegen) {
   Location output = locations->Out();
 
-  if (output.IsFpuRegister()) {
-    // In-register
-    XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  DCHECK(output.IsFpuRegister());
+  XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
 
-    // TODO: Can mask directly with constant area using pand if we can guarantee
-    // that the literal is aligned on a 16 byte boundary.  This will avoid a
-    // temporary.
-    if (is64bit) {
-      __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF)));
-      __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp);
-    } else {
-      __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF)));
-      __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp);
-    }
+  // TODO: Can mask directly with constant area using pand if we can guarantee
+  // that the literal is aligned on a 16 byte boundary.  This will avoid a
+  // temporary.
+  if (is64bit) {
+    __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF)));
+    __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp);
   } else {
-    // TODO: update when assember support is available.
-    UNIMPLEMENTED(FATAL) << "Needs assembler support.";
-//  Once assembler support is available, in-memory operations look like this:
-//    if (is64bit) {
-//      DCHECK(output.IsDoubleStackSlot());
-//      // No 64b and with literal.
-//      __ movq(cpu_temp, Immediate(INT64_C(0x7FFFFFFFFFFFFFFF)));
-//      __ andq(Address(CpuRegister(RSP), output.GetStackIndex()), cpu_temp);
-//    } else {
-//      DCHECK(output.IsStackSlot());
-//      // Can use and with a literal directly.
-//      __ andl(Address(CpuRegister(RSP), output.GetStackIndex()), Immediate(INT64_C(0x7FFFFFFF)));
-//    }
+    __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF)));
+    __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp);
   }
 }
 
@@ -736,6 +718,7 @@
   codegen_->Load64BitValue(out, kPrimIntMax);
 
   // if inPlusPointFive >= maxInt goto done
+  __ movl(out, Immediate(kPrimIntMax));
   __ comiss(inPlusPointFive, codegen_->LiteralFloatAddress(static_cast<float>(kPrimIntMax)));
   __ j(kAboveEqual, &done);
 
@@ -783,6 +766,7 @@
   codegen_->Load64BitValue(out, kPrimLongMax);
 
   // if inPlusPointFive >= maxLong goto done
+  __ movq(out, Immediate(kPrimLongMax));
   __ comisd(inPlusPointFive, codegen_->LiteralDoubleAddress(static_cast<double>(kPrimLongMax)));
   __ j(kAboveEqual, &done);
 
@@ -870,6 +854,97 @@
   __ Bind(slow_path->GetExitLabel());
 }
 
+void IntrinsicLocationsBuilderX86_64::VisitStringEquals(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+
+  // Request temporary registers, RCX and RDI needed for repe_cmpsq instruction.
+  locations->AddTemp(Location::RegisterLocation(RCX));
+  locations->AddTemp(Location::RegisterLocation(RDI));
+
+  // Set output, RSI needed for repe_cmpsq instruction anyways.
+  locations->SetOut(Location::RegisterLocation(RSI), Location::kOutputOverlap);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitStringEquals(HInvoke* invoke) {
+  X86_64Assembler* assembler = GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+
+  CpuRegister str = locations->InAt(0).AsRegister<CpuRegister>();
+  CpuRegister arg = locations->InAt(1).AsRegister<CpuRegister>();
+  CpuRegister rcx = locations->GetTemp(0).AsRegister<CpuRegister>();
+  CpuRegister rdi = locations->GetTemp(1).AsRegister<CpuRegister>();
+  CpuRegister rsi = locations->Out().AsRegister<CpuRegister>();
+
+  Label end;
+  Label return_true;
+  Label return_false;
+
+  // Get offsets of count, value, and class fields within a string object.
+  const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+  const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
+  const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value();
+
+  // Note that the null check must have been done earlier.
+  DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
+
+  // Check if input is null, return false if it is.
+  __ testl(arg, arg);
+  __ j(kEqual, &return_false);
+
+  // Instanceof check for the argument by comparing class fields.
+  // All string objects must have the same type since String cannot be subclassed.
+  // Receiver must be a string object, so its class field is equal to all strings' class fields.
+  // If the argument is a string object, its class field must be equal to receiver's class field.
+  __ movl(rcx, Address(str, class_offset));
+  __ cmpl(rcx, Address(arg, class_offset));
+  __ j(kNotEqual, &return_false);
+
+  // Reference equality check, return true if same reference.
+  __ cmpl(str, arg);
+  __ j(kEqual, &return_true);
+
+  // Load length of receiver string.
+  __ movl(rcx, Address(str, count_offset));
+  // Check if lengths are equal, return false if they're not.
+  __ cmpl(rcx, Address(arg, count_offset));
+  __ j(kNotEqual, &return_false);
+  // Return true if both strings are empty.
+  __ testl(rcx, rcx);
+  __ j(kEqual, &return_true);
+
+  // Load starting addresses of string values into RSI/RDI as required for repe_cmpsq instruction.
+  __ leal(rsi, Address(str, value_offset));
+  __ leal(rdi, Address(arg, value_offset));
+
+  // Divide string length by 4 and adjust for lengths not divisible by 4.
+  __ addl(rcx, Immediate(3));
+  __ shrl(rcx, Immediate(2));
+
+  // Assertions that must hold in order to compare strings 4 characters at a time.
+  DCHECK_ALIGNED(value_offset, 8);
+  static_assert(IsAligned<8>(kObjectAlignment), "String is not zero padded");
+
+  // Loop to compare strings four characters at a time starting at the beginning of the string.
+  __ repe_cmpsq();
+  // If strings are not equal, zero flag will be cleared.
+  __ j(kNotEqual, &return_false);
+
+  // Return true and exit the function.
+  // If loop does not result in returning false, we return true.
+  __ Bind(&return_true);
+  __ movl(rsi, Immediate(1));
+  __ jmp(&end);
+
+  // Return false and exit the function.
+  __ Bind(&return_false);
+  __ xorl(rsi, rsi);
+  __ Bind(&end);
+}
+
 static void CreateStringIndexOfLocations(HInvoke* invoke,
                                          ArenaAllocator* allocator,
                                          bool start_at_zero) {
diff --git a/compiler/optimizing/intrinsics_x86_64.h b/compiler/optimizing/intrinsics_x86_64.h
index 0e0e72c..17293af 100644
--- a/compiler/optimizing/intrinsics_x86_64.h
+++ b/compiler/optimizing/intrinsics_x86_64.h
@@ -36,7 +36,7 @@
 
   // Define visitor methods.
 
-#define OPTIMIZING_INTRINSICS(Name, IsStatic)   \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment)   \
   void Visit ## Name(HInvoke* invoke) OVERRIDE;
 #include "intrinsics_list.h"
 INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
@@ -61,7 +61,7 @@
 
   // Define visitor methods.
 
-#define OPTIMIZING_INTRINSICS(Name, IsStatic)   \
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment)   \
   void Visit ## Name(HInvoke* invoke) OVERRIDE;
 #include "intrinsics_list.h"
 INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index ca2c998..0df5d6d 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1215,23 +1215,33 @@
 };
 
 /**
- * Side-effects representation for write/read dependences on fields/arrays.
+ * Side-effects representation.
  *
- * The dependence analysis uses type disambiguation (e.g. a float field write
- * cannot modify the value of an integer field read) and the access type (e.g.
- * a reference array write cannot modify the value of a reference field read
- * [although it may modify the reference fetch prior to reading the field,
- * which is represented by its own write/read dependence]). The analysis
- * makes conservative points-to assumptions on reference types (e.g. two same
- * typed arrays are assumed to be the same, and any reference read depends
- * on any reference read without further regard of its type).
+ * For write/read dependences on fields/arrays, the dependence analysis uses
+ * type disambiguation (e.g. a float field write cannot modify the value of an
+ * integer field read) and the access type (e.g.  a reference array write cannot
+ * modify the value of a reference field read [although it may modify the
+ * reference fetch prior to reading the field, which is represented by its own
+ * write/read dependence]). The analysis makes conservative points-to
+ * assumptions on reference types (e.g. two same typed arrays are assumed to be
+ * the same, and any reference read depends on any reference read without
+ * further regard of its type).
  *
- * The internal representation uses the following 36-bit flags assignments:
+ * The internal representation uses 38-bit and is described in the table below.
+ * The first line indicates the side effect, and for field/array accesses the
+ * second line indicates the type of the access (in the order of the
+ * Primitive::Type enum).
+ * The two numbered lines below indicate the bit position in the bitfield (read
+ * vertically).
  *
- *   |ARRAY-R  |FIELD-R  |ARRAY-W  |FIELD-W  |
- *   +---------+---------+---------+---------+
- *   |543210987|654321098|765432109|876543210|
- *   |DFJISCBZL|DFJISCBZL|DFJISCBZL|DFJISCBZL|
+ *   |Depends on GC|ARRAY-R  |FIELD-R  |Can trigger GC|ARRAY-W  |FIELD-W  |
+ *   +-------------+---------+---------+--------------+---------+---------+
+ *   |             |DFJISCBZL|DFJISCBZL|              |DFJISCBZL|DFJISCBZL|
+ *   |      3      |333333322|222222221|       1      |111111110|000000000|
+ *   |      7      |654321098|765432109|       8      |765432109|876543210|
+ *
+ * Note that, to ease the implementation, 'changes' bits are least significant
+ * bits, while 'dependency' bits are most significant bits.
  */
 class SideEffects : public ValueObject {
  public:
@@ -1242,6 +1252,22 @@
   }
 
   static SideEffects All() {
+    return SideEffects(kAllChangeBits | kAllDependOnBits);
+  }
+
+  static SideEffects AllChanges() {
+    return SideEffects(kAllChangeBits);
+  }
+
+  static SideEffects AllDependencies() {
+    return SideEffects(kAllDependOnBits);
+  }
+
+  static SideEffects AllExceptGCDependency() {
+    return AllWritesAndReads().Union(SideEffects::CanTriggerGC());
+  }
+
+  static SideEffects AllWritesAndReads() {
     return SideEffects(kAllWrites | kAllReads);
   }
 
@@ -1255,7 +1281,7 @@
 
   static SideEffects FieldWriteOfType(Primitive::Type type, bool is_volatile) {
     return is_volatile
-        ? All()
+        ? AllWritesAndReads()
         : SideEffects(TypeFlagWithAlias(type, kFieldWriteOffset));
   }
 
@@ -1265,7 +1291,7 @@
 
   static SideEffects FieldReadOfType(Primitive::Type type, bool is_volatile) {
     return is_volatile
-        ? All()
+        ? AllWritesAndReads()
         : SideEffects(TypeFlagWithAlias(type, kFieldReadOffset));
   }
 
@@ -1273,11 +1299,40 @@
     return SideEffects(TypeFlagWithAlias(type, kArrayReadOffset));
   }
 
+  static SideEffects CanTriggerGC() {
+    return SideEffects(1ULL << kCanTriggerGCBit);
+  }
+
+  static SideEffects DependsOnGC() {
+    return SideEffects(1ULL << kDependsOnGCBit);
+  }
+
   // Combines the side-effects of this and the other.
   SideEffects Union(SideEffects other) const {
     return SideEffects(flags_ | other.flags_);
   }
 
+  SideEffects Exclusion(SideEffects other) const {
+    return SideEffects(flags_ & ~other.flags_);
+  }
+
+  bool Includes(SideEffects other) const {
+    return (other.flags_ & flags_) == other.flags_;
+  }
+
+  bool HasSideEffects() const {
+    return (flags_ & kAllChangeBits);
+  }
+
+  bool HasDependencies() const {
+    return (flags_ & kAllDependOnBits);
+  }
+
+  // Returns true if there are no side effects or dependencies.
+  bool DoesNothing() const {
+    return flags_ == 0;
+  }
+
   // Returns true if something is written.
   bool DoesAnyWrite() const {
     return (flags_ & kAllWrites);
@@ -1288,47 +1343,81 @@
     return (flags_ & kAllReads);
   }
 
-  // Returns true if nothing is written or read.
-  bool DoesNothing() const {
-    return flags_ == 0;
-  }
-
   // Returns true if potentially everything is written and read
   // (every type and every kind of access).
+  bool DoesAllReadWrite() const {
+    return (flags_ & (kAllWrites | kAllReads)) == (kAllWrites | kAllReads);
+  }
+
   bool DoesAll() const {
-    return flags_ == (kAllWrites | kAllReads);
+    return flags_ == (kAllChangeBits | kAllDependOnBits);
   }
 
   // Returns true if this may read something written by other.
   bool MayDependOn(SideEffects other) const {
-    const uint64_t reads = (flags_ & kAllReads) >> kFieldReadOffset;
-    return (other.flags_ & reads);
+    const uint64_t depends_on_flags = (flags_ & kAllDependOnBits) >> kChangeBits;
+    return (other.flags_ & depends_on_flags);
   }
 
   // Returns string representation of flags (for debugging only).
-  // Format: |DFJISCBZL|DFJISCBZL|DFJISCBZL|DFJISCBZL|
+  // Format: |x|DFJISCBZL|DFJISCBZL|y|DFJISCBZL|DFJISCBZL|
   std::string ToString() const {
-    static const char *kDebug = "LZBCSIJFD";
     std::string flags = "|";
-    for (int s = 35; s >= 0; s--) {
-      const int t = s % kBits;
-      if ((flags_ >> s) & 1)
-        flags += kDebug[t];
-      if (t == 0)
+    for (int s = kLastBit; s >= 0; s--) {
+      bool current_bit_is_set = ((flags_ >> s) & 1) != 0;
+      if ((s == kDependsOnGCBit) || (s == kCanTriggerGCBit)) {
+        // This is a bit for the GC side effect.
+        if (current_bit_is_set) {
+          flags += "GC";
+        }
         flags += "|";
+      } else {
+        // This is a bit for the array/field analysis.
+        // The underscore character stands for the 'can trigger GC' bit.
+        static const char *kDebug = "LZBCSIJFDLZBCSIJFD_LZBCSIJFDLZBCSIJFD";
+        if (current_bit_is_set) {
+          flags += kDebug[s];
+        }
+        if ((s == kFieldWriteOffset) || (s == kArrayWriteOffset) ||
+            (s == kFieldReadOffset) || (s == kArrayReadOffset)) {
+          flags += "|";
+        }
+      }
     }
     return flags;
   }
 
- private:
-  static constexpr int kBits = 9;
-  static constexpr int kFieldWriteOffset = 0 * kBits;
-  static constexpr int kArrayWriteOffset = 1 * kBits;
-  static constexpr int kFieldReadOffset  = 2 * kBits;
-  static constexpr int kArrayReadOffset  = 3 * kBits;
+  bool Equals(const SideEffects& other) const { return flags_ == other.flags_; }
 
-  static constexpr uint64_t kAllWrites = 0x0003ffff;
-  static constexpr uint64_t kAllReads  = kAllWrites << kFieldReadOffset;
+ private:
+  static constexpr int kFieldArrayAnalysisBits = 9;
+
+  static constexpr int kFieldWriteOffset = 0;
+  static constexpr int kArrayWriteOffset = kFieldWriteOffset + kFieldArrayAnalysisBits;
+  static constexpr int kLastBitForWrites = kArrayWriteOffset + kFieldArrayAnalysisBits - 1;
+  static constexpr int kCanTriggerGCBit = kLastBitForWrites + 1;
+
+  static constexpr int kChangeBits = kCanTriggerGCBit + 1;
+
+  static constexpr int kFieldReadOffset = kCanTriggerGCBit + 1;
+  static constexpr int kArrayReadOffset = kFieldReadOffset + kFieldArrayAnalysisBits;
+  static constexpr int kLastBitForReads = kArrayReadOffset + kFieldArrayAnalysisBits - 1;
+  static constexpr int kDependsOnGCBit = kLastBitForReads + 1;
+
+  static constexpr int kLastBit = kDependsOnGCBit;
+  static constexpr int kDependOnBits = kLastBit + 1 - kChangeBits;
+
+  // Aliases.
+
+  static_assert(kChangeBits == kDependOnBits,
+                "the 'change' bits should match the 'depend on' bits.");
+
+  static constexpr uint64_t kAllChangeBits = ((1ULL << kChangeBits) - 1);
+  static constexpr uint64_t kAllDependOnBits = ((1ULL << kDependOnBits) - 1) << kChangeBits;
+  static constexpr uint64_t kAllWrites =
+      ((1ULL << (kLastBitForWrites + 1 - kFieldWriteOffset)) - 1) << kFieldWriteOffset;
+  static constexpr uint64_t kAllReads =
+      ((1ULL << (kLastBitForReads + 1 - kFieldReadOffset)) - 1) << kFieldReadOffset;
 
   // Work around the fact that HIR aliases I/F and J/D.
   // TODO: remove this interceptor once HIR types are clean
@@ -1610,6 +1699,7 @@
   virtual bool IsControlFlow() const { return false; }
   virtual bool CanThrow() const { return false; }
 
+  bool HasSideEffects() const { return side_effects_.HasSideEffects(); }
   bool DoesAnyWrite() const { return side_effects_.DoesAnyWrite(); }
 
   // Does not apply for all instructions, but having this at top level greatly
@@ -2302,7 +2392,9 @@
  public:
   HBinaryOperation(Primitive::Type result_type,
                    HInstruction* left,
-                   HInstruction* right) : HExpression(result_type, SideEffects::None()) {
+                   HInstruction* right,
+                   SideEffects side_effects = SideEffects::None())
+      : HExpression(result_type, side_effects) {
     SetRawInputAt(0, left);
     SetRawInputAt(1, right);
   }
@@ -2626,7 +2718,9 @@
            HInstruction* second,
            ComparisonBias bias,
            uint32_t dex_pc)
-      : HBinaryOperation(Primitive::kPrimInt, first, second), bias_(bias), dex_pc_(dex_pc) {
+      : HBinaryOperation(Primitive::kPrimInt, first, second, SideEffectsForArchRuntimeCalls(type)),
+        bias_(bias),
+        dex_pc_(dex_pc) {
     DCHECK_EQ(type, first->GetType());
     DCHECK_EQ(type, second->GetType());
   }
@@ -2649,7 +2743,12 @@
 
   bool IsGtBias() { return bias_ == ComparisonBias::kGtBias; }
 
-  uint32_t GetDexPc() const { return dex_pc_; }
+  uint32_t GetDexPc() const OVERRIDE { return dex_pc_; }
+
+  static SideEffects SideEffectsForArchRuntimeCalls(Primitive::Type type) {
+    // MIPS64 uses a runtime call for FP comparisons.
+    return Primitive::IsFloatingPointType(type) ? SideEffects::CanTriggerGC() : SideEffects::None();
+  }
 
   DECLARE_INSTRUCTION(Compare);
 
@@ -2791,7 +2890,7 @@
 };
 
 enum class Intrinsics {
-#define OPTIMIZING_INTRINSICS(Name, IsStatic) k ## Name,
+#define OPTIMIZING_INTRINSICS(Name, IsStatic, NeedsEnvironment) k ## Name,
 #include "intrinsics_list.h"
   kNone,
   INTRINSICS_LIST(OPTIMIZING_INTRINSICS)
@@ -2800,13 +2899,18 @@
 };
 std::ostream& operator<<(std::ostream& os, const Intrinsics& intrinsic);
 
+enum IntrinsicNeedsEnvironment {
+  kNoEnvironment,        // Intrinsic does not require an environment.
+  kNeedsEnvironment      // Intrinsic requires an environment.
+};
+
 class HInvoke : public HInstruction {
  public:
   size_t InputCount() const OVERRIDE { return inputs_.Size(); }
 
   // Runtime needs to walk the stack, so Dex -> Dex calls need to
   // know their environment.
-  bool NeedsEnvironment() const OVERRIDE { return true; }
+  bool NeedsEnvironment() const OVERRIDE { return needs_environment_ == kNeedsEnvironment; }
 
   void SetArgumentAt(size_t index, HInstruction* argument) {
     SetRawInputAt(index, argument);
@@ -2831,8 +2935,9 @@
     return intrinsic_;
   }
 
-  void SetIntrinsic(Intrinsics intrinsic) {
+  void SetIntrinsic(Intrinsics intrinsic, IntrinsicNeedsEnvironment needs_environment) {
     intrinsic_ = intrinsic;
+    needs_environment_ = needs_environment;
   }
 
   bool IsFromInlinedInvoke() const {
@@ -2851,14 +2956,16 @@
           uint32_t dex_pc,
           uint32_t dex_method_index,
           InvokeType original_invoke_type)
-    : HInstruction(SideEffects::All()),  // assume write/read on all fields/arrays
+    : HInstruction(
+          SideEffects::AllExceptGCDependency()),  // Assume write/read on all fields/arrays.
       number_of_arguments_(number_of_arguments),
       inputs_(arena, number_of_arguments),
       return_type_(return_type),
       dex_pc_(dex_pc),
       dex_method_index_(dex_method_index),
       original_invoke_type_(original_invoke_type),
-      intrinsic_(Intrinsics::kNone) {
+      intrinsic_(Intrinsics::kNone),
+      needs_environment_(kNeedsEnvironment) {
     uint32_t number_of_inputs = number_of_arguments + number_of_other_inputs;
     inputs_.SetSize(number_of_inputs);
   }
@@ -2875,6 +2982,7 @@
   const uint32_t dex_method_index_;
   const InvokeType original_invoke_type_;
   Intrinsics intrinsic_;
+  IntrinsicNeedsEnvironment needs_environment_;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HInvoke);
@@ -2923,6 +3031,10 @@
     return false;
   }
 
+  bool CanBeNull() const OVERRIDE {
+    return return_type_ == Primitive::kPrimNot && !IsStringInit();
+  }
+
   InvokeType GetInvokeType() const { return invoke_type_; }
   bool IsRecursive() const { return is_recursive_; }
   bool NeedsDexCache() const OVERRIDE { return !IsRecursive(); }
@@ -3068,7 +3180,7 @@
                uint16_t type_index,
                const DexFile& dex_file,
                QuickEntrypointEnum entrypoint)
-      : HExpression(Primitive::kPrimNot, SideEffects::None()),
+      : HExpression(Primitive::kPrimNot, SideEffects::CanTriggerGC()),
         dex_pc_(dex_pc),
         type_index_(type_index),
         dex_file_(dex_file),
@@ -3105,7 +3217,7 @@
 
 class HNeg : public HUnaryOperation {
  public:
-  explicit HNeg(Primitive::Type result_type, HInstruction* input)
+  HNeg(Primitive::Type result_type, HInstruction* input)
       : HUnaryOperation(result_type, input) {}
 
   template <typename T> T Compute(T x) const { return -x; }
@@ -3131,7 +3243,7 @@
             uint16_t type_index,
             const DexFile& dex_file,
             QuickEntrypointEnum entrypoint)
-      : HExpression(Primitive::kPrimNot, SideEffects::None()),
+      : HExpression(Primitive::kPrimNot, SideEffects::CanTriggerGC()),
         dex_pc_(dex_pc),
         type_index_(type_index),
         dex_file_(dex_file),
@@ -3232,7 +3344,8 @@
 class HDiv : public HBinaryOperation {
  public:
   HDiv(Primitive::Type result_type, HInstruction* left, HInstruction* right, uint32_t dex_pc)
-      : HBinaryOperation(result_type, left, right), dex_pc_(dex_pc) {}
+      : HBinaryOperation(result_type, left, right, SideEffectsForArchRuntimeCalls()),
+        dex_pc_(dex_pc) {}
 
   template <typename T>
   T Compute(T x, T y) const {
@@ -3252,6 +3365,11 @@
 
   uint32_t GetDexPc() const OVERRIDE { return dex_pc_; }
 
+  static SideEffects SideEffectsForArchRuntimeCalls() {
+    // The generated code can use a runtime call.
+    return SideEffects::CanTriggerGC();
+  }
+
   DECLARE_INSTRUCTION(Div);
 
  private:
@@ -3263,7 +3381,8 @@
 class HRem : public HBinaryOperation {
  public:
   HRem(Primitive::Type result_type, HInstruction* left, HInstruction* right, uint32_t dex_pc)
-      : HBinaryOperation(result_type, left, right), dex_pc_(dex_pc) {}
+      : HBinaryOperation(result_type, left, right, SideEffectsForArchRuntimeCalls()),
+        dex_pc_(dex_pc) {}
 
   template <typename T>
   T Compute(T x, T y) const {
@@ -3283,6 +3402,10 @@
 
   uint32_t GetDexPc() const OVERRIDE { return dex_pc_; }
 
+  static SideEffects SideEffectsForArchRuntimeCalls() {
+    return SideEffects::CanTriggerGC();
+  }
+
   DECLARE_INSTRUCTION(Rem);
 
  private:
@@ -3535,7 +3658,7 @@
 
 class HNot : public HUnaryOperation {
  public:
-  explicit HNot(Primitive::Type result_type, HInstruction* input)
+  HNot(Primitive::Type result_type, HInstruction* input)
       : HUnaryOperation(result_type, input) {}
 
   bool CanBeMoved() const OVERRIDE { return true; }
@@ -3593,7 +3716,8 @@
  public:
   // Instantiate a type conversion of `input` to `result_type`.
   HTypeConversion(Primitive::Type result_type, HInstruction* input, uint32_t dex_pc)
-      : HExpression(result_type, SideEffects::None()), dex_pc_(dex_pc) {
+      : HExpression(result_type, SideEffectsForArchRuntimeCalls(input->GetType(), result_type)),
+        dex_pc_(dex_pc) {
     SetRawInputAt(0, input);
     DCHECK_NE(input->GetType(), result_type);
   }
@@ -3613,6 +3737,18 @@
   // containing the result.  If the input cannot be converted, return nullptr.
   HConstant* TryStaticEvaluation() const;
 
+  static SideEffects SideEffectsForArchRuntimeCalls(Primitive::Type input_type,
+                                                    Primitive::Type result_type) {
+    // Some architectures may not require the 'GC' side effects, but at this point
+    // in the compilation process we do not know what architecture we will
+    // generate code for, so we must be conservative.
+    if ((Primitive::IsFloatingPointType(input_type) && Primitive::IsIntegralType(result_type))
+        || (input_type == Primitive::kPrimLong && Primitive::IsFloatingPointType(result_type))) {
+      return SideEffects::CanTriggerGC();
+    }
+    return SideEffects::None();
+  }
+
   DECLARE_INSTRUCTION(TypeConversion);
 
  private:
@@ -3879,7 +4015,9 @@
             HInstruction* value,
             Primitive::Type expected_component_type,
             uint32_t dex_pc)
-      : HTemplateInstruction(SideEffects::ArrayWriteOfType(expected_component_type)),
+      : HTemplateInstruction(
+            SideEffects::ArrayWriteOfType(expected_component_type).Union(
+                SideEffectsForArchRuntimeCalls(value->GetType()))),
         dex_pc_(dex_pc),
         expected_component_type_(expected_component_type),
         needs_type_check_(value->GetType() == Primitive::kPrimNot),
@@ -3932,6 +4070,10 @@
         : expected_component_type_;
   }
 
+  static SideEffects SideEffectsForArchRuntimeCalls(Primitive::Type value_type) {
+    return (value_type == Primitive::kPrimNot) ? SideEffects::CanTriggerGC() : SideEffects::None();
+  }
+
   DECLARE_INSTRUCTION(ArraySet);
 
  private:
@@ -4026,7 +4168,7 @@
 class HSuspendCheck : public HTemplateInstruction<0> {
  public:
   explicit HSuspendCheck(uint32_t dex_pc)
-      : HTemplateInstruction(SideEffects::None()), dex_pc_(dex_pc), slow_path_(nullptr) {}
+      : HTemplateInstruction(SideEffects::CanTriggerGC()), dex_pc_(dex_pc), slow_path_(nullptr) {}
 
   bool NeedsEnvironment() const OVERRIDE {
     return true;
@@ -4058,7 +4200,7 @@
              const DexFile& dex_file,
              bool is_referrers_class,
              uint32_t dex_pc)
-      : HExpression(Primitive::kPrimNot, SideEffects::None()),
+      : HExpression(Primitive::kPrimNot, SideEffectsForArchRuntimeCalls()),
         type_index_(type_index),
         dex_file_(dex_file),
         is_referrers_class_(is_referrers_class),
@@ -4119,6 +4261,10 @@
 
   bool NeedsDexCache() const OVERRIDE { return !is_referrers_class_; }
 
+  static SideEffects SideEffectsForArchRuntimeCalls() {
+    return SideEffects::CanTriggerGC();
+  }
+
   DECLARE_INSTRUCTION(LoadClass);
 
  private:
@@ -4138,7 +4284,7 @@
 class HLoadString : public HExpression<1> {
  public:
   HLoadString(HCurrentMethod* current_method, uint32_t string_index, uint32_t dex_pc)
-      : HExpression(Primitive::kPrimNot, SideEffects::None()),
+      : HExpression(Primitive::kPrimNot, SideEffectsForArchRuntimeCalls()),
         string_index_(string_index),
         dex_pc_(dex_pc) {
     SetRawInputAt(0, current_method);
@@ -4159,6 +4305,10 @@
   bool NeedsEnvironment() const OVERRIDE { return false; }
   bool NeedsDexCache() const OVERRIDE { return true; }
 
+  static SideEffects SideEffectsForArchRuntimeCalls() {
+    return SideEffects::CanTriggerGC();
+  }
+
   DECLARE_INSTRUCTION(LoadString);
 
  private:
@@ -4173,10 +4323,10 @@
  */
 class HClinitCheck : public HExpression<1> {
  public:
-  explicit HClinitCheck(HLoadClass* constant, uint32_t dex_pc)
+  HClinitCheck(HLoadClass* constant, uint32_t dex_pc)
       : HExpression(
-          Primitive::kPrimNot,
-          SideEffects::AllWrites()),  // assume write on all fields/arrays
+            Primitive::kPrimNot,
+            SideEffects::AllChanges()),  // Assume write/read on all fields/arrays.
         dex_pc_(dex_pc) {
     SetRawInputAt(0, constant);
   }
@@ -4305,7 +4455,7 @@
 class HThrow : public HTemplateInstruction<1> {
  public:
   HThrow(HInstruction* exception, uint32_t dex_pc)
-      : HTemplateInstruction(SideEffects::None()), dex_pc_(dex_pc) {
+      : HTemplateInstruction(SideEffects::CanTriggerGC()), dex_pc_(dex_pc) {
     SetRawInputAt(0, exception);
   }
 
@@ -4331,7 +4481,7 @@
               HLoadClass* constant,
               bool class_is_final,
               uint32_t dex_pc)
-      : HExpression(Primitive::kPrimBoolean, SideEffects::None()),
+      : HExpression(Primitive::kPrimBoolean, SideEffectsForArchRuntimeCalls(class_is_final)),
         class_is_final_(class_is_final),
         must_do_null_check_(true),
         dex_pc_(dex_pc) {
@@ -4357,6 +4507,10 @@
   bool MustDoNullCheck() const { return must_do_null_check_; }
   void ClearMustDoNullCheck() { must_do_null_check_ = false; }
 
+  static SideEffects SideEffectsForArchRuntimeCalls(bool class_is_final) {
+    return class_is_final ? SideEffects::None() : SideEffects::CanTriggerGC();
+  }
+
   DECLARE_INSTRUCTION(InstanceOf);
 
  private:
@@ -4416,7 +4570,7 @@
              HLoadClass* constant,
              bool class_is_final,
              uint32_t dex_pc)
-      : HTemplateInstruction(SideEffects::None()),
+      : HTemplateInstruction(SideEffects::CanTriggerGC()),
         class_is_final_(class_is_final),
         must_do_null_check_(true),
         dex_pc_(dex_pc) {
@@ -4458,7 +4612,7 @@
  public:
   explicit HMemoryBarrier(MemBarrierKind barrier_kind)
       : HTemplateInstruction(
-          SideEffects::All()),  // assume write/read on all fields/arrays
+            SideEffects::AllWritesAndReads()),  // Assume write/read on all fields/arrays.
         barrier_kind_(barrier_kind) {}
 
   MemBarrierKind GetBarrierKind() { return barrier_kind_; }
@@ -4479,7 +4633,8 @@
   };
 
   HMonitorOperation(HInstruction* object, OperationKind kind, uint32_t dex_pc)
-    : HTemplateInstruction(SideEffects::All()),  // assume write/read on all fields/arrays
+    : HTemplateInstruction(
+          SideEffects::AllExceptGCDependency()),  // Assume write/read on all fields/arrays.
       kind_(kind), dex_pc_(dex_pc) {
     SetRawInputAt(0, object);
   }
diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc
index d1c1134..1349df9 100644
--- a/compiler/optimizing/reference_type_propagation.cc
+++ b/compiler/optimizing/reference_type_propagation.cc
@@ -167,6 +167,9 @@
     bound_type->SetReferenceTypeInfo(
         ReferenceTypeInfo::Create(class_rti.GetTypeHandle(), /* is_exact */ false));
   }
+  if (upper_can_be_null) {
+    bound_type->SetCanBeNull(obj->CanBeNull());
+  }
   return bound_type;
 }
 
@@ -485,6 +488,10 @@
               true /* CheckCast succeeds for nulls. */);
           check_cast->GetBlock()->InsertInstructionAfter(bound_type, check_cast);
         } else {
+          // Update nullability of the existing bound type, which may not have known
+          // that its input was not null when it was being created.
+          bound_type = check_cast->GetNext()->AsBoundType();
+          bound_type->SetCanBeNull(obj->CanBeNull());
           // We already have a bound type on the position we would need to insert
           // the new one. The existing bound type should dominate all the users
           // (dchecked) so there's no need to continue.
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 60f5ab2..9f32a9e 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -952,7 +952,16 @@
 // we spill `current` instead.
 bool RegisterAllocator::AllocateBlockedReg(LiveInterval* current) {
   size_t first_register_use = current->FirstRegisterUse();
-  if (first_register_use == kNoLifetime) {
+  if (current->HasRegister()) {
+    DCHECK(current->IsHighInterval());
+    // The low interval has allocated the register for the high interval. In
+    // case the low interval had to split both intervals, we may end up in a
+    // situation where the high interval does not have a register use anymore.
+    // We must still proceed in order to split currently active and inactive
+    // uses of the high interval's register, and put the high interval in the
+    // active set.
+    DCHECK(first_register_use != kNoLifetime || (current->GetNextSibling() != nullptr));
+  } else if (first_register_use == kNoLifetime) {
     AllocateSpillSlotFor(current);
     return false;
   }
@@ -1019,7 +1028,7 @@
     // When allocating the low part, we made sure the high register was available.
     DCHECK_LT(first_use, next_use[reg]);
   } else if (current->IsLowInterval()) {
-    reg = FindAvailableRegisterPair(next_use, first_register_use);
+    reg = FindAvailableRegisterPair(next_use, first_use);
     // We should spill if both registers are not available.
     should_spill = (first_use >= next_use[reg])
       || (first_use >= next_use[GetHighForLowRegister(reg)]);
@@ -1033,16 +1042,28 @@
   if (should_spill) {
     DCHECK(!current->IsHighInterval());
     bool is_allocation_at_use_site = (current->GetStart() >= (first_register_use - 1));
-    if (current->IsLowInterval()
-        && is_allocation_at_use_site
-        && TrySplitNonPairOrUnalignedPairIntervalAt(current->GetStart(),
-                                                    first_register_use,
-                                                    next_use)) {
+    if (is_allocation_at_use_site) {
+      if (!current->IsLowInterval()) {
+        DumpInterval(std::cerr, current);
+        DumpAllIntervals(std::cerr);
+        // This situation has the potential to infinite loop, so we make it a non-debug CHECK.
+        HInstruction* at = liveness_.GetInstructionFromPosition(first_register_use / 2);
+        CHECK(false) << "There is not enough registers available for "
+          << current->GetParent()->GetDefinedBy()->DebugName() << " "
+          << current->GetParent()->GetDefinedBy()->GetId()
+          << " at " << first_register_use - 1 << " "
+          << (at == nullptr ? "" : at->DebugName());
+      }
+
       // If we're allocating a register for `current` because the instruction at
       // that position requires it, but we think we should spill, then there are
       // non-pair intervals or unaligned pair intervals blocking the allocation.
       // We split the first interval found, and put ourselves first in the
       // `unhandled_` list.
+      bool success = TrySplitNonPairOrUnalignedPairIntervalAt(current->GetStart(),
+                                                              first_register_use,
+                                                              next_use);
+      DCHECK(success);
       LiveInterval* existing = unhandled_->Peek();
       DCHECK(existing->IsHighInterval());
       DCHECK_EQ(existing->GetLowInterval(), current);
@@ -1052,17 +1073,7 @@
       // register, we split this interval just before its first register use.
       AllocateSpillSlotFor(current);
       LiveInterval* split = SplitBetween(current, current->GetStart(), first_register_use - 1);
-      if (current == split) {
-        DumpInterval(std::cerr, current);
-        DumpAllIntervals(std::cerr);
-        // This situation has the potential to infinite loop, so we make it a non-debug CHECK.
-        HInstruction* at = liveness_.GetInstructionFromPosition(first_register_use / 2);
-        CHECK(false) << "There is not enough registers available for "
-          << split->GetParent()->GetDefinedBy()->DebugName() << " "
-          << split->GetParent()->GetDefinedBy()->GetId()
-          << " at " << first_register_use - 1 << " "
-          << (at == nullptr ? "" : at->DebugName());
-      }
+      DCHECK(current != split);
       AddSorted(unhandled_, split);
     }
     return false;
@@ -1243,7 +1254,9 @@
 
 void RegisterAllocator::AllocateSpillSlotFor(LiveInterval* interval) {
   if (interval->IsHighInterval()) {
-    // The low interval will contain the spill slot.
+    // The low interval already took care of allocating the spill slot.
+    DCHECK(!interval->GetLowInterval()->HasRegister());
+    DCHECK(interval->GetLowInterval()->GetParent()->HasSpillSlot());
     return;
   }
 
diff --git a/compiler/optimizing/side_effects_analysis.cc b/compiler/optimizing/side_effects_analysis.cc
index 9dbf638..1c3e255 100644
--- a/compiler/optimizing/side_effects_analysis.cc
+++ b/compiler/optimizing/side_effects_analysis.cc
@@ -47,8 +47,8 @@
          inst_it.Advance()) {
       HInstruction* instruction = inst_it.Current();
       effects = effects.Union(instruction->GetSideEffects());
-      // If every possible write/read is represented, scanning further
-      // will not add any more information to side-effects of this block.
+      // If all side effects are represented, scanning further will not add any
+      // more information to side-effects of this block.
       if (effects.DoesAll()) {
         break;
       }
diff --git a/compiler/optimizing/side_effects_test.cc b/compiler/optimizing/side_effects_test.cc
index 8db5a8a..ec45d6b 100644
--- a/compiler/optimizing/side_effects_test.cc
+++ b/compiler/optimizing/side_effects_test.cc
@@ -77,7 +77,7 @@
   EXPECT_TRUE(all.DoesAnyWrite());
   EXPECT_TRUE(all.DoesAnyRead());
   EXPECT_FALSE(all.DoesNothing());
-  EXPECT_TRUE(all.DoesAll());
+  EXPECT_TRUE(all.DoesAllReadWrite());
 }
 
 TEST(SideEffectsTest, None) {
@@ -85,7 +85,7 @@
   EXPECT_FALSE(none.DoesAnyWrite());
   EXPECT_FALSE(none.DoesAnyRead());
   EXPECT_TRUE(none.DoesNothing());
-  EXPECT_FALSE(none.DoesAll());
+  EXPECT_FALSE(none.DoesAllReadWrite());
 }
 
 TEST(SideEffectsTest, DependencesAndNoDependences) {
@@ -176,33 +176,53 @@
     s = s.Union(SideEffects::FieldReadOfType(type, false));
     s = s.Union(SideEffects::ArrayReadOfType(type));
   }
-  EXPECT_TRUE(s.DoesAll());
+  EXPECT_TRUE(s.DoesAllReadWrite());
+}
+
+TEST(SideEffectsTest, GC) {
+  SideEffects can_trigger_gc = SideEffects::CanTriggerGC();
+  SideEffects depends_on_gc = SideEffects::DependsOnGC();
+  SideEffects all_changes = SideEffects::AllChanges();
+  SideEffects all_dependencies = SideEffects::AllDependencies();
+
+  EXPECT_TRUE(depends_on_gc.MayDependOn(can_trigger_gc));
+  EXPECT_TRUE(depends_on_gc.Union(can_trigger_gc).MayDependOn(can_trigger_gc));
+  EXPECT_FALSE(can_trigger_gc.MayDependOn(depends_on_gc));
+
+  EXPECT_TRUE(depends_on_gc.MayDependOn(all_changes));
+  EXPECT_TRUE(depends_on_gc.Union(can_trigger_gc).MayDependOn(all_changes));
+  EXPECT_FALSE(can_trigger_gc.MayDependOn(all_changes));
+
+  EXPECT_TRUE(all_changes.Includes(can_trigger_gc));
+  EXPECT_FALSE(all_changes.Includes(depends_on_gc));
+  EXPECT_TRUE(all_dependencies.Includes(depends_on_gc));
+  EXPECT_FALSE(all_dependencies.Includes(can_trigger_gc));
 }
 
 TEST(SideEffectsTest, BitStrings) {
   EXPECT_STREQ(
-      "|||||",
+      "|||||||",
       SideEffects::None().ToString().c_str());
   EXPECT_STREQ(
-      "|DFJISCBZL|DFJISCBZL|DFJISCBZL|DFJISCBZL|",
+      "|GC|DFJISCBZL|DFJISCBZL|GC|DFJISCBZL|DFJISCBZL|",
       SideEffects::All().ToString().c_str());
   EXPECT_STREQ(
-      "|||DFJISCBZL|DFJISCBZL|",
+      "|||||DFJISCBZL|DFJISCBZL|",
       SideEffects::AllWrites().ToString().c_str());
   EXPECT_STREQ(
-      "|DFJISCBZL|DFJISCBZL|||",
+      "||DFJISCBZL|DFJISCBZL||||",
       SideEffects::AllReads().ToString().c_str());
   EXPECT_STREQ(
-      "||||L|",
+      "||||||L|",
       SideEffects::FieldWriteOfType(Primitive::kPrimNot, false).ToString().c_str());
   EXPECT_STREQ(
-      "|||Z||",
+      "|||||Z||",
       SideEffects::ArrayWriteOfType(Primitive::kPrimBoolean).ToString().c_str());
   EXPECT_STREQ(
-      "||B|||",
+      "|||B||||",
       SideEffects::FieldReadOfType(Primitive::kPrimByte, false).ToString().c_str());
   EXPECT_STREQ(
-      "|DJ||||",  // note: DJ alias
+      "||DJ|||||",  // note: DJ alias
       SideEffects::ArrayReadOfType(Primitive::kPrimDouble).ToString().c_str());
   SideEffects s = SideEffects::None();
   s = s.Union(SideEffects::FieldWriteOfType(Primitive::kPrimChar, false));
@@ -212,7 +232,7 @@
   s = s.Union(SideEffects::ArrayReadOfType(Primitive::kPrimFloat));
   s = s.Union(SideEffects::ArrayReadOfType(Primitive::kPrimDouble));
   EXPECT_STREQ(
-      "|DFJI|FI|S|DJC|",   // note: DJ/FI alias.
+      "||DFJI|FI||S|DJC|",   // note: DJ/FI alias.
       s.ToString().c_str());
 }
 
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index 5d85d11..ef60fef 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -888,7 +888,7 @@
 // Slowpath entered when Thread::Current()->_exception is non-null
 class ArmExceptionSlowPath FINAL : public SlowPath {
  public:
-  explicit ArmExceptionSlowPath(ArmManagedRegister scratch, size_t stack_adjust)
+  ArmExceptionSlowPath(ArmManagedRegister scratch, size_t stack_adjust)
       : scratch_(scratch), stack_adjust_(stack_adjust) {
   }
   void Emit(Assembler *sp_asm) OVERRIDE;
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 05882a3..8e85fe9 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -254,7 +254,7 @@
 
 class Arm64Exception {
  private:
-  explicit Arm64Exception(Arm64ManagedRegister scratch, size_t stack_adjust)
+  Arm64Exception(Arm64ManagedRegister scratch, size_t stack_adjust)
       : scratch_(scratch), stack_adjust_(stack_adjust) {
     }
 
diff --git a/compiler/utils/dedupe_set.h b/compiler/utils/dedupe_set.h
index 8cdb180..2c4a689 100644
--- a/compiler/utils/dedupe_set.h
+++ b/compiler/utils/dedupe_set.h
@@ -99,7 +99,7 @@
     return hashed_key.store_ptr;
   }
 
-  explicit DedupeSet(const char* set_name, SwapAllocator<void>& alloc)
+  DedupeSet(const char* set_name, SwapAllocator<void>& alloc)
       : allocator_(alloc), hash_time_(0) {
     for (HashType i = 0; i < kShard; ++i) {
       std::ostringstream oss;
diff --git a/compiler/utils/dex_cache_arrays_layout.h b/compiler/utils/dex_cache_arrays_layout.h
index 8f98ea1..2a109bd 100644
--- a/compiler/utils/dex_cache_arrays_layout.h
+++ b/compiler/utils/dex_cache_arrays_layout.h
@@ -37,7 +37,7 @@
   }
 
   // Construct a layout for a particular dex file.
-  explicit DexCacheArraysLayout(size_t pointer_size, const DexFile* dex_file);
+  DexCacheArraysLayout(size_t pointer_size, const DexFile* dex_file);
 
   bool Valid() const {
     return Size() != 0u;
diff --git a/compiler/utils/managed_register.h b/compiler/utils/managed_register.h
index bb62bca..893daff 100644
--- a/compiler/utils/managed_register.h
+++ b/compiler/utils/managed_register.h
@@ -95,7 +95,7 @@
   explicit ManagedRegisterSpill(const ManagedRegister& other)
       : ManagedRegister(other), size_(-1), spill_offset_(-1) { }
 
-  explicit ManagedRegisterSpill(const ManagedRegister& other, int32_t size)
+  ManagedRegisterSpill(const ManagedRegister& other, int32_t size)
       : ManagedRegister(other), size_(size), spill_offset_(-1) { }
 
   int32_t getSpillOffset() {
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 0d1b82c..df95dad 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -283,7 +283,7 @@
 // Slowpath entered when Thread::Current()->_exception is non-null
 class MipsExceptionSlowPath FINAL : public SlowPath {
  public:
-  explicit MipsExceptionSlowPath(MipsManagedRegister scratch, size_t stack_adjust)
+  MipsExceptionSlowPath(MipsManagedRegister scratch, size_t stack_adjust)
       : scratch_(scratch), stack_adjust_(stack_adjust) {}
   virtual void Emit(Assembler *sp_asm) OVERRIDE;
  private:
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 47b146a..31130ea 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -354,7 +354,7 @@
 // Slowpath entered when Thread::Current()->_exception is non-null
 class Mips64ExceptionSlowPath FINAL : public SlowPath {
  public:
-  explicit Mips64ExceptionSlowPath(Mips64ManagedRegister scratch, size_t stack_adjust)
+  Mips64ExceptionSlowPath(Mips64ManagedRegister scratch, size_t stack_adjust)
       : scratch_(scratch), stack_adjust_(stack_adjust) {}
   virtual void Emit(Assembler *sp_asm) OVERRIDE;
  private:
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 8c2a3ed..9b3d792 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -158,6 +158,20 @@
   EmitUint8(0xC8 + dst);
 }
 
+void X86Assembler::bsrl(Register dst, Register src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xBD);
+  EmitRegisterOperand(dst, src);
+}
+
+void X86Assembler::bsrl(Register dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xBD);
+  EmitOperand(dst, src);
+}
+
 void X86Assembler::movzxb(Register dst, ByteRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x0F);
@@ -1552,6 +1566,14 @@
 }
 
 
+void X86Assembler::rep_movsw() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0xF3);
+  EmitUint8(0xA5);
+}
+
+
 X86Assembler* X86Assembler::lock() {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF0);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index d9c1b40..a9227f3 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -205,7 +205,7 @@
 
 class X86Assembler FINAL : public Assembler {
  public:
-  explicit X86Assembler() {}
+  X86Assembler() {}
   virtual ~X86Assembler() {}
 
   /*
@@ -234,6 +234,8 @@
   void movntl(const Address& dst, Register src);
 
   void bswapl(Register dst);
+  void bsrl(Register dst, Register src);
+  void bsrl(Register dst, const Address& src);
 
   void movzxb(Register dst, ByteRegister src);
   void movzxb(Register dst, const Address& src);
@@ -470,6 +472,7 @@
   void repne_scasw();
   void repe_cmpsw();
   void repe_cmpsl();
+  void rep_movsw();
 
   X86Assembler* lock();
   void cmpxchgl(const Address& address, Register reg);
@@ -649,7 +652,6 @@
   void EmitComplex(int rm, const Operand& operand, const Immediate& immediate);
   void EmitLabel(Label* label, int instruction_size);
   void EmitLabelLink(Label* label);
-  void EmitNearLabelLink(Label* label);
 
   void EmitGenericShift(int rm, const Operand& operand, const Immediate& imm);
   void EmitGenericShift(int rm, const Operand& operand, Register shifter);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index b664d23..731b5f4 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -218,4 +218,29 @@
   DriverStr(expected, "Repecmpsl");
 }
 
+TEST_F(AssemblerX86Test, RepneScasw) {
+  GetAssembler()->repne_scasw();
+  const char* expected = "repne scasw\n";
+  DriverStr(expected, "repne_scasw");
+}
+
+TEST_F(AssemblerX86Test, RepMovsw) {
+  GetAssembler()->rep_movsw();
+  const char* expected = "rep movsw\n";
+  DriverStr(expected, "rep_movsw");
+}
+
+TEST_F(AssemblerX86Test, Bsrl) {
+  DriverStr(RepeatRR(&x86::X86Assembler::bsrl, "bsrl %{reg2}, %{reg1}"), "bsrl");
+}
+
+TEST_F(AssemblerX86Test, BsrlAddress) {
+  GetAssembler()->bsrl(x86::Register(x86::EDI), x86::Address(
+      x86::Register(x86::EDI), x86::Register(x86::EBX), x86::TIMES_4, 12));
+  const char* expected =
+    "bsrl 0xc(%EDI,%EBX,4), %EDI\n";
+
+  DriverStr(expected, "bsrl_address");
+}
+
 }  // namespace art
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 22e7b9b..dc61c99 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -2006,6 +2006,14 @@
 }
 
 
+void X86_64Assembler::rep_movsw() {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0xF3);
+  EmitUint8(0xA5);
+}
+
+
 X86_64Assembler* X86_64Assembler::lock() {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF0);
@@ -2084,6 +2092,37 @@
   EmitUint8(0xC8 + dst.LowBits());
 }
 
+void X86_64Assembler::bsrl(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xBD);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
+}
+
+void X86_64Assembler::bsrl(CpuRegister dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xBD);
+  EmitOperand(dst.LowBits(), src);
+}
+
+void X86_64Assembler::bsrq(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitRex64(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xBD);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
+}
+
+void X86_64Assembler::bsrq(CpuRegister dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitRex64(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xBD);
+  EmitOperand(dst.LowBits(), src);
+}
 
 void X86_64Assembler::repne_scasw() {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index b8e5fb6..da42213 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -332,7 +332,7 @@
   void movq(CpuRegister dst, const Address& src);
   void movl(CpuRegister dst, const Address& src);
   void movq(const Address& dst, CpuRegister src);
-  void movq(const Address& dst, const Immediate& src);
+  void movq(const Address& dst, const Immediate& imm);
   void movl(const Address& dst, CpuRegister src);
   void movl(const Address& dst, const Immediate& imm);
 
@@ -606,10 +606,16 @@
   void bswapl(CpuRegister dst);
   void bswapq(CpuRegister dst);
 
+  void bsrl(CpuRegister dst, CpuRegister src);
+  void bsrl(CpuRegister dst, const Address& src);
+  void bsrq(CpuRegister dst, CpuRegister src);
+  void bsrq(CpuRegister dst, const Address& src);
+
   void repne_scasw();
   void repe_cmpsw();
   void repe_cmpsl();
   void repe_cmpsq();
+  void rep_movsw();
 
   //
   // Macros for High-level operations.
@@ -803,7 +809,6 @@
   void EmitComplex(uint8_t rm, const Operand& operand, const Immediate& immediate);
   void EmitLabel(Label* label, int instruction_size);
   void EmitLabelLink(Label* label);
-  void EmitNearLabelLink(Label* label);
 
   void EmitGenericShift(bool wide, int rm, CpuRegister reg, const Immediate& imm);
   void EmitGenericShift(bool wide, int rm, CpuRegister operand, CpuRegister shifter);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 296487e..8673f03 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -836,6 +836,18 @@
   DriverStr(expected, "xorq");
 }
 
+TEST_F(AssemblerX86_64Test, RepneScasw) {
+  GetAssembler()->repne_scasw();
+  const char* expected = "repne scasw\n";
+  DriverStr(expected, "repne_scasw");
+}
+
+TEST_F(AssemblerX86_64Test, RepMovsw) {
+  GetAssembler()->rep_movsw();
+  const char* expected = "rep movsw\n";
+  DriverStr(expected, "rep_movsw");
+}
+
 TEST_F(AssemblerX86_64Test, Movsxd) {
   DriverStr(RepeatRr(&x86_64::X86_64Assembler::movsxd, "movsxd %{reg2}, %{reg1}"), "movsxd");
 }
@@ -1129,6 +1141,44 @@
   DriverStr(RepeatR(&x86_64::X86_64Assembler::bswapq, "bswap %{reg}"), "bswapq");
 }
 
+TEST_F(AssemblerX86_64Test, Bsrl) {
+  DriverStr(Repeatrr(&x86_64::X86_64Assembler::bsrl, "bsrl %{reg2}, %{reg1}"), "bsrl");
+}
+
+TEST_F(AssemblerX86_64Test, BsrlAddress) {
+  GetAssembler()->bsrl(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->bsrl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->bsrl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
+  const char* expected =
+    "bsrl 0xc(%RDI,%RBX,4), %R10d\n"
+    "bsrl 0xc(%R10,%RBX,4), %edi\n"
+    "bsrl 0xc(%RDI,%R9,4), %edi\n";
+
+  DriverStr(expected, "bsrl_address");
+}
+
+TEST_F(AssemblerX86_64Test, Bsrq) {
+  DriverStr(RepeatRR(&x86_64::X86_64Assembler::bsrq, "bsrq %{reg2}, %{reg1}"), "bsrq");
+}
+
+TEST_F(AssemblerX86_64Test, BsrqAddress) {
+  GetAssembler()->bsrq(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->bsrq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
+  GetAssembler()->bsrq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
+  const char* expected =
+    "bsrq 0xc(%RDI,%RBX,4), %R10\n"
+    "bsrq 0xc(%R10,%RBX,4), %RDI\n"
+    "bsrq 0xc(%RDI,%R9,4), %RDI\n";
+
+  DriverStr(expected, "bsrq_address");
+}
+
 std::string setcc_test_fn(AssemblerX86_64Test::Base* assembler_test,
                           x86_64::X86_64Assembler* assembler) {
   // From Condition
diff --git a/dexdump/dexdump.cc b/dexdump/dexdump.cc
index 84c465f..282db5d 100644
--- a/dexdump/dexdump.cc
+++ b/dexdump/dexdump.cc
@@ -38,11 +38,14 @@
 #include <inttypes.h>
 #include <stdio.h>
 
+#include <iostream>
 #include <memory>
+#include <sstream>
 #include <vector>
 
 #include "dex_file-inl.h"
 #include "dex_instruction-inl.h"
+#include "utils.h"
 
 namespace art {
 
@@ -1046,6 +1049,49 @@
 }
 
 /*
+ * Dumping a CFG. Note that this will do duplicate work. utils.h doesn't expose the code-item
+ * version, so the DumpMethodCFG code will have to iterate again to find it. But dexdump is a
+ * tool, so this is not performance-critical.
+ */
+
+static void dumpCfg(const DexFile* dex_file,
+                    uint32_t dex_method_idx,
+                    const DexFile::CodeItem* code_item) {
+  if (code_item != nullptr) {
+    std::ostringstream oss;
+    DumpMethodCFG(dex_file, dex_method_idx, oss);
+    fprintf(gOutFile, "%s", oss.str().c_str());
+  }
+}
+
+static void dumpCfg(const DexFile* dex_file, int idx) {
+  const DexFile::ClassDef& class_def = dex_file->GetClassDef(idx);
+  const uint8_t* class_data = dex_file->GetClassData(class_def);
+  if (class_data == nullptr) {  // empty class such as a marker interface?
+    return;
+  }
+  ClassDataItemIterator it(*dex_file, class_data);
+  while (it.HasNextStaticField()) {
+    it.Next();
+  }
+  while (it.HasNextInstanceField()) {
+    it.Next();
+  }
+  while (it.HasNextDirectMethod()) {
+    dumpCfg(dex_file,
+            it.GetMemberIndex(),
+            it.GetMethodCodeItem());
+    it.Next();
+  }
+  while (it.HasNextVirtualMethod()) {
+    dumpCfg(dex_file,
+                it.GetMemberIndex(),
+                it.GetMethodCodeItem());
+    it.Next();
+  }
+}
+
+/*
  * Dumps the class.
  *
  * Note "idx" is a DexClassDef index, not a DexTypeId index.
@@ -1061,6 +1107,11 @@
     return;
   }
 
+  if (gOptions.cfg) {
+    dumpCfg(pDexFile, idx);
+    return;
+  }
+
   // For the XML output, show the package name.  Ideally we'd gather
   // up the classes, sort them, and dump them alphabetically so the
   // package name wouldn't jump around, but that's not a great plan
diff --git a/dexdump/dexdump.h b/dexdump/dexdump.h
index f2cd16a..50280a9 100644
--- a/dexdump/dexdump.h
+++ b/dexdump/dexdump.h
@@ -45,6 +45,7 @@
   bool showFileHeaders;
   bool showSectionHeaders;
   bool verbose;
+  bool cfg;
   OutputFormat outputFormat;
   const char* outputFileName;
   const char* tempFileName;
diff --git a/dexdump/dexdump_main.cc b/dexdump/dexdump_main.cc
index 9be0922..2466f33 100644
--- a/dexdump/dexdump_main.cc
+++ b/dexdump/dexdump_main.cc
@@ -46,6 +46,7 @@
   fprintf(stderr, " -c : verify checksum and exit\n");
   fprintf(stderr, " -d : disassemble code sections\n");
   fprintf(stderr, " -f : display summary information from file header\n");
+  fprintf(stderr, " -g : dump CFG for dex\n");
   fprintf(stderr, " -h : display file header details\n");
   fprintf(stderr, " -i : ignore checksum failures\n");
   fprintf(stderr, " -l : output layout, either 'plain' or 'xml'\n");
@@ -68,7 +69,7 @@
 
   // Parse all arguments.
   while (1) {
-    const int ic = getopt(argc, argv, "cdfhil:t:o:");
+    const int ic = getopt(argc, argv, "cdfghil:t:o:");
     if (ic < 0) {
       break;  // done
     }
@@ -82,6 +83,9 @@
       case 'f':  // dump outer file header
         gOptions.showFileHeaders = true;
         break;
+      case 'g':  // dump cfg
+        gOptions.cfg = true;
+        break;
       case 'h':  // dump section headers, i.e. all meta-data
         gOptions.showSectionHeaders = true;
         break;
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index d1d3481..03fac18 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -201,14 +201,13 @@
 }
 
 struct FpRegister {
-  explicit FpRegister(uint32_t instr, uint16_t at_bit, uint16_t extra_at_bit) {
+  FpRegister(uint32_t instr, uint16_t at_bit, uint16_t extra_at_bit) {
     size = (instr >> 8) & 1;
     uint32_t Vn = (instr >> at_bit) & 0xF;
     uint32_t N = (instr >> extra_at_bit) & 1;
     r = (size != 0 ? ((N << 4) | Vn) : ((Vn << 1) | N));
   }
-  explicit FpRegister(uint32_t instr, uint16_t at_bit, uint16_t extra_at_bit,
-                      uint32_t forced_size) {
+  FpRegister(uint32_t instr, uint16_t at_bit, uint16_t extra_at_bit, uint32_t forced_size) {
     size = forced_size;
     uint32_t Vn = (instr >> at_bit) & 0xF;
     uint32_t N = (instr >> extra_at_bit) & 1;
diff --git a/disassembler/disassembler_mips.h b/disassembler/disassembler_mips.h
index 4f70a9b..b0e49b3 100644
--- a/disassembler/disassembler_mips.h
+++ b/disassembler/disassembler_mips.h
@@ -26,10 +26,11 @@
 
 class DisassemblerMips FINAL : public Disassembler {
  public:
-  explicit DisassemblerMips(DisassemblerOptions* options, bool is64bit) : Disassembler(options),
-      is64bit_(is64bit),
-      last_ptr_(nullptr),
-      last_instr_(0) {}
+  DisassemblerMips(DisassemblerOptions* options, bool is64bit)
+      : Disassembler(options),
+        is64bit_(is64bit),
+        last_ptr_(nullptr),
+        last_instr_(0) {}
 
   size_t Dump(std::ostream& os, const uint8_t* begin) OVERRIDE;
   void Dump(std::ostream& os, const uint8_t* begin, const uint8_t* end) OVERRIDE;
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 44787a7..d4574f4 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -928,6 +928,11 @@
         has_modrm = true;
         load = true;
         break;
+      case 0xBD:
+        opcode1 = "bsr";
+        has_modrm = true;
+        load = true;
+        break;
       case 0xBE:
         opcode1 = "movsxb";
         has_modrm = true;
@@ -1117,6 +1122,9 @@
       opcode1 = opcode_tmp.c_str();
     }
     break;
+  case 0xA5:
+    opcode1 = (prefix[2] == 0x66 ? "movsw" : "movsl");
+    break;
   case 0xA7:
     opcode1 = (prefix[2] == 0x66 ? "cmpsw" : "cmpsl");
     break;
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index b61cfc9..44b78ff 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -107,7 +107,7 @@
     const OatFile* oat_file_;
   };
 
-  explicit OatSymbolizer(const OatFile* oat_file, const std::string& output_name) :
+  OatSymbolizer(const OatFile* oat_file, const std::string& output_name) :
       oat_file_(oat_file), builder_(nullptr),
       output_name_(output_name.empty() ? "symbolized.oat" : output_name) {
   }
@@ -346,7 +346,7 @@
 
 class OatDumper {
  public:
-  explicit OatDumper(const OatFile& oat_file, const OatDumperOptions& options)
+  OatDumper(const OatFile& oat_file, const OatDumperOptions& options)
     : oat_file_(oat_file),
       oat_dex_files_(oat_file.GetOatDexFiles()),
       options_(options),
@@ -1453,8 +1453,8 @@
 
 class ImageDumper {
  public:
-  explicit ImageDumper(std::ostream* os, gc::space::ImageSpace& image_space,
-                       const ImageHeader& image_header, OatDumperOptions* oat_dumper_options)
+  ImageDumper(std::ostream* os, gc::space::ImageSpace& image_space,
+              const ImageHeader& image_header, OatDumperOptions* oat_dumper_options)
       : os_(os),
         vios_(os),
         indent1_(&vios_),
@@ -1618,9 +1618,7 @@
           const size_t pointer_size =
               InstructionSetPointerSize(oat_dumper_->GetOatInstructionSet());
           DumpArtMethodVisitor visitor(this);
-          methods_section.VisitPackedArtMethods(&visitor,
-                                                image_space->Begin(),
-                                                ArtMethod::ObjectSize(pointer_size));
+          methods_section.VisitPackedArtMethods(&visitor, image_space->Begin(), pointer_size);
         }
       }
       // Dump the large objects separately.
@@ -1642,13 +1640,19 @@
     const auto& intern_section = image_header_.GetImageSection(
         ImageHeader::kSectionInternedStrings);
     stats_.header_bytes = header_bytes;
-    size_t alignment_bytes = RoundUp(header_bytes, kObjectAlignment) - header_bytes;
-    stats_.alignment_bytes += alignment_bytes;
+    stats_.alignment_bytes += RoundUp(header_bytes, kObjectAlignment) - header_bytes;
+    // Add padding between the field and method section.
+    // (Field section is 4-byte aligned, method section is 8-byte aligned on 64-bit targets.)
+    stats_.alignment_bytes +=
+        method_section.Offset() - (field_section.Offset() + field_section.Size());
+    // Add padding between the method section and the intern table.
+    // (Method section is 4-byte aligned on 32-bit targets, intern table is 8-byte aligned.)
+    stats_.alignment_bytes +=
+        intern_section.Offset() - (method_section.Offset() + method_section.Size());
     stats_.alignment_bytes += bitmap_section.Offset() - image_header_.GetImageSize();
     stats_.bitmap_bytes += bitmap_section.Size();
     stats_.art_field_bytes += field_section.Size();
-    // RoundUp to 8 bytes to match the intern table alignment expectation.
-    stats_.art_method_bytes += RoundUp(method_section.Size(), sizeof(uint64_t));
+    stats_.art_method_bytes += method_section.Size();
     stats_.interned_strings_bytes += intern_section.Size();
     stats_.Dump(os, indent_os);
     os << "\n";
@@ -1966,7 +1970,7 @@
                                 method_access_flags);
 
       size_t total_size = dex_instruction_bytes + gc_map_bytes + pc_mapping_table_bytes +
-          vmap_table_bytes + quick_oat_code_size + ArtMethod::ObjectSize(image_pointer_size);
+          vmap_table_bytes + quick_oat_code_size + ArtMethod::Size(image_pointer_size);
 
       double expansion =
       static_cast<double>(quick_oat_code_size) / static_cast<double>(dex_instruction_bytes);
@@ -2019,7 +2023,7 @@
     std::vector<double> method_outlier_expansion;
     std::vector<std::pair<std::string, size_t>> oat_dex_file_sizes;
 
-    explicit Stats()
+    Stats()
         : oat_file_bytes(0),
           file_bytes(0),
           header_bytes(0),
diff --git a/patchoat/patchoat.cc b/patchoat/patchoat.cc
index 6cd391f..d601035 100644
--- a/patchoat/patchoat.cc
+++ b/patchoat/patchoat.cc
@@ -480,9 +480,8 @@
 void PatchOat::PatchArtMethods(const ImageHeader* image_header) {
   const auto& section = image_header->GetMethodsSection();
   const size_t pointer_size = InstructionSetPointerSize(isa_);
-  const size_t method_size = ArtMethod::ObjectSize(pointer_size);
   PatchOatArtMethodVisitor visitor(this);
-  section.VisitPackedArtMethods(&visitor, heap_->Begin(), method_size);
+  section.VisitPackedArtMethods(&visitor, heap_->Begin(), pointer_size);
 }
 
 class FixupRootVisitor : public RootVisitor {
diff --git a/runtime/arch/arm64/instruction_set_features_arm64.h b/runtime/arch/arm64/instruction_set_features_arm64.h
index e59ff58..805131f 100644
--- a/runtime/arch/arm64/instruction_set_features_arm64.h
+++ b/runtime/arch/arm64/instruction_set_features_arm64.h
@@ -83,9 +83,7 @@
                                  std::string* error_msg) const OVERRIDE;
 
  private:
-  explicit Arm64InstructionSetFeatures(bool smp,
-                                       bool needs_a53_835769_fix,
-                                       bool needs_a53_843419_fix)
+  Arm64InstructionSetFeatures(bool smp, bool needs_a53_835769_fix, bool needs_a53_843419_fix)
       : InstructionSetFeatures(smp),
         fix_cortex_a53_835769_(needs_a53_835769_fix),
         fix_cortex_a53_843419_(needs_a53_843419_fix) {
diff --git a/runtime/art_method-inl.h b/runtime/art_method-inl.h
index bb3c72c..40bb9e1 100644
--- a/runtime/art_method-inl.h
+++ b/runtime/art_method-inl.h
@@ -63,6 +63,15 @@
   declaring_class_ = GcRoot<mirror::Class>(new_declaring_class);
 }
 
+inline bool ArtMethod::CASDeclaringClass(mirror::Class* expected_class,
+                                         mirror::Class* desired_class) {
+  GcRoot<mirror::Class> expected_root(expected_class);
+  GcRoot<mirror::Class> desired_root(desired_class);
+  return reinterpret_cast<Atomic<GcRoot<mirror::Class>>*>(&declaring_class_)->
+      CompareExchangeStrongSequentiallyConsistent(
+          expected_root, desired_root);
+}
+
 inline uint32_t ArtMethod::GetAccessFlags() {
   DCHECK(IsRuntimeMethod() || GetDeclaringClass()->IsIdxLoaded() ||
          GetDeclaringClass()->IsErroneous());
@@ -497,7 +506,7 @@
 
 inline void ArtMethod::CopyFrom(const ArtMethod* src, size_t image_pointer_size) {
   memcpy(reinterpret_cast<void*>(this), reinterpret_cast<const void*>(src),
-         ObjectSize(image_pointer_size));
+         Size(image_pointer_size));
   declaring_class_ = GcRoot<mirror::Class>(const_cast<ArtMethod*>(src)->GetDeclaringClass());
   dex_cache_resolved_methods_ = GcRoot<mirror::PointerArray>(
       const_cast<ArtMethod*>(src)->GetDexCacheResolvedMethods());
diff --git a/runtime/art_method.h b/runtime/art_method.h
index 90352b7..1afd056 100644
--- a/runtime/art_method.h
+++ b/runtime/art_method.h
@@ -67,6 +67,9 @@
   void SetDeclaringClass(mirror::Class *new_declaring_class)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  bool CASDeclaringClass(mirror::Class* expected_class, mirror::Class* desired_class)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
   static MemberOffset DeclaringClassOffset() {
     return MemberOffset(OFFSETOF_MEMBER(ArtMethod, declaring_class_));
   }
@@ -504,12 +507,19 @@
   bool EqualParameters(Handle<mirror::ObjectArray<mirror::Class>> params)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
-  // Size of an instance of this object.
-  static size_t ObjectSize(size_t pointer_size) {
+  // Size of an instance of this native class.
+  static size_t Size(size_t pointer_size) {
     return RoundUp(OFFSETOF_MEMBER(ArtMethod, ptr_sized_fields_), pointer_size) +
         (sizeof(PtrSizedFields) / sizeof(void*)) * pointer_size;
   }
 
+  // Alignment of an instance of this native class.
+  static size_t Alignment(size_t pointer_size) {
+    // The ArtMethod alignment is the same as image pointer size. This differs from
+    // alignof(ArtMethod) if cross-compiling with pointer_size != sizeof(void*).
+    return pointer_size;
+  }
+
   void CopyFrom(const ArtMethod* src, size_t image_pointer_size)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
diff --git a/runtime/base/arena_allocator.h b/runtime/base/arena_allocator.h
index c4b36ee..05c66f0 100644
--- a/runtime/base/arena_allocator.h
+++ b/runtime/base/arena_allocator.h
@@ -170,7 +170,7 @@
 
 class MemMapArena FINAL : public Arena {
  public:
-  explicit MemMapArena(size_t size, bool low_4gb);
+  MemMapArena(size_t size, bool low_4gb);
   virtual ~MemMapArena();
   void Release() OVERRIDE;
 
diff --git a/runtime/base/arena_containers.h b/runtime/base/arena_containers.h
index d6c4a54..a7aafdf 100644
--- a/runtime/base/arena_containers.h
+++ b/runtime/base/arena_containers.h
@@ -134,7 +134,7 @@
     typedef ArenaAllocatorAdapter<U> other;
   };
 
-  explicit ArenaAllocatorAdapter(ArenaAllocator* arena_allocator, ArenaAllocKind kind)
+  ArenaAllocatorAdapter(ArenaAllocator* arena_allocator, ArenaAllocKind kind)
       : ArenaAllocatorAdapterKind(kind),
         arena_allocator_(arena_allocator) {
   }
diff --git a/runtime/base/bit_utils.h b/runtime/base/bit_utils.h
index 6f45dc8..1da6750 100644
--- a/runtime/base/bit_utils.h
+++ b/runtime/base/bit_utils.h
@@ -158,6 +158,9 @@
 #define DCHECK_ALIGNED(value, alignment) \
   DCHECK(::art::IsAligned<alignment>(value)) << reinterpret_cast<const void*>(value)
 
+#define CHECK_ALIGNED_PARAM(value, alignment) \
+  CHECK(::art::IsAlignedParam(value, alignment)) << reinterpret_cast<const void*>(value)
+
 #define DCHECK_ALIGNED_PARAM(value, alignment) \
   DCHECK(::art::IsAlignedParam(value, alignment)) << reinterpret_cast<const void*>(value)
 
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 2801fb7..848c904 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -437,7 +437,7 @@
 // (Signal) or all at once (Broadcast).
 class ConditionVariable {
  public:
-  explicit ConditionVariable(const char* name, Mutex& mutex);
+  ConditionVariable(const char* name, Mutex& mutex);
   ~ConditionVariable();
 
   void Broadcast(Thread* self);
@@ -475,7 +475,7 @@
 // upon destruction.
 class SCOPED_CAPABILITY MutexLock {
  public:
-  explicit MutexLock(Thread* self, Mutex& mu) ACQUIRE(mu) : self_(self), mu_(mu) {
+  MutexLock(Thread* self, Mutex& mu) ACQUIRE(mu) : self_(self), mu_(mu) {
     mu_.ExclusiveLock(self_);
   }
 
@@ -495,7 +495,7 @@
 // construction and releases it upon destruction.
 class SCOPED_CAPABILITY ReaderMutexLock {
  public:
-  explicit ReaderMutexLock(Thread* self, ReaderWriterMutex& mu) ACQUIRE(mu) :
+  ReaderMutexLock(Thread* self, ReaderWriterMutex& mu) ACQUIRE(mu) :
       self_(self), mu_(mu) {
     mu_.SharedLock(self_);
   }
@@ -517,7 +517,7 @@
 // construction and releases it upon destruction.
 class SCOPED_CAPABILITY WriterMutexLock {
  public:
-  explicit WriterMutexLock(Thread* self, ReaderWriterMutex& mu) EXCLUSIVE_LOCK_FUNCTION(mu) :
+  WriterMutexLock(Thread* self, ReaderWriterMutex& mu) EXCLUSIVE_LOCK_FUNCTION(mu) :
       self_(self), mu_(mu) {
     mu_.ExclusiveLock(self_);
   }
diff --git a/runtime/base/timing_logger.h b/runtime/base/timing_logger.h
index e10cd24..a5344db 100644
--- a/runtime/base/timing_logger.h
+++ b/runtime/base/timing_logger.h
@@ -131,7 +131,7 @@
     friend class TimingLogger;
   };
 
-  explicit TimingLogger(const char* name, bool precise, bool verbose);
+  TimingLogger(const char* name, bool precise, bool verbose);
   ~TimingLogger();
   // Verify that all open timings have related closed timings.
   void Verify();
@@ -156,7 +156,7 @@
   // starts and ends.
   class ScopedTiming {
    public:
-    explicit ScopedTiming(const char* label, TimingLogger* logger) : logger_(logger) {
+    ScopedTiming(const char* label, TimingLogger* logger) : logger_(logger) {
       logger_->StartTiming(label);
     }
     ~ScopedTiming() {
diff --git a/runtime/base/unix_file/fd_file.h b/runtime/base/unix_file/fd_file.h
index d51fbd6..f47368b 100644
--- a/runtime/base/unix_file/fd_file.h
+++ b/runtime/base/unix_file/fd_file.h
@@ -35,8 +35,8 @@
   FdFile();
   // Creates an FdFile using the given file descriptor. Takes ownership of the
   // file descriptor. (Use DisableAutoClose to retain ownership.)
-  explicit FdFile(int fd, bool checkUsage);
-  explicit FdFile(int fd, const std::string& path, bool checkUsage);
+  FdFile(int fd, bool checkUsage);
+  FdFile(int fd, const std::string& path, bool checkUsage);
 
   // Destroys an FdFile, closing the file descriptor if Close hasn't already
   // been called. (If you care about the return value of Close, call it
diff --git a/runtime/check_jni.cc b/runtime/check_jni.cc
index 38bc818..4172b89 100644
--- a/runtime/check_jni.cc
+++ b/runtime/check_jni.cc
@@ -130,7 +130,7 @@
 
 class ScopedCheck {
  public:
-  explicit ScopedCheck(int flags, const char* functionName, bool has_method = true)
+  ScopedCheck(int flags, const char* functionName, bool has_method = true)
       : function_name_(functionName), flags_(flags), indent_(0), has_method_(has_method) {
   }
 
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 0886e32..c179c64 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -37,6 +37,7 @@
 #include "base/unix_file/fd_file.h"
 #include "base/value_object.h"
 #include "class_linker-inl.h"
+#include "class_table-inl.h"
 #include "compiler_callbacks.h"
 #include "debugger.h"
 #include "dex_file-inl.h"
@@ -582,6 +583,7 @@
 
   // Setup the ClassLoader, verifying the object_size_.
   class_root = FindSystemClass(self, "Ljava/lang/ClassLoader;");
+  class_root->SetClassLoaderClass();
   CHECK_EQ(class_root->GetObjectSize(), mirror::ClassLoader::InstanceSize());
   SetClassRoot(kJavaLangClassLoader, class_root);
 
@@ -1211,9 +1213,8 @@
   if (!runtime->IsAotCompiler() && runtime->GetInstrumentation()->InterpretOnly()) {
     const ImageHeader& header = space->GetImageHeader();
     const ImageSection& methods = header.GetMethodsSection();
-    const size_t art_method_size = ArtMethod::ObjectSize(image_pointer_size_);
     SetInterpreterEntrypointArtMethodVisitor visitor(image_pointer_size_);
-    methods.VisitPackedArtMethods(&visitor, space->Begin(), art_method_size);
+    methods.VisitPackedArtMethods(&visitor, space->Begin(), image_pointer_size_);
   }
 
   // reinit class_roots_
@@ -1273,15 +1274,10 @@
     // Moving concurrent:
     // Need to make sure to not copy ArtMethods without doing read barriers since the roots are
     // marked concurrently and we don't hold the classlinker_classes_lock_ when we do the copy.
-    boot_class_table_.VisitRoots(visitor, flags);
+    boot_class_table_.VisitRoots(buffered_visitor);
     for (GcRoot<mirror::ClassLoader>& root : class_loaders_) {
       // May be null for boot ClassLoader.
       root.VisitRoot(visitor, RootInfo(kRootVMInternal));
-      ClassTable* const class_table = root.Read()->GetClassTable();
-      if (class_table != nullptr) {
-        // May be null if we have no classes.
-        class_table->VisitRoots(visitor, flags);
-      }
     }
   } else if ((flags & kVisitRootFlagNewRoots) != 0) {
     for (auto& root : new_class_roots_) {
@@ -2297,9 +2293,11 @@
   if (length == 0) {
     return nullptr;
   }
-  auto* ret = new(Runtime::Current()->GetLinearAlloc()->Alloc(
-      self, LengthPrefixedArray<ArtField>::ComputeSize(length))) LengthPrefixedArray<ArtField>(
-          length);
+  // If the ArtField alignment changes, review all uses of LengthPrefixedArray<ArtField>.
+  static_assert(alignof(ArtField) == 4, "ArtField alignment is expected to be 4.");
+  size_t storage_size = LengthPrefixedArray<ArtField>::ComputeSize(length);
+  void* array_storage = Runtime::Current()->GetLinearAlloc()->Alloc(self, storage_size);
+  auto* ret = new(array_storage) LengthPrefixedArray<ArtField>(length);
   CHECK(ret != nullptr);
   std::uninitialized_fill_n(&ret->At(0), length, ArtField());
   return ret;
@@ -2309,13 +2307,15 @@
   if (length == 0) {
     return nullptr;
   }
-  const size_t method_size = ArtMethod::ObjectSize(image_pointer_size_);
-  auto* ret = new (Runtime::Current()->GetLinearAlloc()->Alloc(
-      self, LengthPrefixedArray<ArtMethod>::ComputeSize(length, method_size)))
-          LengthPrefixedArray<ArtMethod>(length);
+  const size_t method_alignment = ArtMethod::Alignment(image_pointer_size_);
+  const size_t method_size = ArtMethod::Size(image_pointer_size_);
+  const size_t storage_size =
+      LengthPrefixedArray<ArtMethod>::ComputeSize(length, method_size, method_alignment);
+  void* array_storage = Runtime::Current()->GetLinearAlloc()->Alloc(self, storage_size);
+  auto* ret = new (array_storage) LengthPrefixedArray<ArtMethod>(length);
   CHECK(ret != nullptr);
   for (size_t i = 0; i < length; ++i) {
-    new(reinterpret_cast<void*>(&ret->At(i, method_size))) ArtMethod;
+    new(reinterpret_cast<void*>(&ret->At(i, method_size, method_alignment))) ArtMethod;
   }
   return ret;
 }
@@ -2810,6 +2810,10 @@
   }
   VerifyObject(klass);
   class_table->InsertWithHash(klass, hash);
+  if (class_loader != nullptr) {
+    // This is necessary because we need to have the card dirtied for remembered sets.
+    Runtime::Current()->GetHeap()->WriteBarrierEveryFieldOf(class_loader);
+  }
   if (log_new_class_table_roots_) {
     new_class_roots_.push_back(GcRoot<mirror::Class>(klass));
   }
@@ -4375,6 +4379,11 @@
     klass->SetFinalizable();
   }
 
+  // Inherit class loader flag form super class.
+  if (super->IsClassLoaderClass()) {
+    klass->SetClassLoaderClass();
+  }
+
   // Inherit reference flags (if any) from the superclass.
   int reference_flags = (super->GetAccessFlags() & kAccReferenceFlagsMask);
   if (reference_flags != 0) {
@@ -4683,7 +4692,8 @@
   const bool have_interfaces = interfaces.Get() != nullptr;
   const size_t num_interfaces =
       have_interfaces ? interfaces->GetLength() : klass->NumDirectInterfaces();
-  const size_t method_size = ArtMethod::ObjectSize(image_pointer_size_);
+  const size_t method_alignment = ArtMethod::Alignment(image_pointer_size_);
+  const size_t method_size = ArtMethod::Size(image_pointer_size_);
   if (num_interfaces == 0) {
     if (super_ifcount == 0) {
       // Class implements no interfaces.
@@ -4908,7 +4918,7 @@
         // matter which direction we go.  We walk it backward anyway.)
         for (k = input_array_length - 1; k >= 0; --k) {
           ArtMethod* vtable_method = input_virtual_methods != nullptr ?
-              &input_virtual_methods->At(k, method_size) :
+              &input_virtual_methods->At(k, method_size, method_alignment) :
               input_vtable_array->GetElementPtrSize<ArtMethod*>(k, image_pointer_size_);
           ArtMethod* vtable_method_for_name_comparison =
               vtable_method->GetInterfaceMethodIfProxy(image_pointer_size_);
@@ -4969,10 +4979,14 @@
     // where GCs could attempt to mark stale pointers due to memcpy. And since we overwrite the
     // realloced memory with out->CopyFrom, we are guaranteed to have objects in the to space since
     // CopyFrom has internal read barriers.
-    const size_t old_size = old_virtuals != nullptr ?
-        LengthPrefixedArray<ArtMethod>::ComputeSize(old_method_count, method_size) : 0u;
+    const size_t old_size = old_virtuals != nullptr
+        ? LengthPrefixedArray<ArtMethod>::ComputeSize(old_method_count,
+                                                      method_size,
+                                                      method_alignment)
+        : 0u;
     const size_t new_size = LengthPrefixedArray<ArtMethod>::ComputeSize(new_method_count,
-                                                                        method_size);
+                                                                        method_size,
+                                                                        method_alignment);
     auto* virtuals = reinterpret_cast<LengthPrefixedArray<ArtMethod>*>(
         runtime->GetLinearAlloc()->Realloc(self, old_virtuals, old_size, new_size));
     if (UNLIKELY(virtuals == nullptr)) {
@@ -4983,7 +4997,7 @@
     ScopedArenaUnorderedMap<ArtMethod*, ArtMethod*> move_table(allocator.Adapter());
     if (virtuals != old_virtuals) {
       // Maps from heap allocated miranda method to linear alloc miranda method.
-      StrideIterator<ArtMethod> out = virtuals->Begin(method_size);
+      StrideIterator<ArtMethod> out = virtuals->Begin(method_size, method_alignment);
       // Copy over the old methods + miranda methods.
       for (auto& m : klass->GetVirtualMethods(image_pointer_size_)) {
         move_table.emplace(&m, &*out);
@@ -4993,7 +5007,7 @@
         ++out;
       }
     }
-    StrideIterator<ArtMethod> out(virtuals->Begin(method_size) + old_method_count);
+    StrideIterator<ArtMethod> out(virtuals->Begin(method_size, method_alignment) + old_method_count);
     // Copy over miranda methods before copying vtable since CopyOf may cause thread suspension and
     // we want the roots of the miranda methods to get visited.
     for (ArtMethod* mir_method : miranda_methods) {
@@ -5016,7 +5030,7 @@
       self->AssertPendingOOMException();
       return false;
     }
-    out = StrideIterator<ArtMethod>(virtuals->Begin(method_size) + old_method_count);
+    out = virtuals->Begin(method_size, method_alignment) + old_method_count;
     size_t vtable_pos = old_vtable_count;
     for (size_t i = old_method_count; i < new_method_count; ++i) {
       // Leave the declaring class alone as type indices are relative to it
@@ -5887,8 +5901,10 @@
 }
 
 ArtMethod* ClassLinker::CreateRuntimeMethod() {
-  const size_t method_size = ArtMethod::ObjectSize(image_pointer_size_);
-  ArtMethod* method = &AllocArtMethodArray(Thread::Current(), 1)->At(0, method_size);
+  const size_t method_alignment = ArtMethod::Alignment(image_pointer_size_);
+  const size_t method_size = ArtMethod::Size(image_pointer_size_);
+  LengthPrefixedArray<ArtMethod>* method_array = AllocArtMethodArray(Thread::Current(), 1);
+  ArtMethod* method = &method_array->At(0, method_size, method_alignment);
   CHECK(method != nullptr);
   method->SetDexMethodIndex(DexFile::kDexNoIndex);
   CHECK(method->IsRuntimeMethod());
diff --git a/runtime/class_table-inl.h b/runtime/class_table-inl.h
new file mode 100644
index 0000000..dc60a2c
--- /dev/null
+++ b/runtime/class_table-inl.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_CLASS_TABLE_INL_H_
+#define ART_RUNTIME_CLASS_TABLE_INL_H_
+
+#include "class_table.h"
+
+namespace art {
+
+template<class Visitor>
+void ClassTable::VisitRoots(Visitor& visitor) {
+  for (ClassSet& class_set : classes_) {
+    for (GcRoot<mirror::Class>& root : class_set) {
+      visitor.VisitRoot(root.AddressWithoutBarrier());
+    }
+  }
+}
+
+template<class Visitor>
+void ClassTable::VisitRoots(const Visitor& visitor) {
+  for (ClassSet& class_set : classes_) {
+    for (GcRoot<mirror::Class>& root : class_set) {
+      visitor.VisitRoot(root.AddressWithoutBarrier());
+    }
+  }
+}
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_CLASS_TABLE_INL_H_
diff --git a/runtime/class_table.cc b/runtime/class_table.cc
index c245d4e..fc8e6c4 100644
--- a/runtime/class_table.cc
+++ b/runtime/class_table.cc
@@ -61,16 +61,6 @@
   return existing;
 }
 
-void ClassTable::VisitRoots(RootVisitor* visitor, VisitRootFlags flags ATTRIBUTE_UNUSED) {
-  BufferedRootVisitor<kDefaultBufferedRootCount> buffered_visitor(
-      visitor, RootInfo(kRootStickyClass));
-  for (ClassSet& class_set : classes_) {
-    for (GcRoot<mirror::Class>& root : class_set) {
-      buffered_visitor.VisitRoot(root);
-    }
-  }
-}
-
 bool ClassTable::Visit(ClassVisitor* visitor) {
   for (ClassSet& class_set : classes_) {
     for (GcRoot<mirror::Class>& root : class_set) {
diff --git a/runtime/class_table.h b/runtime/class_table.h
index 4182954..6b18d90 100644
--- a/runtime/class_table.h
+++ b/runtime/class_table.h
@@ -67,8 +67,15 @@
   mirror::Class* UpdateClass(const char* descriptor, mirror::Class* new_klass, size_t hash)
       REQUIRES(Locks::classlinker_classes_lock_) SHARED_REQUIRES(Locks::mutator_lock_);
 
-  void VisitRoots(RootVisitor* visitor, VisitRootFlags flags)
-      REQUIRES(Locks::classlinker_classes_lock_) SHARED_REQUIRES(Locks::mutator_lock_);
+  // NO_THREAD_SAFETY_ANALYSIS for object marking requiring heap bitmap lock.
+  template<class Visitor>
+  void VisitRoots(Visitor& visitor)
+      SHARED_REQUIRES(Locks::classlinker_classes_lock_, Locks::mutator_lock_)
+      NO_THREAD_SAFETY_ANALYSIS;
+  template<class Visitor>
+  void VisitRoots(const Visitor& visitor)
+      SHARED_REQUIRES(Locks::classlinker_classes_lock_, Locks::mutator_lock_)
+      NO_THREAD_SAFETY_ANALYSIS;
 
   // Return false if the callback told us to exit.
   bool Visit(ClassVisitor* visitor)
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 8e60814..dfb7f63 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -3588,10 +3588,10 @@
 
   // Find the dex_pc values that correspond to the current line, for line-based single-stepping.
   struct DebugCallbackContext {
-    explicit DebugCallbackContext(SingleStepControl* single_step_control_cb,
-                                  int32_t line_number_cb, const DexFile::CodeItem* code_item)
-      : single_step_control_(single_step_control_cb), line_number_(line_number_cb),
-        code_item_(code_item), last_pc_valid(false), last_pc(0) {
+    DebugCallbackContext(SingleStepControl* single_step_control_cb,
+                         int32_t line_number_cb, const DexFile::CodeItem* code_item)
+        : single_step_control_(single_step_control_cb), line_number_(line_number_cb),
+          code_item_(code_item), last_pc_valid(false), last_pc(0) {
     }
 
     static bool Callback(void* raw_context, uint32_t address, uint32_t line_number_cb) {
diff --git a/runtime/entrypoints/entrypoint_utils.cc b/runtime/entrypoints/entrypoint_utils.cc
index eaf26bc..eaf33f6 100644
--- a/runtime/entrypoints/entrypoint_utils.cc
+++ b/runtime/entrypoints/entrypoint_utils.cc
@@ -298,7 +298,7 @@
           interface_method->GetArtMethod(), sizeof(void*));
       auto* virtual_methods = proxy_class->GetVirtualMethodsPtr();
       size_t num_virtuals = proxy_class->NumVirtualMethods();
-      size_t method_size = ArtMethod::ObjectSize(sizeof(void*));
+      size_t method_size = ArtMethod::Size(sizeof(void*));
       int throws_index = (reinterpret_cast<uintptr_t>(proxy_method) -
           reinterpret_cast<uintptr_t>(virtual_methods)) / method_size;
       CHECK_LT(throws_index, static_cast<int>(num_virtuals));
diff --git a/runtime/fault_handler.cc b/runtime/fault_handler.cc
index 47f9b1b..c3a9627 100644
--- a/runtime/fault_handler.cc
+++ b/runtime/fault_handler.cc
@@ -331,7 +331,7 @@
   // If we don't have a potential method, we're outta here.
   VLOG(signals) << "potential method: " << method_obj;
   // TODO: Check linear alloc and image.
-  DCHECK_ALIGNED(ArtMethod::ObjectSize(sizeof(void*)), sizeof(void*))
+  DCHECK_ALIGNED(ArtMethod::Size(sizeof(void*)), sizeof(void*))
       << "ArtMethod is not pointer aligned";
   if (method_obj == nullptr || !IsAligned<sizeof(void*)>(method_obj)) {
     VLOG(signals) << "no method";
diff --git a/runtime/gc/accounting/mod_union_table.cc b/runtime/gc/accounting/mod_union_table.cc
index b9e8925..b5ab3d9 100644
--- a/runtime/gc/accounting/mod_union_table.cc
+++ b/runtime/gc/accounting/mod_union_table.cc
@@ -54,8 +54,7 @@
 
 class ModUnionAddToCardBitmapVisitor {
  public:
-  explicit ModUnionAddToCardBitmapVisitor(ModUnionTable::CardBitmap* bitmap,
-                                          CardTable* card_table)
+  ModUnionAddToCardBitmapVisitor(ModUnionTable::CardBitmap* bitmap, CardTable* card_table)
       : bitmap_(bitmap), card_table_(card_table) {
   }
 
@@ -175,9 +174,9 @@
 
 class AddToReferenceArrayVisitor {
  public:
-  explicit AddToReferenceArrayVisitor(ModUnionTableReferenceCache* mod_union_table,
-                                      std::vector<mirror::HeapReference<Object>*>* references)
-    : mod_union_table_(mod_union_table), references_(references) {
+  AddToReferenceArrayVisitor(ModUnionTableReferenceCache* mod_union_table,
+                             std::vector<mirror::HeapReference<Object>*>* references)
+      : mod_union_table_(mod_union_table), references_(references) {
   }
 
   // Extra parameters are required since we use this same visitor signature for checking objects.
@@ -211,10 +210,10 @@
 
 class ModUnionReferenceVisitor {
  public:
-  explicit ModUnionReferenceVisitor(ModUnionTableReferenceCache* const mod_union_table,
-                                    std::vector<mirror::HeapReference<Object>*>* references)
-    : mod_union_table_(mod_union_table),
-      references_(references) {
+  ModUnionReferenceVisitor(ModUnionTableReferenceCache* const mod_union_table,
+                           std::vector<mirror::HeapReference<Object>*>* references)
+      : mod_union_table_(mod_union_table),
+        references_(references) {
   }
 
   void operator()(Object* obj) const
@@ -231,10 +230,10 @@
 
 class CheckReferenceVisitor {
  public:
-  explicit CheckReferenceVisitor(ModUnionTableReferenceCache* mod_union_table,
-                                 const std::set<const Object*>& references)
-    : mod_union_table_(mod_union_table),
-      references_(references) {
+  CheckReferenceVisitor(ModUnionTableReferenceCache* mod_union_table,
+                        const std::set<const Object*>& references)
+      : mod_union_table_(mod_union_table),
+        references_(references) {
   }
 
   // Extra parameters are required since we use this same visitor signature for checking objects.
@@ -277,8 +276,8 @@
 
 class ModUnionCheckReferences {
  public:
-  explicit ModUnionCheckReferences(ModUnionTableReferenceCache* mod_union_table,
-                                   const std::set<const Object*>& references)
+  ModUnionCheckReferences(ModUnionTableReferenceCache* mod_union_table,
+                          const std::set<const Object*>& references)
       REQUIRES(Locks::heap_bitmap_lock_)
       : mod_union_table_(mod_union_table), references_(references) {
   }
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index ec689f8..220c06e 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -181,7 +181,7 @@
 // Used to switch the thread roots of a thread from from-space refs to to-space refs.
 class ThreadFlipVisitor : public Closure {
  public:
-  explicit ThreadFlipVisitor(ConcurrentCopying* concurrent_copying, bool use_tlab)
+  ThreadFlipVisitor(ConcurrentCopying* concurrent_copying, bool use_tlab)
       : concurrent_copying_(concurrent_copying), use_tlab_(use_tlab) {
   }
 
@@ -229,7 +229,7 @@
     CHECK(thread == self);
     Locks::mutator_lock_->AssertExclusiveHeld(self);
     cc->region_space_->SetFromSpace(cc->rb_table_, cc->force_evacuate_all_);
-    cc->SwapStacks(self);
+    cc->SwapStacks();
     if (ConcurrentCopying::kEnableFromSpaceAccountingCheck) {
       cc->RecordLiveStackFreezeSize(self);
       cc->from_space_num_objects_at_first_pause_ = cc->region_space_->GetObjectsAllocated();
@@ -275,8 +275,8 @@
   }
 }
 
-void ConcurrentCopying::SwapStacks(Thread* self) {
-  heap_->SwapStacks(self);
+void ConcurrentCopying::SwapStacks() {
+  heap_->SwapStacks();
 }
 
 void ConcurrentCopying::RecordLiveStackFreezeSize(Thread* self) {
@@ -817,8 +817,8 @@
 
 class RevokeThreadLocalMarkStackCheckpoint : public Closure {
  public:
-  explicit RevokeThreadLocalMarkStackCheckpoint(ConcurrentCopying* concurrent_copying,
-                                                bool disable_weak_ref_access)
+  RevokeThreadLocalMarkStackCheckpoint(ConcurrentCopying* concurrent_copying,
+                                       bool disable_weak_ref_access)
       : concurrent_copying_(concurrent_copying),
         disable_weak_ref_access_(disable_weak_ref_access) {
   }
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index a4fd71c..f382448 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -172,7 +172,7 @@
   mirror::Object* GetFwdPtr(mirror::Object* from_ref)
       SHARED_REQUIRES(Locks::mutator_lock_);
   void FlipThreadRoots() REQUIRES(!Locks::mutator_lock_);
-  void SwapStacks(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_);
+  void SwapStacks() SHARED_REQUIRES(Locks::mutator_lock_);
   void RecordLiveStackFreezeSize(Thread* self);
   void ComputeUnevacFromSpaceLiveRatio();
   void LogFromSpaceRefHolder(mirror::Object* obj, MemberOffset offset)
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index 4b2c588..94ffe6e 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -191,7 +191,7 @@
     heap_->RevokeAllThreadLocalAllocationStacks(self);
   }
   t.NewTiming("SwapStacks");
-  heap_->SwapStacks(self);
+  heap_->SwapStacks();
   {
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
     MarkRoots();
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 7f2c204..b0a8a5b 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -190,7 +190,7 @@
   {
     TimingLogger::ScopedTiming t2("SwapStacks", GetTimings());
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-    heap_->SwapStacks(self);
+    heap_->SwapStacks();
     live_stack_freeze_size_ = heap_->GetLiveStack()->Size();
     // Need to revoke all the thread local allocation stacks since we just swapped the allocation
     // stacks and don't want anybody to allocate into the live stack.
@@ -648,8 +648,8 @@
  protected:
   class MarkObjectParallelVisitor {
    public:
-    ALWAYS_INLINE explicit MarkObjectParallelVisitor(MarkStackTask<kUseFinger>* chunk_task,
-                                                     MarkSweep* mark_sweep)
+    ALWAYS_INLINE MarkObjectParallelVisitor(MarkStackTask<kUseFinger>* chunk_task,
+                                            MarkSweep* mark_sweep)
         : chunk_task_(chunk_task), mark_sweep_(mark_sweep) {}
 
     void operator()(mirror::Object* obj, MemberOffset offset, bool /* static */) const
@@ -1058,8 +1058,8 @@
 
 class CheckpointMarkThreadRoots : public Closure, public RootVisitor {
  public:
-  explicit CheckpointMarkThreadRoots(MarkSweep* mark_sweep,
-                                     bool revoke_ros_alloc_thread_local_buffers_at_checkpoint)
+  CheckpointMarkThreadRoots(MarkSweep* mark_sweep,
+                            bool revoke_ros_alloc_thread_local_buffers_at_checkpoint)
       : mark_sweep_(mark_sweep),
         revoke_ros_alloc_thread_local_buffers_at_checkpoint_(
             revoke_ros_alloc_thread_local_buffers_at_checkpoint) {
@@ -1276,7 +1276,7 @@
   explicit MarkVisitor(MarkSweep* const mark_sweep) ALWAYS_INLINE : mark_sweep_(mark_sweep) {
   }
 
-  void operator()(mirror::Object* obj, MemberOffset offset, bool /* is_static */) const
+  void operator()(mirror::Object* obj, MemberOffset offset, bool is_static ATTRIBUTE_UNUSED) const
       ALWAYS_INLINE SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(Locks::heap_bitmap_lock_) {
     if (kCheckLocks) {
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 606be63..8bd1dc7 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -54,7 +54,7 @@
 
 class MarkSweep : public GarbageCollector {
  public:
-  explicit MarkSweep(Heap* heap, bool is_concurrent, const std::string& name_prefix = "");
+  MarkSweep(Heap* heap, bool is_concurrent, const std::string& name_prefix = "");
 
   ~MarkSweep() {}
 
diff --git a/runtime/gc/collector/partial_mark_sweep.h b/runtime/gc/collector/partial_mark_sweep.h
index 7b69bce..e9b4f6f 100644
--- a/runtime/gc/collector/partial_mark_sweep.h
+++ b/runtime/gc/collector/partial_mark_sweep.h
@@ -30,7 +30,7 @@
     return kGcTypePartial;
   }
 
-  explicit PartialMarkSweep(Heap* heap, bool is_concurrent, const std::string& name_prefix = "");
+  PartialMarkSweep(Heap* heap, bool is_concurrent, const std::string& name_prefix = "");
   ~PartialMarkSweep() {}
 
  protected:
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 63def24..fc2a801 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -226,7 +226,7 @@
     TimingLogger::ScopedTiming t2("RevokeAllThreadLocalAllocationStacks", GetTimings());
     heap_->RevokeAllThreadLocalAllocationStacks(self_);
   }
-  heap_->SwapStacks(self_);
+  heap_->SwapStacks();
   {
     WriterMutexLock mu(self_, *Locks::heap_bitmap_lock_);
     MarkRoots();
@@ -614,7 +614,9 @@
   for (size_t i = 0; i < count; ++i) {
     auto* root = roots[i];
     auto ref = StackReference<mirror::Object>::FromMirrorPtr(*root);
-    MarkObject(&ref);
+    // The root can be in the to-space since we may visit the declaring class of an ArtMethod
+    // multiple times if it is on the call stack.
+    MarkObjectIfNotInToSpace(&ref);
     if (*root != ref.AsMirrorPtr()) {
       *root = ref.AsMirrorPtr();
     }
@@ -624,7 +626,7 @@
 void SemiSpace::VisitRoots(mirror::CompressedReference<mirror::Object>** roots, size_t count,
                            const RootInfo& info ATTRIBUTE_UNUSED) {
   for (size_t i = 0; i < count; ++i) {
-    MarkObject(roots[i]);
+    MarkObjectIfNotInToSpace(roots[i]);
   }
 }
 
diff --git a/runtime/gc/collector/sticky_mark_sweep.h b/runtime/gc/collector/sticky_mark_sweep.h
index e8e70de..e8f0672 100644
--- a/runtime/gc/collector/sticky_mark_sweep.h
+++ b/runtime/gc/collector/sticky_mark_sweep.h
@@ -30,7 +30,7 @@
     return kGcTypeSticky;
   }
 
-  explicit StickyMarkSweep(Heap* heap, bool is_concurrent, const std::string& name_prefix = "");
+  StickyMarkSweep(Heap* heap, bool is_concurrent, const std::string& name_prefix = "");
   ~StickyMarkSweep() {}
 
  protected:
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index cb750eb..efa065b 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -39,8 +39,10 @@
 namespace gc {
 
 template <bool kInstrumented, bool kCheckLargeObject, typename PreFenceVisitor>
-inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Class* klass,
-                                                      size_t byte_count, AllocatorType allocator,
+inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
+                                                      mirror::Class* klass,
+                                                      size_t byte_count,
+                                                      AllocatorType allocator,
                                                       const PreFenceVisitor& pre_fence_visitor) {
   if (kIsDebugBuild) {
     CheckPreconditionsForAllocObject(klass, byte_count);
@@ -65,7 +67,6 @@
     // non moving space). This can happen if there is significant virtual address space
     // fragmentation.
   }
-  AllocationTimer alloc_timer(this, &obj);
   // bytes allocated for the (individual) object.
   size_t bytes_allocated;
   size_t usable_size;
@@ -209,7 +210,8 @@
 }
 
 template <bool kInstrumented, typename PreFenceVisitor>
-inline mirror::Object* Heap::AllocLargeObject(Thread* self, mirror::Class** klass,
+inline mirror::Object* Heap::AllocLargeObject(Thread* self,
+                                              mirror::Class** klass,
                                               size_t byte_count,
                                               const PreFenceVisitor& pre_fence_visitor) {
   // Save and restore the class in case it moves.
@@ -221,11 +223,14 @@
 }
 
 template <const bool kInstrumented, const bool kGrow>
-inline mirror::Object* Heap::TryToAllocate(Thread* self, AllocatorType allocator_type,
-                                           size_t alloc_size, size_t* bytes_allocated,
+inline mirror::Object* Heap::TryToAllocate(Thread* self,
+                                           AllocatorType allocator_type,
+                                           size_t alloc_size,
+                                           size_t* bytes_allocated,
                                            size_t* usable_size,
                                            size_t* bytes_tl_bulk_allocated) {
-  if (allocator_type != kAllocatorTypeTLAB && allocator_type != kAllocatorTypeRegionTLAB &&
+  if (allocator_type != kAllocatorTypeTLAB &&
+      allocator_type != kAllocatorTypeRegionTLAB &&
       allocator_type != kAllocatorTypeRosAlloc &&
       UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) {
     return nullptr;
@@ -380,21 +385,6 @@
   return ret;
 }
 
-inline Heap::AllocationTimer::AllocationTimer(Heap* heap, mirror::Object** allocated_obj_ptr)
-    : heap_(heap), allocated_obj_ptr_(allocated_obj_ptr),
-      allocation_start_time_(kMeasureAllocationTime ? NanoTime() / kTimeAdjust : 0u) { }
-
-inline Heap::AllocationTimer::~AllocationTimer() {
-  if (kMeasureAllocationTime) {
-    mirror::Object* allocated_obj = *allocated_obj_ptr_;
-    // Only if the allocation succeeded, record the time.
-    if (allocated_obj != nullptr) {
-      uint64_t allocation_end_time = NanoTime() / kTimeAdjust;
-      heap_->total_allocation_time_.FetchAndAddSequentiallyConsistent(allocation_end_time - allocation_start_time_);
-    }
-  }
-}
-
 inline bool Heap::ShouldAllocLargeObject(mirror::Class* c, size_t byte_count) const {
   // We need to have a zygote space or else our newly allocated large object can end up in the
   // Zygote resulting in it being prematurely freed.
@@ -423,7 +413,8 @@
   return false;
 }
 
-inline void Heap::CheckConcurrentGC(Thread* self, size_t new_num_bytes_allocated,
+inline void Heap::CheckConcurrentGC(Thread* self,
+                                    size_t new_num_bytes_allocated,
                                     mirror::Object** obj) {
   if (UNLIKELY(new_num_bytes_allocated >= concurrent_start_bytes_)) {
     RequestConcurrentGCAndSaveObject(self, false, obj);
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index e9d9065..e56351f 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -113,18 +113,34 @@
 // timeout on how long we wait for finalizers to run. b/21544853
 static constexpr uint64_t kNativeAllocationFinalizeTimeout = MsToNs(250u);
 
-Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max_free,
-           double target_utilization, double foreground_heap_growth_multiplier,
-           size_t capacity, size_t non_moving_space_capacity, const std::string& image_file_name,
-           const InstructionSet image_instruction_set, CollectorType foreground_collector_type,
+Heap::Heap(size_t initial_size,
+           size_t growth_limit,
+           size_t min_free,
+           size_t max_free,
+           double target_utilization,
+           double foreground_heap_growth_multiplier,
+           size_t capacity,
+           size_t non_moving_space_capacity,
+           const std::string& image_file_name,
+           const InstructionSet image_instruction_set,
+           CollectorType foreground_collector_type,
            CollectorType background_collector_type,
-           space::LargeObjectSpaceType large_object_space_type, size_t large_object_threshold,
-           size_t parallel_gc_threads, size_t conc_gc_threads, bool low_memory_mode,
-           size_t long_pause_log_threshold, size_t long_gc_log_threshold,
-           bool ignore_max_footprint, bool use_tlab,
-           bool verify_pre_gc_heap, bool verify_pre_sweeping_heap, bool verify_post_gc_heap,
-           bool verify_pre_gc_rosalloc, bool verify_pre_sweeping_rosalloc,
-           bool verify_post_gc_rosalloc, bool gc_stress_mode,
+           space::LargeObjectSpaceType large_object_space_type,
+           size_t large_object_threshold,
+           size_t parallel_gc_threads,
+           size_t conc_gc_threads,
+           bool low_memory_mode,
+           size_t long_pause_log_threshold,
+           size_t long_gc_log_threshold,
+           bool ignore_max_footprint,
+           bool use_tlab,
+           bool verify_pre_gc_heap,
+           bool verify_pre_sweeping_heap,
+           bool verify_post_gc_heap,
+           bool verify_pre_gc_rosalloc,
+           bool verify_pre_sweeping_rosalloc,
+           bool verify_post_gc_rosalloc,
+           bool gc_stress_mode,
            bool use_homogeneous_space_compaction_for_oom,
            uint64_t min_interval_homogeneous_space_compaction_by_oom)
     : non_moving_space_(nullptr),
@@ -189,7 +205,6 @@
       target_utilization_(target_utilization),
       foreground_heap_growth_multiplier_(foreground_heap_growth_multiplier),
       total_wait_time_(0),
-      total_allocation_time_(0),
       verify_object_mode_(kVerifyObjectModeDisabled),
       disable_moving_gc_count_(0),
       is_running_on_memory_tool_(Runtime::Current()->IsRunningOnMemoryTool()),
@@ -214,7 +229,8 @@
       alloc_tracking_enabled_(false),
       backtrace_lock_(nullptr),
       seen_backtrace_count_(0u),
-      unique_backtrace_count_(0u) {
+      unique_backtrace_count_(0u),
+      gc_disabled_for_shutdown_(false) {
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() entering";
   }
@@ -526,8 +542,10 @@
   }
 }
 
-MemMap* Heap::MapAnonymousPreferredAddress(const char* name, uint8_t* request_begin,
-                                           size_t capacity, std::string* out_error_str) {
+MemMap* Heap::MapAnonymousPreferredAddress(const char* name,
+                                           uint8_t* request_begin,
+                                           size_t capacity,
+                                           std::string* out_error_str) {
   while (true) {
     MemMap* map = MemMap::MapAnonymous(name, request_begin, capacity,
                                        PROT_READ | PROT_WRITE, true, false, out_error_str);
@@ -543,9 +561,12 @@
   return foreground_collector_type_ == type || background_collector_type_ == type;
 }
 
-space::MallocSpace* Heap::CreateMallocSpaceFromMemMap(MemMap* mem_map, size_t initial_size,
-                                                      size_t growth_limit, size_t capacity,
-                                                      const char* name, bool can_move_objects) {
+space::MallocSpace* Heap::CreateMallocSpaceFromMemMap(MemMap* mem_map,
+                                                      size_t initial_size,
+                                                      size_t growth_limit,
+                                                      size_t capacity,
+                                                      const char* name,
+                                                      bool can_move_objects) {
   space::MallocSpace* malloc_space = nullptr;
   if (kUseRosAlloc) {
     // Create rosalloc space.
@@ -960,8 +981,6 @@
     total_paused_time += collector->GetTotalPausedTimeNs();
     collector->DumpPerformanceInfo(os);
   }
-  uint64_t allocation_time =
-      static_cast<uint64_t>(total_allocation_time_.LoadRelaxed()) * kTimeAdjust;
   if (total_duration != 0) {
     const double total_seconds = static_cast<double>(total_duration / 1000) / 1000000.0;
     os << "Total time spent in GC: " << PrettyDuration(total_duration) << "\n";
@@ -979,11 +998,6 @@
   os << "Free memory until OOME " << PrettySize(GetFreeMemoryUntilOOME()) << "\n";
   os << "Total memory " << PrettySize(GetTotalMemory()) << "\n";
   os << "Max memory " << PrettySize(GetMaxMemory()) << "\n";
-  if (kMeasureAllocationTime) {
-    os << "Total time spent allocating: " << PrettyDuration(allocation_time) << "\n";
-    os << "Mean allocation time: " << PrettyDuration(allocation_time / total_objects_allocated)
-       << "\n";
-  }
   if (HasZygoteSpace()) {
     os << "Zygote space size " << PrettySize(zygote_space_->Size()) << "\n";
   }
@@ -1016,7 +1030,6 @@
   for (auto& collector : garbage_collectors_) {
     collector->ResetMeasurements();
   }
-  total_allocation_time_.StoreRelaxed(0);
   total_bytes_freed_ever_ = 0;
   total_objects_freed_ever_ = 0;
   total_wait_time_ = 0;
@@ -1494,8 +1507,10 @@
   return nullptr;
 }
 
-mirror::Object* Heap::AllocateInternalWithGc(Thread* self, AllocatorType allocator,
-                                             size_t alloc_size, size_t* bytes_allocated,
+mirror::Object* Heap::AllocateInternalWithGc(Thread* self,
+                                             AllocatorType allocator,
+                                             size_t alloc_size,
+                                             size_t* bytes_allocated,
                                              size_t* usable_size,
                                              size_t* bytes_tl_bulk_allocated,
                                              mirror::Class** klass) {
@@ -1694,10 +1709,12 @@
 
 class InstanceCounter {
  public:
-  InstanceCounter(const std::vector<mirror::Class*>& classes, bool use_is_assignable_from, uint64_t* counts)
+  InstanceCounter(const std::vector<mirror::Class*>& classes,
+                  bool use_is_assignable_from,
+                  uint64_t* counts)
       SHARED_REQUIRES(Locks::mutator_lock_)
-      : classes_(classes), use_is_assignable_from_(use_is_assignable_from), counts_(counts) {
-  }
+      : classes_(classes), use_is_assignable_from_(use_is_assignable_from), counts_(counts) {}
+
   static void Callback(mirror::Object* obj, void* arg)
       SHARED_REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
     InstanceCounter* instance_counter = reinterpret_cast<InstanceCounter*>(arg);
@@ -1753,7 +1770,8 @@
   DISALLOW_COPY_AND_ASSIGN(InstanceCollector);
 };
 
-void Heap::GetInstances(mirror::Class* c, int32_t max_count,
+void Heap::GetInstances(mirror::Class* c,
+                        int32_t max_count,
                         std::vector<mirror::Object*>& instances) {
   InstanceCollector collector(c, max_count, instances);
   VisitObjects(&InstanceCollector::Callback, &collector);
@@ -1761,7 +1779,8 @@
 
 class ReferringObjectsFinder {
  public:
-  ReferringObjectsFinder(mirror::Object* object, int32_t max_count,
+  ReferringObjectsFinder(mirror::Object* object,
+                         int32_t max_count,
                          std::vector<mirror::Object*>& referring_objects)
       SHARED_REQUIRES(Locks::mutator_lock_)
       : object_(object), max_count_(max_count), referring_objects_(referring_objects) {
@@ -2081,8 +2100,7 @@
 // Special compacting collector which uses sub-optimal bin packing to reduce zygote space size.
 class ZygoteCompactingCollector FINAL : public collector::SemiSpace {
  public:
-  explicit ZygoteCompactingCollector(gc::Heap* heap,
-                                     bool is_running_on_memory_tool)
+  ZygoteCompactingCollector(gc::Heap* heap, bool is_running_on_memory_tool)
       : SemiSpace(heap, false, "zygote collector"),
         bin_live_bitmap_(nullptr),
         bin_mark_bitmap_(nullptr),
@@ -2135,10 +2153,9 @@
     }
   }
 
-  virtual bool ShouldSweepSpace(space::ContinuousSpace* space) const {
+  virtual bool ShouldSweepSpace(space::ContinuousSpace* space ATTRIBUTE_UNUSED) const {
     // Don't sweep any spaces since we probably blasted the internal accounting of the free list
     // allocator.
-    UNUSED(space);
     return false;
   }
 
@@ -2380,7 +2397,8 @@
   }
 }
 
-collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type, GcCause gc_cause,
+collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type,
+                                               GcCause gc_cause,
                                                bool clear_soft_references) {
   Thread* self = Thread::Current();
   Runtime* runtime = Runtime::Current();
@@ -2417,6 +2435,9 @@
       LOG(WARNING) << "Skipping GC due to disable moving GC count " << disable_moving_gc_count_;
       return collector::kGcTypeNone;
     }
+    if (gc_disabled_for_shutdown_) {
+      return collector::kGcTypeNone;
+    }
     collector_type_running_ = collector_type_;
   }
   if (gc_cause == kGcCauseForAlloc && runtime->HasStatsEnabled()) {
@@ -2608,7 +2629,7 @@
 // Verify a reference from an object.
 class VerifyReferenceVisitor : public SingleRootVisitor {
  public:
-  explicit VerifyReferenceVisitor(Heap* heap, Atomic<size_t>* fail_count, bool verify_referent)
+  VerifyReferenceVisitor(Heap* heap, Atomic<size_t>* fail_count, bool verify_referent)
       SHARED_REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_)
       : heap_(heap), fail_count_(fail_count), verify_referent_(verify_referent) {}
 
@@ -2758,9 +2779,8 @@
 // Verify all references within an object, for use with HeapBitmap::Visit.
 class VerifyObjectVisitor {
  public:
-  explicit VerifyObjectVisitor(Heap* heap, Atomic<size_t>* fail_count, bool verify_referent)
-      : heap_(heap), fail_count_(fail_count), verify_referent_(verify_referent) {
-  }
+  VerifyObjectVisitor(Heap* heap, Atomic<size_t>* fail_count, bool verify_referent)
+      : heap_(heap), fail_count_(fail_count), verify_referent_(verify_referent) {}
 
   void operator()(mirror::Object* obj)
       SHARED_REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
@@ -2980,8 +3000,7 @@
   return !visitor.Failed();
 }
 
-void Heap::SwapStacks(Thread* self) {
-  UNUSED(self);
+void Heap::SwapStacks() {
   if (kUseThreadLocalAllocationStack) {
     live_stack_->AssertAllZero();
   }
@@ -3034,7 +3053,9 @@
   return it->second;
 }
 
-void Heap::ProcessCards(TimingLogger* timings, bool use_rem_sets, bool process_alloc_space_cards,
+void Heap::ProcessCards(TimingLogger* timings,
+                        bool use_rem_sets,
+                        bool process_alloc_space_cards,
                         bool clear_alloc_space_cards) {
   TimingLogger::ScopedTiming t(__FUNCTION__, timings);
   // Clear cards and keep track of cards cleared in the mod-union table.
@@ -3094,11 +3115,11 @@
   if (verify_missing_card_marks_) {
     TimingLogger::ScopedTiming t2("(Paused)PreGcVerifyMissingCardMarks", timings);
     ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
-    SwapStacks(self);
+    SwapStacks();
     // Sort the live stack so that we can quickly binary search it later.
     CHECK(VerifyMissingCardMarks()) << "Pre " << gc->GetName()
                                     << " missing card mark verification failed\n" << DumpSpaces();
-    SwapStacks(self);
+    SwapStacks();
   }
   if (verify_mod_union_table_) {
     TimingLogger::ScopedTiming t2("(Paused)PreGcVerifyModUnionTables", timings);
@@ -3119,8 +3140,7 @@
   }
 }
 
-void Heap::PrePauseRosAllocVerification(collector::GarbageCollector* gc) {
-  UNUSED(gc);
+void Heap::PrePauseRosAllocVerification(collector::GarbageCollector* gc ATTRIBUTE_UNUSED) {
   // TODO: Add a new runtime option for this?
   if (verify_pre_gc_rosalloc_) {
     RosAllocVerification(current_gc_iteration_.GetTimings(), "PreGcRosAllocVerification");
@@ -3428,8 +3448,8 @@
 
 class Heap::ConcurrentGCTask : public HeapTask {
  public:
-  explicit ConcurrentGCTask(uint64_t target_time, bool force_full)
-    : HeapTask(target_time), force_full_(force_full) { }
+  ConcurrentGCTask(uint64_t target_time, bool force_full)
+      : HeapTask(target_time), force_full_(force_full) { }
   virtual void Run(Thread* self) OVERRIDE {
     gc::Heap* heap = Runtime::Current()->GetHeap();
     heap->ConcurrentGC(self, force_full_);
@@ -3486,7 +3506,8 @@
 
 class Heap::CollectorTransitionTask : public HeapTask {
  public:
-  explicit CollectorTransitionTask(uint64_t target_time) : HeapTask(target_time) { }
+  explicit CollectorTransitionTask(uint64_t target_time) : HeapTask(target_time) {}
+
   virtual void Run(Thread* self) OVERRIDE {
     gc::Heap* heap = Runtime::Current()->GetHeap();
     heap->DoPendingCollectorTransition();
@@ -3830,5 +3851,12 @@
   }
 }
 
+void Heap::DisableGCForShutdown() {
+  Thread* const self = Thread::Current();
+  CHECK(Runtime::Current()->IsShuttingDown(self));
+  MutexLock mu(self, *gc_complete_lock_);
+  gc_disabled_for_shutdown_ = true;
+}
+
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 09c18b8..d94f109 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -96,11 +96,7 @@
 class AgeCardVisitor {
  public:
   uint8_t operator()(uint8_t card) const {
-    if (card == accounting::CardTable::kCardDirty) {
-      return card - 1;
-    } else {
-      return 0;
-    }
+    return (card == accounting::CardTable::kCardDirty) ? card - 1 : 0;
   }
 };
 
@@ -130,7 +126,6 @@
 class Heap {
  public:
   // If true, measure the total allocation time.
-  static constexpr bool kMeasureAllocationTime = false;
   static constexpr size_t kDefaultStartingSize = kPageSize;
   static constexpr size_t kDefaultInitialSize = 2 * MB;
   static constexpr size_t kDefaultMaximumSize = 256 * MB;
@@ -165,28 +160,44 @@
   // Create a heap with the requested sizes. The possible empty
   // image_file_names names specify Spaces to load based on
   // ImageWriter output.
-  explicit Heap(size_t initial_size, size_t growth_limit, size_t min_free,
-                size_t max_free, double target_utilization,
-                double foreground_heap_growth_multiplier, size_t capacity,
-                size_t non_moving_space_capacity,
-                const std::string& original_image_file_name,
-                InstructionSet image_instruction_set,
-                CollectorType foreground_collector_type, CollectorType background_collector_type,
-                space::LargeObjectSpaceType large_object_space_type, size_t large_object_threshold,
-                size_t parallel_gc_threads, size_t conc_gc_threads, bool low_memory_mode,
-                size_t long_pause_threshold, size_t long_gc_threshold,
-                bool ignore_max_footprint, bool use_tlab,
-                bool verify_pre_gc_heap, bool verify_pre_sweeping_heap, bool verify_post_gc_heap,
-                bool verify_pre_gc_rosalloc, bool verify_pre_sweeping_rosalloc,
-                bool verify_post_gc_rosalloc, bool gc_stress_mode,
-                bool use_homogeneous_space_compaction,
-                uint64_t min_interval_homogeneous_space_compaction_by_oom);
+  Heap(size_t initial_size,
+       size_t growth_limit,
+       size_t min_free,
+       size_t max_free,
+       double target_utilization,
+       double foreground_heap_growth_multiplier,
+       size_t capacity,
+       size_t non_moving_space_capacity,
+       const std::string& original_image_file_name,
+       InstructionSet image_instruction_set,
+       CollectorType foreground_collector_type,
+       CollectorType background_collector_type,
+       space::LargeObjectSpaceType large_object_space_type,
+       size_t large_object_threshold,
+       size_t parallel_gc_threads,
+       size_t conc_gc_threads,
+       bool low_memory_mode,
+       size_t long_pause_threshold,
+       size_t long_gc_threshold,
+       bool ignore_max_footprint,
+       bool use_tlab,
+       bool verify_pre_gc_heap,
+       bool verify_pre_sweeping_heap,
+       bool verify_post_gc_heap,
+       bool verify_pre_gc_rosalloc,
+       bool verify_pre_sweeping_rosalloc,
+       bool verify_post_gc_rosalloc,
+       bool gc_stress_mode,
+       bool use_homogeneous_space_compaction,
+       uint64_t min_interval_homogeneous_space_compaction_by_oom);
 
   ~Heap();
 
   // Allocates and initializes storage for an object instance.
   template <bool kInstrumented, typename PreFenceVisitor>
-  mirror::Object* AllocObject(Thread* self, mirror::Class* klass, size_t num_bytes,
+  mirror::Object* AllocObject(Thread* self,
+                              mirror::Class* klass,
+                              size_t num_bytes,
                               const PreFenceVisitor& pre_fence_visitor)
       SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!*gc_complete_lock_, !*pending_task_lock_, !*backtrace_lock_,
@@ -196,7 +207,9 @@
   }
 
   template <bool kInstrumented, typename PreFenceVisitor>
-  mirror::Object* AllocNonMovableObject(Thread* self, mirror::Class* klass, size_t num_bytes,
+  mirror::Object* AllocNonMovableObject(Thread* self,
+                                        mirror::Class* klass,
+                                        size_t num_bytes,
                                         const PreFenceVisitor& pre_fence_visitor)
       SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!*gc_complete_lock_, !*pending_task_lock_, !*backtrace_lock_,
@@ -206,9 +219,11 @@
   }
 
   template <bool kInstrumented, bool kCheckLargeObject, typename PreFenceVisitor>
-  ALWAYS_INLINE mirror::Object* AllocObjectWithAllocator(
-      Thread* self, mirror::Class* klass, size_t byte_count, AllocatorType allocator,
-      const PreFenceVisitor& pre_fence_visitor)
+  ALWAYS_INLINE mirror::Object* AllocObjectWithAllocator(Thread* self,
+                                                         mirror::Class* klass,
+                                                         size_t byte_count,
+                                                         AllocatorType allocator,
+                                                         const PreFenceVisitor& pre_fence_visitor)
       SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!*gc_complete_lock_, !*pending_task_lock_, !*backtrace_lock_,
                !Roles::uninterruptible_);
@@ -263,8 +278,7 @@
   // A weaker test than IsLiveObject or VerifyObject that doesn't require the heap lock,
   // and doesn't abort on error, allowing the caller to report more
   // meaningful diagnostics.
-  bool IsValidObjectAddress(const mirror::Object* obj) const
-      SHARED_REQUIRES(Locks::mutator_lock_);
+  bool IsValidObjectAddress(const mirror::Object* obj) const SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Faster alternative to IsHeapAddress since finding if an object is in the large object space is
   // very slow.
@@ -273,8 +287,10 @@
 
   // Returns true if 'obj' is a live heap object, false otherwise (including for invalid addresses).
   // Requires the heap lock to be held.
-  bool IsLiveObjectLocked(mirror::Object* obj, bool search_allocation_stack = true,
-                          bool search_live_stack = true, bool sorted = false)
+  bool IsLiveObjectLocked(mirror::Object* obj,
+                          bool search_allocation_stack = true,
+                          bool search_live_stack = true,
+                          bool sorted = false)
       SHARED_REQUIRES(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   // Returns true if there is any chance that the object (obj) will move.
@@ -298,7 +314,8 @@
 
   // Implements VMDebug.countInstancesOfClass and JDWP VM_InstanceCount.
   // The boolean decides whether to use IsAssignableFrom or == when comparing classes.
-  void CountInstances(const std::vector<mirror::Class*>& classes, bool use_is_assignable_from,
+  void CountInstances(const std::vector<mirror::Class*>& classes,
+                      bool use_is_assignable_from,
                       uint64_t* counts)
       REQUIRES(!Locks::heap_bitmap_lock_, !*gc_complete_lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
@@ -307,7 +324,8 @@
       REQUIRES(!Locks::heap_bitmap_lock_, !*gc_complete_lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
   // Implements JDWP OR_ReferringObjects.
-  void GetReferringObjects(mirror::Object* o, int32_t max_count,
+  void GetReferringObjects(mirror::Object* o,
+                           int32_t max_count,
                            std::vector<mirror::Object*>& referring_objects)
       REQUIRES(!Locks::heap_bitmap_lock_, !*gc_complete_lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
@@ -347,8 +365,7 @@
 
   // Blocks the caller until the garbage collector becomes idle and returns the type of GC we
   // waited for.
-  collector::GcType WaitForGcToComplete(GcCause cause, Thread* self)
-      REQUIRES(!*gc_complete_lock_);
+  collector::GcType WaitForGcToComplete(GcCause cause, Thread* self) REQUIRES(!*gc_complete_lock_);
 
   // Update the heap's process state to a new value, may cause compaction to occur.
   void UpdateProcessState(ProcessState process_state)
@@ -405,14 +422,17 @@
 
   // Must be called if a field of an Object in the heap changes, and before any GC safe-point.
   // The call is not needed if null is stored in the field.
-  ALWAYS_INLINE void WriteBarrierField(const mirror::Object* dst, MemberOffset /*offset*/,
-                                       const mirror::Object* /*new_value*/) {
+  ALWAYS_INLINE void WriteBarrierField(const mirror::Object* dst,
+                                       MemberOffset offset ATTRIBUTE_UNUSED,
+                                       const mirror::Object* new_value ATTRIBUTE_UNUSED) {
     card_table_->MarkCard(dst);
   }
 
   // Write barrier for array operations that update many field positions
-  ALWAYS_INLINE void WriteBarrierArray(const mirror::Object* dst, int /*start_offset*/,
-                                       size_t /*length TODO: element_count or byte_count?*/) {
+  ALWAYS_INLINE void WriteBarrierArray(const mirror::Object* dst,
+                                       int start_offset ATTRIBUTE_UNUSED,
+                                       // TODO: element_count or byte_count?
+                                       size_t length ATTRIBUTE_UNUSED) {
     card_table_->MarkCard(dst);
   }
 
@@ -436,7 +456,8 @@
   }
 
   // Returns the number of objects currently allocated.
-  size_t GetObjectsAllocated() const REQUIRES(!Locks::heap_bitmap_lock_);
+  size_t GetObjectsAllocated() const
+      REQUIRES(!Locks::heap_bitmap_lock_);
 
   // Returns the total number of objects allocated since the heap was created.
   uint64_t GetObjectsAllocatedEver() const;
@@ -540,11 +561,13 @@
                       accounting::SpaceBitmap<kObjectAlignment>* bitmap2,
                       accounting::SpaceBitmap<kLargeObjectAlignment>* large_objects,
                       accounting::ObjectStack* stack)
-      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
 
   // Mark the specified allocation stack as live.
   void MarkAllocStackAsLive(accounting::ObjectStack* stack)
-      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
 
   // Unbind any bound bitmaps.
   void UnBindBitmaps() REQUIRES(Locks::heap_bitmap_lock_);
@@ -722,6 +745,8 @@
       SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!Locks::alloc_tracker_lock_);
 
+  void DisableGCForShutdown() REQUIRES(!*gc_complete_lock_);
+
  private:
   class ConcurrentGCTask;
   class CollectorTransitionTask;
@@ -758,15 +783,20 @@
         allocator_type != kAllocatorTypeTLAB;
   }
   static bool IsMovingGc(CollectorType collector_type) {
-    return collector_type == kCollectorTypeSS || collector_type == kCollectorTypeGSS ||
-        collector_type == kCollectorTypeCC || collector_type == kCollectorTypeMC ||
+    return
+        collector_type == kCollectorTypeSS ||
+        collector_type == kCollectorTypeGSS ||
+        collector_type == kCollectorTypeCC ||
+        collector_type == kCollectorTypeMC ||
         collector_type == kCollectorTypeHomogeneousSpaceCompact;
   }
   bool ShouldAllocLargeObject(mirror::Class* c, size_t byte_count) const
       SHARED_REQUIRES(Locks::mutator_lock_);
-  ALWAYS_INLINE void CheckConcurrentGC(Thread* self, size_t new_num_bytes_allocated,
+  ALWAYS_INLINE void CheckConcurrentGC(Thread* self,
+                                       size_t new_num_bytes_allocated,
                                        mirror::Object** obj)
-      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!*pending_task_lock_, !*gc_complete_lock_);
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!*pending_task_lock_, !*gc_complete_lock_);
 
   accounting::ObjectStack* GetMarkStack() {
     return mark_stack_.get();
@@ -774,22 +804,29 @@
 
   // We don't force this to be inlined since it is a slow path.
   template <bool kInstrumented, typename PreFenceVisitor>
-  mirror::Object* AllocLargeObject(Thread* self, mirror::Class** klass, size_t byte_count,
+  mirror::Object* AllocLargeObject(Thread* self,
+                                   mirror::Class** klass,
+                                   size_t byte_count,
                                    const PreFenceVisitor& pre_fence_visitor)
       SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!*gc_complete_lock_, !*pending_task_lock_, !*backtrace_lock_);
 
   // Handles Allocate()'s slow allocation path with GC involved after
   // an initial allocation attempt failed.
-  mirror::Object* AllocateInternalWithGc(Thread* self, AllocatorType allocator, size_t num_bytes,
-                                         size_t* bytes_allocated, size_t* usable_size,
+  mirror::Object* AllocateInternalWithGc(Thread* self,
+                                         AllocatorType allocator,
+                                         size_t num_bytes,
+                                         size_t* bytes_allocated,
+                                         size_t* usable_size,
                                          size_t* bytes_tl_bulk_allocated,
                                          mirror::Class** klass)
       REQUIRES(!Locks::thread_suspend_count_lock_, !*gc_complete_lock_, !*pending_task_lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Allocate into a specific space.
-  mirror::Object* AllocateInto(Thread* self, space::AllocSpace* space, mirror::Class* c,
+  mirror::Object* AllocateInto(Thread* self,
+                               space::AllocSpace* space,
+                               mirror::Class* c,
                                size_t bytes)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
@@ -800,8 +837,10 @@
   // Try to allocate a number of bytes, this function never does any GCs. Needs to be inlined so
   // that the switch statement is constant optimized in the entrypoints.
   template <const bool kInstrumented, const bool kGrow>
-  ALWAYS_INLINE mirror::Object* TryToAllocate(Thread* self, AllocatorType allocator_type,
-                                              size_t alloc_size, size_t* bytes_allocated,
+  ALWAYS_INLINE mirror::Object* TryToAllocate(Thread* self,
+                                              AllocatorType allocator_type,
+                                              size_t alloc_size,
+                                              size_t* bytes_allocated,
                                               size_t* usable_size,
                                               size_t* bytes_tl_bulk_allocated)
       SHARED_REQUIRES(Locks::mutator_lock_);
@@ -828,12 +867,14 @@
       REQUIRES(!*pending_task_lock_);
 
   void RequestConcurrentGCAndSaveObject(Thread* self, bool force_full, mirror::Object** obj)
-      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!*pending_task_lock_);
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!*pending_task_lock_);
   bool IsGCRequestPending() const;
 
   // Sometimes CollectGarbageInternal decides to run a different Gc than you requested. Returns
   // which type of Gc was actually ran.
-  collector::GcType CollectGarbageInternal(collector::GcType gc_plan, GcCause gc_cause,
+  collector::GcType CollectGarbageInternal(collector::GcType gc_plan,
+                                           GcCause gc_cause,
                                            bool clear_soft_references)
       REQUIRES(!*gc_complete_lock_, !Locks::heap_bitmap_lock_, !Locks::thread_suspend_count_lock_,
                !*pending_task_lock_);
@@ -862,13 +903,18 @@
   HomogeneousSpaceCompactResult PerformHomogeneousSpaceCompact() REQUIRES(!*gc_complete_lock_);
 
   // Create the main free list malloc space, either a RosAlloc space or DlMalloc space.
-  void CreateMainMallocSpace(MemMap* mem_map, size_t initial_size, size_t growth_limit,
+  void CreateMainMallocSpace(MemMap* mem_map,
+                             size_t initial_size,
+                             size_t growth_limit,
                              size_t capacity);
 
   // Create a malloc space based on a mem map. Does not set the space as default.
-  space::MallocSpace* CreateMallocSpaceFromMemMap(MemMap* mem_map, size_t initial_size,
-                                                  size_t growth_limit, size_t capacity,
-                                                  const char* name, bool can_move_objects);
+  space::MallocSpace* CreateMallocSpaceFromMemMap(MemMap* mem_map,
+                                                  size_t initial_size,
+                                                  size_t growth_limit,
+                                                  size_t capacity,
+                                                  const char* name,
+                                                  bool can_move_objects);
 
   // Given the current contents of the alloc space, increase the allowed heap footprint to match
   // the target utilization ratio.  This should only be called immediately after a full garbage
@@ -883,21 +929,26 @@
       SHARED_REQUIRES(Locks::heap_bitmap_lock_);
 
   // Swap the allocation stack with the live stack.
-  void SwapStacks(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_);
+  void SwapStacks() SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Clear cards and update the mod union table. When process_alloc_space_cards is true,
   // if clear_alloc_space_cards is true, then we clear cards instead of ageing them. We do
   // not process the alloc space if process_alloc_space_cards is false.
-  void ProcessCards(TimingLogger* timings, bool use_rem_sets, bool process_alloc_space_cards,
+  void ProcessCards(TimingLogger* timings,
+                    bool use_rem_sets,
+                    bool process_alloc_space_cards,
                     bool clear_alloc_space_cards);
 
   // Push an object onto the allocation stack.
   void PushOnAllocationStack(Thread* self, mirror::Object** obj)
-      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!*gc_complete_lock_, !*pending_task_lock_);
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!*gc_complete_lock_, !*pending_task_lock_);
   void PushOnAllocationStackWithInternalGC(Thread* self, mirror::Object** obj)
-      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!*gc_complete_lock_, !*pending_task_lock_);
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!*gc_complete_lock_, !*pending_task_lock_);
   void PushOnThreadLocalAllocationStackWithInternalGC(Thread* thread, mirror::Object** obj)
-      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!*gc_complete_lock_, !*pending_task_lock_);
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!*gc_complete_lock_, !*pending_task_lock_);
 
   void ClearConcurrentGCRequest();
   void ClearPendingTrim(Thread* self) REQUIRES(!*pending_task_lock_);
@@ -1163,9 +1214,6 @@
   // Total time which mutators are paused or waiting for GC to complete.
   uint64_t total_wait_time_;
 
-  // Total number of objects allocated in microseconds.
-  AtomicInteger total_allocation_time_;
-
   // The current state of heap verification, may be enabled or disabled.
   VerifyObjectMode verify_object_mode_;
 
@@ -1247,6 +1295,10 @@
   // Stack trace hashes that we already saw,
   std::unordered_set<uint64_t> seen_backtraces_ GUARDED_BY(backtrace_lock_);
 
+  // We disable GC when we are shutting down the runtime in case there are daemon threads still
+  // allocating.
+  bool gc_disabled_for_shutdown_ GUARDED_BY(gc_complete_lock_);
+
   friend class CollectorTransitionTask;
   friend class collector::GarbageCollector;
   friend class collector::MarkCompact;
@@ -1257,21 +1309,8 @@
   friend class VerifyReferenceCardVisitor;
   friend class VerifyReferenceVisitor;
   friend class VerifyObjectVisitor;
-  friend class ScopedHeapFill;
   friend class space::SpaceTest;
 
-  class AllocationTimer {
-   public:
-    ALWAYS_INLINE AllocationTimer(Heap* heap, mirror::Object** allocated_obj_ptr);
-    ALWAYS_INLINE ~AllocationTimer();
-   private:
-    Heap* const heap_;
-    mirror::Object** allocated_obj_ptr_;
-    const uint64_t allocation_start_time_;
-
-    DISALLOW_IMPLICIT_CONSTRUCTORS(AllocationTimer);
-  };
-
   DISALLOW_IMPLICIT_CONSTRUCTORS(Heap);
 };
 
diff --git a/runtime/gc/space/memory_tool_malloc_space.h b/runtime/gc/space/memory_tool_malloc_space.h
index fe39e05..a5dbad9 100644
--- a/runtime/gc/space/memory_tool_malloc_space.h
+++ b/runtime/gc/space/memory_tool_malloc_space.h
@@ -55,7 +55,7 @@
   size_t MaxBytesBulkAllocatedFor(size_t num_bytes) OVERRIDE;
 
   template <typename... Params>
-  explicit MemoryToolMallocSpace(MemMap* mem_map, size_t initial_size, Params... params);
+  MemoryToolMallocSpace(MemMap* mem_map, size_t initial_size, Params... params);
   virtual ~MemoryToolMallocSpace() {}
 
  private:
diff --git a/runtime/gc/task_processor_test.cc b/runtime/gc/task_processor_test.cc
index f06f68d..2c44da2 100644
--- a/runtime/gc/task_processor_test.cc
+++ b/runtime/gc/task_processor_test.cc
@@ -102,7 +102,7 @@
 
 class TestOrderTask : public HeapTask {
  public:
-  explicit TestOrderTask(uint64_t expected_time, size_t expected_counter, size_t* counter)
+  TestOrderTask(uint64_t expected_time, size_t expected_counter, size_t* counter)
      : HeapTask(expected_time), expected_counter_(expected_counter), counter_(counter) {
   }
   virtual void Run(Thread* thread) OVERRIDE {
diff --git a/runtime/handle_scope.h b/runtime/handle_scope.h
index 5ed8ef0..e617348 100644
--- a/runtime/handle_scope.h
+++ b/runtime/handle_scope.h
@@ -106,7 +106,7 @@
   }
 
   // Semi-hidden constructor. Construction expected by generated code and StackHandleScope.
-  explicit HandleScope(HandleScope* link, uint32_t num_references) :
+  HandleScope(HandleScope* link, uint32_t num_references) :
       link_(link), number_of_references_(num_references) {
   }
 
diff --git a/runtime/hprof/hprof.cc b/runtime/hprof/hprof.cc
index e67ea3f..713797f 100644
--- a/runtime/hprof/hprof.cc
+++ b/runtime/hprof/hprof.cc
@@ -883,6 +883,7 @@
                      gc::EqAllocRecordTypesPtr<gc::AllocRecordStackTraceElement>> frames_;
   std::unordered_map<const mirror::Object*, const gc::AllocRecordStackTrace*> allocation_records_;
 
+  friend class GcRootVisitor;
   DISALLOW_COPY_AND_ASSIGN(Hprof);
 };
 
@@ -1023,12 +1024,47 @@
   ++objects_in_segment_;
 }
 
+// Use for visiting the GcRoots held live by ArtFields, ArtMethods, and ClassLoaders.
+class GcRootVisitor {
+ public:
+  explicit GcRootVisitor(Hprof* hprof) : hprof_(hprof) {}
+
+  void operator()(mirror::Object* obj ATTRIBUTE_UNUSED,
+                  MemberOffset offset ATTRIBUTE_UNUSED,
+                  bool is_static ATTRIBUTE_UNUSED) const {}
+
+  // Note that these don't have read barriers. Its OK however since the GC is guaranteed to not be
+  // running during the hprof dumping process.
+  void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
+      SHARED_REQUIRES(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+      SHARED_REQUIRES(Locks::mutator_lock_) {
+    mirror::Object* obj = root->AsMirrorPtr();
+    // The two cases are either classes or dex cache arrays. If it is a dex cache array, then use
+    // VM internal. Otherwise the object is a declaring class of an ArtField or ArtMethod or a
+    // class from a ClassLoader.
+    hprof_->VisitRoot(obj, RootInfo(obj->IsClass() ? kRootStickyClass : kRootVMInternal));
+  }
+
+
+ private:
+  Hprof* const hprof_;
+};
+
 void Hprof::DumpHeapObject(mirror::Object* obj) {
   // Ignore classes that are retired.
   if (obj->IsClass() && obj->AsClass()->IsRetired()) {
     return;
   }
 
+  GcRootVisitor visitor(this);
+  obj->VisitReferences<true>(visitor, VoidFunctor());
+
   gc::Heap* const heap = Runtime::Current()->GetHeap();
   const gc::space::ContinuousSpace* const space = heap->FindContinuousSpaceFromObject(obj, true);
   HprofHeapId heap_type = HPROF_HEAP_APP;
diff --git a/runtime/image.cc b/runtime/image.cc
index ba1e58b..2586959 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -24,7 +24,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '1', '8', '\0' };
+const uint8_t ImageHeader::kImageVersion[] = { '0', '1', '9', '\0' };
 
 ImageHeader::ImageHeader(uint32_t image_begin,
                          uint32_t image_size,
@@ -153,19 +153,21 @@
     for (size_t i = 0; i < array->Length(); ++i) {
       visitor->Visit(&array->At(i, sizeof(ArtField)));
     }
-    pos += array->ComputeSize(array->Length(), sizeof(ArtField));
+    pos += array->ComputeSize(array->Length());
   }
 }
 
 void ImageSection::VisitPackedArtMethods(ArtMethodVisitor* visitor,
                                          uint8_t* base,
-                                         size_t method_size) const {
+                                         size_t pointer_size) const {
+  const size_t method_alignment = ArtMethod::Alignment(pointer_size);
+  const size_t method_size = ArtMethod::Size(pointer_size);
   for (size_t pos = 0; pos < Size(); ) {
     auto* array = reinterpret_cast<LengthPrefixedArray<ArtMethod>*>(base + Offset() + pos);
     for (size_t i = 0; i < array->Length(); ++i) {
-      visitor->Visit(&array->At(i, method_size));
+      visitor->Visit(&array->At(i, method_size, method_alignment));
     }
-    pos += array->ComputeSize(array->Length(), method_size);
+    pos += array->ComputeSize(array->Length(), method_size, method_alignment);
   }
 }
 
diff --git a/runtime/image.h b/runtime/image.h
index eb26f7f..1a0d8fd 100644
--- a/runtime/image.h
+++ b/runtime/image.h
@@ -65,7 +65,7 @@
   }
 
   // Visit ArtMethods in the section starting at base.
-  void VisitPackedArtMethods(ArtMethodVisitor* visitor, uint8_t* base, size_t method_size) const;
+  void VisitPackedArtMethods(ArtMethodVisitor* visitor, uint8_t* base, size_t pointer_size) const;
 
   // Visit ArtMethods in the section starting at base.
   void VisitPackedArtFields(ArtFieldVisitor* visitor, uint8_t* base) const;
diff --git a/runtime/indirect_reference_table.h b/runtime/indirect_reference_table.h
index 798b48c..c398555 100644
--- a/runtime/indirect_reference_table.h
+++ b/runtime/indirect_reference_table.h
@@ -227,8 +227,7 @@
 
 class IrtIterator {
  public:
-  explicit IrtIterator(IrtEntry* table, size_t i, size_t capacity)
-      SHARED_REQUIRES(Locks::mutator_lock_)
+  IrtIterator(IrtEntry* table, size_t i, size_t capacity) SHARED_REQUIRES(Locks::mutator_lock_)
       : table_(table), i_(i), capacity_(capacity) {
   }
 
diff --git a/runtime/jit/jit_instrumentation.cc b/runtime/jit/jit_instrumentation.cc
index 1e56cdc..258c29d 100644
--- a/runtime/jit/jit_instrumentation.cc
+++ b/runtime/jit/jit_instrumentation.cc
@@ -26,7 +26,7 @@
 
 class JitCompileTask : public Task {
  public:
-  explicit JitCompileTask(ArtMethod* method, JitInstrumentationCache* cache)
+  JitCompileTask(ArtMethod* method, JitInstrumentationCache* cache)
       : method_(method), cache_(cache) {
   }
 
diff --git a/runtime/length_prefixed_array.h b/runtime/length_prefixed_array.h
index 2b2e8d3..d9bc656 100644
--- a/runtime/length_prefixed_array.h
+++ b/runtime/length_prefixed_array.h
@@ -21,6 +21,8 @@
 
 #include "linear_alloc.h"
 #include "stride_iterator.h"
+#include "base/bit_utils.h"
+#include "base/casts.h"
 #include "base/iteration_range.h"
 
 namespace art {
@@ -28,29 +30,35 @@
 template<typename T>
 class LengthPrefixedArray {
  public:
-  explicit LengthPrefixedArray(uint64_t length) : length_(length) {}
+  explicit LengthPrefixedArray(size_t length)
+      : length_(dchecked_integral_cast<uint32_t>(length)) {}
 
-  T& At(size_t index, size_t element_size = sizeof(T)) {
+  T& At(size_t index, size_t element_size = sizeof(T), size_t alignment = alignof(T)) {
     DCHECK_LT(index, length_);
-    return *reinterpret_cast<T*>(&data_[0] + index * element_size);
+    return AtUnchecked(index, element_size, alignment);
   }
 
-  StrideIterator<T> Begin(size_t element_size = sizeof(T)) {
-    return StrideIterator<T>(reinterpret_cast<T*>(&data_[0]), element_size);
+  StrideIterator<T> Begin(size_t element_size = sizeof(T), size_t alignment = alignof(T)) {
+    return StrideIterator<T>(&AtUnchecked(0, element_size, alignment), element_size);
   }
 
-  StrideIterator<T> End(size_t element_size = sizeof(T)) {
-    return StrideIterator<T>(reinterpret_cast<T*>(&data_[0] + element_size * length_),
-                             element_size);
+  StrideIterator<T> End(size_t element_size = sizeof(T), size_t alignment = alignof(T)) {
+    return StrideIterator<T>(&AtUnchecked(length_, element_size, alignment), element_size);
   }
 
-  static size_t OffsetOfElement(size_t index, size_t element_size = sizeof(T)) {
-    return offsetof(LengthPrefixedArray<T>, data_) + index * element_size;
+  static size_t OffsetOfElement(size_t index,
+                                size_t element_size = sizeof(T),
+                                size_t alignment = alignof(T)) {
+    DCHECK_ALIGNED_PARAM(element_size, alignment);
+    return RoundUp(offsetof(LengthPrefixedArray<T>, data), alignment) + index * element_size;
   }
 
-  // Alignment is the caller's responsibility.
-  static size_t ComputeSize(size_t num_elements, size_t element_size = sizeof(T)) {
-    return OffsetOfElement(num_elements, element_size);
+  static size_t ComputeSize(size_t num_elements,
+                            size_t element_size = sizeof(T),
+                            size_t alignment = alignof(T)) {
+    size_t result = OffsetOfElement(num_elements, element_size, alignment);
+    DCHECK_ALIGNED_PARAM(result, alignment);
+    return result;
   }
 
   uint64_t Length() const {
@@ -58,21 +66,26 @@
   }
 
   // Update the length but does not reallocate storage.
-  void SetLength(uint64_t length) {
-    length_ = length;
+  void SetLength(size_t length) {
+    length_ = dchecked_integral_cast<uint32_t>(length);
   }
 
  private:
-  uint64_t length_;  // 64 bits for 8 byte alignment of data_.
-  uint8_t data_[0];
+  T& AtUnchecked(size_t index, size_t element_size, size_t alignment) {
+    return *reinterpret_cast<T*>(
+        reinterpret_cast<uintptr_t>(this) + OffsetOfElement(index, element_size, alignment));
+  }
+
+  uint32_t length_;
+  uint8_t data[0];
 };
 
 // Returns empty iteration range if the array is null.
 template<typename T>
 IterationRange<StrideIterator<T>> MakeIterationRangeFromLengthPrefixedArray(
-    LengthPrefixedArray<T>* arr, size_t element_size) {
+    LengthPrefixedArray<T>* arr, size_t element_size = sizeof(T), size_t alignment = alignof(T)) {
   return arr != nullptr ?
-      MakeIterationRange(arr->Begin(element_size), arr->End(element_size)) :
+      MakeIterationRange(arr->Begin(element_size, alignment), arr->End(element_size, alignment)) :
       MakeEmptyIterationRange(StrideIterator<T>(nullptr, 0));
 }
 
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index 245f8b8..5d0d204 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -197,7 +197,7 @@
   size_t ForwardingAddress() const;
 
   // Constructor a lock word for inflation to use a Monitor.
-  explicit LockWord(Monitor* mon, uint32_t rb_state);
+  LockWord(Monitor* mon, uint32_t rb_state);
 
   // Return the hash code stored in the lock word, must be kHashCode state.
   int32_t GetHashCode() const;
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 887e204..ac9cb09 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -92,14 +92,18 @@
   CheckPointerSize(pointer_size);
   auto* methods = GetDirectMethodsPtrUnchecked();
   DCHECK(methods != nullptr);
-  return &methods->At(i, ArtMethod::ObjectSize(pointer_size));
+  return &methods->At(i,
+                      ArtMethod::Size(pointer_size),
+                      ArtMethod::Alignment(pointer_size));
 }
 
 inline ArtMethod* Class::GetDirectMethod(size_t i, size_t pointer_size) {
   CheckPointerSize(pointer_size);
   auto* methods = GetDirectMethodsPtr();
   DCHECK(methods != nullptr);
-  return &methods->At(i, ArtMethod::ObjectSize(pointer_size));
+  return &methods->At(i,
+                      ArtMethod::Size(pointer_size),
+                      ArtMethod::Alignment(pointer_size));
 }
 
 template<VerifyObjectFlags kVerifyFlags>
@@ -133,7 +137,9 @@
   CheckPointerSize(pointer_size);
   LengthPrefixedArray<ArtMethod>* methods = GetVirtualMethodsPtrUnchecked();
   DCHECK(methods != nullptr);
-  return &methods->At(i, ArtMethod::ObjectSize(pointer_size));
+  return &methods->At(i,
+                      ArtMethod::Size(pointer_size),
+                      ArtMethod::Alignment(pointer_size));
 }
 
 inline PointerArray* Class::GetVTable() {
@@ -837,29 +843,31 @@
 inline IterationRange<StrideIterator<ArtMethod>> Class::GetDirectMethods(size_t pointer_size) {
   CheckPointerSize(pointer_size);
   return MakeIterationRangeFromLengthPrefixedArray(GetDirectMethodsPtrUnchecked(),
-                                                   ArtMethod::ObjectSize(pointer_size));
+                                                   ArtMethod::Size(pointer_size),
+                                                   ArtMethod::Alignment(pointer_size));
 }
 
 inline IterationRange<StrideIterator<ArtMethod>> Class::GetVirtualMethods(size_t pointer_size) {
   CheckPointerSize(pointer_size);
   return MakeIterationRangeFromLengthPrefixedArray(GetVirtualMethodsPtrUnchecked(),
-                                                   ArtMethod::ObjectSize(pointer_size));
+                                                   ArtMethod::Size(pointer_size),
+                                                   ArtMethod::Alignment(pointer_size));
 }
 
 inline IterationRange<StrideIterator<ArtField>> Class::GetIFields() {
-  return MakeIterationRangeFromLengthPrefixedArray(GetIFieldsPtr(), sizeof(ArtField));
+  return MakeIterationRangeFromLengthPrefixedArray(GetIFieldsPtr());
 }
 
 inline IterationRange<StrideIterator<ArtField>> Class::GetSFields() {
-  return MakeIterationRangeFromLengthPrefixedArray(GetSFieldsPtr(), sizeof(ArtField));
+  return MakeIterationRangeFromLengthPrefixedArray(GetSFieldsPtr());
 }
 
 inline IterationRange<StrideIterator<ArtField>> Class::GetIFieldsUnchecked() {
-  return MakeIterationRangeFromLengthPrefixedArray(GetIFieldsPtrUnchecked(), sizeof(ArtField));
+  return MakeIterationRangeFromLengthPrefixedArray(GetIFieldsPtrUnchecked());
 }
 
 inline IterationRange<StrideIterator<ArtField>> Class::GetSFieldsUnchecked() {
-  return MakeIterationRangeFromLengthPrefixedArray(GetSFieldsPtrUnchecked(), sizeof(ArtField));
+  return MakeIterationRangeFromLengthPrefixedArray(GetSFieldsPtrUnchecked());
 }
 
 inline MemberOffset Class::EmbeddedImTableOffset(size_t pointer_size) {
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index 6af90bb..f20cc6e 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -855,9 +855,9 @@
 // The pre-fence visitor for Class::CopyOf().
 class CopyClassVisitor {
  public:
-  explicit CopyClassVisitor(Thread* self, Handle<mirror::Class>* orig, size_t new_length,
-                            size_t copy_bytes, ArtMethod* const (&imt)[mirror::Class::kImtSize],
-                            size_t pointer_size)
+  CopyClassVisitor(Thread* self, Handle<mirror::Class>* orig, size_t new_length,
+                   size_t copy_bytes, ArtMethod* const (&imt)[mirror::Class::kImtSize],
+                   size_t pointer_size)
       : self_(self), orig_(orig), new_length_(new_length),
         copy_bytes_(copy_bytes), imt_(imt), pointer_size_(pointer_size) {
   }
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 513ab37..dc60a38 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -236,6 +236,15 @@
     SetAccessFlags(flags | kAccClassIsStringClass);
   }
 
+  ALWAYS_INLINE bool IsClassLoaderClass() SHARED_REQUIRES(Locks::mutator_lock_) {
+    return (GetField32(AccessFlagsOffset()) & kAccClassIsClassLoaderClass) != 0;
+  }
+
+  ALWAYS_INLINE void SetClassLoaderClass() SHARED_REQUIRES(Locks::mutator_lock_) {
+    uint32_t flags = GetField32(OFFSET_OF_OBJECT_MEMBER(Class, access_flags_));
+    SetAccessFlags(flags | kAccClassIsClassLoaderClass);
+  }
+
   // Returns true if the class is abstract.
   ALWAYS_INLINE bool IsAbstract() SHARED_REQUIRES(Locks::mutator_lock_) {
     return (GetAccessFlags() & kAccAbstract) != 0;
diff --git a/runtime/mirror/class_loader-inl.h b/runtime/mirror/class_loader-inl.h
new file mode 100644
index 0000000..35f3664
--- /dev/null
+++ b/runtime/mirror/class_loader-inl.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_MIRROR_CLASS_LOADER_INL_H_
+#define ART_RUNTIME_MIRROR_CLASS_LOADER_INL_H_
+
+#include "class_loader.h"
+
+#include "base/mutex-inl.h"
+#include "class_table-inl.h"
+
+namespace art {
+namespace mirror {
+
+template <const bool kVisitClass, VerifyObjectFlags kVerifyFlags, typename Visitor>
+inline void ClassLoader::VisitReferences(mirror::Class* klass, const Visitor& visitor) {
+  // Visit instance fields first.
+  VisitInstanceFieldsReferences<kVisitClass>(klass, visitor);
+  // Visit classes loaded after.
+  ReaderMutexLock mu(Thread::Current(), *Locks::classlinker_classes_lock_);
+  ClassTable* const class_table = GetClassTable();
+  if (class_table != nullptr) {
+    class_table->VisitRoots(visitor);
+  }
+}
+
+}  // namespace mirror
+}  // namespace art
+
+#endif  // ART_RUNTIME_MIRROR_CLASS_LOADER_INL_H_
diff --git a/runtime/mirror/class_loader.h b/runtime/mirror/class_loader.h
index 940aaa6..21c652a 100644
--- a/runtime/mirror/class_loader.h
+++ b/runtime/mirror/class_loader.h
@@ -26,6 +26,8 @@
 
 namespace mirror {
 
+class Class;
+
 // C++ mirror of java.lang.ClassLoader
 class MANAGED ClassLoader : public Object {
  public:
@@ -44,6 +46,12 @@
     SetField64<false>(OFFSET_OF_OBJECT_MEMBER(ClassLoader, class_table_),
                       reinterpret_cast<uint64_t>(class_table));
   }
+  // Visit instance fields of the class loader as well as its associated classes.
+  // Null class loader is handled by ClassLinker::VisitClassRoots.
+  template <const bool kVisitClass, VerifyObjectFlags kVerifyFlags, typename Visitor>
+  void VisitReferences(mirror::Class* klass, const Visitor& visitor)
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!Locks::classlinker_classes_lock_);
 
  private:
   // Field order required by test "ValidateFieldOrderOfJavaCppUnionClasses".
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index c5610b5..7b1660b 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -25,6 +25,7 @@
 #include "array-inl.h"
 #include "class.h"
 #include "class_linker.h"
+#include "class_loader-inl.h"
 #include "lock_word-inl.h"
 #include "monitor.h"
 #include "object_array-inl.h"
@@ -997,6 +998,18 @@
   klass->VisitFieldsReferences<kVisitClass, true>(0, visitor);
 }
 
+
+template<VerifyObjectFlags kVerifyFlags>
+inline bool Object::IsClassLoader() {
+  return GetClass<kVerifyFlags>()->IsClassLoaderClass();
+}
+
+template<VerifyObjectFlags kVerifyFlags>
+inline mirror::ClassLoader* Object::AsClassLoader() {
+  DCHECK(IsClassLoader<kVerifyFlags>());
+  return down_cast<mirror::ClassLoader*>(this);
+}
+
 template <const bool kVisitClass, VerifyObjectFlags kVerifyFlags, typename Visitor,
     typename JavaLangRefVisitor>
 inline void Object::VisitReferences(const Visitor& visitor,
@@ -1010,6 +1023,9 @@
     } else if (kVisitClass) {
       visitor(this, ClassOffset(), false);
     }
+  } else if (klass->IsClassLoaderClass()) {
+    mirror::ClassLoader* class_loader = AsClassLoader<kVerifyFlags>();
+    class_loader->VisitReferences<kVisitClass, kVerifyFlags>(klass, visitor);
   } else {
     DCHECK(!klass->IsVariableSize());
     VisitInstanceFieldsReferences<kVisitClass>(klass, visitor);
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index 87fb5ba..df680b5 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -107,7 +107,7 @@
 // An allocation pre-fence visitor that copies the object.
 class CopyObjectVisitor {
  public:
-  explicit CopyObjectVisitor(Thread* self, Handle<Object>* orig, size_t num_bytes)
+  CopyObjectVisitor(Thread* self, Handle<Object>* orig, size_t num_bytes)
       : self_(self), orig_(orig), num_bytes_(num_bytes) {
   }
 
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index eea9f37..4967a14 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -37,6 +37,7 @@
 
 class Array;
 class Class;
+class ClassLoader;
 class FinalizerReference;
 template<class T> class ObjectArray;
 template<class T> class PrimitiveArray;
@@ -156,6 +157,11 @@
   template<class T, VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   ObjectArray<T>* AsObjectArray() SHARED_REQUIRES(Locks::mutator_lock_);
 
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  bool IsClassLoader() SHARED_REQUIRES(Locks::mutator_lock_);
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  ClassLoader* AsClassLoader() SHARED_REQUIRES(Locks::mutator_lock_);
+
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
            ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   bool IsArrayInstance() SHARED_REQUIRES(Locks::mutator_lock_);
diff --git a/runtime/modifiers.h b/runtime/modifiers.h
index 8586dd1..8b363a6 100644
--- a/runtime/modifiers.h
+++ b/runtime/modifiers.h
@@ -55,6 +55,9 @@
 // Special runtime-only flags.
 // Note: if only kAccClassIsReference is set, we have a soft reference.
 
+// class is ClassLoader or one of its subclasses
+static constexpr uint32_t kAccClassIsClassLoaderClass   = 0x10000000;
+
 // class/ancestor overrides finalize()
 static constexpr uint32_t kAccClassIsFinalizable        = 0x80000000;
 // class is a soft/weak/phantom ref
diff --git a/runtime/monitor.h b/runtime/monitor.h
index 3ca8954..346e866 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -151,10 +151,10 @@
 #endif
 
  private:
-  explicit Monitor(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code)
-        SHARED_REQUIRES(Locks::mutator_lock_);
-  explicit Monitor(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code,
-                   MonitorId id) SHARED_REQUIRES(Locks::mutator_lock_);
+  Monitor(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+  Monitor(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_code, MonitorId id)
+      SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Install the monitor into its object, may fail if another thread installs a different monitor
   // first.
diff --git a/runtime/monitor_test.cc b/runtime/monitor_test.cc
index 1be637c..e1173bb 100644
--- a/runtime/monitor_test.cc
+++ b/runtime/monitor_test.cc
@@ -106,8 +106,7 @@
 
 class CreateTask : public Task {
  public:
-  explicit CreateTask(MonitorTest* monitor_test, uint64_t initial_sleep, int64_t millis,
-                      bool expected) :
+  CreateTask(MonitorTest* monitor_test, uint64_t initial_sleep, int64_t millis, bool expected) :
       monitor_test_(monitor_test), initial_sleep_(initial_sleep), millis_(millis),
       expected_(expected) {}
 
diff --git a/runtime/native/java_lang_Class.cc b/runtime/native/java_lang_Class.cc
index 1ca98e5..c337e91 100644
--- a/runtime/native/java_lang_Class.cc
+++ b/runtime/native/java_lang_Class.cc
@@ -208,7 +208,7 @@
     }
   }
   if (kIsDebugBuild) {
-    for (ArtField& field : MakeIterationRangeFromLengthPrefixedArray(fields, sizeof(ArtField))) {
+    for (ArtField& field : MakeIterationRangeFromLengthPrefixedArray(fields)) {
       CHECK_NE(field.GetName(), name->ToModifiedUtf8());
     }
   }
diff --git a/runtime/quick/inline_method_analyser.h b/runtime/quick/inline_method_analyser.h
index 75ff27f..dd3703c 100644
--- a/runtime/quick/inline_method_analyser.h
+++ b/runtime/quick/inline_method_analyser.h
@@ -57,6 +57,7 @@
   kIntrinsicReferenceGetReferent,
   kIntrinsicCharAt,
   kIntrinsicCompareTo,
+  kIntrinsicEquals,
   kIntrinsicGetCharsNoCheck,
   kIntrinsicIsEmptyOrLength,
   kIntrinsicIndexOf,
diff --git a/runtime/reflection.cc b/runtime/reflection.cc
index 100d199..2fe1e64 100644
--- a/runtime/reflection.cc
+++ b/runtime/reflection.cc
@@ -36,7 +36,7 @@
 
 class ArgArray {
  public:
-  explicit ArgArray(const char* shorty, uint32_t shorty_len)
+  ArgArray(const char* shorty, uint32_t shorty_len)
       : shorty_(shorty), shorty_len_(shorty_len), num_bytes_(0) {
     size_t num_slots = shorty_len + 1;  // +1 in case of receiver.
     if (LIKELY((num_slots * 2) < kSmallArgArraySize)) {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 74e3f11..a33e150 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1808,7 +1808,7 @@
 template<bool kTransactionActive>
 class BuildInternalStackTraceVisitor : public StackVisitor {
  public:
-  explicit BuildInternalStackTraceVisitor(Thread* self, Thread* thread, int skip_depth)
+  BuildInternalStackTraceVisitor(Thread* self, Thread* thread, int skip_depth)
       : StackVisitor(thread, nullptr, StackVisitor::StackWalkKind::kIncludeInlinedFrames),
         self_(self),
         skip_depth_(skip_depth),
@@ -2419,6 +2419,7 @@
 
   void VisitShadowFrame(ShadowFrame* shadow_frame) SHARED_REQUIRES(Locks::mutator_lock_) {
     ArtMethod* m = shadow_frame->GetMethod();
+    VisitDeclaringClass(m);
     DCHECK(m != nullptr);
     size_t num_regs = shadow_frame->NumberOfVRegs();
     if (m->IsNative() || shadow_frame->HasReferenceArray()) {
@@ -2459,10 +2460,25 @@
   }
 
  private:
+  // Visiting the declaring class is necessary so that we don't unload the class of a method that
+  // is executing. We need to ensure that the code stays mapped.
+  void VisitDeclaringClass(ArtMethod* method) SHARED_REQUIRES(Locks::mutator_lock_) {
+    mirror::Class* klass = method->GetDeclaringClassNoBarrier();
+    // klass can be null for runtime methods.
+    if (klass != nullptr) {
+      mirror::Object* new_ref = klass;
+      visitor_(&new_ref, -1, this);
+      if (new_ref != klass) {
+        method->CASDeclaringClass(klass, new_ref->AsClass());
+      }
+    }
+  }
+
   void VisitQuickFrame() SHARED_REQUIRES(Locks::mutator_lock_) {
-    auto* cur_quick_frame = GetCurrentQuickFrame();
+    ArtMethod** cur_quick_frame = GetCurrentQuickFrame();
     DCHECK(cur_quick_frame != nullptr);
-    auto* m = *cur_quick_frame;
+    ArtMethod* m = *cur_quick_frame;
+    VisitDeclaringClass(m);
 
     // Process register map (which native and runtime methods don't have)
     if (!m->IsNative() && !m->IsRuntimeMethod() && !m->IsProxyMethod()) {
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 62d1e84..d449f42 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -78,6 +78,13 @@
     Runtime::Current()->DetachCurrentThread();
   }
   WaitForOtherNonDaemonThreadsToExit();
+  // Disable GC and wait for GC to complete in case there are still daemon threads doing
+  // allocations.
+  gc::Heap* const heap = Runtime::Current()->GetHeap();
+  heap->DisableGCForShutdown();
+  // In case a GC is in progress, wait for it to finish.
+  heap->WaitForGcToComplete(gc::kGcCauseBackground, Thread::Current());
+
   // TODO: there's an unaddressed race here where a thread may attach during shutdown, see
   //       Thread::Init.
   SuspendAllDaemonThreads();
diff --git a/runtime/thread_pool.h b/runtime/thread_pool.h
index 1ca0a21..a2338d6 100644
--- a/runtime/thread_pool.h
+++ b/runtime/thread_pool.h
@@ -91,7 +91,7 @@
   // after running it, it is the caller's responsibility.
   void AddTask(Thread* self, Task* task) REQUIRES(!task_queue_lock_);
 
-  explicit ThreadPool(const char* name, size_t num_threads);
+  ThreadPool(const char* name, size_t num_threads);
   virtual ~ThreadPool();
 
   // Wait for all tasks currently on queue to get completed.
diff --git a/runtime/utils.cc b/runtime/utils.cc
index 20512f9..db3f2fe 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -30,6 +30,7 @@
 #include "base/stl_util.h"
 #include "base/unix_file/fd_file.h"
 #include "dex_file-inl.h"
+#include "dex_instruction.h"
 #include "mirror/class-inl.h"
 #include "mirror/class_loader.h"
 #include "mirror/object-inl.h"
@@ -1452,4 +1453,372 @@
   return PrettyDescriptor(Primitive::Descriptor(type));
 }
 
+static void DumpMethodCFGImpl(const DexFile* dex_file,
+                              uint32_t dex_method_idx,
+                              const DexFile::CodeItem* code_item,
+                              std::ostream& os) {
+  os << "digraph {\n";
+  os << "  # /* " << PrettyMethod(dex_method_idx, *dex_file, true) << " */\n";
+
+  std::set<uint32_t> dex_pc_is_branch_target;
+  {
+    // Go and populate.
+    const Instruction* inst = Instruction::At(code_item->insns_);
+    for (uint32_t dex_pc = 0;
+         dex_pc < code_item->insns_size_in_code_units_;
+         dex_pc += inst->SizeInCodeUnits(), inst = inst->Next()) {
+      if (inst->IsBranch()) {
+        dex_pc_is_branch_target.insert(dex_pc + inst->GetTargetOffset());
+      } else if (inst->IsSwitch()) {
+        const uint16_t* insns = code_item->insns_ + dex_pc;
+        int32_t switch_offset = insns[1] | ((int32_t) insns[2]) << 16;
+        const uint16_t* switch_insns = insns + switch_offset;
+        uint32_t switch_count = switch_insns[1];
+        int32_t targets_offset;
+        if ((*insns & 0xff) == Instruction::PACKED_SWITCH) {
+          /* 0=sig, 1=count, 2/3=firstKey */
+          targets_offset = 4;
+        } else {
+          /* 0=sig, 1=count, 2..count*2 = keys */
+          targets_offset = 2 + 2 * switch_count;
+        }
+        for (uint32_t targ = 0; targ < switch_count; targ++) {
+          int32_t offset = (int32_t) switch_insns[targets_offset + targ * 2] |
+              (int32_t) (switch_insns[targets_offset + targ * 2 + 1] << 16);
+          dex_pc_is_branch_target.insert(dex_pc + offset);
+        }
+      }
+    }
+  }
+
+  // Create nodes for "basic blocks."
+  std::map<uint32_t, uint32_t> dex_pc_to_node_id;  // This only has entries for block starts.
+  std::map<uint32_t, uint32_t> dex_pc_to_incl_id;  // This has entries for all dex pcs.
+
+  {
+    const Instruction* inst = Instruction::At(code_item->insns_);
+    bool first_in_block = true;
+    bool force_new_block = false;
+    for (uint32_t dex_pc = 0; dex_pc < code_item->insns_size_in_code_units_;
+        dex_pc += inst->SizeInCodeUnits(), inst = inst->Next()) {
+      if (dex_pc == 0 ||
+          (dex_pc_is_branch_target.find(dex_pc) != dex_pc_is_branch_target.end()) ||
+          force_new_block) {
+        uint32_t id = dex_pc_to_node_id.size();
+        if (id > 0) {
+          // End last node.
+          os << "}\"];\n";
+        }
+        // Start next node.
+        os << "  node" << id << " [shape=record,label=\"{";
+        dex_pc_to_node_id.insert(std::make_pair(dex_pc, id));
+        first_in_block = true;
+        force_new_block = false;
+      }
+
+      // Register instruction.
+      dex_pc_to_incl_id.insert(std::make_pair(dex_pc, dex_pc_to_node_id.size() - 1));
+
+      // Print instruction.
+      if (!first_in_block) {
+        os << " | ";
+      } else {
+        first_in_block = false;
+      }
+
+      // Dump the instruction. Need to escape '"', '<', '>', '{' and '}'.
+      os << "<" << "p" << dex_pc << ">";
+      os << " 0x" << std::hex << dex_pc << std::dec << ": ";
+      std::string inst_str = inst->DumpString(dex_file);
+      size_t cur_start = 0;  // It's OK to start at zero, instruction dumps don't start with chars
+      // we need to escape.
+      while (cur_start != std::string::npos) {
+        size_t next_escape = inst_str.find_first_of("\"{}<>", cur_start + 1);
+        if (next_escape == std::string::npos) {
+          os << inst_str.substr(cur_start, inst_str.size() - cur_start);
+          break;
+        } else {
+          os << inst_str.substr(cur_start, next_escape - cur_start);
+          // Escape all necessary characters.
+          while (next_escape < inst_str.size()) {
+            char c = inst_str.at(next_escape);
+            if (c == '"' || c == '{' || c == '}' || c == '<' || c == '>') {
+              os << '\\' << c;
+            } else {
+              break;
+            }
+            next_escape++;
+          }
+          if (next_escape >= inst_str.size()) {
+            next_escape = std::string::npos;
+          }
+          cur_start = next_escape;
+        }
+      }
+
+      // Force a new block for some fall-throughs and some instructions that terminate the "local"
+      // control flow.
+      force_new_block = inst->IsSwitch() || inst->IsBasicBlockEnd();
+    }
+    // Close last node.
+    if (dex_pc_to_node_id.size() > 0) {
+      os << "}\"];\n";
+    }
+  }
+
+  // Create edges between them.
+  {
+    std::ostringstream regular_edges;
+    std::ostringstream taken_edges;
+    std::ostringstream exception_edges;
+
+    // Common set of exception edges.
+    std::set<uint32_t> exception_targets;
+
+    // These blocks (given by the first dex pc) need exception per dex-pc handling in a second
+    // pass. In the first pass we try and see whether we can use a common set of edges.
+    std::set<uint32_t> blocks_with_detailed_exceptions;
+
+    {
+      uint32_t last_node_id = std::numeric_limits<uint32_t>::max();
+      uint32_t old_dex_pc = 0;
+      uint32_t block_start_dex_pc = std::numeric_limits<uint32_t>::max();
+      const Instruction* inst = Instruction::At(code_item->insns_);
+      for (uint32_t dex_pc = 0;
+          dex_pc < code_item->insns_size_in_code_units_;
+          old_dex_pc = dex_pc, dex_pc += inst->SizeInCodeUnits(), inst = inst->Next()) {
+        {
+          auto it = dex_pc_to_node_id.find(dex_pc);
+          if (it != dex_pc_to_node_id.end()) {
+            if (!exception_targets.empty()) {
+              // It seems the last block had common exception handlers. Add the exception edges now.
+              uint32_t node_id = dex_pc_to_node_id.find(block_start_dex_pc)->second;
+              for (uint32_t handler_pc : exception_targets) {
+                auto node_id_it = dex_pc_to_incl_id.find(handler_pc);
+                if (node_id_it != dex_pc_to_incl_id.end()) {
+                  exception_edges << "  node" << node_id
+                      << " -> node" << node_id_it->second << ":p" << handler_pc
+                      << ";\n";
+                }
+              }
+              exception_targets.clear();
+            }
+
+            block_start_dex_pc = dex_pc;
+
+            // Seems to be a fall-through, connect to last_node_id. May be spurious edges for things
+            // like switch data.
+            uint32_t old_last = last_node_id;
+            last_node_id = it->second;
+            if (old_last != std::numeric_limits<uint32_t>::max()) {
+              regular_edges << "  node" << old_last << ":p" << old_dex_pc
+                  << " -> node" << last_node_id << ":p" << dex_pc
+                  << ";\n";
+            }
+          }
+
+          // Look at the exceptions of the first entry.
+          CatchHandlerIterator catch_it(*code_item, dex_pc);
+          for (; catch_it.HasNext(); catch_it.Next()) {
+            exception_targets.insert(catch_it.GetHandlerAddress());
+          }
+        }
+
+        // Handle instruction.
+
+        // Branch: something with at most two targets.
+        if (inst->IsBranch()) {
+          const int32_t offset = inst->GetTargetOffset();
+          const bool conditional = !inst->IsUnconditional();
+
+          auto target_it = dex_pc_to_node_id.find(dex_pc + offset);
+          if (target_it != dex_pc_to_node_id.end()) {
+            taken_edges << "  node" << last_node_id << ":p" << dex_pc
+                << " -> node" << target_it->second << ":p" << (dex_pc + offset)
+                << ";\n";
+          }
+          if (!conditional) {
+            // No fall-through.
+            last_node_id = std::numeric_limits<uint32_t>::max();
+          }
+        } else if (inst->IsSwitch()) {
+          // TODO: Iterate through all switch targets.
+          const uint16_t* insns = code_item->insns_ + dex_pc;
+          /* make sure the start of the switch is in range */
+          int32_t switch_offset = insns[1] | ((int32_t) insns[2]) << 16;
+          /* offset to switch table is a relative branch-style offset */
+          const uint16_t* switch_insns = insns + switch_offset;
+          uint32_t switch_count = switch_insns[1];
+          int32_t targets_offset;
+          if ((*insns & 0xff) == Instruction::PACKED_SWITCH) {
+            /* 0=sig, 1=count, 2/3=firstKey */
+            targets_offset = 4;
+          } else {
+            /* 0=sig, 1=count, 2..count*2 = keys */
+            targets_offset = 2 + 2 * switch_count;
+          }
+          /* make sure the end of the switch is in range */
+          /* verify each switch target */
+          for (uint32_t targ = 0; targ < switch_count; targ++) {
+            int32_t offset = (int32_t) switch_insns[targets_offset + targ * 2] |
+                (int32_t) (switch_insns[targets_offset + targ * 2 + 1] << 16);
+            int32_t abs_offset = dex_pc + offset;
+            auto target_it = dex_pc_to_node_id.find(abs_offset);
+            if (target_it != dex_pc_to_node_id.end()) {
+              // TODO: value label.
+              taken_edges << "  node" << last_node_id << ":p" << dex_pc
+                  << " -> node" << target_it->second << ":p" << (abs_offset)
+                  << ";\n";
+            }
+          }
+        }
+
+        // Exception edges. If this is not the first instruction in the block
+        if (block_start_dex_pc != dex_pc) {
+          std::set<uint32_t> current_handler_pcs;
+          CatchHandlerIterator catch_it(*code_item, dex_pc);
+          for (; catch_it.HasNext(); catch_it.Next()) {
+            current_handler_pcs.insert(catch_it.GetHandlerAddress());
+          }
+          if (current_handler_pcs != exception_targets) {
+            exception_targets.clear();  // Clear so we don't do something at the end.
+            blocks_with_detailed_exceptions.insert(block_start_dex_pc);
+          }
+        }
+
+        if (inst->IsReturn() ||
+            (inst->Opcode() == Instruction::THROW) ||
+            (inst->IsBranch() && inst->IsUnconditional())) {
+          // No fall-through.
+          last_node_id = std::numeric_limits<uint32_t>::max();
+        }
+      }
+      // Finish up the last block, if it had common exceptions.
+      if (!exception_targets.empty()) {
+        // It seems the last block had common exception handlers. Add the exception edges now.
+        uint32_t node_id = dex_pc_to_node_id.find(block_start_dex_pc)->second;
+        for (uint32_t handler_pc : exception_targets) {
+          auto node_id_it = dex_pc_to_incl_id.find(handler_pc);
+          if (node_id_it != dex_pc_to_incl_id.end()) {
+            exception_edges << "  node" << node_id
+                << " -> node" << node_id_it->second << ":p" << handler_pc
+                << ";\n";
+          }
+        }
+        exception_targets.clear();
+      }
+    }
+
+    // Second pass for detailed exception blocks.
+    // TODO
+    // Exception edges. If this is not the first instruction in the block
+    for (uint32_t dex_pc : blocks_with_detailed_exceptions) {
+      const Instruction* inst = Instruction::At(&code_item->insns_[dex_pc]);
+      uint32_t this_node_id = dex_pc_to_incl_id.find(dex_pc)->second;
+      for (;;) {
+        CatchHandlerIterator catch_it(*code_item, dex_pc);
+        if (catch_it.HasNext()) {
+          std::set<uint32_t> handled_targets;
+          for (; catch_it.HasNext(); catch_it.Next()) {
+            uint32_t handler_pc = catch_it.GetHandlerAddress();
+            auto it = handled_targets.find(handler_pc);
+            if (it == handled_targets.end()) {
+              auto node_id_it = dex_pc_to_incl_id.find(handler_pc);
+              if (node_id_it != dex_pc_to_incl_id.end()) {
+                exception_edges << "  node" << this_node_id << ":p" << dex_pc
+                    << " -> node" << node_id_it->second << ":p" << handler_pc
+                    << ";\n";
+              }
+
+              // Mark as done.
+              handled_targets.insert(handler_pc);
+            }
+          }
+        }
+        if (inst->IsBasicBlockEnd()) {
+          break;
+        }
+
+        // Loop update. In the body to have a late break-out if the next instruction is a branch
+        // target and thus in another block.
+        dex_pc += inst->SizeInCodeUnits();
+        if (dex_pc >= code_item->insns_size_in_code_units_) {
+          break;
+        }
+        if (dex_pc_to_node_id.find(dex_pc) != dex_pc_to_node_id.end()) {
+          break;
+        }
+        inst = inst->Next();
+      }
+    }
+
+    // Write out the sub-graphs to make edges styled.
+    os << "\n";
+    os << "  subgraph regular_edges {\n";
+    os << "    edge [color=\"#000000\",weight=.3,len=3];\n\n";
+    os << "    " << regular_edges.str() << "\n";
+    os << "  }\n\n";
+
+    os << "  subgraph taken_edges {\n";
+    os << "    edge [color=\"#00FF00\",weight=.3,len=3];\n\n";
+    os << "    " << taken_edges.str() << "\n";
+    os << "  }\n\n";
+
+    os << "  subgraph exception_edges {\n";
+    os << "    edge [color=\"#FF0000\",weight=.3,len=3];\n\n";
+    os << "    " << exception_edges.str() << "\n";
+    os << "  }\n\n";
+  }
+
+  os << "}\n";
+}
+
+void DumpMethodCFG(ArtMethod* method, std::ostream& os) {
+  const DexFile* dex_file = method->GetDexFile();
+  const DexFile::CodeItem* code_item = dex_file->GetCodeItem(method->GetCodeItemOffset());
+
+  DumpMethodCFGImpl(dex_file, method->GetDexMethodIndex(), code_item, os);
+}
+
+void DumpMethodCFG(const DexFile* dex_file, uint32_t dex_method_idx, std::ostream& os) {
+  // This is painful, we need to find the code item. That means finding the class, and then
+  // iterating the table.
+  if (dex_method_idx >= dex_file->NumMethodIds()) {
+    os << "Could not find method-idx.";
+    return;
+  }
+  const DexFile::MethodId& method_id = dex_file->GetMethodId(dex_method_idx);
+
+  const DexFile::ClassDef* class_def = dex_file->FindClassDef(method_id.class_idx_);
+  if (class_def == nullptr) {
+    os << "Could not find class-def.";
+    return;
+  }
+
+  const uint8_t* class_data = dex_file->GetClassData(*class_def);
+  if (class_data == nullptr) {
+    os << "No class data.";
+    return;
+  }
+
+  ClassDataItemIterator it(*dex_file, class_data);
+  // Skip fields
+  while (it.HasNextStaticField() || it.HasNextInstanceField()) {
+    it.Next();
+  }
+
+  // Find method, and dump it.
+  while (it.HasNextDirectMethod() || it.HasNextVirtualMethod()) {
+    uint32_t method_idx = it.GetMemberIndex();
+    if (method_idx == dex_method_idx) {
+      DumpMethodCFGImpl(dex_file, dex_method_idx, it.GetMethodCodeItem(), os);
+      return;
+    }
+    it.Next();
+  }
+
+  // Otherwise complain.
+  os << "Something went wrong, didn't find the method in the class data.";
+}
+
 }  // namespace art
diff --git a/runtime/utils.h b/runtime/utils.h
index 4fa5f5a..d1be51a 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -324,6 +324,9 @@
   return pointer_size == 4 || pointer_size == 8;
 }
 
+void DumpMethodCFG(ArtMethod* method, std::ostream& os) SHARED_REQUIRES(Locks::mutator_lock_);
+void DumpMethodCFG(const DexFile* dex_file, uint32_t dex_method_idx, std::ostream& os);
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_UTILS_H_
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 0181e5b..7c6add1 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -53,6 +53,9 @@
 static constexpr bool gDebugVerify = false;
 // TODO: Add a constant to method_verifier to turn on verbose logging?
 
+// On VLOG(verifier), should we dump the whole state when we run into a hard failure?
+static constexpr bool kDumpRegLinesOnHardFailureIfVLOG = true;
+
 void PcToRegisterLineTable::Init(RegisterTrackingMode mode, InstructionFlags* flags,
                                  uint32_t insns_size, uint16_t registers_size,
                                  MethodVerifier* verifier) {
@@ -107,7 +110,7 @@
 }
 
 static void SafelyMarkAllRegistersAsConflicts(MethodVerifier* verifier, RegisterLine* reg_line) {
-  if (verifier->IsConstructor()) {
+  if (verifier->IsInstanceConstructor()) {
     // Before we mark all regs as conflicts, check that we don't have an uninitialized this.
     reg_line->CheckConstructorReturn(verifier);
   }
@@ -638,6 +641,12 @@
         Runtime::Current()->GetCompilerCallbacks()->ClassRejected(ref);
       }
       have_pending_hard_failure_ = true;
+      if (VLOG_IS_ON(verifier) && kDumpRegLinesOnHardFailureIfVLOG) {
+        ScopedObjectAccess soa(Thread::Current());
+        std::ostringstream oss;
+        Dump(oss);
+        LOG(ERROR) << oss.str();
+      }
       break;
     }
   }
@@ -1319,7 +1328,7 @@
   ScopedIndentation indent1(vios);
   const Instruction* inst = Instruction::At(code_item_->insns_);
   for (size_t dex_pc = 0; dex_pc < code_item_->insns_size_in_code_units_;
-      dex_pc += inst->SizeInCodeUnits()) {
+      dex_pc += inst->SizeInCodeUnits(), inst = inst->Next()) {
     RegisterLine* reg_line = reg_table_.GetLine(dex_pc);
     if (reg_line != nullptr) {
       vios->Stream() << reg_line->Dump(this) << "\n";
@@ -1331,7 +1340,6 @@
       vios->Stream() << inst->DumpHex(5) << " ";
     }
     vios->Stream() << inst->DumpString(dex_file_) << "\n";
-    inst = inst->Next();
   }
 }
 
@@ -1373,9 +1381,15 @@
     // argument as uninitialized. This restricts field access until the superclass constructor is
     // called.
     const RegType& declaring_class = GetDeclaringClass();
-    if (IsConstructor() && !declaring_class.IsJavaLangObject()) {
-      reg_line->SetRegisterType(this, arg_start + cur_arg,
-                                reg_types_.UninitializedThisArgument(declaring_class));
+    if (IsConstructor()) {
+      if (declaring_class.IsJavaLangObject()) {
+        // "this" is implicitly initialized.
+        reg_line->SetThisInitialized();
+        reg_line->SetRegisterType(this, arg_start + cur_arg, declaring_class);
+      } else {
+        reg_line->SetRegisterType(this, arg_start + cur_arg,
+                                  reg_types_.UninitializedThisArgument(declaring_class));
+      }
     } else {
       reg_line->SetRegisterType(this, arg_start + cur_arg, declaring_class);
     }
@@ -1698,16 +1712,6 @@
   std::unique_ptr<RegisterLine> branch_line;
   std::unique_ptr<RegisterLine> fallthrough_line;
 
-  /*
-   * If we are in a constructor, and we currently have an UninitializedThis type
-   * in a register somewhere, we need to make sure it isn't overwritten.
-   */
-  bool track_uninitialized_this = false;
-  size_t uninitialized_this_loc = 0;
-  if (IsConstructor()) {
-    track_uninitialized_this = work_line_->GetUninitializedThisLoc(this, &uninitialized_this_loc);
-  }
-
   switch (inst->Opcode()) {
     case Instruction::NOP:
       /*
@@ -1785,14 +1789,14 @@
       break;
     }
     case Instruction::RETURN_VOID:
-      if (!IsConstructor() || work_line_->CheckConstructorReturn(this)) {
+      if (!IsInstanceConstructor() || work_line_->CheckConstructorReturn(this)) {
         if (!GetMethodReturnType().IsConflict()) {
           Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "return-void not expected";
         }
       }
       break;
     case Instruction::RETURN:
-      if (!IsConstructor() || work_line_->CheckConstructorReturn(this)) {
+      if (!IsInstanceConstructor() || work_line_->CheckConstructorReturn(this)) {
         /* check the method signature */
         const RegType& return_type = GetMethodReturnType();
         if (!return_type.IsCategory1Types()) {
@@ -1817,7 +1821,7 @@
       }
       break;
     case Instruction::RETURN_WIDE:
-      if (!IsConstructor() || work_line_->CheckConstructorReturn(this)) {
+      if (!IsInstanceConstructor() || work_line_->CheckConstructorReturn(this)) {
         /* check the method signature */
         const RegType& return_type = GetMethodReturnType();
         if (!return_type.IsCategory2Types()) {
@@ -1833,7 +1837,7 @@
       }
       break;
     case Instruction::RETURN_OBJECT:
-      if (!IsConstructor() || work_line_->CheckConstructorReturn(this)) {
+      if (!IsInstanceConstructor() || work_line_->CheckConstructorReturn(this)) {
         const RegType& return_type = GetMethodReturnType();
         if (!return_type.IsReferenceTypes()) {
           Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "return-object not expected";
@@ -3003,20 +3007,6 @@
      */
   }  // end - switch (dec_insn.opcode)
 
-  /*
-   * If we are in a constructor, and we had an UninitializedThis type
-   * in a register somewhere, we need to make sure it wasn't overwritten.
-   */
-  if (track_uninitialized_this) {
-    bool was_invoke_direct = (inst->Opcode() == Instruction::INVOKE_DIRECT ||
-                              inst->Opcode() == Instruction::INVOKE_DIRECT_RANGE);
-    if (work_line_->WasUninitializedThisOverwritten(this, uninitialized_this_loc,
-                                                    was_invoke_direct)) {
-      Fail(VERIFY_ERROR_BAD_CLASS_HARD)
-          << "Constructor failed to initialize this object";
-    }
-  }
-
   if (have_pending_hard_failure_) {
     if (Runtime::Current()->IsAotCompiler()) {
       /* When AOT compiling, check that the last failure is a hard failure */
@@ -3956,7 +3946,24 @@
     if (array_type.IsZero()) {
       // Null array type; this code path will fail at runtime.
       // Still check that the given value matches the instruction's type.
-      work_line_->VerifyRegisterType(this, inst->VRegA_23x(), insn_type);
+      // Note: this is, as usual, complicated by the fact the the instruction isn't fully typed
+      //       and fits multiple register types.
+      const RegType* modified_reg_type = &insn_type;
+      if ((modified_reg_type == &reg_types_.Integer()) ||
+          (modified_reg_type == &reg_types_.LongLo())) {
+        // May be integer or float | long or double. Overwrite insn_type accordingly.
+        const RegType& value_type = work_line_->GetRegisterType(this, inst->VRegA_23x());
+        if (modified_reg_type == &reg_types_.Integer()) {
+          if (&value_type == &reg_types_.Float()) {
+            modified_reg_type = &value_type;
+          }
+        } else {
+          if (&value_type == &reg_types_.DoubleLo()) {
+            modified_reg_type = &value_type;
+          }
+        }
+      }
+      work_line_->VerifyRegisterType(this, inst->VRegA_23x(), *modified_reg_type);
     } else if (!array_type.IsArrayTypes()) {
       Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "not array type " << array_type << " with aput";
     } else {
@@ -4378,6 +4385,10 @@
       const Instruction* ret_inst = Instruction::At(code_item_->insns_ + next_insn);
       Instruction::Code opcode = ret_inst->Opcode();
       if (opcode == Instruction::RETURN_VOID || opcode == Instruction::RETURN_VOID_NO_BARRIER) {
+        // Explicitly copy the this-initialized flag from the merge-line, as we didn't copy its
+        // state. Must be done before SafelyMarkAllRegistersAsConflicts as that will do the
+        // super-constructor-call checking.
+        target_line->CopyThisInitialized(*merge_line);
         SafelyMarkAllRegistersAsConflicts(this, target_line);
       } else {
         target_line->CopyFromLine(merge_line);
diff --git a/runtime/verifier/method_verifier.h b/runtime/verifier/method_verifier.h
index 3b59bba..21f8543 100644
--- a/runtime/verifier/method_verifier.h
+++ b/runtime/verifier/method_verifier.h
@@ -283,6 +283,10 @@
     return (method_access_flags_ & kAccStatic) != 0;
   }
 
+  bool IsInstanceConstructor() const {
+    return IsConstructor() && !IsStatic();
+  }
+
   SafeMap<uint32_t, std::set<uint32_t>>& GetStringInitPcRegMap() {
     return string_init_pc_reg_map_;
   }
diff --git a/runtime/verifier/register_line.cc b/runtime/verifier/register_line.cc
index 2838681..f286a45 100644
--- a/runtime/verifier/register_line.cc
+++ b/runtime/verifier/register_line.cc
@@ -18,66 +18,30 @@
 
 #include "base/stringprintf.h"
 #include "dex_instruction-inl.h"
-#include "method_verifier.h"
+#include "method_verifier-inl.h"
 #include "register_line-inl.h"
 #include "reg_type-inl.h"
 
 namespace art {
 namespace verifier {
 
-bool RegisterLine::WasUninitializedThisOverwritten(MethodVerifier* verifier,
-                                                   size_t this_loc,
-                                                   bool was_invoke_direct) const {
-  DCHECK(verifier->IsConstructor());
-
-  // Is the UnintializedThis type still there?
-  if (GetRegisterType(verifier, this_loc).IsUninitializedThisReference() ||
-      GetRegisterType(verifier, this_loc).IsUnresolvedAndUninitializedThisReference()) {
-    return false;
-  }
-
-  // If there is an initialized reference here now, did we just perform an invoke-direct? Note that
-  // this is the correct approach for dex bytecode: results of invoke-direct are stored in the
-  // result register. Overwriting "this_loc" can only be done by a constructor call.
-  if (GetRegisterType(verifier, this_loc).IsReferenceTypes() && was_invoke_direct) {
-    return false;
-    // Otherwise we could have just copied a different initialized reference to this location.
-  }
-
-  // The UnintializedThis in the register is gone, so check to see if it's somewhere else now.
-  for (size_t i = 0; i < num_regs_; i++) {
-    if (GetRegisterType(verifier, i).IsUninitializedThisReference() ||
-        GetRegisterType(verifier, i).IsUnresolvedAndUninitializedThisReference()) {
-      // We found it somewhere else...
-      return false;
-    }
-  }
-
-  // The UninitializedThis is gone from the original register, and now we can't find it.
-  return true;
-}
-
-bool RegisterLine::GetUninitializedThisLoc(MethodVerifier* verifier, size_t* vreg) const {
-  for (size_t i = 0; i < num_regs_; i++) {
-    if (GetRegisterType(verifier, i).IsUninitializedThisReference() ||
-        GetRegisterType(verifier, i).IsUnresolvedAndUninitializedThisReference()) {
-      *vreg = i;
-      return true;
-    }
-  }
-  return false;
-}
-
 bool RegisterLine::CheckConstructorReturn(MethodVerifier* verifier) const {
-  for (size_t i = 0; i < num_regs_; i++) {
-    if (GetRegisterType(verifier, i).IsUninitializedThisReference() ||
-        GetRegisterType(verifier, i).IsUnresolvedAndUninitializedThisReference()) {
-      verifier->Fail(VERIFY_ERROR_BAD_CLASS_SOFT)
-          << "Constructor returning without calling superclass constructor";
-      return false;
+  if (kIsDebugBuild && this_initialized_) {
+    // Ensure that there is no UninitializedThisReference type anymore if this_initialized_ is true.
+    for (size_t i = 0; i < num_regs_; i++) {
+      const RegType& type = GetRegisterType(verifier, i);
+      CHECK(!type.IsUninitializedThisReference() &&
+            !type.IsUnresolvedAndUninitializedThisReference())
+          << i << ": " << type.IsUninitializedThisReference() << " in "
+          << PrettyMethod(verifier->GetMethodReference().dex_method_index,
+                          *verifier->GetMethodReference().dex_file);
     }
   }
-  return true;
+  if (!this_initialized_) {
+    verifier->Fail(VERIFY_ERROR_BAD_CLASS_HARD)
+        << "Constructor returning without calling superclass constructor";
+  }
+  return this_initialized_;
 }
 
 const RegType& RegisterLine::GetInvocationThis(MethodVerifier* verifier, const Instruction* inst,
@@ -148,6 +112,11 @@
       }
     }
   }
+  // Is this initializing "this"?
+  if (uninit_type.IsUninitializedThisReference() ||
+      uninit_type.IsUnresolvedAndUninitializedThisReference()) {
+    this_initialized_ = true;
+  }
   DCHECK_GT(changed, 0u);
 }
 
@@ -432,6 +401,11 @@
       }
     }
   }
+  // Check whether "this" was initialized in both paths.
+  if (this_initialized_ && !incoming_line->this_initialized_) {
+    this_initialized_ = false;
+    changed = true;
+  }
   return changed;
 }
 
diff --git a/runtime/verifier/register_line.h b/runtime/verifier/register_line.h
index 4fb3a2c..f61e51f 100644
--- a/runtime/verifier/register_line.h
+++ b/runtime/verifier/register_line.h
@@ -114,6 +114,7 @@
     memcpy(&line_, &src->line_, num_regs_ * sizeof(uint16_t));
     monitors_ = src->monitors_;
     reg_to_lock_depths_ = src->reg_to_lock_depths_;
+    this_initialized_ = src->this_initialized_;
   }
 
   std::string Dump(MethodVerifier* verifier) const SHARED_REQUIRES(Locks::mutator_lock_);
@@ -149,6 +150,14 @@
   void MarkAllRegistersAsConflictsExcept(MethodVerifier* verifier, uint32_t vsrc);
   void MarkAllRegistersAsConflictsExceptWide(MethodVerifier* verifier, uint32_t vsrc);
 
+  void SetThisInitialized() {
+    this_initialized_ = true;
+  }
+
+  void CopyThisInitialized(const RegisterLine& src) {
+    this_initialized_ = src.this_initialized_;
+  }
+
   /*
    * Check constraints on constructor return. Specifically, make sure that the "this" argument got
    * initialized.
@@ -158,18 +167,6 @@
    */
   bool CheckConstructorReturn(MethodVerifier* verifier) const;
 
-  /*
-   * Check if an UninitializedThis at the specified location has been overwritten before
-   * being correctly initialized.
-   */
-  bool WasUninitializedThisOverwritten(MethodVerifier* verifier, size_t this_loc,
-                                       bool was_invoke_direct) const;
-
-  /*
-   * Get the first location of an UninitializedThis type, or return kInvalidVreg if there are none.
-   */
-  bool GetUninitializedThisLoc(MethodVerifier* verifier, size_t* vreg) const;
-
   // Compare two register lines. Returns 0 if they match.
   // Using this for a sort is unwise, since the value can change based on machine endianness.
   int CompareLine(const RegisterLine* line2) const {
@@ -354,7 +351,7 @@
   }
 
   RegisterLine(size_t num_regs, MethodVerifier* verifier)
-      : num_regs_(num_regs) {
+      : num_regs_(num_regs), this_initialized_(false) {
     memset(&line_, 0, num_regs_ * sizeof(uint16_t));
     SetResultTypeToUnknown(verifier);
   }
@@ -372,6 +369,9 @@
   // monitor-enter on v5 and then on v6, we expect the monitor-exit to be on v6 then on v5.
   AllocationTrackingSafeMap<uint32_t, uint32_t, kAllocatorTagVerifier> reg_to_lock_depths_;
 
+  // Whether "this" initialization (a constructor supercall) has happened.
+  bool this_initialized_;
+
   // An array of RegType Ids associated with each dex register.
   uint16_t line_[0];
 
diff --git a/test/004-ThreadStress/src/Main.java b/test/004-ThreadStress/src/Main.java
index 6e7d5b6..7acd950 100644
--- a/test/004-ThreadStress/src/Main.java
+++ b/test/004-ThreadStress/src/Main.java
@@ -32,6 +32,7 @@
 //
 // ThreadStress command line parameters:
 //    -n X ............ number of threads
+//    -d X ............ number of daemon threads
 //    -o X ............ number of overall operations
 //    -t X ............ number of operations per thread
 //    --dumpmap ....... print the frequency map
@@ -301,6 +302,7 @@
 
     public static void parseAndRun(String[] args) throws Exception {
         int numberOfThreads = -1;
+        int numberOfDaemons = -1;
         int totalOperations = -1;
         int operationsPerThread = -1;
         Object lock = new Object();
@@ -312,6 +314,9 @@
                 if (args[i].equals("-n")) {
                     i++;
                     numberOfThreads = Integer.parseInt(args[i]);
+                } else if (args[i].equals("-d")) {
+                    i++;
+                    numberOfDaemons = Integer.parseInt(args[i]);
                 } else if (args[i].equals("-o")) {
                     i++;
                     totalOperations = Integer.parseInt(args[i]);
@@ -338,6 +343,10 @@
             numberOfThreads = 5;
         }
 
+        if (numberOfDaemons == -1) {
+            numberOfDaemons = 3;
+        }
+
         if (totalOperations == -1) {
             totalOperations = 1000;
         }
@@ -355,14 +364,16 @@
             System.out.println(frequencyMap);
         }
 
-        runTest(numberOfThreads, operationsPerThread, lock, frequencyMap);
+        runTest(numberOfThreads, numberOfDaemons, operationsPerThread, lock, frequencyMap);
     }
 
-    public static void runTest(final int numberOfThreads, final int operationsPerThread,
-                               final Object lock, Map<Operation, Double> frequencyMap)
-                                   throws Exception {
-        // Each thread is going to do operationsPerThread
-        // operations. The distribution of operations is determined by
+    public static void runTest(final int numberOfThreads, final int numberOfDaemons,
+                               final int operationsPerThread, final Object lock,
+                               Map<Operation, Double> frequencyMap) throws Exception {
+        // Each normal thread is going to do operationsPerThread
+        // operations. Each daemon thread will loop over all
+        // the operations and will not stop.
+        // The distribution of operations is determined by
         // the Operation.frequency values. We fill out an Operation[]
         // for each thread with the operations it is to perform. The
         // Operation[] is shuffled so that there is more random
@@ -371,7 +382,9 @@
         // Fill in the Operation[] array for each thread by laying
         // down references to operation according to their desired
         // frequency.
-        final Main[] threadStresses = new Main[numberOfThreads];
+        // The first numberOfThreads elements are normal threads, the last
+        // numberOfDaemons elements are daemon threads.
+        final Main[] threadStresses = new Main[numberOfThreads + numberOfDaemons];
         for (int t = 0; t < threadStresses.length; t++) {
             Operation[] operations = new Operation[operationsPerThread];
             int o = 0;
@@ -388,9 +401,10 @@
                     }
                 }
             }
-            // Randomize the oepration order
+            // Randomize the operation order
             Collections.shuffle(Arrays.asList(operations));
-            threadStresses[t] = new Main(lock, t, operations);
+            threadStresses[t] = t < numberOfThreads ? new Main(lock, t, operations) :
+                                                      new Daemon(lock, t, operations);
         }
 
         // Enable to dump operation counts per thread to make sure its
@@ -459,6 +473,14 @@
             notifier.start();
         }
 
+        // Create and start the daemon threads.
+        for (int r = 0; r < numberOfDaemons; r++) {
+            Main daemon = threadStresses[numberOfThreads + r];
+            Thread t = new Thread(daemon, "Daemon thread " + daemon.id);
+            t.setDaemon(true);
+            t.start();
+        }
+
         for (int r = 0; r < runners.length; r++) {
             runners[r].start();
         }
@@ -467,9 +489,9 @@
         }
     }
 
-    private final Operation[] operations;
+    protected final Operation[] operations;
     private final Object lock;
-    private final int id;
+    protected final int id;
 
     private int nextOperation;
 
@@ -503,4 +525,36 @@
         }
     }
 
+    private static class Daemon extends Main {
+        private Daemon(Object lock, int id, Operation[] operations) {
+            super(lock, id, operations);
+        }
+
+        public void run() {
+            try {
+                if (DEBUG) {
+                    System.out.println("Starting ThreadStress Daemon " + id);
+                }
+                int i = 0;
+                while (true) {
+                    Operation operation = operations[i];
+                    if (DEBUG) {
+                        System.out.println("ThreadStress Daemon " + id
+                                           + " operation " + i
+                                           + " is " + operation);
+                    }
+                    operation.perform();
+                    i = (i + 1) % operations.length;
+                }
+            } catch (OutOfMemoryError e) {
+                // Catch OutOfMemoryErrors since these can cause the test to fail it they print
+                // the stack trace after "Finishing worker".
+            } finally {
+                if (DEBUG) {
+                    System.out.println("Finishing ThreadStress Daemon for " + id);
+                }
+            }
+        }
+    }
+
 }
diff --git a/test/494-checker-instanceof-tests/src/Main.java b/test/494-checker-instanceof-tests/src/Main.java
index bff9c72..2eac6c9 100644
--- a/test/494-checker-instanceof-tests/src/Main.java
+++ b/test/494-checker-instanceof-tests/src/Main.java
@@ -129,6 +129,26 @@
     return $inline$interfaceTypeTest(finalUnrelatedField);
   }
 
+  // Check that we remove the LoadClass instruction from the graph.
+  /// CHECK-START: boolean Main.knownTestWithLoadedClass() register (after)
+  /// CHECK-NOT: LoadClass
+  public static boolean knownTestWithLoadedClass() {
+    return new String() instanceof String;
+  }
+
+  // Check that we do not remove the LoadClass instruction from the graph.
+  /// CHECK-START: boolean Main.knownTestWithUnloadedClass() register (after)
+  /// CHECK: <<Const:i\d+>> IntConstant 0
+  /// CHECK:                LoadClass
+  /// CHECK:                Return [<<Const>>]
+  public static boolean knownTestWithUnloadedClass() {
+    return $inline$returnMain() instanceof String;
+  }
+
+  public static Object $inline$returnMain() {
+    return new Main();
+  }
+
   public static void expect(boolean expected, boolean actual) {
     if (expected != actual) {
       throw new Error("Unexpected result");
diff --git a/test/495-checker-checkcast-tests/src/Main.java b/test/495-checker-checkcast-tests/src/Main.java
index aa6d5a7..4b2bf09 100644
--- a/test/495-checker-checkcast-tests/src/Main.java
+++ b/test/495-checker-checkcast-tests/src/Main.java
@@ -112,6 +112,33 @@
     return $inline$interfaceTypeTest(finalUnrelatedField);
   }
 
+  /// CHECK-START: java.lang.String Main.knownTestWithLoadedClass() register (after)
+  /// CHECK-NOT: LoadClass
+  public static String knownTestWithLoadedClass() {
+    return (String)$inline$getString();
+  }
+
+  /// CHECK-START: Itf Main.knownTestWithUnloadedClass() register (after)
+  /// CHECK: LoadClass
+  public static Itf knownTestWithUnloadedClass() {
+    return (Itf)$inline$getString();
+  }
+
+  public static Object $inline$getString() {
+    return new String();
+  }
+
+  public static Object $inline$getMain() {
+    return new Main();
+  }
+
+  /// CHECK-START: void Main.nonNullBoundType() register (after)
+  /// CHECK-NOT: NullCheck
+  public static void nonNullBoundType() {
+    Main main = (Main)$inline$getMain();
+    main.getClass();
+  }
+
   public static void main(String[] args) {
     classTypeTestNull();
     try {
diff --git a/test/526-long-regalloc/expected.txt b/test/526-long-regalloc/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/526-long-regalloc/expected.txt
diff --git a/test/526-long-regalloc/info.txt b/test/526-long-regalloc/info.txt
new file mode 100644
index 0000000..a5ce1bc
--- /dev/null
+++ b/test/526-long-regalloc/info.txt
@@ -0,0 +1,2 @@
+Regression test for optimizing that used to trip when allocating a register
+pair under certain circumstances.
diff --git a/test/526-long-regalloc/src/Main.java b/test/526-long-regalloc/src/Main.java
new file mode 100644
index 0000000..e8b3096
--- /dev/null
+++ b/test/526-long-regalloc/src/Main.java
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+class Main {
+  public static void main(String[] args) {
+    foo();
+  }
+
+  public static void foo() {
+    int a = myField1; // esi
+    int b = myField2; // edi
+    $noinline$bar(); // makes allocation of a and b to be callee-save registers
+    int c = myField3; // ecx
+    int e = myField4; // ebx
+    int f = myField5; // edx
+    long d = a == 42 ? myLongField1 : 42L; // Will call AllocateBlockedReg -> edx/ebx
+
+    // At this point, the register allocator used to be in a bogus state, where the low
+    // part of the interval was in the active set, but not the high part.
+
+    long i = myLongField1; // Will call TrySplitNonPairOrUnalignedPairIntervalAt -> Failing DCHECK
+
+    // Use esi and edi first to not have d allocated to them.
+    myField2 = a;
+    myField3 = b;
+
+    // The following sequence of instructions are making the AllocateBlockedReg call
+    // for allocating the d variable misbehave: allocation of the low interval would split
+    // both low and high interval at the fixed use; therefore the allocation of the high interval
+    // would not see the register use, and think the interval can just be spilled and not be
+    // put in the active set, even though it is holding a register.
+    myField1 = (int)d; // stack use
+    myLongField3 = (long) myField2; // edx fixed use
+    myLongField2 = d; // register use
+
+    // Ensure the HInstruction mapping to i, c, e, and f have a live range.
+    myLongField1 = i;
+    myField4 = c;
+    myField5 = e;
+    myField6 = f;
+  }
+
+  public static long $noinline$bar() {
+    if (doThrow) throw new Error();
+    return 42;
+  }
+
+  public static boolean doThrow = false;
+
+  public static int myField1 = 0;
+  public static int myField2 = 0;
+  public static int myField3 = 0;
+  public static int myField4 = 0;
+  public static int myField5 = 0;
+  public static int myField6 = 0;
+  public static long myLongField1 = 0L;
+  public static long myLongField2 = 0L;
+  public static long myLongField3 = 0L;
+}
diff --git a/test/800-smali/expected.txt b/test/800-smali/expected.txt
index 884f280..dd37cdb 100644
--- a/test/800-smali/expected.txt
+++ b/test/800-smali/expected.txt
@@ -38,4 +38,7 @@
 b/22411633 (5)
 b/22777307
 b/22881413
+b/20843113
+b/23201502 (float)
+b/23201502 (double)
 Done!
diff --git a/test/800-smali/smali/b_20843113.smali b/test/800-smali/smali/b_20843113.smali
new file mode 100644
index 0000000..ab3dc41
--- /dev/null
+++ b/test/800-smali/smali/b_20843113.smali
@@ -0,0 +1,34 @@
+.class public LB20843113;
+.super Ljava/lang/Object;
+
+
+.method public constructor <init>(I)V
+.registers 2
+
+:Label1
+       # An instruction that may throw, so as to pass UninitializedThis to the handler
+       div-int v1, v1, v1
+
+       # Call the super-constructor
+       invoke-direct {v0}, Ljava/lang/Object;-><init>()V
+
+       # Return normally.
+       return-void
+
+:Label2
+
+
+:Handler
+       move-exception v0                    # Overwrite the (last) "this" register. This should be
+                                            # allowed as we will terminate abnormally below.
+
+       throw v0                             # Terminate abnormally
+
+.catchall {:Label1 .. :Label2} :Handler
+.end method
+
+# Just a dummy.
+.method public static run()V
+.registers 1
+       return-void
+.end method
diff --git a/test/800-smali/smali/b_23201502.smali b/test/800-smali/smali/b_23201502.smali
new file mode 100644
index 0000000..d958938
--- /dev/null
+++ b/test/800-smali/smali/b_23201502.smali
@@ -0,0 +1,23 @@
+.class public LB23201502;
+
+.super Ljava/lang/Object;
+
+.method public static runFloat()V
+   .registers 3
+   const v0, 0             # Null array.
+   const v1, 0             # 0 index into array.
+   const v2, 0             # 0 value, will be turned into float.
+   int-to-float v2, v2     # Definitely make v2 float.
+   aput v2 , v0, v1        # Put into null array.
+   return-void
+.end method
+
+.method public static runDouble()V
+   .registers 4
+   const v0, 0             # Null array.
+   const v1, 0             # 0 index into array.
+   const v2, 0             # 0 value, will be turned into double.
+   int-to-double v2, v2    # Definitely make v2+v3 double.
+   aput-wide v2 , v0, v1   # Put into null array.
+   return-void
+.end method
diff --git a/test/800-smali/src/Main.java b/test/800-smali/src/Main.java
index e1ac749..b481a1d 100644
--- a/test/800-smali/src/Main.java
+++ b/test/800-smali/src/Main.java
@@ -122,6 +122,11 @@
         testCases.add(new TestCase("b/22777307", "B22777307", "run", null, new InstantiationError(),
                 null));
         testCases.add(new TestCase("b/22881413", "B22881413", "run", null, null, null));
+        testCases.add(new TestCase("b/20843113", "B20843113", "run", null, null, null));
+        testCases.add(new TestCase("b/23201502 (float)", "B23201502", "runFloat", null,
+                new NullPointerException(), null));
+        testCases.add(new TestCase("b/23201502 (double)", "B23201502", "runDouble", null,
+                new NullPointerException(), null));
     }
 
     public void runTests() {