Revert "Revert "Enable store elimination for singleton objects.""

This reverts commit 55d02cf056f993aeafebd54e7b7c68c7a48507c9, and
makes the following change:
Currently we leverage loop side effects to decide whether heap values are
killed by the loop. Stores need to be kept if heap values may be killed
by loops and the corresponding loads cannot be eliminated. Similar thing
need to be done for each predecessor when we merge predecessor heap values.
To do that, the HInstanceFieldSet instruction itself is put in the heap
value array instead of the value of the store instruction. The store
instruction may be added to possibly_removed_stores_ first, but can later
be removed from possibly_removed_stores_ when it's found out that the store
needs to be kept due to merging/loop side effects.

Change-Id: I4f7bb1960f7b47240873e00ff1adac46fc102a02
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 2b60a51..5da7214 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -1104,7 +1104,11 @@
   // access because the verifier was unable to?
   const DexFile* dex_file = cu_->dex_file;
   CompilerDriver* driver = cu_->compiler_driver;
-  if (driver->CanAccessInstantiableTypeWithoutChecks(cu_->method_idx, *dex_file, type_idx)) {
+  bool finalizable;
+  if (driver->CanAccessInstantiableTypeWithoutChecks(cu_->method_idx,
+                                                     *dex_file,
+                                                     type_idx,
+                                                     &finalizable)) {
     bool is_type_initialized;
     bool use_direct_type_ptr;
     uintptr_t direct_type_ptr;
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index bf3a865..e42a737 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1208,7 +1208,8 @@
 
 bool CompilerDriver::CanAccessInstantiableTypeWithoutChecks(uint32_t referrer_idx,
                                                             const DexFile& dex_file,
-                                                            uint32_t type_idx) {
+                                                            uint32_t type_idx,
+                                                            bool* finalizable) {
   ScopedObjectAccess soa(Thread::Current());
   mirror::DexCache* dex_cache = Runtime::Current()->GetClassLinker()->FindDexCache(
       soa.Self(), dex_file, false);
@@ -1216,8 +1217,11 @@
   mirror::Class* resolved_class = dex_cache->GetResolvedType(type_idx);
   if (resolved_class == nullptr) {
     stats_->TypeNeedsAccessCheck();
+    // Be conservative.
+    *finalizable = true;
     return false;  // Unknown class needs access checks.
   }
+  *finalizable = resolved_class->IsFinalizable();
   const DexFile::MethodId& method_id = dex_file.GetMethodId(referrer_idx);
   mirror::Class* referrer_class = dex_cache->GetResolvedType(method_id.class_idx_);
   if (referrer_class == nullptr) {
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 5683b03..dae785b 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -211,8 +211,11 @@
       REQUIRES(!Locks::mutator_lock_);
 
   // Are runtime access and instantiable checks necessary in the code?
-  bool CanAccessInstantiableTypeWithoutChecks(uint32_t referrer_idx, const DexFile& dex_file,
-                                              uint32_t type_idx)
+  // out_is_finalizable is set to whether the type is finalizable.
+  bool CanAccessInstantiableTypeWithoutChecks(uint32_t referrer_idx,
+                                              const DexFile& dex_file,
+                                              uint32_t type_idx,
+                                              bool* out_is_finalizable)
       REQUIRES(!Locks::mutator_lock_);
 
   bool CanEmbedTypeInCode(const DexFile& dex_file, uint32_t type_idx,
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 167c35d..3257de1 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -1449,7 +1449,8 @@
                                         uint32_t* args,
                                         uint32_t register_index) {
   HInstruction* length = graph_->GetIntConstant(number_of_vreg_arguments, dex_pc);
-  QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index)
+  bool finalizable;
+  QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index, &finalizable)
       ? kQuickAllocArrayWithAccessCheck
       : kQuickAllocArray;
   HInstruction* object = new (arena_) HNewArray(length,
@@ -1629,9 +1630,9 @@
   }
 }
 
-bool HGraphBuilder::NeedsAccessCheck(uint32_t type_index) const {
+bool HGraphBuilder::NeedsAccessCheck(uint32_t type_index, bool* finalizable) const {
   return !compiler_driver_->CanAccessInstantiableTypeWithoutChecks(
-      dex_compilation_unit_->GetDexMethodIndex(), *dex_file_, type_index);
+      dex_compilation_unit_->GetDexMethodIndex(), *dex_file_, type_index, finalizable);
 }
 
 void HGraphBuilder::BuildSwitchJumpTable(const SwitchTable& table,
@@ -2508,7 +2509,9 @@
         current_block_->AddInstruction(fake_string);
         UpdateLocal(register_index, fake_string, dex_pc);
       } else {
-        QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index)
+        bool finalizable;
+        bool can_throw = NeedsAccessCheck(type_index, &finalizable);
+        QuickEntrypointEnum entrypoint = can_throw
             ? kQuickAllocObjectWithAccessCheck
             : kQuickAllocObject;
 
@@ -2517,6 +2520,8 @@
             dex_pc,
             type_index,
             *dex_compilation_unit_->GetDexFile(),
+            can_throw,
+            finalizable,
             entrypoint));
         UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction(), dex_pc);
       }
@@ -2526,7 +2531,8 @@
     case Instruction::NEW_ARRAY: {
       uint16_t type_index = instruction.VRegC_22c();
       HInstruction* length = LoadLocal(instruction.VRegB_22c(), Primitive::kPrimInt, dex_pc);
-      QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index)
+      bool finalizable;
+      QuickEntrypointEnum entrypoint = NeedsAccessCheck(type_index, &finalizable)
           ? kQuickAllocArrayWithAccessCheck
           : kQuickAllocArray;
       current_block_->AddInstruction(new (arena_) HNewArray(length,
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index 9eaa4b6..f857ef0 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -138,7 +138,10 @@
   HInstruction* LoadLocal(uint32_t register_index, Primitive::Type type, uint32_t dex_pc) const;
   void PotentiallyAddSuspendCheck(HBasicBlock* target, uint32_t dex_pc);
   void InitializeParameters(uint16_t number_of_parameters);
-  bool NeedsAccessCheck(uint32_t type_index) const;
+
+  // Returns whether the current method needs access check for the type.
+  // Output parameter finalizable is set to whether the type is finalizable.
+  bool NeedsAccessCheck(uint32_t type_index, /*out*/bool* finalizable) const;
 
   template<typename T>
   void Unop_12x(const Instruction& instruction, Primitive::Type type, uint32_t dex_pc);
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index 6fbb682..068d5db 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -119,19 +119,10 @@
       : ref_info_(ref_info),
         offset_(offset),
         index_(index),
-        declaring_class_def_index_(declaring_class_def_index),
-        may_become_unknown_(true) {
+        declaring_class_def_index_(declaring_class_def_index) {
     DCHECK(ref_info != nullptr);
     DCHECK((offset == kInvalidFieldOffset && index != nullptr) ||
            (offset != kInvalidFieldOffset && index == nullptr));
-
-    if (ref_info->IsSingletonAndNotReturned()) {
-      // We try to track stores to singletons that aren't returned to eliminate the stores
-      // since values in singleton's fields cannot be killed due to aliasing. Those values
-      // can still be killed due to merging values since we don't build phi for merging heap
-      // values. SetMayBecomeUnknown(true) may be called later once such merge becomes possible.
-      may_become_unknown_ = false;
-    }
   }
 
   ReferenceInfo* GetReferenceInfo() const { return ref_info_; }
@@ -148,21 +139,11 @@
     return index_ != nullptr;
   }
 
-  // Returns true if this heap location's value may become unknown after it's
-  // set to a value, due to merge of values, or killed due to aliasing.
-  bool MayBecomeUnknown() const {
-    return may_become_unknown_;
-  }
-  void SetMayBecomeUnknown(bool val) {
-    may_become_unknown_ = val;
-  }
-
  private:
   ReferenceInfo* const ref_info_;      // reference for instance/static field or array access.
   const size_t offset_;                // offset of static/instance field.
   HInstruction* const index_;          // index of an array element.
   const int16_t declaring_class_def_index_;  // declaring class's def's dex index.
-  bool may_become_unknown_;            // value may become kUnknownHeapValue.
 
   DISALLOW_COPY_AND_ASSIGN(HeapLocation);
 };
@@ -381,26 +362,13 @@
     return heap_locations_[heap_location_idx];
   }
 
-  void VisitFieldAccess(HInstruction* field_access,
-                        HInstruction* ref,
-                        const FieldInfo& field_info,
-                        bool is_store) {
+  void VisitFieldAccess(HInstruction* ref, const FieldInfo& field_info) {
     if (field_info.IsVolatile()) {
       has_volatile_ = true;
     }
     const uint16_t declaring_class_def_index = field_info.GetDeclaringClassDefIndex();
     const size_t offset = field_info.GetFieldOffset().SizeValue();
-    HeapLocation* location = GetOrCreateHeapLocation(ref, offset, nullptr, declaring_class_def_index);
-    // A store of a value may be eliminated if all future loads for that value can be eliminated.
-    // For a value that's stored into a singleton field, the value will not be killed due
-    // to aliasing. However if the value is set in a block that doesn't post dominate the definition,
-    // the value may be killed due to merging later. Before we have post dominating info, we check
-    // if the store is in the same block as the definition just to be conservative.
-    if (is_store &&
-        location->GetReferenceInfo()->IsSingletonAndNotReturned() &&
-        field_access->GetBlock() != ref->GetBlock()) {
-      location->SetMayBecomeUnknown(true);
-    }
+    GetOrCreateHeapLocation(ref, offset, nullptr, declaring_class_def_index);
   }
 
   void VisitArrayAccess(HInstruction* array, HInstruction* index) {
@@ -409,20 +377,20 @@
   }
 
   void VisitInstanceFieldGet(HInstanceFieldGet* instruction) OVERRIDE {
-    VisitFieldAccess(instruction, instruction->InputAt(0), instruction->GetFieldInfo(), false);
+    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
   }
 
   void VisitInstanceFieldSet(HInstanceFieldSet* instruction) OVERRIDE {
-    VisitFieldAccess(instruction, instruction->InputAt(0), instruction->GetFieldInfo(), true);
+    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
     has_heap_stores_ = true;
   }
 
   void VisitStaticFieldGet(HStaticFieldGet* instruction) OVERRIDE {
-    VisitFieldAccess(instruction, instruction->InputAt(0), instruction->GetFieldInfo(), false);
+    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
   }
 
   void VisitStaticFieldSet(HStaticFieldSet* instruction) OVERRIDE {
-    VisitFieldAccess(instruction, instruction->InputAt(0), instruction->GetFieldInfo(), true);
+    VisitFieldAccess(instruction->InputAt(0), instruction->GetFieldInfo());
     has_heap_stores_ = true;
   }
 
@@ -464,9 +432,14 @@
 };
 
 // An unknown heap value. Loads with such a value in the heap location cannot be eliminated.
+// A heap location can be set to kUnknownHeapValue when:
+// - initially set a value.
+// - killed due to aliasing, merging, invocation, or loop side effects.
 static HInstruction* const kUnknownHeapValue =
     reinterpret_cast<HInstruction*>(static_cast<uintptr_t>(-1));
+
 // Default heap value after an allocation.
+// A heap location can be set to that value right after an allocation.
 static HInstruction* const kDefaultHeapValue =
     reinterpret_cast<HInstruction*>(static_cast<uintptr_t>(-2));
 
@@ -484,29 +457,17 @@
                                                     kUnknownHeapValue,
                                                     graph->GetArena()->Adapter(kArenaAllocLSE)),
                          graph->GetArena()->Adapter(kArenaAllocLSE)),
-        removed_instructions_(graph->GetArena()->Adapter(kArenaAllocLSE)),
-        substitute_instructions_(graph->GetArena()->Adapter(kArenaAllocLSE)),
+        removed_loads_(graph->GetArena()->Adapter(kArenaAllocLSE)),
+        substitute_instructions_for_loads_(graph->GetArena()->Adapter(kArenaAllocLSE)),
+        possibly_removed_stores_(graph->GetArena()->Adapter(kArenaAllocLSE)),
         singleton_new_instances_(graph->GetArena()->Adapter(kArenaAllocLSE)) {
   }
 
   void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
-    int block_id = block->GetBlockId();
-    ArenaVector<HInstruction*>& heap_values = heap_values_for_[block_id];
+    // Populate the heap_values array for this block.
     // TODO: try to reuse the heap_values array from one predecessor if possible.
     if (block->IsLoopHeader()) {
-      // We do a single pass in reverse post order. For loops, use the side effects as a hint
-      // to see if the heap values should be killed.
-      if (side_effects_.GetLoopEffects(block).DoesAnyWrite()) {
-        // Leave all values as kUnknownHeapValue.
-      } else {
-        // Inherit the values from pre-header.
-        HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
-        ArenaVector<HInstruction*>& pre_header_heap_values =
-            heap_values_for_[pre_header->GetBlockId()];
-        for (size_t i = 0; i < heap_values.size(); i++) {
-          heap_values[i] = pre_header_heap_values[i];
-        }
-      }
+      HandleLoopSideEffects(block);
     } else {
       MergePredecessorValues(block);
     }
@@ -515,23 +476,34 @@
 
   // Remove recorded instructions that should be eliminated.
   void RemoveInstructions() {
-    size_t size = removed_instructions_.size();
-    DCHECK_EQ(size, substitute_instructions_.size());
+    size_t size = removed_loads_.size();
+    DCHECK_EQ(size, substitute_instructions_for_loads_.size());
     for (size_t i = 0; i < size; i++) {
-      HInstruction* instruction = removed_instructions_[i];
-      DCHECK(instruction != nullptr);
-      HInstruction* substitute = substitute_instructions_[i];
-      if (substitute != nullptr) {
-        // Keep tracing substitute till one that's not removed.
-        HInstruction* sub_sub = FindSubstitute(substitute);
-        while (sub_sub != substitute) {
-          substitute = sub_sub;
-          sub_sub = FindSubstitute(substitute);
-        }
-        instruction->ReplaceWith(substitute);
+      HInstruction* load = removed_loads_[i];
+      DCHECK(load != nullptr);
+      DCHECK(load->IsInstanceFieldGet() ||
+             load->IsStaticFieldGet() ||
+             load->IsArrayGet());
+      HInstruction* substitute = substitute_instructions_for_loads_[i];
+      DCHECK(substitute != nullptr);
+      // Keep tracing substitute till one that's not removed.
+      HInstruction* sub_sub = FindSubstitute(substitute);
+      while (sub_sub != substitute) {
+        substitute = sub_sub;
+        sub_sub = FindSubstitute(substitute);
       }
-      instruction->GetBlock()->RemoveInstruction(instruction);
+      load->ReplaceWith(substitute);
+      load->GetBlock()->RemoveInstruction(load);
     }
+
+    // At this point, stores in possibly_removed_stores_ can be safely removed.
+    size = possibly_removed_stores_.size();
+    for (size_t i = 0; i < size; i++) {
+      HInstruction* store = possibly_removed_stores_[i];
+      DCHECK(store->IsInstanceFieldSet() || store->IsStaticFieldSet() || store->IsArraySet());
+      store->GetBlock()->RemoveInstruction(store);
+    }
+
     // TODO: remove unnecessary allocations.
     // Eliminate instructions in singleton_new_instances_ that:
     // - don't have uses,
@@ -541,6 +513,52 @@
   }
 
  private:
+  // If heap_values[index] is an instance field store, need to keep the store.
+  // This is necessary if a heap value is killed due to merging, or loop side
+  // effects (which is essentially merging also), since a load later from the
+  // location won't be eliminated.
+  void KeepIfIsStore(HInstruction* heap_value) {
+    if (heap_value == kDefaultHeapValue ||
+        heap_value == kUnknownHeapValue ||
+        !heap_value->IsInstanceFieldSet()) {
+      return;
+    }
+    auto idx = std::find(possibly_removed_stores_.begin(),
+        possibly_removed_stores_.end(), heap_value);
+    if (idx != possibly_removed_stores_.end()) {
+      // Make sure the store is kept.
+      possibly_removed_stores_.erase(idx);
+    }
+  }
+
+  void HandleLoopSideEffects(HBasicBlock* block) {
+    DCHECK(block->IsLoopHeader());
+    int block_id = block->GetBlockId();
+    ArenaVector<HInstruction*>& heap_values = heap_values_for_[block_id];
+    HBasicBlock* pre_header = block->GetLoopInformation()->GetPreHeader();
+    ArenaVector<HInstruction*>& pre_header_heap_values =
+        heap_values_for_[pre_header->GetBlockId()];
+    // We do a single pass in reverse post order. For loops, use the side effects as a hint
+    // to see if the heap values should be killed.
+    if (side_effects_.GetLoopEffects(block).DoesAnyWrite()) {
+      for (size_t i = 0; i < pre_header_heap_values.size(); i++) {
+        // heap value is killed by loop side effects, need to keep the last store.
+        KeepIfIsStore(pre_header_heap_values[i]);
+      }
+      if (kIsDebugBuild) {
+        // heap_values should all be kUnknownHeapValue that it is inited with.
+        for (size_t i = 0; i < heap_values.size(); i++) {
+          DCHECK_EQ(heap_values[i], kUnknownHeapValue);
+        }
+      }
+    } else {
+      // Inherit the values from pre-header.
+      for (size_t i = 0; i < heap_values.size(); i++) {
+        heap_values[i] = pre_header_heap_values[i];
+      }
+    }
+  }
+
   void MergePredecessorValues(HBasicBlock* block) {
     const ArenaVector<HBasicBlock*>& predecessors = block->GetPredecessors();
     if (predecessors.size() == 0) {
@@ -548,16 +566,25 @@
     }
     ArenaVector<HInstruction*>& heap_values = heap_values_for_[block->GetBlockId()];
     for (size_t i = 0; i < heap_values.size(); i++) {
-      HInstruction* value = heap_values_for_[predecessors[0]->GetBlockId()][i];
-      if (value != kUnknownHeapValue) {
+      HInstruction* pred0_value = heap_values_for_[predecessors[0]->GetBlockId()][i];
+      heap_values[i] = pred0_value;
+      if (pred0_value != kUnknownHeapValue) {
         for (size_t j = 1; j < predecessors.size(); j++) {
-          if (heap_values_for_[predecessors[j]->GetBlockId()][i] != value) {
-            value = kUnknownHeapValue;
+          HInstruction* pred_value = heap_values_for_[predecessors[j]->GetBlockId()][i];
+          if (pred_value != pred0_value) {
+            heap_values[i] = kUnknownHeapValue;
             break;
           }
         }
       }
-      heap_values[i] = value;
+
+      if (heap_values[i] == kUnknownHeapValue) {
+        // Keep the last store in each predecessor since future loads cannot be eliminated.
+        for (size_t j = 0; j < predecessors.size(); j++) {
+          ArenaVector<HInstruction*>& pred_values = heap_values_for_[predecessors[j]->GetBlockId()];
+          KeepIfIsStore(pred_values[i]);
+        }
+      }
     }
   }
 
@@ -616,21 +643,30 @@
     HInstruction* heap_value = heap_values[idx];
     if (heap_value == kDefaultHeapValue) {
       HInstruction* constant = GetDefaultValue(instruction->GetType());
-      removed_instructions_.push_back(instruction);
-      substitute_instructions_.push_back(constant);
+      removed_loads_.push_back(instruction);
+      substitute_instructions_for_loads_.push_back(constant);
       heap_values[idx] = constant;
       return;
     }
+    if (heap_value != kUnknownHeapValue && heap_value->IsInstanceFieldSet()) {
+      HInstruction* store = heap_value;
+      // This load must be from a singleton since it's from the same field
+      // that a "removed" store puts the value. That store must be to a singleton's field.
+      DCHECK(ref_info->IsSingleton());
+      // Get the real heap value of the store.
+      heap_value = store->InputAt(1);
+    }
     if ((heap_value != kUnknownHeapValue) &&
         // Keep the load due to possible I/F, J/D array aliasing.
         // See b/22538329 for details.
         (heap_value->GetType() == instruction->GetType())) {
-      removed_instructions_.push_back(instruction);
-      substitute_instructions_.push_back(heap_value);
+      removed_loads_.push_back(instruction);
+      substitute_instructions_for_loads_.push_back(heap_value);
       TryRemovingNullCheck(instruction);
       return;
     }
 
+    // Load isn't eliminated.
     if (heap_value == kUnknownHeapValue) {
       // Put the load as the value into the HeapLocation.
       // This acts like GVN but with better aliasing analysis.
@@ -662,51 +698,65 @@
     ArenaVector<HInstruction*>& heap_values =
         heap_values_for_[instruction->GetBlock()->GetBlockId()];
     HInstruction* heap_value = heap_values[idx];
-    bool redundant_store = false;
+    bool same_value = false;
+    bool possibly_redundant = false;
     if (Equal(heap_value, value)) {
       // Store into the heap location with the same value.
-      redundant_store = true;
+      same_value = true;
     } else if (index != nullptr) {
       // For array element, don't eliminate stores since it can be easily aliased
       // with non-constant index.
     } else if (!heap_location_collector_.MayDeoptimize() &&
-               ref_info->IsSingletonAndNotReturned() &&
-               !heap_location_collector_.GetHeapLocation(idx)->MayBecomeUnknown()) {
-      // Store into a field of a singleton that's not returned. And that value cannot be
-      // killed due to merge. It's redundant since future loads will get the value
-      // set by this instruction.
-      Primitive::Type type = Primitive::kPrimVoid;
-      if (instruction->IsInstanceFieldSet()) {
-        type = instruction->AsInstanceFieldSet()->GetFieldInfo().GetFieldType();
-      } else if (instruction->IsStaticFieldSet()) {
-        type = instruction->AsStaticFieldSet()->GetFieldInfo().GetFieldType();
+               ref_info->IsSingletonAndNotReturned()) {
+      // Store into a field of a singleton that's not returned. The value cannot be
+      // killed due to aliasing/invocation. It can be redundant since future loads can
+      // directly get the value set by this instruction. The value can still be killed due to
+      // merging or loop side effects. Stores whose values are killed due to merging/loop side
+      // effects later will be removed from possibly_removed_stores_ when that is detected.
+      possibly_redundant = true;
+      HNewInstance* new_instance = ref_info->GetReference()->AsNewInstance();
+      DCHECK(new_instance != nullptr);
+      if (new_instance->IsFinalizable()) {
+        // Finalizable objects escape globally. Need to keep the store.
+        possibly_redundant = false;
       } else {
-        DCHECK(false) << "Must be an instance/static field set instruction.";
+        HLoopInformation* loop_info = instruction->GetBlock()->GetLoopInformation();
+        if (loop_info != nullptr) {
+          // instruction is a store in the loop so the loop must does write.
+          DCHECK(side_effects_.GetLoopEffects(loop_info->GetHeader()).DoesAnyWrite());
+
+          if (loop_info->IsLoopInvariant(original_ref, false)) {
+            DCHECK(original_ref->GetBlock()->Dominates(loop_info->GetPreHeader()));
+            // Keep the store since its value may be needed at the loop header.
+            possibly_redundant = false;
+          } else {
+            // The singleton is created inside the loop. Value stored to it isn't needed at
+            // the loop header. This is true for outer loops also.
+          }
+        }
       }
-      if (value->GetType() != type) {
-        // I/F, J/D aliasing should not happen for fields.
-        DCHECK(Primitive::IsIntegralType(value->GetType()));
-        DCHECK(!Primitive::Is64BitType(value->GetType()));
-        DCHECK(Primitive::IsIntegralType(type));
-        DCHECK(!Primitive::Is64BitType(type));
-        // Keep the store since the corresponding load isn't eliminated due to different types.
-        // TODO: handle the different int types so that we can eliminate this store.
-        redundant_store = false;
-      } else {
-        redundant_store = true;
-      }
-      // TODO: eliminate the store if the singleton object is not finalizable.
-      redundant_store = false;
     }
-    if (redundant_store) {
-      removed_instructions_.push_back(instruction);
-      substitute_instructions_.push_back(nullptr);
-      TryRemovingNullCheck(instruction);
+    if (same_value || possibly_redundant) {
+      possibly_removed_stores_.push_back(instruction);
+      // Same-value/singleton-field store shouldn't have a null check.
+      DCHECK(!ref->InputAt(0)->IsNullCheck());
     }
 
-    heap_values[idx] = value;
+    if (!same_value) {
+      if (possibly_redundant) {
+        DCHECK(instruction->IsInstanceFieldSet());
+        // Put the store as the heap value. If the value is loaded from heap
+        // by a load later, this store isn't really redundant.
+        heap_values[idx] = instruction;
+      } else {
+        heap_values[idx] = value;
+      }
+    }
     // This store may kill values in other heap locations due to aliasing.
     for (size_t i = 0; i < heap_values.size(); i++) {
+      if (i == idx) {
+        continue;
+      }
       if (heap_values[i] == value) {
         // Same value should be kept even if aliasing happens.
         continue;
@@ -834,9 +884,10 @@
       return;
     }
     if (!heap_location_collector_.MayDeoptimize() &&
-        ref_info->IsSingletonAndNotReturned()) {
-      // The allocation might be eliminated.
-      singleton_new_instances_.push_back(new_instance);
+        ref_info->IsSingletonAndNotReturned() &&
+        !new_instance->IsFinalizable() &&
+        !new_instance->CanThrow()) {
+      // TODO: add new_instance to singleton_new_instances_ and enable allocation elimination.
     }
     ArenaVector<HInstruction*>& heap_values =
         heap_values_for_[new_instance->GetBlock()->GetBlockId()];
@@ -854,10 +905,10 @@
   // Find an instruction's substitute if it should be removed.
   // Return the same instruction if it should not be removed.
   HInstruction* FindSubstitute(HInstruction* instruction) {
-    size_t size = removed_instructions_.size();
+    size_t size = removed_loads_.size();
     for (size_t i = 0; i < size; i++) {
-      if (removed_instructions_[i] == instruction) {
-        return substitute_instructions_[i];
+      if (removed_loads_[i] == instruction) {
+        return substitute_instructions_for_loads_[i];
       }
     }
     return instruction;
@@ -871,8 +922,13 @@
 
   // We record the instructions that should be eliminated but may be
   // used by heap locations. They'll be removed in the end.
-  ArenaVector<HInstruction*> removed_instructions_;
-  ArenaVector<HInstruction*> substitute_instructions_;
+  ArenaVector<HInstruction*> removed_loads_;
+  ArenaVector<HInstruction*> substitute_instructions_for_loads_;
+
+  // Stores in this list may be removed from the list later when it's
+  // found that the store cannot be eliminated.
+  ArenaVector<HInstruction*> possibly_removed_stores_;
+
   ArenaVector<HInstruction*> singleton_new_instances_;
 
   DISALLOW_COPY_AND_ASSIGN(LSEVisitor);
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 1da2a1d..e3c810e 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -3643,10 +3643,14 @@
                uint32_t dex_pc,
                uint16_t type_index,
                const DexFile& dex_file,
+               bool can_throw,
+               bool finalizable,
                QuickEntrypointEnum entrypoint)
       : HExpression(Primitive::kPrimNot, SideEffects::CanTriggerGC(), dex_pc),
         type_index_(type_index),
         dex_file_(dex_file),
+        can_throw_(can_throw),
+        finalizable_(finalizable),
         entrypoint_(entrypoint) {
     SetRawInputAt(0, current_method);
   }
@@ -3656,11 +3660,13 @@
 
   // Calls runtime so needs an environment.
   bool NeedsEnvironment() const OVERRIDE { return true; }
-  // It may throw when called on:
-  //   - interfaces
-  //   - abstract/innaccessible/unknown classes
-  // TODO: optimize when possible.
-  bool CanThrow() const OVERRIDE { return true; }
+
+  // It may throw when called on type that's not instantiable/accessible.
+  // It can throw OOME.
+  // TODO: distinguish between the two cases so we can for example allow allocation elimination.
+  bool CanThrow() const OVERRIDE { return can_throw_ || true; }
+
+  bool IsFinalizable() const { return finalizable_; }
 
   bool CanBeNull() const OVERRIDE { return false; }
 
@@ -3671,6 +3677,8 @@
  private:
   const uint16_t type_index_;
   const DexFile& dex_file_;
+  const bool can_throw_;
+  const bool finalizable_;
   const QuickEntrypointEnum entrypoint_;
 
   DISALLOW_COPY_AND_ASSIGN(HNewInstance);
diff --git a/test/530-checker-lse/src/Main.java b/test/530-checker-lse/src/Main.java
index c766aaa..13c4722 100644
--- a/test/530-checker-lse/src/Main.java
+++ b/test/530-checker-lse/src/Main.java
@@ -22,7 +22,7 @@
     return radius * radius * Math.PI;
   }
   private double radius;
-};
+}
 
 class TestClass {
   TestClass() {
@@ -35,17 +35,31 @@
   int j;
   volatile int k;
   TestClass next;
+  String str;
   static int si;
-};
+}
 
 class SubTestClass extends TestClass {
   int k;
-};
+}
 
 class TestClass2 {
   int i;
   int j;
-};
+}
+
+class Finalizable {
+  static boolean sVisited = false;
+  static final int VALUE = 0xbeef;
+  int i;
+
+  protected void finalize() {
+    if (i != VALUE) {
+      System.out.println("Where is the beef?");
+    }
+    sVisited = true;
+  }
+}
 
 public class Main {
 
@@ -56,7 +70,7 @@
 
   /// CHECK-START: double Main.calcCircleArea(double) load_store_elimination (after)
   /// CHECK: NewInstance
-  /// CHECK: InstanceFieldSet
+  /// CHECK-NOT: InstanceFieldSet
   /// CHECK-NOT: InstanceFieldGet
 
   static double calcCircleArea(double radius) {
@@ -117,7 +131,7 @@
   /// CHECK: InstanceFieldGet
   /// CHECK: InstanceFieldSet
   /// CHECK: NewInstance
-  /// CHECK: InstanceFieldSet
+  /// CHECK-NOT: InstanceFieldSet
   /// CHECK-NOT: InstanceFieldGet
 
   // A new allocation shouldn't alias with pre-existing values.
@@ -223,7 +237,7 @@
 
   /// CHECK-START: int Main.test8() load_store_elimination (after)
   /// CHECK: NewInstance
-  /// CHECK: InstanceFieldSet
+  /// CHECK-NOT: InstanceFieldSet
   /// CHECK: InvokeVirtual
   /// CHECK-NOT: NullCheck
   /// CHECK-NOT: InstanceFieldGet
@@ -381,8 +395,8 @@
 
   /// CHECK-START: int Main.test16() load_store_elimination (after)
   /// CHECK: NewInstance
-  /// CHECK-NOT: StaticFieldSet
-  /// CHECK-NOT: StaticFieldGet
+  /// CHECK-NOT: InstanceFieldSet
+  /// CHECK-NOT: InstanceFieldGet
 
   // Test inlined constructor.
   static int test16() {
@@ -398,8 +412,8 @@
   /// CHECK-START: int Main.test17() load_store_elimination (after)
   /// CHECK: <<Const0:i\d+>> IntConstant 0
   /// CHECK: NewInstance
-  /// CHECK-NOT: StaticFieldSet
-  /// CHECK-NOT: StaticFieldGet
+  /// CHECK-NOT: InstanceFieldSet
+  /// CHECK-NOT: InstanceFieldGet
   /// CHECK: Return [<<Const0>>]
 
   // Test getting default value.
@@ -455,6 +469,148 @@
     return obj;
   }
 
+  /// CHECK-START: void Main.test21() load_store_elimination (before)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: StaticFieldSet
+  /// CHECK: StaticFieldGet
+
+  /// CHECK-START: void Main.test21() load_store_elimination (after)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: StaticFieldSet
+  /// CHECK: InstanceFieldGet
+
+  // Loop side effects can kill heap values, stores need to be kept in that case.
+  static void test21() {
+    TestClass obj = new TestClass();
+    obj.str = "abc";
+    for (int i = 0; i < 2; i++) {
+      // Generate some loop side effect that does write.
+      obj.si = 1;
+    }
+    System.out.print(obj.str.substring(0, 0));
+  }
+
+  /// CHECK-START: int Main.test22() load_store_elimination (before)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldGet
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldGet
+  /// CHECK: InstanceFieldGet
+
+  /// CHECK-START: int Main.test22() load_store_elimination (after)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: NewInstance
+  /// CHECK-NOT: InstanceFieldSet
+  /// CHECK-NOT: InstanceFieldGet
+  /// CHECK: NewInstance
+  /// CHECK-NOT: InstanceFieldSet
+  /// CHECK: InstanceFieldGet
+  /// CHECK-NOT: InstanceFieldGet
+
+  // Loop side effects only affects stores into singletons that dominiates the loop header.
+  static int test22() {
+    int sum = 0;
+    TestClass obj1 = new TestClass();
+    obj1.i = 2;       // This store can't be eliminated since it can be killed by loop side effects.
+    for (int i = 0; i < 2; i++) {
+      TestClass obj2 = new TestClass();
+      obj2.i = 3;    // This store can be eliminated since the singleton is inside the loop.
+      sum += obj2.i;
+    }
+    TestClass obj3 = new TestClass();
+    obj3.i = 5;      // This store can be eliminated since the singleton is created after the loop.
+    sum += obj1.i + obj3.i;
+    return sum;
+  }
+
+  /// CHECK-START: int Main.test23(boolean) load_store_elimination (before)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldGet
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldGet
+  /// CHECK: Return
+  /// CHECK: InstanceFieldGet
+  /// CHECK: InstanceFieldSet
+
+  /// CHECK-START: int Main.test23(boolean) load_store_elimination (after)
+  /// CHECK: NewInstance
+  /// CHECK-NOT: InstanceFieldSet
+  /// CHECK-NOT: InstanceFieldGet
+  /// CHECK: InstanceFieldSet
+  /// CHECK: InstanceFieldGet
+  /// CHECK: Return
+  /// CHECK-NOT: InstanceFieldGet
+  /// CHECK: InstanceFieldSet
+
+  // Test store elimination on merging.
+  static int test23(boolean b) {
+    TestClass obj = new TestClass();
+    obj.i = 3;      // This store can be eliminated since the value flows into each branch.
+    if (b) {
+      obj.i += 1;   // This store cannot be eliminated due to the merge later.
+    } else {
+      obj.i += 2;   // This store cannot be eliminated due to the merge later.
+    }
+    return obj.i;
+  }
+
+  /// CHECK-START: void Main.testFinalizable() load_store_elimination (before)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+
+  /// CHECK-START: void Main.testFinalizable() load_store_elimination (after)
+  /// CHECK: NewInstance
+  /// CHECK: InstanceFieldSet
+
+  // Allocations and stores into finalizable objects cannot be eliminated.
+  static void testFinalizable() {
+    Finalizable finalizable = new Finalizable();
+    finalizable.i = Finalizable.VALUE;
+  }
+
+  static java.lang.ref.WeakReference<Object> getWeakReference() {
+    return new java.lang.ref.WeakReference<>(new Object());
+  }
+
+  static void testFinalizableByForcingGc() {
+    testFinalizable();
+    java.lang.ref.WeakReference<Object> reference = getWeakReference();
+
+    Runtime runtime = Runtime.getRuntime();
+    for (int i = 0; i < 20; ++i) {
+      runtime.gc();
+      System.runFinalization();
+      try {
+        Thread.sleep(1);
+      } catch (InterruptedException e) {
+        throw new AssertionError(e);
+      }
+
+      // Check to see if the weak reference has been garbage collected.
+      if (reference.get() == null) {
+        // A little bit more sleep time to make sure.
+        try {
+          Thread.sleep(100);
+        } catch (InterruptedException e) {
+          throw new AssertionError(e);
+        }
+        if (!Finalizable.sVisited) {
+          System.out.println("finalize() not called.");
+        }
+        return;
+      }
+    }
+    System.out.println("testFinalizableByForcingGc() failed to force gc.");
+  }
+
   public static void assertIntEquals(int expected, int result) {
     if (expected != result) {
       throw new Error("Expected: " + expected + ", found: " + result);
@@ -508,5 +664,10 @@
     float[] fa2 = { 1.8f };
     assertFloatEquals(test19(fa1, fa2), 1.8f);
     assertFloatEquals(test20().i, 0);
+    test21();
+    assertIntEquals(test22(), 13);
+    assertIntEquals(test23(true), 4);
+    assertIntEquals(test23(false), 5);
+    testFinalizableByForcingGc();
   }
 }