Merge "Run Java tests with the optimizing compiler."
diff --git a/build/Android.cpplint.mk b/build/Android.cpplint.mk
index adb87cb..1ecad21 100644
--- a/build/Android.cpplint.mk
+++ b/build/Android.cpplint.mk
@@ -16,7 +16,7 @@
 
 ART_CPPLINT := art/tools/cpplint.py
 ART_CPPLINT_FILTER := --filter=-whitespace/line_length,-build/include,-readability/function,-readability/streams,-readability/todo,-runtime/references,-runtime/sizeof,-runtime/threadsafe_fn,-runtime/printf
-ART_CPPLINT_SRC := $(shell find art -name *.h -o -name *$(ART_CPP_EXTENSION) | grep -v art/compiler/llvm/generated/)
+ART_CPPLINT_SRC := $(shell find art -name "*.h" -o -name "*$(ART_CPP_EXTENSION)" | grep -v art/compiler/llvm/generated/)
 
 # "mm cpplint-art" to verify we aren't regressing
 .PHONY: cpplint-art
diff --git a/compiler/dex/bb_optimizations.h b/compiler/dex/bb_optimizations.h
index fb482bf..6d500a5 100644
--- a/compiler/dex/bb_optimizations.h
+++ b/compiler/dex/bb_optimizations.h
@@ -59,6 +59,34 @@
 };
 
 /**
+ * @class CallInlining
+ * @brief Perform method inlining pass.
+ */
+class CallInlining : public Pass {
+ public:
+  CallInlining() : Pass("CallInlining") {
+  }
+
+  bool Gate(const CompilationUnit* cUnit) const {
+    return cUnit->mir_graph->InlineCallsGate();
+  }
+
+  void Start(CompilationUnit* cUnit) const {
+    cUnit->mir_graph->InlineCallsStart();
+  }
+
+  bool WalkBasicBlocks(CompilationUnit* cUnit, BasicBlock* bb) const {
+    cUnit->mir_graph->InlineCalls(bb);
+    // No need of repeating, so just return false.
+    return false;
+  }
+
+  void End(CompilationUnit* cUnit) const {
+    cUnit->mir_graph->InlineCallsEnd();
+  }
+};
+
+/**
  * @class CodeLayout
  * @brief Perform the code layout pass.
  */
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index 4485b15..5a26064 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -52,6 +52,7 @@
   // (1 << kMatch) |
   // (1 << kPromoteCompilerTemps) |
   // (1 << kSuppressExceptionEdges) |
+  // (1 << kSuppressMethodInlining) |
   0;
 
 static uint32_t kCompilerDebugFlags = 0 |     // Enable debug/testing modes
diff --git a/compiler/dex/frontend.h b/compiler/dex/frontend.h
index 37c85b1..f714ecd 100644
--- a/compiler/dex/frontend.h
+++ b/compiler/dex/frontend.h
@@ -53,6 +53,7 @@
   kPromoteCompilerTemps,
   kBranchFusing,
   kSuppressExceptionEdges,
+  kSuppressMethodInlining,
 };
 
 // Force code generation paths for testing.
diff --git a/compiler/dex/local_value_numbering.cc b/compiler/dex/local_value_numbering.cc
index 61c6767..45167a8 100644
--- a/compiler/dex/local_value_numbering.cc
+++ b/compiler/dex/local_value_numbering.cc
@@ -196,8 +196,10 @@
       // Intentional fall-through.
     case Instruction::INVOKE_STATIC:
     case Instruction::INVOKE_STATIC_RANGE:
-      AdvanceGlobalMemory();
-      MakeArgsAliasing(mir);
+      if ((mir->optimization_flags & MIR_INLINED) == 0) {
+        AdvanceGlobalMemory();
+        MakeArgsAliasing(mir);
+      }
       break;
 
     case Instruction::MOVE_RESULT:
@@ -213,13 +215,17 @@
     case Instruction::CONST_STRING_JUMBO:
     case Instruction::CONST_CLASS:
     case Instruction::NEW_ARRAY:
-      // 1 result, treat as unique each time, use result s_reg - will be unique.
-      res = MarkNonAliasingNonNull(mir);
+      if ((mir->optimization_flags & MIR_INLINED) == 0) {
+        // 1 result, treat as unique each time, use result s_reg - will be unique.
+        res = MarkNonAliasingNonNull(mir);
+      }
       break;
     case Instruction::MOVE_RESULT_WIDE:
-      // 1 wide result, treat as unique each time, use result s_reg - will be unique.
-      res = GetOperandValueWide(mir->ssa_rep->defs[0]);
-      SetOperandValueWide(mir->ssa_rep->defs[0], res);
+      if ((mir->optimization_flags & MIR_INLINED) == 0) {
+        // 1 wide result, treat as unique each time, use result s_reg - will be unique.
+        res = GetOperandValueWide(mir->ssa_rep->defs[0]);
+        SetOperandValueWide(mir->ssa_rep->defs[0], res);
+      }
       break;
 
     case kMirOpPhi:
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index 1c0205d..36f1be7 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -1001,7 +1001,7 @@
         static_cast<int>(kNumPackedOpcodes)) {
       int flags = Instruction::FlagsOf(mir->dalvikInsn.opcode);
 
-      if (flags & Instruction::kInvoke) {
+      if ((flags & Instruction::kInvoke) != 0 && (mir->optimization_flags & MIR_INLINED) == 0) {
         attributes_ &= ~METHOD_IS_LEAF;
       }
     }
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 036dd84..fd25798 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -522,6 +522,8 @@
     return method_lowering_infos_.GetRawStorage()[mir->meta.method_lowering_info];
   }
 
+  void ComputeInlineIFieldLoweringInfo(uint16_t field_idx, MIR* invoke, MIR* iget_or_iput);
+
   void InitRegLocations();
 
   void RemapRegLocations();
@@ -811,6 +813,11 @@
   BasicBlock* NextDominatedBlock(BasicBlock* bb);
   bool LayoutBlocks(BasicBlock* bb);
 
+  bool InlineCallsGate();
+  void InlineCallsStart();
+  void InlineCalls(BasicBlock* bb);
+  void InlineCallsEnd();
+
   /**
    * @brief Perform the initial preparation for the Method Uses.
    */
diff --git a/compiler/dex/mir_method_info.cc b/compiler/dex/mir_method_info.cc
index 4580e76..2c33ef1 100644
--- a/compiler/dex/mir_method_info.cc
+++ b/compiler/dex/mir_method_info.cc
@@ -75,10 +75,14 @@
     int fast_path_flags = compiler_driver->IsFastInvoke(
         soa, dex_cache, class_loader, mUnit, referrer_class.get(), resolved_method, &invoke_type,
         &target_method, devirt_target, &it->direct_code_, &it->direct_method_);
-    uint16_t other_flags = it->flags_ & ~kFlagFastPath & ~(kInvokeTypeMask << kBitSharpTypeBegin);
+    bool needs_clinit =
+        compiler_driver->NeedsClassInitialization(referrer_class.get(), resolved_method);
+    uint16_t other_flags = it->flags_ &
+        ~(kFlagFastPath | kFlagNeedsClassInitialization | (kInvokeTypeMask << kBitSharpTypeBegin));
     it->flags_ = other_flags |
         (fast_path_flags != 0 ? kFlagFastPath : 0u) |
-        (static_cast<uint16_t>(invoke_type) << kBitSharpTypeBegin);
+        (static_cast<uint16_t>(invoke_type) << kBitSharpTypeBegin) |
+        (needs_clinit ? kFlagNeedsClassInitialization : 0u);
     it->target_dex_file_ = target_method.dex_file;
     it->target_method_idx_ = target_method.dex_method_index;
     it->stats_flags_ = fast_path_flags;
diff --git a/compiler/dex/mir_method_info.h b/compiler/dex/mir_method_info.h
index f927f1d..efe92f3 100644
--- a/compiler/dex/mir_method_info.h
+++ b/compiler/dex/mir_method_info.h
@@ -123,6 +123,10 @@
     return (flags_ & kFlagFastPath) != 0u;
   }
 
+  bool NeedsClassInitialization() const {
+    return (flags_ & kFlagNeedsClassInitialization) != 0u;
+  }
+
   InvokeType GetInvokeType() const {
     return static_cast<InvokeType>((flags_ >> kBitInvokeTypeBegin) & kInvokeTypeMask);
   }
@@ -158,10 +162,12 @@
     kBitInvokeTypeEnd = kBitInvokeTypeBegin + 3,  // 3 bits for invoke type.
     kBitSharpTypeBegin,
     kBitSharpTypeEnd = kBitSharpTypeBegin + 3,  // 3 bits for sharp type.
-    kMethodLoweringInfoEnd = kBitSharpTypeEnd
+    kBitNeedsClassInitialization = kBitSharpTypeEnd,
+    kMethodLoweringInfoEnd
   };
   COMPILE_ASSERT(kMethodLoweringInfoEnd <= 16, too_many_flags);
   static constexpr uint16_t kFlagFastPath = 1u << kBitFastPath;
+  static constexpr uint16_t kFlagNeedsClassInitialization = 1u << kBitNeedsClassInitialization;
   static constexpr uint16_t kInvokeTypeMask = 7u;
   COMPILE_ASSERT((1u << (kBitInvokeTypeEnd - kBitInvokeTypeBegin)) - 1u == kInvokeTypeMask,
                  assert_invoke_type_bits_ok);
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index 333126b..45c8d87 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -17,6 +17,8 @@
 #include "compiler_internals.h"
 #include "local_value_numbering.h"
 #include "dataflow_iterator-inl.h"
+#include "dex/quick/dex_file_method_inliner.h"
+#include "dex/quick/dex_file_to_method_inliner_map.h"
 
 namespace art {
 
@@ -1113,6 +1115,97 @@
   temp_scoped_alloc_.reset();
 }
 
+void MIRGraph::ComputeInlineIFieldLoweringInfo(uint16_t field_idx, MIR* invoke, MIR* iget_or_iput) {
+  uint32_t method_index = invoke->meta.method_lowering_info;
+  if (temp_bit_vector_->IsBitSet(method_index)) {
+    iget_or_iput->meta.ifield_lowering_info = temp_insn_data_[method_index];
+    DCHECK_EQ(field_idx, GetIFieldLoweringInfo(iget_or_iput).FieldIndex());
+    return;
+  }
+
+  const MirMethodLoweringInfo& method_info = GetMethodLoweringInfo(invoke);
+  MethodReference target = method_info.GetTargetMethod();
+  DexCompilationUnit inlined_unit(
+      cu_, cu_->class_loader, cu_->class_linker, *target.dex_file,
+      nullptr /* code_item not used */, 0u /* class_def_idx not used */, target.dex_method_index,
+      0u /* access_flags not used */, nullptr /* verified_method not used */);
+  MirIFieldLoweringInfo inlined_field_info(field_idx);
+  MirIFieldLoweringInfo::Resolve(cu_->compiler_driver, &inlined_unit, &inlined_field_info, 1u);
+  DCHECK(inlined_field_info.IsResolved());
+
+  uint32_t field_info_index = ifield_lowering_infos_.Size();
+  ifield_lowering_infos_.Insert(inlined_field_info);
+  temp_bit_vector_->SetBit(method_index);
+  temp_insn_data_[method_index] = field_info_index;
+  iget_or_iput->meta.ifield_lowering_info = field_info_index;
+}
+
+bool MIRGraph::InlineCallsGate() {
+  if ((cu_->disable_opt & (1 << kSuppressMethodInlining)) != 0 ||
+      method_lowering_infos_.Size() == 0u) {
+    return false;
+  }
+  if (cu_->compiler_driver->GetMethodInlinerMap() == nullptr) {
+    // This isn't the Quick compiler.
+    return false;
+  }
+  return true;
+}
+
+void MIRGraph::InlineCallsStart() {
+  // Prepare for inlining getters/setters. Since we're inlining at most 1 IGET/IPUT from
+  // each INVOKE, we can index the data by the MIR::meta::method_lowering_info index.
+
+  DCHECK(temp_scoped_alloc_.get() == nullptr);
+  temp_scoped_alloc_.reset(ScopedArenaAllocator::Create(&cu_->arena_stack));
+  temp_bit_vector_size_ = method_lowering_infos_.Size();
+  temp_bit_vector_ = new (temp_scoped_alloc_.get()) ArenaBitVector(
+      temp_scoped_alloc_.get(), temp_bit_vector_size_, false, kBitMapMisc);
+  temp_bit_vector_->ClearAllBits();
+  temp_insn_data_ = static_cast<uint16_t*>(temp_scoped_alloc_->Alloc(
+      temp_bit_vector_size_ * sizeof(*temp_insn_data_), kArenaAllocGrowableArray));
+}
+
+void MIRGraph::InlineCalls(BasicBlock* bb) {
+  if (bb->block_type != kDalvikByteCode) {
+    return;
+  }
+  for (MIR* mir = bb->first_mir_insn; mir != NULL; mir = mir->next) {
+    if (!(Instruction::FlagsOf(mir->dalvikInsn.opcode) & Instruction::kInvoke)) {
+      continue;
+    }
+    const MirMethodLoweringInfo& method_info = GetMethodLoweringInfo(mir);
+    if (!method_info.FastPath()) {
+      continue;
+    }
+    InvokeType sharp_type = method_info.GetSharpType();
+    if ((sharp_type != kDirect) &&
+        (sharp_type != kStatic || method_info.NeedsClassInitialization())) {
+      continue;
+    }
+    DCHECK(cu_->compiler_driver->GetMethodInlinerMap() != nullptr);
+    MethodReference target = method_info.GetTargetMethod();
+    if (cu_->compiler_driver->GetMethodInlinerMap()->GetMethodInliner(target.dex_file)
+            ->GenInline(this, bb, mir, target.dex_method_index)) {
+      if (cu_->verbose) {
+        LOG(INFO) << "In \"" << PrettyMethod(cu_->method_idx, *cu_->dex_file)
+            << "\" @0x" << std::hex << mir->offset
+            << " inlined " << method_info.GetInvokeType() << " (" << sharp_type << ") call to \""
+            << PrettyMethod(target.dex_method_index, *target.dex_file) << "\"";
+      }
+    }
+  }
+}
+
+void MIRGraph::InlineCallsEnd() {
+  DCHECK(temp_insn_data_ != nullptr);
+  temp_insn_data_ = nullptr;
+  DCHECK(temp_bit_vector_ != nullptr);
+  temp_bit_vector_ = nullptr;
+  DCHECK(temp_scoped_alloc_.get() != nullptr);
+  temp_scoped_alloc_.reset();
+}
+
 void MIRGraph::DumpCheckStats() {
   Checkstats* stats =
       static_cast<Checkstats*>(arena_->Alloc(sizeof(Checkstats), kArenaAllocDFInfo));
diff --git a/compiler/dex/pass_driver.cc b/compiler/dex/pass_driver.cc
index f195aff..999ed2a 100644
--- a/compiler/dex/pass_driver.cc
+++ b/compiler/dex/pass_driver.cc
@@ -92,6 +92,7 @@
 static const Pass* const gPasses[] = {
   GetPassInstance<CacheFieldLoweringInfo>(),
   GetPassInstance<CacheMethodLoweringInfo>(),
+  GetPassInstance<CallInlining>(),
   GetPassInstance<CodeLayout>(),
   GetPassInstance<SSATransformation>(),
   GetPassInstance<ConstantPropagation>(),
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 8b02a42..882a3bb 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -225,6 +225,9 @@
     case kOpBlx:
       opcode = kThumbBlxR;
       break;
+    case kOpBx:
+      opcode = kThumbBx;
+      break;
     default:
       LOG(FATAL) << "Bad opcode " << op;
   }
diff --git a/compiler/dex/quick/dex_file_method_inliner.cc b/compiler/dex/quick/dex_file_method_inliner.cc
index e50ba24..53e26c7 100644
--- a/compiler/dex/quick/dex_file_method_inliner.cc
+++ b/compiler/dex/quick/dex_file_method_inliner.cc
@@ -21,6 +21,7 @@
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "base/mutex-inl.h"
+#include "dex/frontend.h"
 #include "thread.h"
 #include "thread-inl.h"
 #include "dex/mir_graph.h"
@@ -31,6 +32,23 @@
 
 namespace art {
 
+namespace {  // anonymous namespace
+
+MIR* AllocReplacementMIR(MIRGraph* mir_graph, MIR* invoke, MIR* move_return) {
+  ArenaAllocator* arena = mir_graph->GetArena();
+  MIR* insn = static_cast<MIR*>(arena->Alloc(sizeof(MIR), kArenaAllocMIR));
+  insn->offset = invoke->offset;
+  insn->width = invoke->width;
+  insn->optimization_flags = MIR_CALLEE;
+  if (move_return != nullptr) {
+    DCHECK_EQ(move_return->offset, invoke->offset + invoke->width);
+    insn->width += move_return->width;
+  }
+  return insn;
+}
+
+}  // anonymous namespace
+
 const uint32_t DexFileMethodInliner::kIndexUnresolved;
 const char* const DexFileMethodInliner::kClassCacheNames[] = {
     "Z",                       // kClassCacheBoolean
@@ -348,6 +366,51 @@
   return backend->SpecialMIR2LIR(special);
 }
 
+bool DexFileMethodInliner::GenInline(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
+                                     uint32_t method_idx) {
+  InlineMethod method;
+  {
+    ReaderMutexLock mu(Thread::Current(), lock_);
+    auto it = inline_methods_.find(method_idx);
+    if (it == inline_methods_.end() || (it->second.flags & kInlineSpecial) == 0) {
+      return false;
+    }
+    method = it->second;
+  }
+
+  MIR* move_result = nullptr;
+  bool result = true;
+  switch (method.opcode) {
+    case kInlineOpNop:
+      break;
+    case kInlineOpNonWideConst:
+      move_result = mir_graph->FindMoveResult(bb, invoke);
+      result = GenInlineConst(mir_graph, bb, invoke, move_result, method);
+      break;
+    case kInlineOpReturnArg:
+      move_result = mir_graph->FindMoveResult(bb, invoke);
+      result = GenInlineReturnArg(mir_graph, bb, invoke, move_result, method);
+      break;
+    case kInlineOpIGet:
+      move_result = mir_graph->FindMoveResult(bb, invoke);
+      result = GenInlineIGet(mir_graph, bb, invoke, move_result, method, method_idx);
+      break;
+    case kInlineOpIPut:
+      result = GenInlineIPut(mir_graph, bb, invoke, method, method_idx);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected inline op: " << method.opcode;
+  }
+  if (result) {
+    invoke->optimization_flags |= MIR_INLINED;
+    if (move_result != nullptr) {
+      move_result->optimization_flags |= MIR_INLINED;
+      move_result->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpNop);
+    }
+  }
+  return result;
+}
+
 uint32_t DexFileMethodInliner::FindClassIndex(const DexFile* dex_file, IndexCache* cache,
                                               ClassCacheIndex index) {
   uint32_t* class_index = &cache->class_indexes[index];
@@ -484,4 +547,149 @@
   }
 }
 
+bool DexFileMethodInliner::GenInlineConst(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
+                                          MIR* move_result, const InlineMethod& method) {
+  if (move_result == nullptr) {
+    // Result is unused.
+    return true;
+  }
+
+  // Check the opcode and for MOVE_RESULT_OBJECT check also that the constant is null.
+  DCHECK(move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT ||
+         (move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT_OBJECT &&
+             method.d.data == 0u));
+
+  // Insert the CONST instruction.
+  MIR* insn = AllocReplacementMIR(mir_graph, invoke, move_result);
+  insn->dalvikInsn.opcode = Instruction::CONST;
+  insn->dalvikInsn.vA = move_result->dalvikInsn.vA;
+  insn->dalvikInsn.vB = method.d.data;
+  mir_graph->InsertMIRAfter(bb, move_result, insn);
+  return true;
+}
+
+bool DexFileMethodInliner::GenInlineReturnArg(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
+                                              MIR* move_result, const InlineMethod& method) {
+  if (move_result == nullptr) {
+    // Result is unused.
+    return true;
+  }
+
+  // Select opcode and argument.
+  const InlineReturnArgData& data = method.d.return_data;
+  Instruction::Code opcode = Instruction::MOVE_FROM16;
+  if (move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT_OBJECT) {
+    DCHECK_EQ(data.is_object, 1u);
+    opcode = Instruction::MOVE_OBJECT_FROM16;
+  } else if (move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT_WIDE) {
+    DCHECK_EQ(data.is_wide, 1u);
+    opcode = Instruction::MOVE_WIDE_FROM16;
+  } else {
+    DCHECK(move_result->dalvikInsn.opcode == Instruction::MOVE_RESULT);
+    DCHECK_EQ(data.is_wide, 0u);
+    DCHECK_EQ(data.is_object, 0u);
+  }
+  DCHECK_LT(data.is_wide ? data.arg + 1u : data.arg, invoke->dalvikInsn.vA);
+  int arg;
+  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k35c) {
+    arg = invoke->dalvikInsn.arg[data.arg];  // Non-range invoke.
+  } else {
+    DCHECK_EQ(Instruction::FormatOf(invoke->dalvikInsn.opcode), Instruction::k3rc);
+    arg = invoke->dalvikInsn.vC + data.arg;  // Range invoke.
+  }
+
+  // Insert the move instruction
+  MIR* insn = AllocReplacementMIR(mir_graph, invoke, move_result);
+  insn->dalvikInsn.opcode = opcode;
+  insn->dalvikInsn.vA = move_result->dalvikInsn.vA;
+  insn->dalvikInsn.vB = arg;
+  mir_graph->InsertMIRAfter(bb, move_result, insn);
+  return true;
+}
+
+bool DexFileMethodInliner::GenInlineIGet(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
+                                         MIR* move_result, const InlineMethod& method,
+                                         uint32_t method_idx) {
+  CompilationUnit* cu = mir_graph->GetCurrentDexCompilationUnit()->GetCompilationUnit();
+  if (cu->enable_debug & (1 << kDebugSlowFieldPath)) {
+    return false;
+  }
+
+  const InlineIGetIPutData& data = method.d.ifield_data;
+  if (invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
+      invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE ||
+      data.object_arg != 0) {
+    // TODO: Implement inlining of IGET on non-"this" registers (needs correct stack trace for NPE).
+    return false;
+  }
+
+  if (move_result == nullptr) {
+    // Result is unused. If volatile, we still need to emit the IGET but we have no destination.
+    return !data.is_volatile;
+  }
+
+  Instruction::Code opcode = static_cast<Instruction::Code>(Instruction::IGET + data.op_variant);
+  DCHECK_EQ(InlineMethodAnalyser::IGetVariant(opcode), data.op_variant);
+
+  MIR* insn = AllocReplacementMIR(mir_graph, invoke, move_result);
+  insn->width += insn->offset - invoke->offset;
+  insn->offset = invoke->offset;
+  insn->dalvikInsn.opcode = opcode;
+  insn->dalvikInsn.vA = move_result->dalvikInsn.vA;
+  DCHECK_LT(data.object_arg, invoke->dalvikInsn.vA);
+  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc) {
+    insn->dalvikInsn.vB = invoke->dalvikInsn.vC + data.object_arg;
+  } else {
+    DCHECK_EQ(Instruction::FormatOf(invoke->dalvikInsn.opcode), Instruction::k35c);
+    insn->dalvikInsn.vB = invoke->dalvikInsn.arg[data.object_arg];
+  }
+  mir_graph->ComputeInlineIFieldLoweringInfo(data.field_idx, invoke, insn);
+
+  DCHECK(mir_graph->GetIFieldLoweringInfo(insn).IsResolved());
+  DCHECK(mir_graph->GetIFieldLoweringInfo(insn).FastGet());
+  DCHECK_EQ(data.field_offset, mir_graph->GetIFieldLoweringInfo(insn).FieldOffset().Uint32Value());
+  DCHECK_EQ(data.is_volatile, mir_graph->GetIFieldLoweringInfo(insn).IsVolatile() ? 1u : 0u);
+
+  mir_graph->InsertMIRAfter(bb, move_result, insn);
+  return true;
+}
+
+bool DexFileMethodInliner::GenInlineIPut(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
+                                         const InlineMethod& method, uint32_t method_idx) {
+  CompilationUnit* cu = mir_graph->GetCurrentDexCompilationUnit()->GetCompilationUnit();
+  if (cu->enable_debug & (1 << kDebugSlowFieldPath)) {
+    return false;
+  }
+
+  const InlineIGetIPutData& data = method.d.ifield_data;
+  if (invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC ||
+      invoke->dalvikInsn.opcode == Instruction::INVOKE_STATIC_RANGE ||
+      data.object_arg != 0) {
+    // TODO: Implement inlining of IPUT on non-"this" registers (needs correct stack trace for NPE).
+    return false;
+  }
+
+  Instruction::Code opcode = static_cast<Instruction::Code>(Instruction::IPUT + data.op_variant);
+  DCHECK_EQ(InlineMethodAnalyser::IPutVariant(opcode), data.op_variant);
+
+  MIR* insn = AllocReplacementMIR(mir_graph, invoke, nullptr);
+  insn->dalvikInsn.opcode = opcode;
+  if (Instruction::FormatOf(invoke->dalvikInsn.opcode) == Instruction::k3rc) {
+    insn->dalvikInsn.vA = invoke->dalvikInsn.vC + data.src_arg;
+    insn->dalvikInsn.vB = invoke->dalvikInsn.vC + data.object_arg;
+  } else {
+    insn->dalvikInsn.vA = invoke->dalvikInsn.arg[data.src_arg];
+    insn->dalvikInsn.vB = invoke->dalvikInsn.arg[data.object_arg];
+  }
+  mir_graph->ComputeInlineIFieldLoweringInfo(data.field_idx, invoke, insn);
+
+  DCHECK(mir_graph->GetIFieldLoweringInfo(insn).IsResolved());
+  DCHECK(mir_graph->GetIFieldLoweringInfo(insn).FastPut());
+  DCHECK_EQ(data.field_offset, mir_graph->GetIFieldLoweringInfo(insn).FieldOffset().Uint32Value());
+  DCHECK_EQ(data.is_volatile, mir_graph->GetIFieldLoweringInfo(insn).IsVolatile() ? 1u : 0u);
+
+  mir_graph->InsertMIRAfter(bb, invoke, insn);
+  return true;
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/dex_file_method_inliner.h b/compiler/dex/quick/dex_file_method_inliner.h
index a6d4cab..b4e190a 100644
--- a/compiler/dex/quick/dex_file_method_inliner.h
+++ b/compiler/dex/quick/dex_file_method_inliner.h
@@ -31,7 +31,10 @@
 class MethodVerifier;
 }  // namespace verifier
 
+struct BasicBlock;
 struct CallInfo;
+struct MIR;
+class MIRGraph;
 class Mir2Lir;
 
 /**
@@ -79,7 +82,13 @@
     /**
      * Generate code for a special function.
      */
-    bool GenSpecial(Mir2Lir* backend, uint32_t method_idx);
+    bool GenSpecial(Mir2Lir* backend, uint32_t method_idx) LOCKS_EXCLUDED(lock_);
+
+    /**
+     * Try to inline an invoke.
+     */
+    bool GenInline(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke, uint32_t method_idx)
+        LOCKS_EXCLUDED(lock_);
 
     /**
      * To avoid multiple lookups of a class by its descriptor, we cache its
@@ -286,6 +295,15 @@
 
     bool AddInlineMethod(int32_t method_idx, const InlineMethod& method) LOCKS_EXCLUDED(lock_);
 
+    static bool GenInlineConst(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
+                               MIR* move_result, const InlineMethod& method);
+    static bool GenInlineReturnArg(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
+                                   MIR* move_result, const InlineMethod& method);
+    static bool GenInlineIGet(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
+                              MIR* move_result, const InlineMethod& method, uint32_t method_idx);
+    static bool GenInlineIPut(MIRGraph* mir_graph, BasicBlock* bb, MIR* invoke,
+                              const InlineMethod& method, uint32_t method_idx);
+
     ReaderWriterMutex lock_;
     /*
      * Maps method indexes (for the particular DexFile) to Intrinsic defintions.
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 58db984..71cc0d9 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -16,6 +16,7 @@
 
 #include "dex/compiler_ir.h"
 #include "dex/compiler_internals.h"
+#include "dex/quick/arm/arm_lir.h"
 #include "dex/quick/mir_to_lir-inl.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "mirror/array.h"
@@ -627,7 +628,9 @@
     ThreadOffset func_offset(-1);
     int v1 = lab->operands[2];
     int v2 = lab->operands[3];
-    bool target_x86 = (cu_->instruction_set == kX86);
+    const bool target_x86 = cu_->instruction_set == kX86;
+    const bool target_arm = cu_->instruction_set == kArm || cu_->instruction_set == kThumb2;
+    const bool target_mips = cu_->instruction_set == kMips;
     switch (lab->operands[0]) {
       case kThrowNullPointer:
         func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowNullPointer);
@@ -685,21 +688,40 @@
         func_offset =
           QUICK_ENTRYPOINT_OFFSET(pThrowNoSuchMethod);
         break;
-      case kThrowStackOverflow:
+      case kThrowStackOverflow: {
         func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowStackOverflow);
         // Restore stack alignment
+        int r_tgt = 0;
+        const int spill_size = (num_core_spills_ + num_fp_spills_) * 4;
         if (target_x86) {
-          OpRegImm(kOpAdd, TargetReg(kSp), frame_size_);
+          // - 4 to leave link register on stack.
+          OpRegImm(kOpAdd, TargetReg(kSp), frame_size_ - 4);
+          ClobberCallerSave();
+        } else if (target_arm) {
+          r_tgt = r12;
+          LoadWordDisp(TargetReg(kSp), spill_size - 4, TargetReg(kLr));
+          OpRegImm(kOpAdd, TargetReg(kSp), spill_size);
+          ClobberCallerSave();
+          LoadWordDisp(rARM_SELF, func_offset.Int32Value(), r_tgt);
         } else {
-          OpRegImm(kOpAdd, TargetReg(kSp), (num_core_spills_ + num_fp_spills_) * 4);
+          DCHECK(target_mips);
+          DCHECK_EQ(num_fp_spills_, 0);  // FP spills currently don't happen on mips.
+          // LR is offset 0 since we push in reverse order.
+          LoadWordDisp(TargetReg(kSp), 0, TargetReg(kLr));
+          OpRegImm(kOpAdd, TargetReg(kSp), spill_size);
+          ClobberCallerSave();
+          r_tgt = CallHelperSetup(func_offset);  // Doesn't clobber LR.
+          DCHECK_NE(r_tgt, TargetReg(kLr));
         }
-        break;
+        CallHelper(r_tgt, func_offset, false /* MarkSafepointPC */, false /* UseLink */);
+        continue;
+      }
       default:
         LOG(FATAL) << "Unexpected throw kind: " << lab->operands[0];
     }
     ClobberCallerSave();
     int r_tgt = CallHelperSetup(func_offset);
-    CallHelper(r_tgt, func_offset, true /* MarkSafepointPC */);
+    CallHelper(r_tgt, func_offset, true /* MarkSafepointPC */, true /* UseLink */);
   }
 }
 
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 92c13ce..55d50ae 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -62,7 +62,7 @@
 
 /*
  * To save scheduling time, helper calls are broken into two parts: generation of
- * the helper target address, and the actuall call to the helper.  Because x86
+ * the helper target address, and the actual call to the helper.  Because x86
  * has a memory call operation, part 1 is a NOP for x86.  For other targets,
  * load arguments between the two parts.
  */
@@ -71,12 +71,13 @@
 }
 
 /* NOTE: if r_tgt is a temp, it will be freed following use */
-LIR* Mir2Lir::CallHelper(int r_tgt, ThreadOffset helper_offset, bool safepoint_pc) {
+LIR* Mir2Lir::CallHelper(int r_tgt, ThreadOffset helper_offset, bool safepoint_pc, bool use_link) {
   LIR* call_inst;
+  OpKind op = use_link ? kOpBlx : kOpBx;
   if (cu_->instruction_set == kX86) {
-    call_inst = OpThreadMem(kOpBlx, helper_offset);
+    call_inst = OpThreadMem(op, helper_offset);
   } else {
-    call_inst = OpReg(kOpBlx, r_tgt);
+    call_inst = OpReg(op, r_tgt);
     FreeTemp(r_tgt);
   }
   if (safepoint_pc) {
@@ -1423,6 +1424,16 @@
 }
 
 void Mir2Lir::GenInvoke(CallInfo* info) {
+  if ((info->opt_flags & MIR_INLINED) != 0) {
+    // Already inlined but we may still need the null check.
+    if (info->type != kStatic &&
+        ((cu_->disable_opt & (1 << kNullCheckElimination)) != 0 ||
+         (info->opt_flags & MIR_IGNORE_NULL_CHECK) == 0))  {
+      RegLocation rl_obj = LoadValue(info->args[0], kCoreReg);
+      GenImmedCheck(kCondEq, rl_obj.reg.GetReg(), 0, kThrowNullPointer);
+    }
+    return;
+  }
   DCHECK(cu_->compiler_driver->GetMethodInlinerMap() != nullptr);
   if (cu_->compiler_driver->GetMethodInlinerMap()->GetMethodInliner(cu_->dex_file)
       ->GenIntrinsic(this, info)) {
diff --git a/compiler/dex/quick/mips/mips_lir.h b/compiler/dex/quick/mips/mips_lir.h
index 59f442c..77ae337 100644
--- a/compiler/dex/quick/mips/mips_lir.h
+++ b/compiler/dex/quick/mips/mips_lir.h
@@ -138,7 +138,6 @@
 #define r_FRESULT1 r_F1
 
 // Regs not used for Mips.
-#define rMIPS_LR INVALID_REG
 #define rMIPS_PC INVALID_REG
 
 enum MipsResourceEncodingPos {
@@ -268,6 +267,7 @@
 #define rMIPS_RET1 r_RESULT1
 #define rMIPS_INVOKE_TGT r_T9
 #define rMIPS_COUNT INVALID_REG
+#define rMIPS_LR r_RA
 
 // RegisterLocation templates return values (r_V0, or r_V0/r_V1).
 const RegLocation mips_loc_c_return
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 538c292..39994e9 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -346,15 +346,17 @@
       break;
 
     case Instruction::MOVE_RESULT_WIDE:
-      if (opt_flags & MIR_INLINED)
+      if ((opt_flags & MIR_INLINED) != 0) {
         break;  // Nop - combined w/ previous invoke.
+      }
       StoreValueWide(rl_dest, GetReturnWide(rl_dest.fp));
       break;
 
     case Instruction::MOVE_RESULT:
     case Instruction::MOVE_RESULT_OBJECT:
-      if (opt_flags & MIR_INLINED)
+      if ((opt_flags & MIR_INLINED) != 0) {
         break;  // Nop - combined w/ previous invoke.
+      }
       StoreValue(rl_dest, GetReturn(rl_dest.fp));
       break;
 
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 42d7f59..5a1f6cd 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -602,7 +602,7 @@
 
     // Shared by all targets - implemented in gen_invoke.cc.
     int CallHelperSetup(ThreadOffset helper_offset);
-    LIR* CallHelper(int r_tgt, ThreadOffset helper_offset, bool safepoint_pc);
+    LIR* CallHelper(int r_tgt, ThreadOffset helper_offset, bool safepoint_pc, bool use_link = true);
     void CallRuntimeHelperImm(ThreadOffset helper_offset, int arg0, bool safepoint_pc);
     void CallRuntimeHelperReg(ThreadOffset helper_offset, int arg0, bool safepoint_pc);
     void CallRuntimeHelperRegLocation(ThreadOffset helper_offset, RegLocation arg0,
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index f6c8a00..9cafcee 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -357,6 +357,7 @@
   { kX86Jmp32, kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP,               { 0,             0, 0xE9, 0,    0, 0, 0, 0 }, "Jmp32", "!0t" },
   { kX86JmpR,  kJmp,  IS_UNARY_OP  | IS_BRANCH | REG_USE0,                  { 0,             0, 0xFF, 0,    0, 4, 0, 0 }, "JmpR",  "!0r" },
   { kX86Jecxz8, kJmp, NO_OPERAND   | IS_BRANCH | NEEDS_FIXUP | REG_USEC,    { 0,             0, 0xE3, 0,    0, 0, 0, 0 }, "Jecxz", "!0t" },
+  { kX86JmpT,  kJmp,  IS_UNARY_OP  | IS_BRANCH | IS_LOAD,                   { THREAD_PREFIX, 0, 0xFF, 0,    0, 4, 0, 0 }, "JmpT",  "fs:[!0d]" },
   { kX86CallR, kCall, IS_UNARY_OP  | IS_BRANCH | REG_USE0,                  { 0,             0, 0xE8, 0,    0, 0, 0, 0 }, "CallR", "!0r" },
   { kX86CallM, kCall, IS_BINARY_OP | IS_BRANCH | IS_LOAD | REG_USE0,        { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallM", "[!0r+!1d]" },
   { kX86CallA, kCall, IS_QUAD_OP   | IS_BRANCH | IS_LOAD | REG_USE01,       { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallA", "[!0r+!1r<<!2d+!3d]" },
@@ -499,6 +500,8 @@
         return 2;  // opcode + rel8
       } else if (lir->opcode == kX86Jmp32) {
         return 5;  // opcode + rel32
+      } else if (lir->opcode == kX86JmpT) {
+        return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
       } else {
         DCHECK(lir->opcode == kX86JmpR);
         return 2;  // opcode + modrm
@@ -1328,7 +1331,13 @@
         EmitRegRegCond(entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
       case kJmp:  // lir operands - 0: rel
-        EmitJmp(entry, lir->operands[0]);
+        if (entry->opcode == kX86JmpT) {
+          // This works since the instruction format for jmp and call is basically the same and
+          // EmitCallThread loads opcode info.
+          EmitCallThread(entry, lir->operands[0]);
+        } else {
+          EmitJmp(entry, lir->operands[0]);
+        }
         break;
       case kJcc:  // lir operands - 0: rel, 1: CC, target assigned
         EmitJcc(entry, lir->operands[0], lir->operands[1]);
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index 577f216..72fc922 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -198,15 +198,15 @@
   LockTemp(rX86_ARG2);
 
   /* Build frame, return address already on stack */
+  // TODO: 64 bit.
   stack_decrement_ = OpRegImm(kOpSub, rX86_SP, frame_size_ - 4);
 
   /*
    * We can safely skip the stack overflow check if we're
    * a leaf *and* our frame size < fudge factor.
    */
-  bool skip_overflow_check = (mir_graph_->MethodIsLeaf() &&
-                (static_cast<size_t>(frame_size_) <
-                Thread::kStackOverflowReservedBytes));
+  const bool skip_overflow_check = (mir_graph_->MethodIsLeaf() &&
+      (static_cast<size_t>(frame_size_) < Thread::kStackOverflowReservedBytes));
   NewLIR0(kPseudoMethodEntry);
   /* Spill core callee saves */
   SpillCoreRegs();
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index d5d6b0e..bd82bf6 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -472,6 +472,7 @@
   X86OpCode opcode = kX86Bkpt;
   switch (op) {
     case kOpBlx: opcode = kX86CallT;  break;
+    case kOpBx: opcode = kX86JmpT;  break;
     default:
       LOG(FATAL) << "Bad opcode: " << op;
       break;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 9fb0044..abe1b3d 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -397,6 +397,8 @@
   kX86Jmp8, kX86Jmp32,  // jmp rel8/32; lir operands - 0: rel, target assigned
   kX86JmpR,             // jmp reg; lir operands - 0: reg
   kX86Jecxz8,           // jcexz rel8; jump relative if ECX is zero.
+  kX86JmpT,             // jmp fs:[disp]; fs: is equal to Thread::Current(); lir operands - 0: disp
+
   kX86CallR,            // call reg; lir operands - 0: reg
   kX86CallM,            // call [base + disp]; lir operands - 0: base, 1: disp
   kX86CallA,            // call [base + index * scale + disp]
diff --git a/compiler/driver/compiler_driver-inl.h b/compiler/driver/compiler_driver-inl.h
index 664f809..d9f2a3a 100644
--- a/compiler/driver/compiler_driver-inl.h
+++ b/compiler/driver/compiler_driver-inl.h
@@ -289,6 +289,16 @@
   return stats_flags;
 }
 
+inline bool CompilerDriver::NeedsClassInitialization(mirror::Class* referrer_class,
+                                                     mirror::ArtMethod* resolved_method) {
+  if (!resolved_method->IsStatic()) {
+    return false;
+  }
+  mirror::Class* methods_class = resolved_method->GetDeclaringClass();
+  // NOTE: Unlike in IsFastStaticField(), we don't check CanAssumeTypeIsPresentInDexCache() here.
+  return methods_class != referrer_class && !methods_class->IsInitialized();
+}
+
 }  // namespace art
 
 #endif  // ART_COMPILER_DRIVER_COMPILER_DRIVER_INL_H_
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index d88b2aa..256aa46 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -283,6 +283,10 @@
       uintptr_t* direct_code, uintptr_t* direct_method)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Does invokation of the resolved method need class initialization?
+  bool NeedsClassInitialization(mirror::Class* referrer_class, mirror::ArtMethod* resolved_method)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   void ProcessedInstanceField(bool resolved);
   void ProcessedStaticField(bool resolved, bool local);
   void ProcessedInvoke(InvokeType invoke_type, int flags);
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index b8b4a3b..498deba 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -122,7 +122,7 @@
 
 void HBasicBlock::AddInstruction(HInstruction* instruction) {
   DCHECK(instruction->GetBlock() == nullptr);
-  DCHECK(instruction->GetId() == -1);
+  DCHECK_EQ(instruction->GetId(), -1);
   instruction->SetBlock(this);
   instruction->SetId(GetGraph()->GetNextInstructionId());
   if (first_instruction_ == nullptr) {
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 1153634..cca7d03 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -50,6 +50,7 @@
 	gc/accounting/gc_allocator.cc \
 	gc/accounting/heap_bitmap.cc \
 	gc/accounting/mod_union_table.cc \
+	gc/accounting/remembered_set.cc \
 	gc/accounting/space_bitmap.cc \
 	gc/collector/garbage_collector.cc \
 	gc/collector/immune_region.cc \
diff --git a/runtime/gc/accounting/remembered_set.cc b/runtime/gc/accounting/remembered_set.cc
new file mode 100644
index 0000000..e6508dc
--- /dev/null
+++ b/runtime/gc/accounting/remembered_set.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "remembered_set.h"
+
+#include "base/stl_util.h"
+#include "card_table-inl.h"
+#include "heap_bitmap.h"
+#include "gc/collector/mark_sweep.h"
+#include "gc/collector/mark_sweep-inl.h"
+#include "gc/collector/semi_space.h"
+#include "gc/heap.h"
+#include "gc/space/space.h"
+#include "mirror/art_field-inl.h"
+#include "mirror/object-inl.h"
+#include "mirror/class-inl.h"
+#include "mirror/object_array-inl.h"
+#include "space_bitmap-inl.h"
+#include "thread.h"
+#include "UniquePtr.h"
+
+namespace art {
+namespace gc {
+namespace accounting {
+
+class RememberedSetCardVisitor {
+ public:
+  explicit RememberedSetCardVisitor(RememberedSet::CardSet* const dirty_cards)
+      : dirty_cards_(dirty_cards) {}
+
+  void operator()(byte* card, byte expected_value, byte new_value) const {
+    if (expected_value == CardTable::kCardDirty) {
+      dirty_cards_->insert(card);
+    }
+  }
+
+ private:
+  RememberedSet::CardSet* const dirty_cards_;
+};
+
+void RememberedSet::ClearCards() {
+  CardTable* card_table = GetHeap()->GetCardTable();
+  RememberedSetCardVisitor card_visitor(&dirty_cards_);
+  // Clear dirty cards in the space and insert them into the dirty card set.
+  card_table->ModifyCardsAtomic(space_->Begin(), space_->End(), AgeCardVisitor(), card_visitor);
+}
+
+class RememberedSetReferenceVisitor {
+ public:
+  RememberedSetReferenceVisitor(MarkObjectCallback* callback, space::ContinuousSpace* target_space,
+                                bool* const contains_reference_to_target_space, void* arg)
+      : callback_(callback), target_space_(target_space), arg_(arg),
+        contains_reference_to_target_space_(contains_reference_to_target_space) {}
+
+  void operator()(mirror::Object* obj, mirror::Object* ref,
+                  const MemberOffset& offset, bool /* is_static */) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    if (ref != nullptr) {
+      if (target_space_->HasAddress(ref)) {
+        *contains_reference_to_target_space_ = true;
+        mirror::Object* new_ref = callback_(ref, arg_);
+        DCHECK(!target_space_->HasAddress(new_ref));
+        if (new_ref != ref) {
+          obj->SetFieldObjectWithoutWriteBarrier<false>(offset, new_ref, false);
+        }
+      }
+    }
+  }
+
+ private:
+  MarkObjectCallback* const callback_;
+  space::ContinuousSpace* const target_space_;
+  void* const arg_;
+  bool* const contains_reference_to_target_space_;
+};
+
+class RememberedSetObjectVisitor {
+ public:
+  RememberedSetObjectVisitor(MarkObjectCallback* callback, space::ContinuousSpace* target_space,
+                             bool* const contains_reference_to_target_space, void* arg)
+      : callback_(callback), target_space_(target_space), arg_(arg),
+        contains_reference_to_target_space_(contains_reference_to_target_space) {}
+
+  void operator()(mirror::Object* obj) const EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    DCHECK(obj != NULL);
+    RememberedSetReferenceVisitor ref_visitor(callback_, target_space_,
+                                              contains_reference_to_target_space_, arg_);
+    collector::MarkSweep::VisitObjectReferences(obj, ref_visitor, true);
+  }
+
+ private:
+  MarkObjectCallback* const callback_;
+  space::ContinuousSpace* const target_space_;
+  void* const arg_;
+  bool* const contains_reference_to_target_space_;
+};
+
+void RememberedSet::UpdateAndMarkReferences(MarkObjectCallback* callback,
+                                            space::ContinuousSpace* target_space, void* arg) {
+  CardTable* card_table = heap_->GetCardTable();
+  bool contains_reference_to_target_space = false;
+  RememberedSetObjectVisitor obj_visitor(callback, target_space,
+                                         &contains_reference_to_target_space, arg);
+  SpaceBitmap* bitmap = space_->GetLiveBitmap();
+  CardSet remove_card_set;
+  for (byte* const card_addr : dirty_cards_) {
+    contains_reference_to_target_space = false;
+    uintptr_t start = reinterpret_cast<uintptr_t>(card_table->AddrFromCard(card_addr));
+    DCHECK(space_->HasAddress(reinterpret_cast<mirror::Object*>(start)));
+    bitmap->VisitMarkedRange(start, start + CardTable::kCardSize, obj_visitor);
+    if (!contains_reference_to_target_space) {
+      // It was in the dirty card set, but it didn't actually contain
+      // a reference to the target space. So, remove it from the dirty
+      // card set so we won't have to scan it again (unless it gets
+      // dirty again.)
+      remove_card_set.insert(card_addr);
+    }
+  }
+
+  // Remove the cards that didn't contain a reference to the target
+  // space from the dirty card set.
+  for (byte* const card_addr : remove_card_set) {
+    DCHECK(dirty_cards_.find(card_addr) != dirty_cards_.end());
+    dirty_cards_.erase(card_addr);
+  }
+}
+
+void RememberedSet::Dump(std::ostream& os) {
+  CardTable* card_table = heap_->GetCardTable();
+  os << "RememberedSet dirty cards: [";
+  for (const byte* card_addr : dirty_cards_) {
+    auto start = reinterpret_cast<uintptr_t>(card_table->AddrFromCard(card_addr));
+    auto end = start + CardTable::kCardSize;
+    os << reinterpret_cast<void*>(start) << "-" << reinterpret_cast<void*>(end) << "\n";
+  }
+  os << "]";
+}
+
+void RememberedSet::AssertAllDirtyCardsAreWithinSpace() const {
+  CardTable* card_table = heap_->GetCardTable();
+  for (const byte* card_addr : dirty_cards_) {
+    auto start = reinterpret_cast<byte*>(card_table->AddrFromCard(card_addr));
+    auto end = start + CardTable::kCardSize;
+    DCHECK(space_->Begin() <= start && end <= space_->End());
+  }
+}
+
+}  // namespace accounting
+}  // namespace gc
+}  // namespace art
diff --git a/runtime/gc/accounting/remembered_set.h b/runtime/gc/accounting/remembered_set.h
new file mode 100644
index 0000000..92feeb1
--- /dev/null
+++ b/runtime/gc/accounting/remembered_set.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_ACCOUNTING_REMEMBERED_SET_H_
+#define ART_RUNTIME_GC_ACCOUNTING_REMEMBERED_SET_H_
+
+#include "gc_allocator.h"
+#include "globals.h"
+#include "object_callbacks.h"
+#include "safe_map.h"
+
+#include <set>
+#include <vector>
+
+namespace art {
+namespace gc {
+
+namespace collector {
+  class MarkSweep;
+}  // namespace collector
+namespace space {
+  class ContinuousSpace;
+}  // namespace space
+
+class Heap;
+
+namespace accounting {
+
+// The remembered set keeps track of cards that may contain references
+// from the free list spaces to the bump pointer spaces.
+class RememberedSet {
+ public:
+  typedef std::set<byte*, std::less<byte*>, GcAllocator<byte*> > CardSet;
+
+  explicit RememberedSet(const std::string& name, Heap* heap, space::ContinuousSpace* space)
+      : name_(name), heap_(heap), space_(space) {}
+
+  // Clear dirty cards and add them to the dirty card set.
+  void ClearCards();
+
+  // Mark through all references to the target space.
+  void UpdateAndMarkReferences(MarkObjectCallback* callback,
+                               space::ContinuousSpace* target_space, void* arg)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  void Dump(std::ostream& os);
+
+  space::ContinuousSpace* GetSpace() {
+    return space_;
+  }
+  Heap* GetHeap() const {
+    return heap_;
+  }
+  const std::string& GetName() const {
+    return name_;
+  }
+  void AssertAllDirtyCardsAreWithinSpace() const;
+
+ private:
+  const std::string name_;
+  Heap* const heap_;
+  space::ContinuousSpace* const space_;
+
+  CardSet dirty_cards_;
+};
+
+}  // namespace accounting
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_ACCOUNTING_REMEMBERED_SET_H_
diff --git a/runtime/gc/collector/immune_region.cc b/runtime/gc/collector/immune_region.cc
index 9e65384..70a6213 100644
--- a/runtime/gc/collector/immune_region.cc
+++ b/runtime/gc/collector/immune_region.cc
@@ -56,9 +56,14 @@
 }
 
 bool ImmuneRegion::ContainsSpace(const space::ContinuousSpace* space) const {
-  return
+  bool contains =
       begin_ <= reinterpret_cast<mirror::Object*>(space->Begin()) &&
       end_ >= reinterpret_cast<mirror::Object*>(space->Limit());
+  if (kIsDebugBuild && contains) {
+    // A bump pointer space shoult not be in the immune region.
+    DCHECK(space->GetType() != space::kSpaceTypeBumpPointerSpace);
+  }
+  return contains;
 }
 
 }  // namespace collector
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 4f3ad32..fe5a75f 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -201,7 +201,7 @@
     Thread* self = Thread::Current();
     CHECK(!Locks::mutator_lock_->IsExclusiveHeld(self));
     // Process dirty cards and add dirty cards to mod union tables, also ages cards.
-    heap_->ProcessCards(timings_);
+    heap_->ProcessCards(timings_, false);
     // The checkpoint root marking is required to avoid a race condition which occurs if the
     // following happens during a reference write:
     // 1. mutator dirties the card (write barrier)
@@ -241,7 +241,7 @@
   FindDefaultMarkBitmap();
 
   // Process dirty cards and add dirty cards to mod union tables.
-  heap_->ProcessCards(timings_);
+  heap_->ProcessCards(timings_, false);
 
   // Need to do this before the checkpoint since we don't want any threads to add references to
   // the live stack during the recursive mark.
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 23b155c..5b9c397 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -27,6 +27,7 @@
 #include "base/timing_logger.h"
 #include "gc/accounting/heap_bitmap.h"
 #include "gc/accounting/mod_union_table.h"
+#include "gc/accounting/remembered_set.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/heap.h"
 #include "gc/space/bump_pointer_space.h"
@@ -182,7 +183,7 @@
   // Assume the cleared space is already empty.
   BindBitmaps();
   // Process dirty cards and add dirty cards to mod-union tables.
-  heap_->ProcessCards(timings_);
+  heap_->ProcessCards(timings_, kUseRememberedSet && generational_);
   // Clear the whole card table since we can not get any additional dirty cards during the
   // paused GC. This saves memory but only works for pause the world collectors.
   timings_.NewSplit("ClearCardTable");
@@ -214,13 +215,29 @@
                                      "UpdateAndMarkImageModUnionTable",
                                      &timings_);
         table->UpdateAndMarkReferences(MarkObjectCallback, this);
+      } else if (heap_->FindRememberedSetFromSpace(space) != nullptr) {
+        DCHECK(kUseRememberedSet);
+        // If a bump pointer space only collection, the non-moving
+        // space is added to the immune space. The non-moving space
+        // doesn't have a mod union table, but has a remembered
+        // set. Its dirty cards will be scanned later in
+        // MarkReachableObjects().
+        DCHECK(generational_ && !whole_heap_collection_ &&
+               (space == heap_->GetNonMovingSpace() || space == heap_->GetPrimaryFreeListSpace()))
+            << "Space " << space->GetName() << " "
+            << "generational_=" << generational_ << " "
+            << "whole_heap_collection_=" << whole_heap_collection_ << " ";
       } else {
+        DCHECK(!kUseRememberedSet);
         // If a bump pointer space only collection, the non-moving
         // space is added to the immune space. But the non-moving
         // space doesn't have a mod union table. Instead, its live
         // bitmap will be scanned later in MarkReachableObjects().
         DCHECK(generational_ && !whole_heap_collection_ &&
-               (space == heap_->GetNonMovingSpace() || space == heap_->GetPrimaryFreeListSpace()));
+               (space == heap_->GetNonMovingSpace() || space == heap_->GetPrimaryFreeListSpace()))
+            << "Space " << space->GetName() << " "
+            << "generational_=" << generational_ << " "
+            << "whole_heap_collection_=" << whole_heap_collection_ << " ";
       }
     }
   }
@@ -240,6 +257,42 @@
   SemiSpace* const semi_space_;
 };
 
+// Used to verify that there's no references to the from-space.
+class SemiSpaceVerifyNoFromSpaceReferencesVisitor {
+ public:
+  explicit SemiSpaceVerifyNoFromSpaceReferencesVisitor(space::ContinuousMemMapAllocSpace* from_space) :
+      from_space_(from_space) {}
+
+  void operator()(Object* obj, Object* ref, const MemberOffset& offset, bool /* is_static */)
+      const ALWAYS_INLINE {
+    if (from_space_->HasAddress(ref)) {
+      Runtime::Current()->GetHeap()->DumpObject(LOG(INFO), obj);
+    }
+    DCHECK(!from_space_->HasAddress(ref));
+  }
+ private:
+  space::ContinuousMemMapAllocSpace* from_space_;
+};
+
+void SemiSpace::VerifyNoFromSpaceReferences(Object* obj) {
+  DCHECK(obj != NULL);
+  DCHECK(!from_space_->HasAddress(obj)) << "Scanning object " << obj << " in from space";
+  SemiSpaceVerifyNoFromSpaceReferencesVisitor visitor(from_space_);
+  MarkSweep::VisitObjectReferences(obj, visitor, kMovingClasses);
+}
+
+class SemiSpaceVerifyNoFromSpaceReferencesObjectVisitor {
+ public:
+  explicit SemiSpaceVerifyNoFromSpaceReferencesObjectVisitor(SemiSpace* ss) : semi_space_(ss) {}
+  void operator()(Object* obj) const
+      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) {
+    DCHECK(obj != nullptr);
+    semi_space_->VerifyNoFromSpaceReferences(obj);
+  }
+ private:
+  SemiSpace* const semi_space_;
+};
+
 void SemiSpace::MarkReachableObjects() {
   timings_.StartSplit("MarkStackAsLive");
   accounting::ObjectStack* live_stack = heap_->GetLiveStack();
@@ -250,18 +303,36 @@
   for (auto& space : heap_->GetContinuousSpaces()) {
     // If the space is immune and has no mod union table (the
     // non-moving space when the bump pointer space only collection is
-    // enabled,) then we need to scan its live bitmap as roots
+    // enabled,) then we need to scan its live bitmap or dirty cards as roots
     // (including the objects on the live stack which have just marked
     // in the live bitmap above in MarkAllocStackAsLive().)
     if (immune_region_.ContainsSpace(space) &&
         heap_->FindModUnionTableFromSpace(space) == nullptr) {
       DCHECK(generational_ && !whole_heap_collection_ &&
              (space == GetHeap()->GetNonMovingSpace() || space == GetHeap()->GetPrimaryFreeListSpace()));
-      accounting::SpaceBitmap* live_bitmap = space->GetLiveBitmap();
-      SemiSpaceScanObjectVisitor visitor(this);
-      live_bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(space->Begin()),
-                                    reinterpret_cast<uintptr_t>(space->End()),
-                                    visitor);
+      accounting::RememberedSet* rem_set = heap_->FindRememberedSetFromSpace(space);
+      if (kUseRememberedSet) {
+        DCHECK(rem_set != nullptr);
+        rem_set->UpdateAndMarkReferences(MarkObjectCallback, from_space_, this);
+        if (kIsDebugBuild) {
+          // Verify that there are no from-space references that
+          // remain in the space, that is, the remembered set (and the
+          // card table) didn't miss any from-space references in the
+          // space.
+          accounting::SpaceBitmap* live_bitmap = space->GetLiveBitmap();
+          SemiSpaceVerifyNoFromSpaceReferencesObjectVisitor visitor(this);
+          live_bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(space->Begin()),
+                                        reinterpret_cast<uintptr_t>(space->End()),
+                                        visitor);
+        }
+      } else {
+        DCHECK(rem_set == nullptr);
+        accounting::SpaceBitmap* live_bitmap = space->GetLiveBitmap();
+        SemiSpaceScanObjectVisitor visitor(this);
+        live_bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(space->Begin()),
+                                      reinterpret_cast<uintptr_t>(space->End()),
+                                      visitor);
+      }
     }
   }
 
@@ -447,6 +518,10 @@
     } else {
       GetHeap()->num_bytes_allocated_.FetchAndAdd(bytes_promoted);
       bytes_promoted_ += bytes_promoted;
+      // Dirty the card at the destionation as it may contain
+      // references (including the class pointer) to the bump pointer
+      // space.
+      GetHeap()->WriteBarrierEveryFieldOf(forward_address);
       // Handle the bitmaps marking.
       accounting::SpaceBitmap* live_bitmap = promo_dest_space->GetLiveBitmap();
       DCHECK(live_bitmap != nullptr);
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index be7ec05..08bfbc4 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -63,6 +63,9 @@
 
 class SemiSpace : public GarbageCollector {
  public:
+  // If true, use remembered sets in the generational mode.
+  static constexpr bool kUseRememberedSet = true;
+
   explicit SemiSpace(Heap* heap, bool generational = false,
                      const std::string& name_prefix = "");
 
@@ -100,6 +103,9 @@
   void ScanObject(mirror::Object* obj)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
+  void VerifyNoFromSpaceReferences(mirror::Object* obj)
+      SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+
   // Marks the root set at the start of a garbage collection.
   void MarkRoots()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 533e5df..6cc44c9 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -20,6 +20,8 @@
 #include "heap.h"
 
 #include "debugger.h"
+#include "gc/accounting/card_table-inl.h"
+#include "gc/collector/semi_space.h"
 #include "gc/space/bump_pointer_space-inl.h"
 #include "gc/space/dlmalloc_space-inl.h"
 #include "gc/space/large_object_space.h"
@@ -75,6 +77,18 @@
     obj->SetBrooksPointer(obj);
     obj->AssertSelfBrooksPointer();
   }
+  if (collector::SemiSpace::kUseRememberedSet && UNLIKELY(allocator == kAllocatorTypeNonMoving)) {
+    // (Note this if statement will be constant folded away for the
+    // fast-path quick entry points.) Because SetClass() has no write
+    // barrier, if a non-moving space allocation, we need a write
+    // barrier as the class pointer may point to the bump pointer
+    // space (where the class pointer is an "old-to-young" reference,
+    // though rare) under the GSS collector with the remembered set
+    // enabled. We don't need this for kAllocatorTypeRosAlloc/DlMalloc
+    // cases because we don't directly allocate into the main alloc
+    // space (besides promotions) under the SS/GSS collector.
+    WriteBarrierField(obj, mirror::Object::ClassOffset(), klass);
+  }
   pre_fence_visitor(obj, usable_size);
   if (kIsDebugBuild && Runtime::Current()->IsStarted()) {
     CHECK_LE(obj->SizeOf(), usable_size);
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 45904ff..e8ee62f 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -33,6 +33,7 @@
 #include "gc/accounting/heap_bitmap-inl.h"
 #include "gc/accounting/mod_union_table.h"
 #include "gc/accounting/mod_union_table-inl.h"
+#include "gc/accounting/remembered_set.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/collector/mark_sweep-inl.h"
 #include "gc/collector/partial_mark_sweep.h"
@@ -161,7 +162,8 @@
   } else {
     if (kMovingCollector) {
       // We are the zygote, use bump pointer allocation + semi space collector.
-      desired_collector_type_ = kCollectorTypeSS;
+      bool generational = post_zygote_collector_type_ == kCollectorTypeGSS;
+      desired_collector_type_ = generational ? kCollectorTypeGSS : kCollectorTypeSS;
     } else {
       desired_collector_type_ = post_zygote_collector_type_;
     }
@@ -279,6 +281,13 @@
   CHECK(mod_union_table != nullptr) << "Failed to create image mod-union table";
   AddModUnionTable(mod_union_table);
 
+  if (collector::SemiSpace::kUseRememberedSet) {
+    accounting::RememberedSet* non_moving_space_rem_set =
+        new accounting::RememberedSet("Non-moving space remembered set", this, non_moving_space_);
+    CHECK(non_moving_space_rem_set != nullptr) << "Failed to create non-moving space remembered set";
+    AddRememberedSet(non_moving_space_rem_set);
+  }
+
   // TODO: Count objects in the image space here.
   num_bytes_allocated_ = 0;
 
@@ -1469,7 +1478,7 @@
 // Special compacting collector which uses sub-optimal bin packing to reduce zygote space size.
 class ZygoteCompactingCollector FINAL : public collector::SemiSpace {
  public:
-  explicit ZygoteCompactingCollector(gc::Heap* heap) : SemiSpace(heap, "zygote collector"),
+  explicit ZygoteCompactingCollector(gc::Heap* heap) : SemiSpace(heap, false, "zygote collector"),
       bin_live_bitmap_(nullptr), bin_mark_bitmap_(nullptr) {
   }
 
@@ -1618,6 +1627,16 @@
   // Remove the old space before creating the zygote space since creating the zygote space sets
   // the old alloc space's bitmaps to nullptr.
   RemoveSpace(old_alloc_space);
+  if (collector::SemiSpace::kUseRememberedSet) {
+    // Sanity bound check.
+    FindRememberedSetFromSpace(old_alloc_space)->AssertAllDirtyCardsAreWithinSpace();
+    // Remove the remembered set for the now zygote space (the old
+    // non-moving space). Note now that we have compacted objects into
+    // the zygote space, the data in the remembered set is no longer
+    // needed. The zygote space will instead have a mod-union table
+    // from this point on.
+    RemoveRememberedSet(old_alloc_space);
+  }
   space::ZygoteSpace* zygote_space = old_alloc_space->CreateZygoteSpace("alloc space",
                                                                         low_memory_mode_,
                                                                         &main_space_);
@@ -1640,6 +1659,13 @@
       new accounting::ModUnionTableCardCache("zygote space mod-union table", this, zygote_space);
   CHECK(mod_union_table != nullptr) << "Failed to create zygote space mod-union table";
   AddModUnionTable(mod_union_table);
+  if (collector::SemiSpace::kUseRememberedSet) {
+    // Add a new remembered set for the new main space.
+    accounting::RememberedSet* main_space_rem_set =
+        new accounting::RememberedSet("Main space remembered set", this, main_space_);
+    CHECK(main_space_rem_set != nullptr) << "Failed to create main space remembered set";
+    AddRememberedSet(main_space_rem_set);
+  }
   // Can't use RosAlloc for non moving space due to thread local buffers.
   // TODO: Non limited space for non-movable objects?
   MemMap* mem_map = post_zygote_non_moving_space_mem_map_.release();
@@ -1650,6 +1676,15 @@
   CHECK(new_non_moving_space != nullptr) << "Failed to create new non-moving space";
   new_non_moving_space->SetFootprintLimit(new_non_moving_space->Capacity());
   non_moving_space_ = new_non_moving_space;
+  if (collector::SemiSpace::kUseRememberedSet) {
+    // Add a new remembered set for the post-zygote non-moving space.
+    accounting::RememberedSet* post_zygote_non_moving_space_rem_set =
+        new accounting::RememberedSet("Post-zygote non-moving space remembered set", this,
+                                      non_moving_space_);
+    CHECK(post_zygote_non_moving_space_rem_set != nullptr)
+        << "Failed to create post-zygote non-moving space remembered set";
+    AddRememberedSet(post_zygote_non_moving_space_rem_set);
+  }
 }
 
 void Heap::FlushAllocStack() {
@@ -2034,6 +2069,11 @@
       accounting::ModUnionTable* mod_union_table = table_pair.second;
       mod_union_table->Dump(LOG(ERROR) << mod_union_table->GetName() << ": ");
     }
+    // Dump remembered sets.
+    for (const auto& table_pair : remembered_sets_) {
+      accounting::RememberedSet* remembered_set = table_pair.second;
+      remembered_set->Dump(LOG(ERROR) << remembered_set->GetName() << ": ");
+    }
     DumpSpaces();
     return false;
   }
@@ -2185,15 +2225,29 @@
   return it->second;
 }
 
-void Heap::ProcessCards(TimingLogger& timings) {
+accounting::RememberedSet* Heap::FindRememberedSetFromSpace(space::Space* space) {
+  auto it = remembered_sets_.find(space);
+  if (it == remembered_sets_.end()) {
+    return nullptr;
+  }
+  return it->second;
+}
+
+void Heap::ProcessCards(TimingLogger& timings, bool use_rem_sets) {
   // Clear cards and keep track of cards cleared in the mod-union table.
   for (const auto& space : continuous_spaces_) {
     accounting::ModUnionTable* table = FindModUnionTableFromSpace(space);
+    accounting::RememberedSet* rem_set = FindRememberedSetFromSpace(space);
     if (table != nullptr) {
       const char* name = space->IsZygoteSpace() ? "ZygoteModUnionClearCards" :
           "ImageModUnionClearCards";
       TimingLogger::ScopedSplit split(name, &timings);
       table->ClearCards();
+    } else if (use_rem_sets && rem_set != nullptr) {
+      DCHECK(collector::SemiSpace::kUseRememberedSet && collector_type_ == kCollectorTypeGSS)
+          << static_cast<int>(collector_type_);
+      TimingLogger::ScopedSplit split("AllocSpaceRemSetClearCards", &timings);
+      rem_set->ClearCards();
     } else if (space->GetType() != space::kSpaceTypeBumpPointerSpace) {
       TimingLogger::ScopedSplit split("AllocSpaceClearCards", &timings);
       // No mod union table for the AllocSpace. Age the cards so that the GC knows that these cards
@@ -2694,5 +2748,22 @@
   CHECK_GE(byte_count, sizeof(mirror::Object));
 }
 
+void Heap::AddRememberedSet(accounting::RememberedSet* remembered_set) {
+  CHECK(remembered_set != nullptr);
+  space::Space* space = remembered_set->GetSpace();
+  CHECK(space != nullptr);
+  CHECK(remembered_sets_.find(space) == remembered_sets_.end());
+  remembered_sets_.Put(space, remembered_set);
+  CHECK(remembered_sets_.find(space) != remembered_sets_.end());
+}
+
+void Heap::RemoveRememberedSet(space::Space* space) {
+  CHECK(space != nullptr);
+  auto it = remembered_sets_.find(space);
+  CHECK(it != remembered_sets_.end());
+  remembered_sets_.erase(it);
+  CHECK(remembered_sets_.find(space) == remembered_sets_.end());
+}
+
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 1e0a596..de20a4e 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -56,6 +56,7 @@
   class HeapBitmap;
   class ModUnionTable;
   class ObjectSet;
+  class RememberedSet;
 }  // namespace accounting
 
 namespace collector {
@@ -541,6 +542,10 @@
   accounting::ModUnionTable* FindModUnionTableFromSpace(space::Space* space);
   void AddModUnionTable(accounting::ModUnionTable* mod_union_table);
 
+  accounting::RememberedSet* FindRememberedSetFromSpace(space::Space* space);
+  void AddRememberedSet(accounting::RememberedSet* remembered_set);
+  void RemoveRememberedSet(space::Space* space);
+
   bool IsCompilingBoot() const;
   bool HasImageSpace() const;
 
@@ -660,7 +665,7 @@
   void SwapStacks(Thread* self);
 
   // Clear cards and update the mod union table.
-  void ProcessCards(TimingLogger& timings);
+  void ProcessCards(TimingLogger& timings, bool use_rem_sets);
 
   // Signal the heap trim daemon that there is something to do, either a heap transition or heap
   // trim.
@@ -701,6 +706,9 @@
   // A mod-union table remembers all of the references from the it's space to other spaces.
   SafeMap<space::Space*, accounting::ModUnionTable*> mod_union_tables_;
 
+  // A remembered set remembers all of the references from the it's space to the target space.
+  SafeMap<space::Space*, accounting::RememberedSet*> remembered_sets_;
+
   // Keep the free list allocator mem map lying around when we transition to background so that we
   // don't have to worry about virtual address space fragmentation.
   UniquePtr<MemMap> allocator_mem_map_;
diff --git a/runtime/oat.cc b/runtime/oat.cc
index d04514f..f970789 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -22,7 +22,7 @@
 namespace art {
 
 const uint8_t OatHeader::kOatMagic[] = { 'o', 'a', 't', '\n' };
-const uint8_t OatHeader::kOatVersion[] = { '0', '1', '8', '\0' };
+const uint8_t OatHeader::kOatVersion[] = { '0', '1', '9', '\0' };
 
 OatHeader::OatHeader() {
   memset(this, 0, sizeof(*this));
diff --git a/test/018-stack-overflow/expected.txt b/test/018-stack-overflow/expected.txt
index 7797816..98b45b7 100644
--- a/test/018-stack-overflow/expected.txt
+++ b/test/018-stack-overflow/expected.txt
@@ -1,2 +1,3 @@
-caught SOE
+caught SOE in testSelfRecursion
+caught SOE in testMutualRecursion
 SOE test done
diff --git a/test/018-stack-overflow/src/Main.java b/test/018-stack-overflow/src/Main.java
index f79c269..41adabc 100644
--- a/test/018-stack-overflow/src/Main.java
+++ b/test/018-stack-overflow/src/Main.java
@@ -19,17 +19,46 @@
  */
 public class Main {
     public static void main(String args[]) {
+        testSelfRecursion();
+        testMutualRecursion();
+        System.out.println("SOE test done");
+    }
+
+    private static void testSelfRecursion() {
         try {
             stackOverflowTestSub(0.0, 0.0, 0.0);
         }
         catch (StackOverflowError soe) {
-            System.out.println("caught SOE");
+            System.out.println("caught SOE in testSelfRecursion");
         }
-        System.out.println("SOE test done");
     }
 
-    private static void stackOverflowTestSub(double pad1, double pad2,
-            double pad3) {
+    private static void stackOverflowTestSub(double pad1, double pad2, double pad3) {
         stackOverflowTestSub(pad1, pad2, pad3);
     }
+
+    private static void testMutualRecursion() {
+        try {
+            foo(0.0, 0.0, 0.0);
+        }
+        catch (StackOverflowError soe) {
+            System.out.println("caught SOE in testMutualRecursion");
+        }
+    }
+
+    private static void foo(double pad1, double pad2, double pad3) {
+        bar(pad1, pad2, pad3);
+    }
+
+    private static void bar(double pad1, double pad2, double pad3) {
+        baz(pad1, pad2, pad3);
+    }
+
+    private static void baz(double pad1, double pad2, double pad3) {
+        qux(pad1, pad2, pad3);
+    }
+
+    private static void qux(double pad1, double pad2, double pad3) {
+        foo(pad1, pad2, pad3);
+    }
 }