Merge "Enable core callee-save on x64."
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index bc9649f..0af70f9 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -236,7 +236,8 @@
     }
   }
 
-  SetupBlockedRegisters();
+  static constexpr bool kBaseline = true;
+  SetupBlockedRegisters(kBaseline);
 
   // Allocate all unallocated input locations.
   for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
@@ -547,8 +548,18 @@
 
   size_t environment_size = instruction->EnvironmentSize();
 
-  size_t register_mask = 0;
   size_t inlining_depth = 0;
+  uint32_t register_mask = locations->GetRegisterMask();
+  if (locations->OnlyCallsOnSlowPath()) {
+    // In case of slow path, we currently set the location of caller-save registers
+    // to register (instead of their stack location when pushed before the slow-path
+    // call). Therefore register_mask contains both callee-save and caller-save
+    // registers that hold objects. We must remove the caller-save from the mask, since
+    // they will be overwritten by the callee.
+    register_mask &= core_callee_save_mask_;
+  }
+  // The register mask must be a subset of callee-save registers.
+  DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask);
   stack_map_stream_.AddStackMapEntry(
       dex_pc, pc_info.native_pc, register_mask,
       locations->GetStackMask(), environment_size, inlining_depth);
@@ -684,20 +695,24 @@
   RegisterSet* register_set = locations->GetLiveRegisters();
   size_t stack_offset = first_register_slot_in_slow_path_;
   for (size_t i = 0, e = GetNumberOfCoreRegisters(); i < e; ++i) {
-    if (register_set->ContainsCoreRegister(i)) {
-      // If the register holds an object, update the stack mask.
-      if (locations->RegisterContainsObject(i)) {
-        locations->SetStackBit(stack_offset / kVRegSize);
+    if (!IsCoreCalleeSaveRegister(i)) {
+      if (register_set->ContainsCoreRegister(i)) {
+        // If the register holds an object, update the stack mask.
+        if (locations->RegisterContainsObject(i)) {
+          locations->SetStackBit(stack_offset / kVRegSize);
+        }
+        DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+        stack_offset += SaveCoreRegister(stack_offset, i);
       }
-      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
-      stack_offset += SaveCoreRegister(stack_offset, i);
     }
   }
 
   for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
-    if (register_set->ContainsFloatingPointRegister(i)) {
-      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
-      stack_offset += SaveFloatingPointRegister(stack_offset, i);
+    if (!IsFloatingPointCalleeSaveRegister(i)) {
+      if (register_set->ContainsFloatingPointRegister(i)) {
+        DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+        stack_offset += SaveFloatingPointRegister(stack_offset, i);
+      }
     }
   }
 }
@@ -706,16 +721,20 @@
   RegisterSet* register_set = locations->GetLiveRegisters();
   size_t stack_offset = first_register_slot_in_slow_path_;
   for (size_t i = 0, e = GetNumberOfCoreRegisters(); i < e; ++i) {
-    if (register_set->ContainsCoreRegister(i)) {
-      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
-      stack_offset += RestoreCoreRegister(stack_offset, i);
+    if (!IsCoreCalleeSaveRegister(i)) {
+      if (register_set->ContainsCoreRegister(i)) {
+        DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+        stack_offset += RestoreCoreRegister(stack_offset, i);
+      }
     }
   }
 
   for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
-    if (register_set->ContainsFloatingPointRegister(i)) {
-      DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
-      stack_offset += RestoreFloatingPointRegister(stack_offset, i);
+    if (!IsFloatingPointCalleeSaveRegister(i)) {
+      if (register_set->ContainsFloatingPointRegister(i)) {
+        DCHECK_LT(stack_offset, GetFrameSize() - FrameEntrySpillSize());
+        stack_offset += RestoreFloatingPointRegister(stack_offset, i);
+      }
     }
   }
 }
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index f66aed9..16080a4 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -127,7 +127,7 @@
 
   size_t GetNumberOfCoreRegisters() const { return number_of_core_registers_; }
   size_t GetNumberOfFloatingPointRegisters() const { return number_of_fpu_registers_; }
-  virtual void SetupBlockedRegisters() const = 0;
+  virtual void SetupBlockedRegisters(bool is_baseline) const = 0;
 
   virtual void DumpCoreRegister(std::ostream& stream, int reg) const = 0;
   virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const = 0;
@@ -151,6 +151,14 @@
   }
   virtual bool NeedsTwoRegisters(Primitive::Type type) const = 0;
 
+  bool IsCoreCalleeSaveRegister(int reg) const {
+    return (core_callee_save_mask_ & (1 << reg)) != 0;
+  }
+
+  bool IsFloatingPointCalleeSaveRegister(int reg) const {
+    return (fpu_callee_save_mask_ & (1 << reg)) != 0;
+  }
+
   void RecordPcInfo(HInstruction* instruction, uint32_t dex_pc);
   bool CanMoveNullCheckToUser(HNullCheck* null_check);
   void MaybeRecordImplicitNullCheck(HInstruction* instruction);
@@ -203,11 +211,17 @@
     return type == Primitive::kPrimNot && !value->IsIntConstant();
   }
 
+  void AddAllocatedRegister(Location location) {
+    allocated_registers_.Add(location);
+  }
+
  protected:
   CodeGenerator(HGraph* graph,
                 size_t number_of_core_registers,
                 size_t number_of_fpu_registers,
                 size_t number_of_register_pairs,
+                uint32_t core_callee_save_mask,
+                uint32_t fpu_callee_save_mask,
                 const CompilerOptions& compiler_options)
       : frame_size_(kUninitializedFrameSize),
         core_spill_mask_(0),
@@ -218,6 +232,8 @@
         number_of_core_registers_(number_of_core_registers),
         number_of_fpu_registers_(number_of_fpu_registers),
         number_of_register_pairs_(number_of_register_pairs),
+        core_callee_save_mask_(core_callee_save_mask),
+        fpu_callee_save_mask_(fpu_callee_save_mask),
         graph_(graph),
         compiler_options_(compiler_options),
         pc_infos_(graph->GetArena(), 32),
@@ -243,6 +259,9 @@
   uint32_t core_spill_mask_;
   uint32_t first_register_slot_in_slow_path_;
 
+  // Registers that were allocated during linear scan.
+  RegisterSet allocated_registers_;
+
   // Arrays used when doing register allocation to know which
   // registers we can allocate. `SetupBlockedRegisters` updates the
   // arrays.
@@ -252,6 +271,8 @@
   size_t number_of_core_registers_;
   size_t number_of_fpu_registers_;
   size_t number_of_register_pairs_;
+  const uint32_t core_callee_save_mask_;
+  const uint32_t fpu_callee_save_mask_;
 
  private:
   void InitLocations(HInstruction* instruction);
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index c6a6974..bc8858b 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -390,7 +390,7 @@
                                    const ArmInstructionSetFeatures& isa_features,
                                    const CompilerOptions& compiler_options)
     : CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfSRegisters,
-                    kNumberOfRegisterPairs, compiler_options),
+                    kNumberOfRegisterPairs, 0, 0, compiler_options),
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
@@ -453,7 +453,7 @@
   return Location();
 }
 
-void CodeGeneratorARM::SetupBlockedRegisters() const {
+void CodeGeneratorARM::SetupBlockedRegisters(bool is_baseline ATTRIBUTE_UNUSED) const {
   // Don't allocate the dalvik style register pair passing.
   blocked_register_pairs_[R1_R2] = true;
 
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 0de6669..f3b1ff5 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -197,7 +197,7 @@
     return GetLabelOf(block)->Position();
   }
 
-  void SetupBlockedRegisters() const OVERRIDE;
+  void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
 
   Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
 
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 760d2be..21c1e9c 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -567,6 +567,8 @@
                     kNumberOfAllocatableRegisters,
                     kNumberOfAllocatableFPRegisters,
                     kNumberOfAllocatableRegisterPairs,
+                    0,
+                    0,
                     compiler_options),
       block_labels_(nullptr),
       location_builder_(graph, this),
@@ -729,7 +731,7 @@
   __ Bind(&done);
 }
 
-void CodeGeneratorARM64::SetupBlockedRegisters() const {
+void CodeGeneratorARM64::SetupBlockedRegisters(bool is_baseline ATTRIBUTE_UNUSED) const {
   // Block reserved registers:
   //   ip0 (VIXL temporary)
   //   ip1 (VIXL temporary)
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 27c6fbd..d81e481 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -217,7 +217,7 @@
 
   // Register allocation.
 
-  void SetupBlockedRegisters() const OVERRIDE;
+  void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
   // AllocateFreeRegister() is only used when allocating registers locally
   // during CompileBaseline().
   Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 2d30412..9e26ddd 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -375,7 +375,7 @@
 
 CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options)
     : CodeGenerator(graph, kNumberOfCpuRegisters, kNumberOfXmmRegisters,
-                    kNumberOfRegisterPairs, compiler_options),
+                    kNumberOfRegisterPairs, 0, 0, compiler_options),
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
@@ -431,7 +431,7 @@
   return Location();
 }
 
-void CodeGeneratorX86::SetupBlockedRegisters() const {
+void CodeGeneratorX86::SetupBlockedRegisters(bool is_baseline ATTRIBUTE_UNUSED) const {
   // Don't allocate the dalvik style register pair passing.
   blocked_register_pairs_[ECX_EDX] = true;
 
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index a9086f8..dcfeb2f 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -196,7 +196,7 @@
     return GetLabelOf(block)->Position();
   }
 
-  void SetupBlockedRegisters() const OVERRIDE;
+  void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
 
   Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index da83b76..285003d 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -46,6 +46,7 @@
 static constexpr FloatRegister kRuntimeParameterFpuRegisters[] = { XMM0, XMM1 };
 static constexpr size_t kRuntimeParameterFpuRegistersLength =
     arraysize(kRuntimeParameterFpuRegisters);
+static constexpr Register kCoreCalleeSaves[] = { RBX, RBP, R12, R13, R14, R15 };
 
 static constexpr int kC2ConditionMask = 0x400;
 
@@ -416,17 +417,27 @@
   return kX86_64WordSize;
 }
 
+static uint32_t ComputeCoreCalleeSaveMask() {
+  uint32_t mask = 0;
+  for (size_t i = 0, e = arraysize(kCoreCalleeSaves); i < e; ++i) {
+    mask |= (1 << kCoreCalleeSaves[i]);
+  }
+  return mask;
+}
+
 CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& compiler_options)
-      : CodeGenerator(graph, kNumberOfCpuRegisters, kNumberOfFloatRegisters, 0, compiler_options),
+      : CodeGenerator(graph,
+                      kNumberOfCpuRegisters,
+                      kNumberOfFloatRegisters,
+                      0,
+                      ComputeCoreCalleeSaveMask(),
+                      0,
+                      compiler_options),
         block_labels_(graph->GetArena(), 0),
         location_builder_(graph, this),
         instruction_visitor_(graph, this),
         move_resolver_(graph->GetArena(), this) {}
 
-size_t CodeGeneratorX86_64::FrameEntrySpillSize() const {
-  return kNumberOfPushedRegistersAtEntry * kX86_64WordSize;
-}
-
 InstructionCodeGeneratorX86_64::InstructionCodeGeneratorX86_64(HGraph* graph,
                                                                CodeGeneratorX86_64* codegen)
       : HGraphVisitor(graph),
@@ -459,21 +470,26 @@
   return Location();
 }
 
-void CodeGeneratorX86_64::SetupBlockedRegisters() const {
+size_t CodeGeneratorX86_64::FrameEntrySpillSize() const {
+  uint32_t mask = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
+  return kNumberOfPushedRegistersAtEntry * kX86_64WordSize
+      + __builtin_popcount(mask) * kX86_64WordSize;
+}
+
+void CodeGeneratorX86_64::SetupBlockedRegisters(bool is_baseline) const {
   // Stack register is always reserved.
   blocked_core_registers_[RSP] = true;
 
   // Block the register used as TMP.
   blocked_core_registers_[TMP] = true;
 
-  // TODO: We currently don't use Quick's callee saved registers.
-  blocked_core_registers_[RBX] = true;
-  blocked_core_registers_[RBP] = true;
-  blocked_core_registers_[R12] = true;
-  blocked_core_registers_[R13] = true;
-  blocked_core_registers_[R14] = true;
-  blocked_core_registers_[R15] = true;
+  if (is_baseline) {
+    for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
+      blocked_core_registers_[kCoreCalleeSaves[i]] = true;
+    }
+  }
 
+  // TODO: We currently don't use Quick's FP callee saved registers.
   blocked_fpu_registers_[XMM12] = true;
   blocked_fpu_registers_[XMM13] = true;
   blocked_fpu_registers_[XMM14] = true;
@@ -484,6 +500,7 @@
   // Create a fake register to mimic Quick.
   static const int kFakeReturnRegister = 16;
   core_spill_mask_ |= (1 << kFakeReturnRegister);
+  core_spill_mask_ |= (allocated_registers_.GetCoreRegisters() & core_callee_save_mask_);
 
   bool skip_overflow_check = IsLeafMethod()
       && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86_64);
@@ -494,10 +511,14 @@
         CpuRegister(RSP), -static_cast<int32_t>(GetStackOverflowReservedBytes(kX86_64))));
     RecordPcInfo(nullptr, 0);
   }
+  
+  for (int i = arraysize(kCoreCalleeSaves) - 1; i >= 0; --i) {
+    if (allocated_registers_.ContainsCoreRegister(kCoreCalleeSaves[i])) {
+      __ pushq(CpuRegister(kCoreCalleeSaves[i]));
+    }
+  }
 
-  // The return PC has already been pushed on the stack.
-  __ subq(CpuRegister(RSP),
-          Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86_64WordSize));
+  __ subq(CpuRegister(RSP), Immediate(GetFrameSize() - FrameEntrySpillSize()));
 
   if (!skip_overflow_check && !implicitStackOverflowChecks) {
     SlowPathCodeX86_64* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathX86_64();
@@ -512,8 +533,13 @@
 }
 
 void CodeGeneratorX86_64::GenerateFrameExit() {
-  __ addq(CpuRegister(RSP),
-          Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86_64WordSize));
+  __ addq(CpuRegister(RSP), Immediate(GetFrameSize() - FrameEntrySpillSize()));
+
+  for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
+    if (allocated_registers_.ContainsCoreRegister(kCoreCalleeSaves[i])) {
+      __ popq(CpuRegister(kCoreCalleeSaves[i]));
+    }
+  }
 }
 
 void CodeGeneratorX86_64::Bind(HBasicBlock* block) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index ead771a..645fb17 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -218,7 +218,7 @@
 
   Location GetStackLocation(HLoadLocal* load) const OVERRIDE;
 
-  void SetupBlockedRegisters() const OVERRIDE;
+  void SetupBlockedRegisters(bool is_baseline) const OVERRIDE;
   Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
   void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
   void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index dda6c94..6bf8f77 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -431,6 +431,14 @@
     return __builtin_popcount(core_registers_) + __builtin_popcount(floating_point_registers_);
   }
 
+  uint32_t GetCoreRegisters() const {
+    return core_registers_;
+  }
+
+  uint32_t GetFloatingPointRegisters() const {
+    return floating_point_registers_;
+  }
+
  private:
   uint32_t core_registers_;
   uint32_t floating_point_registers_;
@@ -529,6 +537,10 @@
     register_mask_ |= (1 << reg_id);
   }
 
+  uint32_t GetRegisterMask() const {
+    return register_mask_;
+  }
+
   bool RegisterContainsObject(uint32_t reg_id) {
     return RegisterSet::Contains(register_mask_, reg_id);
   }
@@ -557,7 +569,14 @@
       return false;
     }
     Location input = inputs_.Get(input_index);
-    if (input.IsRegister() || input.IsFpuRegister() || input.IsPair()) {
+    if (input.IsRegister()
+        || input.IsFpuRegister()
+        || input.IsPair()
+        || input.IsStackSlot()
+        || input.IsDoubleStackSlot()) {
+      // For fixed locations, the register allocator requires to have inputs die before
+      // the instruction, so that input moves use the location of the input just
+      // before that instruction (and not potential moves due to splitting).
       return false;
     }
     return true;
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index e120bc6..260076a 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -58,7 +58,8 @@
         reserved_out_slots_(0),
         maximum_number_of_live_core_registers_(0),
         maximum_number_of_live_fp_registers_(0) {
-  codegen->SetupBlockedRegisters();
+  static constexpr bool kIsBaseline = false;
+  codegen->SetupBlockedRegisters(kIsBaseline);
   physical_core_register_intervals_.SetSize(codegen->GetNumberOfCoreRegisters());
   physical_fp_register_intervals_.SetSize(codegen->GetNumberOfFloatingPointRegisters());
   // Always reserve for the current method and the graph's max out registers.
@@ -278,14 +279,18 @@
   if (locations->WillCall()) {
     // Block all registers.
     for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) {
-      BlockRegister(Location::RegisterLocation(i),
-                    position,
-                    position + 1);
+      if (!codegen_->IsCoreCalleeSaveRegister(i)) {
+        BlockRegister(Location::RegisterLocation(i),
+                      position,
+                      position + 1);
+      }
     }
     for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) {
-      BlockRegister(Location::FpuRegisterLocation(i),
-                    position,
-                    position + 1);
+      if (!codegen_->IsFloatingPointCalleeSaveRegister(i)) {
+        BlockRegister(Location::FpuRegisterLocation(i),
+                      position,
+                      position + 1);
+      }
     }
   }
 
@@ -627,6 +632,9 @@
     // (6) If the interval had a register allocated, add it to the list of active
     //     intervals.
     if (success) {
+      codegen_->AddAllocatedRegister(processing_core_registers_
+          ? Location::RegisterLocation(current->GetRegister())
+          : Location::FpuRegisterLocation(current->GetRegister()));
       active_.Add(current);
       if (current->HasHighInterval() && !current->GetHighInterval()->HasRegister()) {
         current->GetHighInterval()->SetRegister(GetHighForLowRegister(current->GetRegister()));
@@ -1357,9 +1365,11 @@
       switch (source.GetKind()) {
         case Location::kRegister: {
           locations->AddLiveRegister(source);
-          DCHECK_LE(locations->GetNumberOfLiveRegisters(),
-                    maximum_number_of_live_core_registers_ +
-                    maximum_number_of_live_fp_registers_);
+          if (kIsDebugBuild && locations->OnlyCallsOnSlowPath()) {
+            DCHECK_LE(locations->GetNumberOfLiveRegisters(),
+                      maximum_number_of_live_core_registers_ +
+                      maximum_number_of_live_fp_registers_);
+          }
           if (current->GetType() == Primitive::kPrimNot) {
             locations->SetRegisterBit(source.reg());
           }
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index e24fa1b..2fc251b 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -180,18 +180,18 @@
   void Init(CpuRegister base_in, int32_t disp) {
     if (disp == 0 && base_in.LowBits() != RBP) {
       SetModRM(0, base_in);
-      if (base_in.AsRegister() == RSP) {
+      if (base_in.LowBits() == RSP) {
         SetSIB(TIMES_1, CpuRegister(RSP), base_in);
       }
     } else if (disp >= -128 && disp <= 127) {
       SetModRM(1, base_in);
-      if (base_in.AsRegister() == RSP) {
+      if (base_in.LowBits() == RSP) {
         SetSIB(TIMES_1, CpuRegister(RSP), base_in);
       }
       SetDisp8(disp);
     } else {
       SetModRM(2, base_in);
-      if (base_in.AsRegister() == RSP) {
+      if (base_in.LowBits() == RSP) {
         SetSIB(TIMES_1, CpuRegister(RSP), base_in);
       }
       SetDisp32(disp);
diff --git a/runtime/check_reference_map_visitor.h b/runtime/check_reference_map_visitor.h
index 4fe3852..93062a7 100644
--- a/runtime/check_reference_map_visitor.h
+++ b/runtime/check_reference_map_visitor.h
@@ -82,7 +82,7 @@
           CHECK(stack_mask.LoadBit(dex_register_map.GetValue(reg) >> 2));
           break;
         case DexRegisterMap::kInRegister:
-          CHECK_NE(register_mask & dex_register_map.GetValue(reg), 0u);
+          CHECK_NE(register_mask & (1 << dex_register_map.GetValue(reg)), 0u);
           break;
         case DexRegisterMap::kInFpuRegister:
           // In Fpu register, should not be a reference.
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 78a8bf8..5690d51 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -2137,6 +2137,7 @@
         uintptr_t native_pc_offset = m->NativeQuickPcOffset(GetCurrentQuickFramePc(), entry_point);
         StackMap map = m->GetStackMap(native_pc_offset);
         MemoryRegion mask = map.GetStackMask();
+        // Visit stack entries that hold pointers.
         for (size_t i = 0; i < mask.size_in_bits(); ++i) {
           if (mask.LoadBit(i)) {
             StackReference<mirror::Object>* ref_addr =
@@ -2151,6 +2152,16 @@
             }
           }
         }
+        // Visit callee-save registers that hold pointers.
+        uint32_t register_mask = map.GetRegisterMask();
+        for (size_t i = 0; i < BitSizeOf<uint32_t>(); ++i) {
+          if (register_mask & (1 << i)) {
+            mirror::Object** ref_addr = reinterpret_cast<mirror::Object**>(GetGPRAddress(i));
+            if (*ref_addr != nullptr) {
+              visitor_(ref_addr, -1, this);
+            }
+          }
+        }
       } else {
         const uint8_t* native_gc_map = m->GetNativeGcMap(sizeof(void*));
         CHECK(native_gc_map != nullptr) << PrettyMethod(m);