Merge "Support callee save floating point registers on x64."
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 0af70f9..43fd8bb 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -140,6 +140,9 @@
                                      size_t maximum_number_of_live_core_registers,
                                      size_t maximum_number_of_live_fp_registers,
                                      size_t number_of_out_slots) {
+  core_spill_mask_ = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
+  DCHECK_NE(core_spill_mask_, 0u) << "At least the return address register must be saved";
+  fpu_spill_mask_ = allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_;
   first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize;
 
   SetFrameSize(RoundUp(
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 16080a4..85d18c0 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -117,13 +117,13 @@
                         size_t maximum_number_of_live_core_registers,
                         size_t maximum_number_of_live_fp_registers,
                         size_t number_of_out_slots);
-  virtual size_t FrameEntrySpillSize() const = 0;
   int32_t GetStackSlot(HLocal* local) const;
   Location GetTemporaryLocation(HTemporary* temp) const;
 
   uint32_t GetFrameSize() const { return frame_size_; }
   void SetFrameSize(uint32_t size) { frame_size_ = size; }
   uint32_t GetCoreSpillMask() const { return core_spill_mask_; }
+  uint32_t GetFpuSpillMask() const { return fpu_spill_mask_; }
 
   size_t GetNumberOfCoreRegisters() const { return number_of_core_registers_; }
   size_t GetNumberOfFloatingPointRegisters() const { return number_of_fpu_registers_; }
@@ -225,6 +225,7 @@
                 const CompilerOptions& compiler_options)
       : frame_size_(kUninitializedFrameSize),
         core_spill_mask_(0),
+        fpu_spill_mask_(0),
         first_register_slot_in_slow_path_(0),
         blocked_core_registers_(graph->GetArena()->AllocArray<bool>(number_of_core_registers)),
         blocked_fpu_registers_(graph->GetArena()->AllocArray<bool>(number_of_fpu_registers)),
@@ -254,9 +255,29 @@
 
   virtual ParallelMoveResolver* GetMoveResolver() = 0;
 
+  // Returns the location of the first spilled entry for floating point registers,
+  // relative to the stack pointer.
+  uint32_t GetFpuSpillStart() const {
+    DCHECK_NE(frame_size_, kUninitializedFrameSize);
+    return GetFrameSize() - FrameEntrySpillSize();
+  }
+
+  uint32_t GetFpuSpillSize() const {
+    return POPCOUNT(fpu_spill_mask_) * GetFloatingPointSpillSlotSize();
+  }
+
+  uint32_t GetCoreSpillSize() const {
+    return POPCOUNT(core_spill_mask_) * GetWordSize();
+  }
+
+  uint32_t FrameEntrySpillSize() const {
+    return GetFpuSpillSize() + GetCoreSpillSize();
+  }
+
   // Frame size required for this method.
   uint32_t frame_size_;
   uint32_t core_spill_mask_;
+  uint32_t fpu_spill_mask_;
   uint32_t first_register_slot_in_slow_path_;
 
   // Registers that were allocated during linear scan.
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index bc8858b..f4e4f5a 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -42,7 +42,6 @@
   return ((location.low() & 1) == 0) && (location.low() + 1 == location.high());
 }
 
-static constexpr int kNumberOfPushedRegistersAtEntry = 1 + 2;  // LR, R6, R7
 static constexpr int kCurrentMethodStackOffset = 0;
 
 static constexpr Register kRuntimeParameterCoreRegisters[] = { R0, R1, R2, R3 };
@@ -113,20 +112,6 @@
   DISALLOW_COPY_AND_ASSIGN(DivZeroCheckSlowPathARM);
 };
 
-class StackOverflowCheckSlowPathARM : public SlowPathCodeARM {
- public:
-  StackOverflowCheckSlowPathARM() {}
-
-  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    __ Bind(GetEntryLabel());
-    __ LoadFromOffset(kLoadWord, PC, TR,
-        QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pThrowStackOverflow).Int32Value());
-  }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(StackOverflowCheckSlowPathARM);
-};
-
 class SuspendCheckSlowPathARM : public SlowPathCodeARM {
  public:
   SuspendCheckSlowPathARM(HSuspendCheck* instruction, HBasicBlock* successor)
@@ -390,16 +375,19 @@
                                    const ArmInstructionSetFeatures& isa_features,
                                    const CompilerOptions& compiler_options)
     : CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfSRegisters,
-                    kNumberOfRegisterPairs, 0, 0, compiler_options),
+                    kNumberOfRegisterPairs, (1 << R6) | (1 << R7) | (1 << LR), 0, compiler_options),
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetArena(), this),
       assembler_(true),
-      isa_features_(isa_features) {}
-
-size_t CodeGeneratorARM::FrameEntrySpillSize() const {
-  return kNumberOfPushedRegistersAtEntry * kArmWordSize;
+      isa_features_(isa_features) {
+  // We unconditionally allocate R6 and R7 to ensure we can do long operations
+  // with baseline.
+  AddAllocatedRegister(Location::RegisterLocation(R6));
+  AddAllocatedRegister(Location::RegisterLocation(R7));
+  // Save the link register to mimic Quick.
+  AddAllocatedRegister(Location::RegisterLocation(LR));
 }
 
 Location CodeGeneratorARM::AllocateFreeRegister(Primitive::Type type) const {
@@ -516,32 +504,21 @@
 void CodeGeneratorARM::GenerateFrameEntry() {
   bool skip_overflow_check =
       IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kArm);
+  DCHECK(GetCompilerOptions().GetImplicitStackOverflowChecks());
   if (!skip_overflow_check) {
-    if (GetCompilerOptions().GetImplicitStackOverflowChecks()) {
-      __ AddConstant(IP, SP, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm)));
-      __ LoadFromOffset(kLoadWord, IP, IP, 0);
-      RecordPcInfo(nullptr, 0);
-    } else {
-      SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathARM();
-      AddSlowPath(slow_path);
-
-      __ LoadFromOffset(kLoadWord, IP, TR, Thread::StackEndOffset<kArmWordSize>().Int32Value());
-      __ cmp(SP, ShifterOperand(IP));
-      __ b(slow_path->GetEntryLabel(), CC);
-    }
+    __ AddConstant(IP, SP, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm)));
+    __ LoadFromOffset(kLoadWord, IP, IP, 0);
+    RecordPcInfo(nullptr, 0);
   }
 
-  core_spill_mask_ |= (1 << LR | 1 << R6 | 1 << R7);
-  __ PushList(1 << LR | 1 << R6 | 1 << R7);
-
-  // The return PC has already been pushed on the stack.
-  __ AddConstant(SP, -(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kArmWordSize));
+  __ PushList(core_spill_mask_);
+  __ AddConstant(SP, -(GetFrameSize() - FrameEntrySpillSize()));
   __ StoreToOffset(kStoreWord, R0, SP, 0);
 }
 
 void CodeGeneratorARM::GenerateFrameExit() {
-  __ AddConstant(SP, GetFrameSize() - kNumberOfPushedRegistersAtEntry * kArmWordSize);
-  __ PopList(1 << PC | 1 << R6 | 1 << R7);
+  __ AddConstant(SP, GetFrameSize() - FrameEntrySpillSize());
+  __ PopList((core_spill_mask_ & (~(1 << LR))) | 1 << PC);
 }
 
 void CodeGeneratorARM::Bind(HBasicBlock* block) {
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index f3b1ff5..46accfd 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -179,8 +179,6 @@
     return kArmWordSize;
   }
 
-  size_t FrameEntrySpillSize() const OVERRIDE;
-
   HGraphVisitor* GetLocationBuilder() OVERRIDE {
     return &location_builder_;
   }
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 21c1e9c..1f561b7 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -434,21 +434,6 @@
   DISALLOW_COPY_AND_ASSIGN(NullCheckSlowPathARM64);
 };
 
-class StackOverflowCheckSlowPathARM64 : public SlowPathCodeARM64 {
- public:
-  StackOverflowCheckSlowPathARM64() {}
-
-  virtual void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
-    __ Bind(GetEntryLabel());
-    arm64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pThrowStackOverflow), nullptr, 0);
-    CheckEntrypointTypes<kQuickThrowStackOverflow, void, void*>();
-  }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(StackOverflowCheckSlowPathARM64);
-};
-
 class SuspendCheckSlowPathARM64 : public SlowPathCodeARM64 {
  public:
   explicit SuspendCheckSlowPathARM64(HSuspendCheck* instruction,
@@ -567,13 +552,16 @@
                     kNumberOfAllocatableRegisters,
                     kNumberOfAllocatableFPRegisters,
                     kNumberOfAllocatableRegisterPairs,
-                    0,
+                    (1 << LR),
                     0,
                     compiler_options),
       block_labels_(nullptr),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
-      move_resolver_(graph->GetArena(), this) {}
+      move_resolver_(graph->GetArena(), this) {
+  // Save the link register (containing the return address) to mimic Quick.
+  AddAllocatedRegister(Location::RegisterLocation(LR));
+}
 
 #undef __
 #define __ GetVIXLAssembler()->
@@ -607,26 +595,15 @@
   if (do_overflow_check) {
     UseScratchRegisterScope temps(GetVIXLAssembler());
     Register temp = temps.AcquireX();
-    if (GetCompilerOptions().GetImplicitStackOverflowChecks()) {
-      __ Add(temp, sp, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm64)));
-      __ Ldr(wzr, MemOperand(temp, 0));
-      RecordPcInfo(nullptr, 0);
-    } else {
-      SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathARM64();
-      AddSlowPath(slow_path);
-
-      __ Ldr(temp, MemOperand(tr, Thread::StackEndOffset<kArm64WordSize>().Int32Value()));
-      __ Cmp(sp, temp);
-      __ B(lo, slow_path->GetEntryLabel());
-    }
+    DCHECK(GetCompilerOptions().GetImplicitStackOverflowChecks());
+    __ Add(temp, sp, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm64)));
+    __ Ldr(wzr, MemOperand(temp, 0));
+    RecordPcInfo(nullptr, 0);
   }
 
-  CPURegList preserved_regs = GetFramePreservedRegisters();
   int frame_size = GetFrameSize();
-  core_spill_mask_ |= preserved_regs.list();
-
   __ Str(w0, MemOperand(sp, -frame_size, PreIndex));
-  __ PokeCPURegList(preserved_regs, frame_size - preserved_regs.TotalSizeInBytes());
+  __ PokeCPURegList(GetFramePreservedRegisters(), frame_size - FrameEntrySpillSize());
 
   // Stack layout:
   // sp[frame_size - 8]        : lr.
@@ -638,8 +615,7 @@
 
 void CodeGeneratorARM64::GenerateFrameExit() {
   int frame_size = GetFrameSize();
-  CPURegList preserved_regs = GetFramePreservedRegisters();
-  __ PeekCPURegList(preserved_regs, frame_size - preserved_regs.TotalSizeInBytes());
+  __ PeekCPURegList(GetFramePreservedRegisters(), frame_size - FrameEntrySpillSize());
   __ Drop(frame_size);
 }
 
@@ -690,10 +666,6 @@
   }
 }
 
-size_t CodeGeneratorARM64::FrameEntrySpillSize() const {
-  return GetFramePreservedRegistersSize();
-}
-
 Location CodeGeneratorARM64::GetStackLocation(HLoadLocal* load) const {
   Primitive::Type type = load->GetType();
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index d81e481..96013e5 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -178,9 +178,6 @@
         vixl::CPURegList(vixl::CPURegister::kRegister, vixl::kXRegSize, vixl::lr.Bit());
     return frame_preserved_regs;
   }
-  static int GetFramePreservedRegistersSize() {
-    return GetFramePreservedRegisters().TotalSizeInBytes();
-  }
 
   void Bind(HBasicBlock* block) OVERRIDE;
 
@@ -205,8 +202,6 @@
     return block_entry_label->location();
   }
 
-  size_t FrameEntrySpillSize() const OVERRIDE;
-
   HGraphVisitor* GetLocationBuilder() OVERRIDE { return &location_builder_; }
   HGraphVisitor* GetInstructionVisitor() OVERRIDE { return &instruction_visitor_; }
   Arm64Assembler* GetAssembler() OVERRIDE { return &assembler_; }
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 9e26ddd..c0fdcaa 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -31,7 +31,6 @@
 
 namespace x86 {
 
-static constexpr int kNumberOfPushedRegistersAtEntry = 1;
 static constexpr int kCurrentMethodStackOffset = 0;
 
 static constexpr Register kRuntimeParameterCoreRegisters[] = { EAX, ECX, EDX, EBX };
@@ -44,6 +43,7 @@
 
 // Marker for places that can be updated once we don't follow the quick ABI.
 static constexpr bool kFollowsQuickABI = true;
+static constexpr int kFakeReturnRegister = Register(8);
 
 class InvokeRuntimeCallingConvention : public CallingConvention<Register, XmmRegister> {
  public:
@@ -123,21 +123,6 @@
   DISALLOW_COPY_AND_ASSIGN(DivRemMinusOneSlowPathX86);
 };
 
-class StackOverflowCheckSlowPathX86 : public SlowPathCodeX86 {
- public:
-  StackOverflowCheckSlowPathX86() {}
-
-  virtual void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    __ Bind(GetEntryLabel());
-    __ addl(ESP,
-            Immediate(codegen->GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86WordSize));
-    __ fs()->jmp(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pThrowStackOverflow)));
-  }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(StackOverflowCheckSlowPathX86);
-};
-
 class BoundsCheckSlowPathX86 : public SlowPathCodeX86 {
  public:
   BoundsCheckSlowPathX86(HBoundsCheck* instruction,
@@ -375,14 +360,13 @@
 
 CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options)
     : CodeGenerator(graph, kNumberOfCpuRegisters, kNumberOfXmmRegisters,
-                    kNumberOfRegisterPairs, 0, 0, compiler_options),
+                    kNumberOfRegisterPairs, (1 << kFakeReturnRegister), 0, compiler_options),
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
-      move_resolver_(graph->GetArena(), this) {}
-
-size_t CodeGeneratorX86::FrameEntrySpillSize() const {
-  return kNumberOfPushedRegistersAtEntry * kX86WordSize;
+      move_resolver_(graph->GetArena(), this) {
+  // Use a fake return address register to mimic Quick.
+  AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
 
 Location CodeGeneratorX86::AllocateFreeRegister(Primitive::Type type) const {
@@ -464,35 +448,21 @@
         codegen_(codegen) {}
 
 void CodeGeneratorX86::GenerateFrameEntry() {
-  // Create a fake register to mimic Quick.
-  static const int kFakeReturnRegister = 8;
-  core_spill_mask_ |= (1 << kFakeReturnRegister);
-
   bool skip_overflow_check =
       IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86);
-  bool implicitStackOverflowChecks = GetCompilerOptions().GetImplicitStackOverflowChecks();
+  DCHECK(GetCompilerOptions().GetImplicitStackOverflowChecks());
 
-  if (!skip_overflow_check && implicitStackOverflowChecks) {
+  if (!skip_overflow_check) {
     __ testl(EAX, Address(ESP, -static_cast<int32_t>(GetStackOverflowReservedBytes(kX86))));
     RecordPcInfo(nullptr, 0);
   }
 
-  // The return PC has already been pushed on the stack.
-  __ subl(ESP, Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86WordSize));
-
-  if (!skip_overflow_check && !implicitStackOverflowChecks) {
-    SlowPathCodeX86* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathX86();
-    AddSlowPath(slow_path);
-
-    __ fs()->cmpl(ESP, Address::Absolute(Thread::StackEndOffset<kX86WordSize>()));
-    __ j(kLess, slow_path->GetEntryLabel());
-  }
-
+  __ subl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
   __ movl(Address(ESP, kCurrentMethodStackOffset), EAX);
 }
 
 void CodeGeneratorX86::GenerateFrameExit() {
-  __ addl(ESP, Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86WordSize));
+  __ addl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
 }
 
 void CodeGeneratorX86::Bind(HBasicBlock* block) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index dcfeb2f..73b647c 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -178,8 +178,6 @@
     return 2 * kX86WordSize;
   }
 
-  size_t FrameEntrySpillSize() const OVERRIDE;
-
   HGraphVisitor* GetLocationBuilder() OVERRIDE {
     return &location_builder_;
   }
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 3d99695..e60f8a5 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -37,7 +37,6 @@
 // Some x86_64 instructions require a register to be available as temp.
 static constexpr Register TMP = R11;
 
-static constexpr int kNumberOfPushedRegistersAtEntry = 1;
 static constexpr int kCurrentMethodStackOffset = 0;
 
 static constexpr Register kRuntimeParameterCoreRegisters[] = { RDI, RSI, RDX };
@@ -46,7 +45,10 @@
 static constexpr FloatRegister kRuntimeParameterFpuRegisters[] = { XMM0, XMM1 };
 static constexpr size_t kRuntimeParameterFpuRegistersLength =
     arraysize(kRuntimeParameterFpuRegisters);
-static constexpr Register kCoreCalleeSaves[] = { RBX, RBP, R12, R13, R14, R15 };
+static constexpr Register kFakeReturnRegister = Register(16);
+static constexpr Register kCoreCalleeSaves[] =
+    { RBX, RBP, R12, R13, R14, R15, kFakeReturnRegister };
+static constexpr FloatRegister kFpuCalleeSaves[] = { XMM12, XMM13, XMM14, XMM15 };
 
 static constexpr int kC2ConditionMask = 0x400;
 
@@ -128,22 +130,6 @@
   DISALLOW_COPY_AND_ASSIGN(DivRemMinusOneSlowPathX86_64);
 };
 
-class StackOverflowCheckSlowPathX86_64 : public SlowPathCodeX86_64 {
- public:
-  StackOverflowCheckSlowPathX86_64() {}
-
-  virtual void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    __ Bind(GetEntryLabel());
-    __ addq(CpuRegister(RSP),
-            Immediate(codegen->GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86_64WordSize));
-    __ gs()->jmp(
-        Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pThrowStackOverflow), true));
-  }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(StackOverflowCheckSlowPathX86_64);
-};
-
 class SuspendCheckSlowPathX86_64 : public SlowPathCodeX86_64 {
  public:
   explicit SuspendCheckSlowPathX86_64(HSuspendCheck* instruction, HBasicBlock* successor)
@@ -417,26 +403,32 @@
   return kX86_64WordSize;
 }
 
-static uint32_t ComputeCoreCalleeSaveMask() {
+static uint32_t ComputeCalleeSaveMask(const int* registers, size_t length) {
   uint32_t mask = 0;
-  for (size_t i = 0, e = arraysize(kCoreCalleeSaves); i < e; ++i) {
-    mask |= (1 << kCoreCalleeSaves[i]);
+  for (size_t i = 0, e = length; i < e; ++i) {
+    mask |= (1 << registers[i]);
   }
   return mask;
 }
 
+static constexpr int kNumberOfCpuRegisterPairs = 0;
 CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& compiler_options)
       : CodeGenerator(graph,
                       kNumberOfCpuRegisters,
                       kNumberOfFloatRegisters,
-                      0,
-                      ComputeCoreCalleeSaveMask(),
-                      0,
+                      kNumberOfCpuRegisterPairs,
+                      ComputeCalleeSaveMask(reinterpret_cast<const int*>(kCoreCalleeSaves),
+                                            arraysize(kCoreCalleeSaves)),
+                      ComputeCalleeSaveMask(reinterpret_cast<const int*>(kFpuCalleeSaves),
+                                            arraysize(kFpuCalleeSaves)),
                       compiler_options),
         block_labels_(graph->GetArena(), 0),
         location_builder_(graph, this),
         instruction_visitor_(graph, this),
-        move_resolver_(graph->GetArena(), this) {}
+        move_resolver_(graph->GetArena(), this) {
+  // Use a fake return address register to mimic Quick.
+  AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
+}
 
 InstructionCodeGeneratorX86_64::InstructionCodeGeneratorX86_64(HGraph* graph,
                                                                CodeGeneratorX86_64* codegen)
@@ -470,12 +462,6 @@
   return Location();
 }
 
-size_t CodeGeneratorX86_64::FrameEntrySpillSize() const {
-  uint32_t mask = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
-  return kNumberOfPushedRegistersAtEntry * kX86_64WordSize
-      + __builtin_popcount(mask) * kX86_64WordSize;
-}
-
 void CodeGeneratorX86_64::SetupBlockedRegisters(bool is_baseline) const {
   // Stack register is always reserved.
   blocked_core_registers_[RSP] = true;
@@ -487,57 +473,60 @@
     for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
       blocked_core_registers_[kCoreCalleeSaves[i]] = true;
     }
+    for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) {
+      blocked_fpu_registers_[kFpuCalleeSaves[i]] = true;
+    }
   }
-
-  // TODO: We currently don't use Quick's FP callee saved registers.
-  blocked_fpu_registers_[XMM12] = true;
-  blocked_fpu_registers_[XMM13] = true;
-  blocked_fpu_registers_[XMM14] = true;
-  blocked_fpu_registers_[XMM15] = true;
 }
 
 void CodeGeneratorX86_64::GenerateFrameEntry() {
-  // Create a fake register to mimic Quick.
-  static const int kFakeReturnRegister = 16;
-  core_spill_mask_ |= (1 << kFakeReturnRegister);
-  core_spill_mask_ |= (allocated_registers_.GetCoreRegisters() & core_callee_save_mask_);
-
   bool skip_overflow_check = IsLeafMethod()
       && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86_64);
-  bool implicitStackOverflowChecks = GetCompilerOptions().GetImplicitStackOverflowChecks();
+  DCHECK(GetCompilerOptions().GetImplicitStackOverflowChecks());
 
-  if (!skip_overflow_check && implicitStackOverflowChecks) {
+  if (!skip_overflow_check) {
     __ testq(CpuRegister(RAX), Address(
         CpuRegister(RSP), -static_cast<int32_t>(GetStackOverflowReservedBytes(kX86_64))));
     RecordPcInfo(nullptr, 0);
   }
 
   for (int i = arraysize(kCoreCalleeSaves) - 1; i >= 0; --i) {
-    if (allocated_registers_.ContainsCoreRegister(kCoreCalleeSaves[i])) {
-      __ pushq(CpuRegister(kCoreCalleeSaves[i]));
+    Register reg = kCoreCalleeSaves[i];
+    if (allocated_registers_.ContainsCoreRegister(reg) && reg != kFakeReturnRegister) {
+      __ pushq(CpuRegister(reg));
     }
   }
 
-  __ subq(CpuRegister(RSP), Immediate(GetFrameSize() - FrameEntrySpillSize()));
+  __ subq(CpuRegister(RSP), Immediate(GetFrameSize() - GetCoreSpillSize()));
+  uint32_t xmm_spill_location = GetFpuSpillStart();
+  size_t xmm_spill_slot_size = GetFloatingPointSpillSlotSize();
 
-  if (!skip_overflow_check && !implicitStackOverflowChecks) {
-    SlowPathCodeX86_64* slow_path = new (GetGraph()->GetArena()) StackOverflowCheckSlowPathX86_64();
-    AddSlowPath(slow_path);
-
-    __ gs()->cmpq(CpuRegister(RSP),
-                  Address::Absolute(Thread::StackEndOffset<kX86_64WordSize>(), true));
-    __ j(kLess, slow_path->GetEntryLabel());
+  for (int i = arraysize(kFpuCalleeSaves) - 1; i >= 0; --i) {
+    if (allocated_registers_.ContainsFloatingPointRegister(kFpuCalleeSaves[i])) {
+      __ movsd(Address(CpuRegister(RSP), xmm_spill_location + (xmm_spill_slot_size * i)),
+               XmmRegister(kFpuCalleeSaves[i]));
+    }
   }
 
   __ movl(Address(CpuRegister(RSP), kCurrentMethodStackOffset), CpuRegister(RDI));
 }
 
 void CodeGeneratorX86_64::GenerateFrameExit() {
-  __ addq(CpuRegister(RSP), Immediate(GetFrameSize() - FrameEntrySpillSize()));
+  uint32_t xmm_spill_location = GetFpuSpillStart();
+  size_t xmm_spill_slot_size = GetFloatingPointSpillSlotSize();
+  for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) {
+    if (allocated_registers_.ContainsFloatingPointRegister(kFpuCalleeSaves[i])) {
+      __ movsd(XmmRegister(kFpuCalleeSaves[i]),
+               Address(CpuRegister(RSP), xmm_spill_location + (xmm_spill_slot_size * i)));
+    }
+  }
+
+  __ addq(CpuRegister(RSP), Immediate(GetFrameSize() - GetCoreSpillSize()));
 
   for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
-    if (allocated_registers_.ContainsCoreRegister(kCoreCalleeSaves[i])) {
-      __ popq(CpuRegister(kCoreCalleeSaves[i]));
+    Register reg = kCoreCalleeSaves[i];
+    if (allocated_registers_.ContainsCoreRegister(reg) && reg != kFakeReturnRegister) {
+      __ popq(CpuRegister(reg));
     }
   }
 }
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 645fb17..1ac2ab7 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -194,8 +194,6 @@
     return kX86_64WordSize;
   }
 
-  size_t FrameEntrySpillSize() const OVERRIDE;
-
   HGraphVisitor* GetLocationBuilder() OVERRIDE {
     return &location_builder_;
   }
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 1e0d65a..5bca730 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -286,7 +286,7 @@
       ArrayRef<const uint8_t>(allocator.GetMemory()),
       codegen->GetFrameSize(),
       codegen->GetCoreSpillMask(),
-      0, /* FPR spill mask, unused */
+      codegen->GetFpuSpillMask(),
       ArrayRef<const uint8_t>(stack_map));
 }
 
@@ -313,7 +313,7 @@
                                                  ArrayRef<const uint8_t>(allocator.GetMemory()),
                                                  codegen->GetFrameSize(),
                                                  codegen->GetCoreSpillMask(),
-                                                 0, /* FPR spill mask, unused */
+                                                 codegen->GetFpuSpillMask(),
                                                  &src_mapping_table,
                                                  AlignVectorSize(mapping_table),
                                                  AlignVectorSize(vmap_table),