Implement register allocator for floating point registers.

Also:
- Fix misuses of emitting the rex prefix in the x86_64 assembler.
- Fix movaps code generation in the x86_64 assembler.

Change-Id: Ib6dcf6e7c4a9c43368cfc46b02ba50f69ae69cbe
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 408e13e..d5cd490 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -473,8 +473,7 @@
       case Location::kRegister : {
         int id = location.reg();
         stack_map_stream_.AddDexRegisterEntry(DexRegisterMap::kInRegister, id);
-        if (current->GetType() == Primitive::kPrimDouble
-            || current->GetType() == Primitive::kPrimLong) {
+        if (current->GetType() == Primitive::kPrimLong) {
           stack_map_stream_.AddDexRegisterEntry(DexRegisterMap::kInRegister, id);
           ++i;
           DCHECK_LT(i, environment_size);
@@ -482,52 +481,55 @@
         break;
       }
 
+      case Location::kFpuRegister : {
+        int id = location.reg();
+        stack_map_stream_.AddDexRegisterEntry(DexRegisterMap::kInFpuRegister, id);
+        if (current->GetType() == Primitive::kPrimDouble) {
+          stack_map_stream_.AddDexRegisterEntry(DexRegisterMap::kInFpuRegister, id);
+          ++i;
+          DCHECK_LT(i, environment_size);
+        }
+        break;
+      }
+
       default:
         LOG(FATAL) << "Unexpected kind " << location.GetKind();
     }
   }
 }
 
-size_t CodeGenerator::GetStackOffsetOfSavedRegister(size_t index) {
-  return first_register_slot_in_slow_path_ + index * GetWordSize();
-}
-
 void CodeGenerator::SaveLiveRegisters(LocationSummary* locations) {
   RegisterSet* register_set = locations->GetLiveRegisters();
-  uint32_t count = 0;
+  size_t stack_offset = first_register_slot_in_slow_path_;
   for (size_t i = 0, e = GetNumberOfCoreRegisters(); i < e; ++i) {
     if (register_set->ContainsCoreRegister(i)) {
-      size_t stack_offset = GetStackOffsetOfSavedRegister(count);
-      ++count;
-      SaveCoreRegister(Location::StackSlot(stack_offset), i);
       // If the register holds an object, update the stack mask.
       if (locations->RegisterContainsObject(i)) {
         locations->SetStackBit(stack_offset / kVRegSize);
       }
+      stack_offset += SaveCoreRegister(stack_offset, i);
     }
   }
 
   for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
     if (register_set->ContainsFloatingPointRegister(i)) {
-      LOG(FATAL) << "Unimplemented";
+      stack_offset += SaveFloatingPointRegister(stack_offset, i);
     }
   }
 }
 
 void CodeGenerator::RestoreLiveRegisters(LocationSummary* locations) {
   RegisterSet* register_set = locations->GetLiveRegisters();
-  uint32_t count = 0;
+  size_t stack_offset = first_register_slot_in_slow_path_;
   for (size_t i = 0, e = GetNumberOfCoreRegisters(); i < e; ++i) {
     if (register_set->ContainsCoreRegister(i)) {
-      size_t stack_offset = GetStackOffsetOfSavedRegister(count);
-      ++count;
-      RestoreCoreRegister(Location::StackSlot(stack_offset), i);
+      stack_offset += RestoreCoreRegister(stack_offset, i);
     }
   }
 
   for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
     if (register_set->ContainsFloatingPointRegister(i)) {
-      LOG(FATAL) << "Unimplemented";
+      stack_offset += RestoreFloatingPointRegister(stack_offset, i);
     }
   }
 }
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 7aaf991..220d745 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -110,8 +110,18 @@
   virtual void DumpCoreRegister(std::ostream& stream, int reg) const = 0;
   virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const = 0;
   virtual InstructionSet GetInstructionSet() const = 0;
-  virtual void SaveCoreRegister(Location stack_location, uint32_t reg_id) = 0;
-  virtual void RestoreCoreRegister(Location stack_location, uint32_t reg_id) = 0;
+  // Saves the register in the stack. Returns the size taken on stack.
+  virtual size_t SaveCoreRegister(size_t stack_index, uint32_t reg_id) = 0;
+  // Restores the register from the stack. Returns the size taken on stack.
+  virtual size_t RestoreCoreRegister(size_t stack_index, uint32_t reg_id) = 0;
+  virtual size_t SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
+    LOG(FATAL) << "Unimplemented";
+    return 0u;
+  }
+  virtual size_t RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
+    LOG(FATAL) << "Unimplemented";
+    return 0u;
+  }
 
   void RecordPcInfo(HInstruction* instruction, uint32_t dex_pc);
 
@@ -145,6 +155,7 @@
   void ClearSpillSlotsFromLoopPhisInStackMap(HSuspendCheck* suspend_check) const;
 
   bool* GetBlockedCoreRegisters() const { return blocked_core_registers_; }
+  bool* GetBlockedFloatingPointRegisters() const { return blocked_fpu_registers_; }
 
  protected:
   CodeGenerator(HGraph* graph,
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index ebb1d6a..24b7c2d 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -210,12 +210,14 @@
   stream << ArmManagedRegister::FromDRegister(DRegister(reg));
 }
 
-void CodeGeneratorARM::SaveCoreRegister(Location stack_location, uint32_t reg_id) {
-  __ StoreToOffset(kStoreWord, static_cast<Register>(reg_id), SP, stack_location.GetStackIndex());
+size_t CodeGeneratorARM::SaveCoreRegister(size_t stack_index, uint32_t reg_id) {
+  __ StoreToOffset(kStoreWord, static_cast<Register>(reg_id), SP, stack_index);
+  return kArmWordSize;
 }
 
-void CodeGeneratorARM::RestoreCoreRegister(Location stack_location, uint32_t reg_id) {
-  __ LoadFromOffset(kLoadWord, static_cast<Register>(reg_id), SP, stack_location.GetStackIndex());
+size_t CodeGeneratorARM::RestoreCoreRegister(size_t stack_index, uint32_t reg_id) {
+  __ LoadFromOffset(kLoadWord, static_cast<Register>(reg_id), SP, stack_index);
+  return kArmWordSize;
 }
 
 CodeGeneratorARM::CodeGeneratorARM(HGraph* graph)
@@ -859,6 +861,26 @@
   // Will be generated at use site.
 }
 
+void LocationsBuilderARM::VisitFloatConstant(HFloatConstant* constant) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(constant, LocationSummary::kNoCall);
+  locations->SetOut(Location::ConstantLocation(constant));
+}
+
+void InstructionCodeGeneratorARM::VisitFloatConstant(HFloatConstant* constant) {
+  // Will be generated at use site.
+}
+
+void LocationsBuilderARM::VisitDoubleConstant(HDoubleConstant* constant) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(constant, LocationSummary::kNoCall);
+  locations->SetOut(Location::ConstantLocation(constant));
+}
+
+void InstructionCodeGeneratorARM::VisitDoubleConstant(HDoubleConstant* constant) {
+  // Will be generated at use site.
+}
+
 void LocationsBuilderARM::VisitReturnVoid(HReturnVoid* ret) {
   ret->SetLocations(nullptr);
 }
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 57b289c..1fe8a7e 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -142,8 +142,8 @@
   virtual void GenerateFrameExit() OVERRIDE;
   virtual void Bind(HBasicBlock* block) OVERRIDE;
   virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
-  virtual void SaveCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
-  virtual void RestoreCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
+  virtual size_t SaveCoreRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
+  virtual size_t RestoreCoreRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
 
   virtual size_t GetWordSize() const OVERRIDE {
     return kArmWordSize;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index cc2be82..2550518 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -182,12 +182,14 @@
   stream << X86ManagedRegister::FromXmmRegister(XmmRegister(reg));
 }
 
-void CodeGeneratorX86::SaveCoreRegister(Location stack_location, uint32_t reg_id) {
-  __ movl(Address(ESP, stack_location.GetStackIndex()), static_cast<Register>(reg_id));
+size_t CodeGeneratorX86::SaveCoreRegister(size_t stack_index, uint32_t reg_id) {
+  __ movl(Address(ESP, stack_index), static_cast<Register>(reg_id));
+  return kX86WordSize;
 }
 
-void CodeGeneratorX86::RestoreCoreRegister(Location stack_location, uint32_t reg_id) {
-  __ movl(static_cast<Register>(reg_id), Address(ESP, stack_location.GetStackIndex()));
+size_t CodeGeneratorX86::RestoreCoreRegister(size_t stack_index, uint32_t reg_id) {
+  __ movl(static_cast<Register>(reg_id), Address(ESP, stack_index));
+  return kX86WordSize;
 }
 
 CodeGeneratorX86::CodeGeneratorX86(HGraph* graph)
@@ -795,6 +797,26 @@
   // Will be generated at use site.
 }
 
+void LocationsBuilderX86::VisitFloatConstant(HFloatConstant* constant) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(constant, LocationSummary::kNoCall);
+  locations->SetOut(Location::ConstantLocation(constant));
+}
+
+void InstructionCodeGeneratorX86::VisitFloatConstant(HFloatConstant* constant) {
+  // Will be generated at use site.
+}
+
+void LocationsBuilderX86::VisitDoubleConstant(HDoubleConstant* constant) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(constant, LocationSummary::kNoCall);
+  locations->SetOut(Location::ConstantLocation(constant));
+}
+
+void InstructionCodeGeneratorX86::VisitDoubleConstant(HDoubleConstant* constant) {
+  // Will be generated at use site.
+}
+
 void LocationsBuilderX86::VisitReturnVoid(HReturnVoid* ret) {
   ret->SetLocations(nullptr);
 }
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index db8b9ab..fff91d1 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -144,8 +144,8 @@
   virtual void GenerateFrameExit() OVERRIDE;
   virtual void Bind(HBasicBlock* block) OVERRIDE;
   virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
-  virtual void SaveCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
-  virtual void RestoreCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
+  virtual size_t SaveCoreRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
+  virtual size_t RestoreCoreRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
 
   virtual size_t GetWordSize() const OVERRIDE {
     return kX86WordSize;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 9df9d41..9e63f8b 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -191,12 +191,24 @@
   stream << X86_64ManagedRegister::FromXmmRegister(FloatRegister(reg));
 }
 
-void CodeGeneratorX86_64::SaveCoreRegister(Location stack_location, uint32_t reg_id) {
-  __ movq(Address(CpuRegister(RSP), stack_location.GetStackIndex()), CpuRegister(reg_id));
+size_t CodeGeneratorX86_64::SaveCoreRegister(size_t stack_index, uint32_t reg_id) {
+  __ movq(Address(CpuRegister(RSP), stack_index), CpuRegister(reg_id));
+  return kX86_64WordSize;
 }
 
-void CodeGeneratorX86_64::RestoreCoreRegister(Location stack_location, uint32_t reg_id) {
-  __ movq(CpuRegister(reg_id), Address(CpuRegister(RSP), stack_location.GetStackIndex()));
+size_t CodeGeneratorX86_64::RestoreCoreRegister(size_t stack_index, uint32_t reg_id) {
+  __ movq(CpuRegister(reg_id), Address(CpuRegister(RSP), stack_index));
+  return kX86_64WordSize;
+}
+
+size_t CodeGeneratorX86_64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
+  __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
+  return kX86_64WordSize;
+}
+
+size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
+  __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
+  return kX86_64WordSize;
 }
 
 CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph)
@@ -727,6 +739,26 @@
   // Will be generated at use site.
 }
 
+void LocationsBuilderX86_64::VisitFloatConstant(HFloatConstant* constant) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(constant, LocationSummary::kNoCall);
+  locations->SetOut(Location::ConstantLocation(constant));
+}
+
+void InstructionCodeGeneratorX86_64::VisitFloatConstant(HFloatConstant* constant) {
+  // Will be generated at use site.
+}
+
+void LocationsBuilderX86_64::VisitDoubleConstant(HDoubleConstant* constant) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(constant, LocationSummary::kNoCall);
+  locations->SetOut(Location::ConstantLocation(constant));
+}
+
+void InstructionCodeGeneratorX86_64::VisitDoubleConstant(HDoubleConstant* constant) {
+  // Will be generated at use site.
+}
+
 void LocationsBuilderX86_64::VisitReturnVoid(HReturnVoid* ret) {
   ret->SetLocations(nullptr);
 }
@@ -995,7 +1027,7 @@
     case Primitive::kPrimDouble:
     case Primitive::kPrimFloat: {
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::Any());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     }
@@ -1032,21 +1064,12 @@
     }
 
     case Primitive::kPrimFloat: {
-      if (second.IsFpuRegister()) {
-        __ addss(first.As<XmmRegister>(), second.As<XmmRegister>());
-      } else {
-        __ addss(first.As<XmmRegister>(),
-                 Address(CpuRegister(RSP), second.GetStackIndex()));
-      }
+      __ addss(first.As<XmmRegister>(), second.As<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      if (second.IsFpuRegister()) {
-        __ addsd(first.As<XmmRegister>(), second.As<XmmRegister>());
-      } else {
-        __ addsd(first.As<XmmRegister>(), Address(CpuRegister(RSP), second.GetStackIndex()));
-      }
+      __ addsd(first.As<XmmRegister>(), second.As<XmmRegister>());
       break;
     }
 
@@ -1482,10 +1505,30 @@
       break;
     }
 
-    case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented register type " << instruction->GetType();
-      UNREACHABLE();
+    case Primitive::kPrimFloat: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(float)).Uint32Value();
+      XmmRegister out = locations->Out().As<XmmRegister>();
+      if (index.IsConstant()) {
+        __ movss(out, Address(obj,
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset));
+      } else {
+        __ movss(out, Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset));
+      }
+      break;
+    }
+
+    case Primitive::kPrimDouble: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(double)).Uint32Value();
+      XmmRegister out = locations->Out().As<XmmRegister>();
+      if (index.IsConstant()) {
+        __ movsd(out, Address(obj,
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset));
+      } else {
+        __ movsd(out, Address(obj, index.As<CpuRegister>(), TIMES_8, data_offset));
+      }
+      break;
+    }
+
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
@@ -1509,6 +1552,8 @@
     locations->SetInAt(2, Location::RequiresRegister());
     if (value_type == Primitive::kPrimLong) {
       locations->SetInAt(2, Location::RequiresRegister());
+    } else if (value_type == Primitive::kPrimFloat || value_type == Primitive::kPrimDouble) {
+      locations->SetInAt(2, Location::RequiresFpuRegister());
     } else {
       locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2)));
     }
@@ -1581,6 +1626,7 @@
           __ movl(Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset),
                   value.As<CpuRegister>());
         } else {
+          DCHECK(value.IsConstant()) << value;
           __ movl(Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset),
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
@@ -1609,10 +1655,34 @@
       break;
     }
 
-    case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented register type " << instruction->GetType();
-      UNREACHABLE();
+    case Primitive::kPrimFloat: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(float)).Uint32Value();
+      if (index.IsConstant()) {
+        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+        DCHECK(value.IsFpuRegister());
+        __ movss(Address(obj, offset), value.As<XmmRegister>());
+      } else {
+        DCHECK(value.IsFpuRegister());
+        __ movss(Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset),
+                value.As<XmmRegister>());
+      }
+      break;
+    }
+
+    case Primitive::kPrimDouble: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(double)).Uint32Value();
+      if (index.IsConstant()) {
+        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
+        DCHECK(value.IsFpuRegister());
+        __ movsd(Address(obj, offset), value.As<XmmRegister>());
+      } else {
+        DCHECK(value.IsFpuRegister());
+        __ movsd(Address(obj, index.As<CpuRegister>(), TIMES_8, data_offset),
+                value.As<XmmRegister>());
+      }
+      break;
+    }
+
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
@@ -1746,6 +1816,9 @@
     if (destination.IsRegister()) {
       __ movl(destination.As<CpuRegister>(),
               Address(CpuRegister(RSP), source.GetStackIndex()));
+    } else if (destination.IsFpuRegister()) {
+      __ movss(destination.As<XmmRegister>(),
+              Address(CpuRegister(RSP), source.GetStackIndex()));
     } else {
       DCHECK(destination.IsStackSlot());
       __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -1755,6 +1828,8 @@
     if (destination.IsRegister()) {
       __ movq(destination.As<CpuRegister>(),
               Address(CpuRegister(RSP), source.GetStackIndex()));
+    } else if (destination.IsFpuRegister()) {
+      __ movsd(destination.As<XmmRegister>(), Address(CpuRegister(RSP), source.GetStackIndex()));
     } else {
       DCHECK(destination.IsDoubleStackSlot());
       __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -1767,6 +1842,7 @@
       if (destination.IsRegister()) {
         __ movl(destination.As<CpuRegister>(), imm);
       } else {
+        DCHECK(destination.IsStackSlot()) << destination;
         __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), imm);
       }
     } else if (constant->IsLongConstant()) {
@@ -1774,14 +1850,42 @@
       if (destination.IsRegister()) {
         __ movq(destination.As<CpuRegister>(), Immediate(value));
       } else {
+        DCHECK(destination.IsDoubleStackSlot()) << destination;
         __ movq(CpuRegister(TMP), Immediate(value));
         __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
       }
+    } else if (constant->IsFloatConstant()) {
+      Immediate imm(bit_cast<float, int32_t>(constant->AsFloatConstant()->GetValue()));
+      if (destination.IsFpuRegister()) {
+        __ movl(CpuRegister(TMP), imm);
+        __ movd(destination.As<XmmRegister>(), CpuRegister(TMP));
+      } else {
+        DCHECK(destination.IsStackSlot()) << destination;
+        __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), imm);
+      }
     } else {
-      LOG(FATAL) << "Unimplemented constant type";
+      DCHECK(constant->IsDoubleConstant()) << constant->DebugName();
+      Immediate imm(bit_cast<double, int64_t>(constant->AsDoubleConstant()->GetValue()));
+      if (destination.IsFpuRegister()) {
+        __ movq(CpuRegister(TMP), imm);
+        __ movd(destination.As<XmmRegister>(), CpuRegister(TMP));
+      } else {
+        DCHECK(destination.IsDoubleStackSlot()) << destination;
+        __ movq(CpuRegister(TMP), imm);
+        __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+      }
     }
-  } else {
-    LOG(FATAL) << "Unimplemented";
+  } else if (source.IsFpuRegister()) {
+    if (destination.IsFpuRegister()) {
+      __ movaps(destination.As<XmmRegister>(), source.As<XmmRegister>());
+    } else if (destination.IsStackSlot()) {
+      __ movss(Address(CpuRegister(RSP), destination.GetStackIndex()),
+               source.As<XmmRegister>());
+    } else {
+      DCHECK(destination.IsDoubleStackSlot());
+      __ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()),
+               source.As<XmmRegister>());
+    }
   }
 }
 
@@ -1823,6 +1927,18 @@
           CpuRegister(ensure_scratch.GetRegister()));
 }
 
+void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) {
+  __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
+  __ movss(Address(CpuRegister(RSP), mem), reg);
+  __ movd(reg, CpuRegister(TMP));
+}
+
+void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) {
+  __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
+  __ movsd(Address(CpuRegister(RSP), mem), reg);
+  __ movd(reg, CpuRegister(TMP));
+}
+
 void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
   MoveOperands* move = moves_.Get(index);
   Location source = move->GetSource();
@@ -1842,8 +1958,20 @@
     Exchange64(destination.As<CpuRegister>(), source.GetStackIndex());
   } else if (source.IsDoubleStackSlot() && destination.IsDoubleStackSlot()) {
     Exchange64(destination.GetStackIndex(), source.GetStackIndex());
+  } else if (source.IsFpuRegister() && destination.IsFpuRegister()) {
+    __ movd(CpuRegister(TMP), source.As<XmmRegister>());
+    __ movaps(source.As<XmmRegister>(), destination.As<XmmRegister>());
+    __ movd(destination.As<XmmRegister>(), CpuRegister(TMP));
+  } else if (source.IsFpuRegister() && destination.IsStackSlot()) {
+    Exchange32(source.As<XmmRegister>(), destination.GetStackIndex());
+  } else if (source.IsStackSlot() && destination.IsFpuRegister()) {
+    Exchange32(destination.As<XmmRegister>(), source.GetStackIndex());
+  } else if (source.IsFpuRegister() && destination.IsDoubleStackSlot()) {
+    Exchange64(source.As<XmmRegister>(), destination.GetStackIndex());
+  } else if (source.IsDoubleStackSlot() && destination.IsFpuRegister()) {
+    Exchange64(destination.As<XmmRegister>(), source.GetStackIndex());
   } else {
-    LOG(FATAL) << "Unimplemented";
+    LOG(FATAL) << "Unimplemented swap between " << source << " and " << destination;
   }
 }
 
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 5ac0189..e04a8d8 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -80,8 +80,10 @@
 
  private:
   void Exchange32(CpuRegister reg, int mem);
+  void Exchange32(XmmRegister reg, int mem);
   void Exchange32(int mem1, int mem2);
   void Exchange64(CpuRegister reg, int mem);
+  void Exchange64(XmmRegister reg, int mem);
   void Exchange64(int mem1, int mem2);
 
   CodeGeneratorX86_64* const codegen_;
@@ -146,8 +148,10 @@
   virtual void GenerateFrameExit() OVERRIDE;
   virtual void Bind(HBasicBlock* block) OVERRIDE;
   virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
-  virtual void SaveCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
-  virtual void RestoreCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
+  virtual size_t SaveCoreRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
+  virtual size_t RestoreCoreRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
+  virtual size_t SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
+  virtual size_t RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) OVERRIDE;
 
   virtual size_t GetWordSize() const OVERRIDE {
     return kX86_64WordSize;
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 459010d..4ed2156 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -120,13 +120,11 @@
     output_<< std::endl;
   }
 
-  void DumpLocation(Location location, Primitive::Type type) {
+  void DumpLocation(Location location) {
     if (location.IsRegister()) {
-      if (type == Primitive::kPrimDouble || type == Primitive::kPrimFloat) {
-        codegen_.DumpFloatingPointRegister(output_, location.reg());
-      } else {
-        codegen_.DumpCoreRegister(output_, location.reg());
-      }
+      codegen_.DumpCoreRegister(output_, location.reg());
+    } else if (location.IsFpuRegister()) {
+      codegen_.DumpFloatingPointRegister(output_, location.reg());
     } else if (location.IsConstant()) {
       output_ << "constant";
       HConstant* constant = location.GetConstant();
@@ -150,9 +148,9 @@
     output_ << " (";
     for (size_t i = 0, e = instruction->NumMoves(); i < e; ++i) {
       MoveOperands* move = instruction->MoveOperandsAt(i);
-      DumpLocation(move->GetSource(), Primitive::kPrimInt);
+      DumpLocation(move->GetSource());
       output_ << " -> ";
-      DumpLocation(move->GetDestination(), Primitive::kPrimInt);
+      DumpLocation(move->GetDestination());
       if (i + 1 != e) {
         output_ << ", ";
       }
@@ -183,13 +181,13 @@
       if (locations != nullptr) {
         output_ << " ( ";
         for (size_t i = 0; i < instruction->InputCount(); ++i) {
-          DumpLocation(locations->InAt(i), instruction->InputAt(i)->GetType());
+          DumpLocation(locations->InAt(i));
           output_ << " ";
         }
         output_ << ")";
         if (locations->Out().IsValid()) {
           output_ << " -> ";
-          DumpLocation(locations->Out(), instruction->GetType());
+          DumpLocation(locations->Out());
         }
       }
       output_ << " (liveness: " << instruction->GetLifetimePosition() << ")";
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index a219b97..0505510 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -363,6 +363,25 @@
   Add(&phis_, this, phi);
 }
 
+void HBasicBlock::InsertPhiAfter(HPhi* phi, HPhi* cursor) {
+  DCHECK_EQ(phi->GetId(), -1);
+  DCHECK_NE(cursor->GetId(), -1);
+  DCHECK_EQ(cursor->GetBlock(), this);
+  if (cursor->next_ == nullptr) {
+    cursor->next_ = phi;
+    phi->previous_ = cursor;
+    DCHECK(phi->next_ == nullptr);
+  } else {
+    phi->next_ = cursor->next_;
+    phi->previous_ = cursor;
+    cursor->next_ = phi;
+    phi->next_->previous_ = phi;
+  }
+  phi->SetBlock(this);
+  phi->SetId(GetGraph()->GetNextInstructionId());
+  UpdateInputsUsers(phi);
+}
+
 static void Remove(HInstructionList* instruction_list,
                    HBasicBlock* block,
                    HInstruction* instruction) {
@@ -531,6 +550,12 @@
   env_uses_ = nullptr;
 }
 
+void HInstruction::ReplaceInput(HInstruction* replacement, size_t index) {
+  InputAt(index)->RemoveUser(this, index);
+  SetRawInputAt(index, replacement);
+  replacement->AddUseAt(this, index);
+}
+
 size_t HInstruction::EnvironmentSize() const {
   return HasEnvironment() ? environment_->Size() : 0;
 }
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 3f29e53..7c933aa 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -399,6 +399,7 @@
   void ReplaceAndRemoveInstructionWith(HInstruction* initial,
                                        HInstruction* replacement);
   void AddPhi(HPhi* phi);
+  void InsertPhiAfter(HPhi* instruction, HPhi* cursor);
   void RemovePhi(HPhi* phi);
 
   bool IsLoopHeader() const {
@@ -503,7 +504,9 @@
   M(Temporary, Instruction)                                             \
   M(SuspendCheck, Instruction)                                          \
   M(Mul, BinaryOperation)                                               \
-  M(Neg, UnaryOperation)
+  M(Neg, UnaryOperation)                                                \
+  M(FloatConstant, Constant)                                            \
+  M(DoubleConstant, Constant)                                           \
 
 #define FOR_EACH_INSTRUCTION(M)                                         \
   FOR_EACH_CONCRETE_INSTRUCTION(M)                                      \
@@ -710,6 +713,7 @@
   void SetLocations(LocationSummary* locations) { locations_ = locations; }
 
   void ReplaceWith(HInstruction* instruction);
+  void ReplaceInput(HInstruction* replacement, size_t index);
 
   bool HasOnlyOneUse() const {
     return uses_ != nullptr && uses_->GetTail() == nullptr;
@@ -995,8 +999,8 @@
 
   virtual Primitive::Type GetType() const { return type_; }
 
- private:
-  const Primitive::Type type_;
+ protected:
+  Primitive::Type type_;
 };
 
 // Represents dex's RETURN_VOID opcode. A HReturnVoid is a control flow
@@ -1401,6 +1405,48 @@
   DISALLOW_COPY_AND_ASSIGN(HConstant);
 };
 
+class HFloatConstant : public HConstant {
+ public:
+  explicit HFloatConstant(float value) : HConstant(Primitive::kPrimFloat), value_(value) {}
+
+  float GetValue() const { return value_; }
+
+  virtual bool InstructionDataEquals(HInstruction* other) const {
+    return bit_cast<float, int32_t>(other->AsFloatConstant()->value_) ==
+        bit_cast<float, int32_t>(value_);
+  }
+
+  virtual size_t ComputeHashCode() const { return static_cast<size_t>(GetValue()); }
+
+  DECLARE_INSTRUCTION(FloatConstant);
+
+ private:
+  const float value_;
+
+  DISALLOW_COPY_AND_ASSIGN(HFloatConstant);
+};
+
+class HDoubleConstant : public HConstant {
+ public:
+  explicit HDoubleConstant(double value) : HConstant(Primitive::kPrimDouble), value_(value) {}
+
+  double GetValue() const { return value_; }
+
+  virtual bool InstructionDataEquals(HInstruction* other) const {
+    return bit_cast<double, int64_t>(other->AsDoubleConstant()->value_) ==
+        bit_cast<double, int64_t>(value_);
+  }
+
+  virtual size_t ComputeHashCode() const { return static_cast<size_t>(GetValue()); }
+
+  DECLARE_INSTRUCTION(DoubleConstant);
+
+ private:
+  const double value_;
+
+  DISALLOW_COPY_AND_ASSIGN(HDoubleConstant);
+};
+
 // Constants of the type int. Those can be from Dex instructions, or
 // synthesized (for example with the if-eqz instruction).
 class HIntConstant : public HConstant {
@@ -1794,6 +1840,7 @@
 
   virtual bool CanBeMoved() const { return true; }
   virtual bool InstructionDataEquals(HInstruction* other) const { return true; }
+  void SetType(Primitive::Type type) { type_ = type; }
 
   DECLARE_INSTRUCTION(ArrayGet);
 
@@ -1806,11 +1853,11 @@
   HArraySet(HInstruction* array,
             HInstruction* index,
             HInstruction* value,
-            Primitive::Type component_type,
+            Primitive::Type expected_component_type,
             uint32_t dex_pc)
       : HTemplateInstruction(SideEffects::ChangesSomething()),
         dex_pc_(dex_pc),
-        component_type_(component_type) {
+        expected_component_type_(expected_component_type) {
     SetRawInputAt(0, array);
     SetRawInputAt(1, index);
     SetRawInputAt(2, value);
@@ -1824,13 +1871,24 @@
 
   uint32_t GetDexPc() const { return dex_pc_; }
 
-  Primitive::Type GetComponentType() const { return component_type_; }
+  HInstruction* GetValue() const { return InputAt(2); }
+
+  Primitive::Type GetComponentType() const {
+    // The Dex format does not type floating point index operations. Since the
+    // `expected_component_type_` is set during building and can therefore not
+    // be correct, we also check what is the value type. If it is a floating
+    // point type, we must use that type.
+    Primitive::Type value_type = GetValue()->GetType();
+    return ((value_type == Primitive::kPrimFloat) || (value_type == Primitive::kPrimDouble))
+        ? value_type
+        : expected_component_type_;
+  }
 
   DECLARE_INSTRUCTION(ArraySet);
 
  private:
   const uint32_t dex_pc_;
-  const Primitive::Type component_type_;
+  const Primitive::Type expected_component_type_;
 
   DISALLOW_COPY_AND_ASSIGN(HArraySet);
 };
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 719c069..3b51bfb 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -37,18 +37,21 @@
         handled_(allocator, 0),
         active_(allocator, 0),
         inactive_(allocator, 0),
-        physical_register_intervals_(allocator, codegen->GetNumberOfCoreRegisters()),
+        physical_core_register_intervals_(allocator, codegen->GetNumberOfCoreRegisters()),
+        physical_fp_register_intervals_(allocator, codegen->GetNumberOfFloatingPointRegisters()),
         temp_intervals_(allocator, 4),
         spill_slots_(allocator, kDefaultNumberOfSpillSlots),
         safepoints_(allocator, 0),
         processing_core_registers_(false),
         number_of_registers_(-1),
         registers_array_(nullptr),
-        blocked_registers_(codegen->GetBlockedCoreRegisters()),
+        blocked_core_registers_(codegen->GetBlockedCoreRegisters()),
+        blocked_fp_registers_(codegen->GetBlockedFloatingPointRegisters()),
         reserved_out_slots_(0),
         maximum_number_of_live_registers_(0) {
   codegen->SetupBlockedRegisters();
-  physical_register_intervals_.SetSize(codegen->GetNumberOfCoreRegisters());
+  physical_core_register_intervals_.SetSize(codegen->GetNumberOfCoreRegisters());
+  physical_fp_register_intervals_.SetSize(codegen->GetNumberOfFloatingPointRegisters());
   // Always reserve for the current method and the graph's max out registers.
   // TODO: compute it instead.
   reserved_out_slots_ = 1 + codegen->GetGraph()->GetMaximumNumberOfOutVRegs();
@@ -65,8 +68,10 @@
          it.Advance()) {
       HInstruction* current = it.Current();
       if (current->GetType() == Primitive::kPrimLong && instruction_set != kX86_64) return false;
-      if (current->GetType() == Primitive::kPrimFloat) return false;
-      if (current->GetType() == Primitive::kPrimDouble) return false;
+      if ((current->GetType() == Primitive::kPrimFloat || current->GetType() == Primitive::kPrimDouble)
+          && instruction_set != kX86_64) {
+        return false;
+      }
     }
   }
   return true;
@@ -93,14 +98,22 @@
 
 void RegisterAllocator::BlockRegister(Location location,
                                       size_t start,
-                                      size_t end,
-                                      Primitive::Type type) {
+                                      size_t end) {
   int reg = location.reg();
-  LiveInterval* interval = physical_register_intervals_.Get(reg);
+  DCHECK(location.IsRegister() || location.IsFpuRegister());
+  LiveInterval* interval = location.IsRegister()
+      ? physical_core_register_intervals_.Get(reg)
+      : physical_fp_register_intervals_.Get(reg);
+  Primitive::Type type = location.IsRegister()
+      ? Primitive::kPrimInt
+      : Primitive::kPrimDouble;
   if (interval == nullptr) {
     interval = LiveInterval::MakeFixedInterval(allocator_, reg, type);
-    physical_register_intervals_.Put(reg, interval);
-    inactive_.Add(interval);
+    if (location.IsRegister()) {
+      physical_core_register_intervals_.Put(reg, interval);
+    } else {
+      physical_fp_register_intervals_.Put(reg, interval);
+    }
   }
   DCHECK(interval->GetRegister() == reg);
   interval->AddRange(start, end);
@@ -123,8 +136,17 @@
   registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_);
   processing_core_registers_ = true;
   unhandled_ = &unhandled_core_intervals_;
+  for (size_t i = 0, e = physical_core_register_intervals_.Size(); i < e; ++i) {
+    LiveInterval* fixed = physical_core_register_intervals_.Get(i);
+    if (fixed != nullptr) {
+      inactive_.Add(fixed);
+    }
+  }
   LinearScan();
 
+  size_t saved_maximum_number_of_live_registers = maximum_number_of_live_registers_;
+  maximum_number_of_live_registers_ = 0;
+
   inactive_.Reset();
   active_.Reset();
   handled_.Reset();
@@ -133,9 +155,14 @@
   registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_);
   processing_core_registers_ = false;
   unhandled_ = &unhandled_fp_intervals_;
-  // TODO: Enable FP register allocation.
-  DCHECK(unhandled_->IsEmpty());
+  for (size_t i = 0, e = physical_fp_register_intervals_.Size(); i < e; ++i) {
+    LiveInterval* fixed = physical_fp_register_intervals_.Get(i);
+    if (fixed != nullptr) {
+      inactive_.Add(fixed);
+    }
+  }
   LinearScan();
+  maximum_number_of_live_registers_ += saved_maximum_number_of_live_registers;
 }
 
 void RegisterAllocator::ProcessInstruction(HInstruction* instruction) {
@@ -148,8 +175,9 @@
   for (size_t i = 0; i < locations->GetTempCount(); ++i) {
     Location temp = locations->GetTemp(i);
     if (temp.IsRegister()) {
-      BlockRegister(temp, position, position + 1, Primitive::kPrimInt);
+      BlockRegister(temp, position, position + 1);
     } else {
+      DCHECK(temp.IsUnallocated());
       LiveInterval* interval = LiveInterval::MakeTempInterval(allocator_, Primitive::kPrimInt);
       temp_intervals_.Add(interval);
       interval->AddRange(position, position + 1);
@@ -160,10 +188,6 @@
   bool core_register = (instruction->GetType() != Primitive::kPrimDouble)
       && (instruction->GetType() != Primitive::kPrimFloat);
 
-  GrowableArray<LiveInterval*>& unhandled = core_register
-      ? unhandled_core_intervals_
-      : unhandled_fp_intervals_;
-
   if (locations->CanCall()) {
     if (!instruction->IsSuspendCheck()) {
       codegen_->MarkNotLeaf();
@@ -180,7 +204,8 @@
       // maximum before updating locations.
       LiveInterval* interval = LiveInterval::MakeSlowPathInterval(allocator_, instruction);
       interval->AddRange(position, position + 1);
-      unhandled.Add(interval);
+      unhandled_core_intervals_.Add(interval);
+      unhandled_fp_intervals_.Add(interval);
     }
   }
 
@@ -189,21 +214,29 @@
     for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) {
       BlockRegister(Location::RegisterLocation(i),
                     position,
-                    position + 1,
-                    Primitive::kPrimInt);
+                    position + 1);
+    }
+    for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) {
+      BlockRegister(Location::FpuRegisterLocation(i),
+                    position,
+                    position + 1);
     }
   }
 
   for (size_t i = 0; i < instruction->InputCount(); ++i) {
     Location input = locations->InAt(i);
-    if (input.IsRegister()) {
-      BlockRegister(input, position, position + 1, instruction->InputAt(i)->GetType());
+    if (input.IsRegister() || input.IsFpuRegister()) {
+      BlockRegister(input, position, position + 1);
     }
   }
 
   LiveInterval* current = instruction->GetLiveInterval();
   if (current == nullptr) return;
 
+  GrowableArray<LiveInterval*>& unhandled = core_register
+      ? unhandled_core_intervals_
+      : unhandled_fp_intervals_;
+
   DCHECK(unhandled.IsEmpty() || current->StartsBeforeOrAt(unhandled.Peek()));
   // Some instructions define their output in fixed register/stack slot. We need
   // to ensure we know these locations before doing register allocation. For a
@@ -213,11 +246,11 @@
   //
   // The backwards walking ensures the ranges are ordered on increasing start positions.
   Location output = locations->Out();
-  if (output.IsRegister()) {
+  if (output.IsRegister() || output.IsFpuRegister()) {
     // Shift the interval's start by one to account for the blocked register.
     current->SetFrom(position + 1);
     current->SetRegister(output.reg());
-    BlockRegister(output, position, position + 1, instruction->GetType());
+    BlockRegister(output, position, position + 1);
   } else if (!locations->OutputOverlapsWithInputs()) {
     // Shift the interval's start by one to not interfere with the inputs.
     current->SetFrom(position + 1);
@@ -281,10 +314,19 @@
     }
   }
 
-  for (size_t i = 0, e = physical_register_intervals_.Size(); i < e; ++i) {
-    LiveInterval* fixed = physical_register_intervals_.Get(i);
-    if (fixed != nullptr && ShouldProcess(processing_core_registers_, fixed)) {
-      intervals.Add(fixed);
+  if (processing_core_registers_) {
+    for (size_t i = 0, e = physical_core_register_intervals_.Size(); i < e; ++i) {
+      LiveInterval* fixed = physical_core_register_intervals_.Get(i);
+      if (fixed != nullptr) {
+        intervals.Add(fixed);
+      }
+    }
+  } else {
+    for (size_t i = 0, e = physical_fp_register_intervals_.Size(); i < e; ++i) {
+      LiveInterval* fixed = physical_fp_register_intervals_.Get(i);
+      if (fixed != nullptr) {
+        intervals.Add(fixed);
+      }
     }
   }
 
@@ -377,10 +419,10 @@
   interval->Dump(stream);
   stream << ": ";
   if (interval->HasRegister()) {
-    if (processing_core_registers_) {
-      codegen_->DumpCoreRegister(stream, interval->GetRegister());
-    } else {
+    if (interval->IsFloatingPoint()) {
       codegen_->DumpFloatingPointRegister(stream, interval->GetRegister());
+    } else {
+      codegen_->DumpCoreRegister(stream, interval->GetRegister());
     }
   } else {
     stream << "spilled";
@@ -522,10 +564,9 @@
 }
 
 bool RegisterAllocator::IsBlocked(int reg) const {
-  // TODO: This only works for core registers and needs to be adjusted for
-  // floating point registers.
-  DCHECK(processing_core_registers_);
-  return blocked_registers_[reg];
+  return processing_core_registers_
+      ? blocked_core_registers_[reg]
+      : blocked_fp_registers_[reg];
 }
 
 // Find the register that is used the last, and spill the interval
@@ -727,7 +768,10 @@
 }
 
 static bool IsValidDestination(Location destination) {
-  return destination.IsRegister() || destination.IsStackSlot() || destination.IsDoubleStackSlot();
+  return destination.IsRegister()
+      || destination.IsFpuRegister()
+      || destination.IsStackSlot()
+      || destination.IsDoubleStackSlot();
 }
 
 void RegisterAllocator::AddInputMoveFor(HInstruction* user,
@@ -877,7 +921,9 @@
   if (current->HasSpillSlot() && current->HasRegister()) {
     // We spill eagerly, so move must be at definition.
     InsertMoveAfter(interval->GetDefinedBy(),
-                    Location::RegisterLocation(interval->GetRegister()),
+                    interval->IsFloatingPoint()
+                        ? Location::FpuRegisterLocation(interval->GetRegister())
+                        : Location::RegisterLocation(interval->GetRegister()),
                     interval->NeedsTwoSpillSlots()
                         ? Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot())
                         : Location::StackSlot(interval->GetParent()->GetSpillSlot()));
@@ -935,6 +981,10 @@
           }
           break;
         }
+        case Location::kFpuRegister: {
+          locations->AddLiveRegister(source);
+          break;
+        }
         case Location::kStackSlot:  // Fall-through
         case Location::kDoubleStackSlot:  // Fall-through
         case Location::kConstant: {
@@ -1098,6 +1148,7 @@
       current = at;
     }
     LocationSummary* locations = at->GetLocations();
+    DCHECK(temp->GetType() == Primitive::kPrimInt);
     locations->SetTempAt(
         temp_index++, Location::RegisterLocation(temp->GetRegister()));
   }
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
index 0c3a9b3..b881539 100644
--- a/compiler/optimizing/register_allocator.h
+++ b/compiler/optimizing/register_allocator.h
@@ -94,7 +94,7 @@
   bool IsBlocked(int reg) const;
 
   // Update the interval for the register in `location` to cover [start, end).
-  void BlockRegister(Location location, size_t start, size_t end, Primitive::Type type);
+  void BlockRegister(Location location, size_t start, size_t end);
 
   // Allocate a spill slot for the given interval.
   void AllocateSpillSlotFor(LiveInterval* interval);
@@ -156,7 +156,8 @@
 
   // Fixed intervals for physical registers. Such intervals cover the positions
   // where an instruction requires a specific register.
-  GrowableArray<LiveInterval*> physical_register_intervals_;
+  GrowableArray<LiveInterval*> physical_core_register_intervals_;
+  GrowableArray<LiveInterval*> physical_fp_register_intervals_;
 
   // Intervals for temporaries. Such intervals cover the positions
   // where an instruction requires a temporary.
@@ -179,7 +180,8 @@
   size_t* registers_array_;
 
   // Blocked registers, as decided by the code generator.
-  bool* const blocked_registers_;
+  bool* const blocked_core_registers_;
+  bool* const blocked_fp_registers_;
 
   // Slots reserved for out arguments.
   size_t reserved_out_slots_;
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index be2c039..a0cc8a9 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -129,8 +129,112 @@
   }
 }
 
+/**
+ * Constants in the Dex format are not typed. So the builder types them as
+ * integers, but when doing the SSA form, we might realize the constant
+ * is used for floating point operations. We create a floating-point equivalent
+ * constant to make the operations correctly typed.
+ */
+static HFloatConstant* GetFloatEquivalent(HIntConstant* constant) {
+  // We place the floating point constant next to this constant.
+  HFloatConstant* result = constant->GetNext()->AsFloatConstant();
+  if (result == nullptr) {
+    HGraph* graph = constant->GetBlock()->GetGraph();
+    ArenaAllocator* allocator = graph->GetArena();
+    result = new (allocator) HFloatConstant(bit_cast<int32_t, float>(constant->GetValue()));
+    constant->GetBlock()->InsertInstructionBefore(result, constant->GetNext());
+  } else {
+    // If there is already a constant with the expected type, we know it is
+    // the floating point equivalent of this constant.
+    DCHECK_EQ((bit_cast<float, int32_t>(result->GetValue())), constant->GetValue());
+  }
+  return result;
+}
+
+/**
+ * Wide constants in the Dex format are not typed. So the builder types them as
+ * longs, but when doing the SSA form, we might realize the constant
+ * is used for floating point operations. We create a floating-point equivalent
+ * constant to make the operations correctly typed.
+ */
+static HDoubleConstant* GetDoubleEquivalent(HLongConstant* constant) {
+  // We place the floating point constant next to this constant.
+  HDoubleConstant* result = constant->GetNext()->AsDoubleConstant();
+  if (result == nullptr) {
+    HGraph* graph = constant->GetBlock()->GetGraph();
+    ArenaAllocator* allocator = graph->GetArena();
+    result = new (allocator) HDoubleConstant(bit_cast<int64_t, double>(constant->GetValue()));
+    constant->GetBlock()->InsertInstructionBefore(result, constant->GetNext());
+  } else {
+    // If there is already a constant with the expected type, we know it is
+    // the floating point equivalent of this constant.
+    DCHECK_EQ((bit_cast<double, int64_t>(result->GetValue())), constant->GetValue());
+  }
+  return result;
+}
+
+/**
+ * Because of Dex format, we might end up having the same phi being
+ * used for non floating point operations and floating point operations. Because
+ * we want the graph to be correctly typed (and thereafter avoid moves between
+ * floating point registers and core registers), we need to create a copy of the
+ * phi with a floating point type.
+ */
+static HPhi* GetFloatOrDoubleEquivalentOfPhi(HPhi* phi, Primitive::Type type) {
+  // We place the floating point phi next to this phi.
+  HInstruction* next = phi->GetNext();
+  if (next == nullptr
+      || (next->GetType() != Primitive::kPrimDouble && next->GetType() != Primitive::kPrimFloat)) {
+    ArenaAllocator* allocator = phi->GetBlock()->GetGraph()->GetArena();
+    HPhi* new_phi = new (allocator) HPhi(allocator, phi->GetRegNumber(), phi->InputCount(), type);
+    for (size_t i = 0, e = phi->InputCount(); i < e; ++i) {
+      // Copy the inputs. Note that the graph may not be correctly typed by doing this copy,
+      // but the type propagation phase will fix it.
+      new_phi->SetRawInputAt(i, phi->InputAt(i));
+    }
+    phi->GetBlock()->InsertPhiAfter(new_phi, phi);
+    return new_phi;
+  } else {
+    // If there is already a phi with the expected type, we know it is the floating
+    // point equivalent of this phi.
+    DCHECK_EQ(next->AsPhi()->GetRegNumber(), phi->GetRegNumber());
+    return next->AsPhi();
+  }
+}
+
+HInstruction* SsaBuilder::GetFloatOrDoubleEquivalent(HInstruction* user,
+                                                     HInstruction* value,
+                                                     Primitive::Type type) {
+  if (value->IsArrayGet()) {
+    // The verifier has checked that values in arrays cannot be used for both
+    // floating point and non-floating point operations. It is therefore safe to just
+    // change the type of the operation.
+    value->AsArrayGet()->SetType(type);
+    return value;
+  } else if (value->IsLongConstant()) {
+    return GetDoubleEquivalent(value->AsLongConstant());
+  } else if (value->IsIntConstant()) {
+    return GetFloatEquivalent(value->AsIntConstant());
+  } else if (value->IsPhi()) {
+    return GetFloatOrDoubleEquivalentOfPhi(value->AsPhi(), type);
+  } else {
+    // For other instructions, we assume the verifier has checked that the dex format is correctly
+    // typed and the value in a dex register will not be used for both floating point and
+    // non-floating point operations. So the only reason an instruction would want a floating
+    // point equivalent is for an unused phi that will be removed by the dead phi elimination phase.
+    DCHECK(user->IsPhi());
+    return value;
+  }
+}
+
 void SsaBuilder::VisitLoadLocal(HLoadLocal* load) {
-  load->ReplaceWith(current_locals_->Get(load->GetLocal()->GetRegNumber()));
+  HInstruction* value = current_locals_->Get(load->GetLocal()->GetRegNumber());
+  if (load->GetType() != value->GetType()
+      && (load->GetType() == Primitive::kPrimFloat || load->GetType() == Primitive::kPrimDouble)) {
+    // If the operation requests a specific type, we make sure its input is of that type.
+    value = GetFloatOrDoubleEquivalent(load, value, load->GetType());
+  }
+  load->ReplaceWith(value);
   load->GetBlock()->RemoveInstruction(load);
 }
 
diff --git a/compiler/optimizing/ssa_builder.h b/compiler/optimizing/ssa_builder.h
index 9d8c072..24f5ac5 100644
--- a/compiler/optimizing/ssa_builder.h
+++ b/compiler/optimizing/ssa_builder.h
@@ -52,6 +52,10 @@
   void VisitStoreLocal(HStoreLocal* store);
   void VisitInstruction(HInstruction* instruction);
 
+  static HInstruction* GetFloatOrDoubleEquivalent(HInstruction* user,
+                                                  HInstruction* instruction,
+                                                  Primitive::Type type);
+
  private:
   // Locals for the current block being visited.
   GrowableArray<HInstruction*>* current_locals_;
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index f0edc64..1e34670 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -319,7 +319,7 @@
       if (user->IsPhi()) {
         // If the phi has a register, try to use the same.
         Location phi_location = user->GetLiveInterval()->ToLocation();
-        if (phi_location.IsRegister() && free_until[phi_location.reg()] >= use_position) {
+        if (SameRegisterKind(phi_location) && free_until[phi_location.reg()] >= use_position) {
           return phi_location.reg();
         }
         const GrowableArray<HBasicBlock*>& predecessors = user->GetBlock()->GetPredecessors();
@@ -345,7 +345,7 @@
         // We use the user's lifetime position - 1 (and not `use_position`) because the
         // register is blocked at the beginning of the user.
         size_t position = user->GetLifetimePosition() - 1;
-        if (expected.IsRegister() && free_until[expected.reg()] >= position) {
+        if (SameRegisterKind(expected) && free_until[expected.reg()] >= position) {
           return expected.reg();
         }
       }
@@ -368,7 +368,7 @@
         // If the input dies at the end of the predecessor, we know its register can
         // be reused.
         Location input_location = input_interval.ToLocation();
-        if (input_location.IsRegister()) {
+        if (SameRegisterKind(input_location)) {
           return input_location.reg();
         }
       }
@@ -384,7 +384,7 @@
         // If the input dies at the start of this instruction, we know its register can
         // be reused.
         Location location = input_interval.ToLocation();
-        if (location.IsRegister()) {
+        if (SameRegisterKind(location)) {
           return location.reg();
         }
       }
@@ -393,13 +393,21 @@
   return kNoRegister;
 }
 
+bool LiveInterval::SameRegisterKind(Location other) const {
+  return IsFloatingPoint()
+      ? other.IsFpuRegister()
+      : other.IsRegister();
+}
+
 bool LiveInterval::NeedsTwoSpillSlots() const {
   return type_ == Primitive::kPrimLong || type_ == Primitive::kPrimDouble;
 }
 
 Location LiveInterval::ToLocation() const {
   if (HasRegister()) {
-    return Location::RegisterLocation(GetRegister());
+    return IsFloatingPoint()
+        ? Location::FpuRegisterLocation(GetRegister())
+        : Location::RegisterLocation(GetRegister());
   } else {
     HInstruction* defined_by = GetParent()->GetDefinedBy();
     if (defined_by->IsConstant()) {
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index d3e1c0e..8ce5ce9 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -358,6 +358,10 @@
              || (location.GetPolicy() == Location::kSameAsFirstInput
                  && locations->InAt(0).GetPolicy() == Location::kRequiresRegister)) {
           return position;
+        } else if ((location.GetPolicy() == Location::kRequiresFpuRegister)
+                   || (location.GetPolicy() == Location::kSameAsFirstInput
+                       && locations->InAt(0).GetPolicy() == Location::kRequiresFpuRegister)) {
+          return position;
         }
       }
     }
@@ -368,7 +372,9 @@
       size_t use_position = use->GetPosition();
       if (use_position >= position && !use->GetIsEnvironment()) {
         Location location = use->GetUser()->GetLocations()->InAt(use->GetInputIndex());
-        if (location.IsUnallocated() && location.GetPolicy() == Location::kRequiresRegister) {
+        if (location.IsUnallocated()
+            && (location.GetPolicy() == Location::kRequiresRegister
+                || location.GetPolicy() == Location::kRequiresFpuRegister)) {
           // Return the lifetime just before the user, so that the interval has a register
           // when entering the user.
           return use->GetUser()->GetLifetimePosition() - 1;
@@ -502,6 +508,10 @@
   // slots for spilling.
   bool NeedsTwoSpillSlots() const;
 
+  bool IsFloatingPoint() const {
+    return type_ == Primitive::kPrimFloat || type_ == Primitive::kPrimDouble;
+  }
+
   // Converts the location of the interval to a `Location` object.
   Location ToLocation() const;
 
@@ -513,6 +523,9 @@
 
   bool IsTemp() const { return is_temp_; }
 
+  // Returns whether `other` and `this` share the same kind of register.
+  bool SameRegisterKind(Location other) const;
+
  private:
   ArenaAllocator* const allocator_;
 
diff --git a/compiler/optimizing/ssa_phi_elimination.cc b/compiler/optimizing/ssa_phi_elimination.cc
index e02a182..4eda0f3 100644
--- a/compiler/optimizing/ssa_phi_elimination.cc
+++ b/compiler/optimizing/ssa_phi_elimination.cc
@@ -24,18 +24,13 @@
     HBasicBlock* block = it.Current();
     for (HInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
       HPhi* phi = it.Current()->AsPhi();
-      if (phi->HasEnvironmentUses()) {
-        // TODO: Do we want to keep that phi alive?
-        worklist_.Add(phi);
-        phi->SetLive();
-        continue;
-      }
       for (HUseIterator<HInstruction> it(phi->GetUses()); !it.Done(); it.Advance()) {
         HUseListNode<HInstruction>* current = it.Current();
         HInstruction* user = current->GetUser();
         if (!user->IsPhi()) {
           worklist_.Add(phi);
           phi->SetLive();
+          break;
         } else {
           phi->SetDead();
         }
@@ -76,6 +71,14 @@
             current->RemoveUser(user, user_node->GetIndex());
           }
         }
+        if (current->HasEnvironmentUses()) {
+          for (HUseIterator<HEnvironment> it(current->GetEnvUses()); !it.Done(); it.Advance()) {
+            HUseListNode<HEnvironment>* user_node = it.Current();
+            HEnvironment* user = user_node->GetUser();
+            user->SetRawEnvAt(user_node->GetIndex(), nullptr);
+            current->RemoveEnvironmentUser(user, user_node->GetIndex());
+          }
+        }
         block->RemovePhi(current->AsPhi());
       }
       current = next;
diff --git a/compiler/optimizing/ssa_type_propagation.cc b/compiler/optimizing/ssa_type_propagation.cc
index a860cb7..3828142 100644
--- a/compiler/optimizing/ssa_type_propagation.cc
+++ b/compiler/optimizing/ssa_type_propagation.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "ssa_builder.h"
 #include "ssa_type_propagation.h"
 
 #include "nodes.h"
@@ -38,15 +39,31 @@
 
 // Re-compute and update the type of the instruction. Returns
 // whether or not the type was changed.
-static bool UpdateType(HPhi* phi) {
+bool SsaTypePropagation::UpdateType(HPhi* phi) {
   Primitive::Type existing = phi->GetType();
 
-  Primitive::Type new_type = Primitive::kPrimVoid;
+  Primitive::Type new_type = existing;
   for (size_t i = 0, e = phi->InputCount(); i < e; ++i) {
     Primitive::Type input_type = phi->InputAt(i)->GetType();
     new_type = MergeTypes(new_type, input_type);
   }
   phi->SetType(new_type);
+
+  if (new_type == Primitive::kPrimDouble || new_type == Primitive::kPrimFloat) {
+    // If the phi is of floating point type, we need to update its inputs to that
+    // type. For inputs that are phis, we need to recompute their types.
+    for (size_t i = 0, e = phi->InputCount(); i < e; ++i) {
+      HInstruction* input = phi->InputAt(i);
+      if (input->GetType() != new_type) {
+        HInstruction* equivalent = SsaBuilder::GetFloatOrDoubleEquivalent(phi, input, new_type);
+        phi->ReplaceInput(equivalent, i);
+        if (equivalent->IsPhi()) {
+          AddToWorklist(equivalent->AsPhi());
+        }
+      }
+    }
+  }
+
   return existing != new_type;
 }
 
@@ -63,7 +80,12 @@
       HPhi* phi = it.Current()->AsPhi();
       // Set the initial type for the phi. Use the non back edge input for reaching
       // a fixed point faster.
-      phi->SetType(phi->InputAt(0)->GetType());
+      Primitive::Type phi_type = phi->GetType();
+      // We merge with the existing type, that has been set by the SSA builder.
+      DCHECK(phi_type == Primitive::kPrimVoid
+          || phi_type == Primitive::kPrimFloat
+          || phi_type == Primitive::kPrimDouble);
+      phi->SetType(MergeTypes(phi->InputAt(0)->GetType(), phi->GetType()));
       AddToWorklist(phi);
     }
   } else {
diff --git a/compiler/optimizing/ssa_type_propagation.h b/compiler/optimizing/ssa_type_propagation.h
index 5f471a9..f4d3d63 100644
--- a/compiler/optimizing/ssa_type_propagation.h
+++ b/compiler/optimizing/ssa_type_propagation.h
@@ -34,6 +34,7 @@
   void ProcessWorklist();
   void AddToWorklist(HPhi* phi);
   void AddDependentInstructionsToWorklist(HPhi* phi);
+  bool UpdateType(HPhi* phi);
 
   HGraph* const graph_;
   GrowableArray<HPhi*> worklist_;
diff --git a/compiler/utils/assembler_test.h b/compiler/utils/assembler_test.h
index 3742913..5bfa462 100644
--- a/compiler/utils/assembler_test.h
+++ b/compiler/utils/assembler_test.h
@@ -373,7 +373,7 @@
         }
       } else {
         // This will output the assembly.
-        EXPECT_EQ(*data, *res.code) << "Outputs (and disassembly) not identical.";
+        EXPECT_EQ(*res.code, *data) << "Outputs (and disassembly) not identical.";
       }
     }
   }
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index db7151c..f4c9862 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -317,7 +317,7 @@
   EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x28);
-  EmitXmmRegisterOperand(src.LowBits(), dst);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
@@ -354,7 +354,7 @@
 void X86_64Assembler::movd(XmmRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
-  EmitOptionalRex32(dst, src);
+  EmitRex64(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x6E);
   EmitOperand(dst.LowBits(), Operand(src));
@@ -364,7 +364,7 @@
 void X86_64Assembler::movd(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
-  EmitOptionalRex32(src, dst);
+  EmitRex64(src, dst);
   EmitUint8(0x0F);
   EmitUint8(0x7E);
   EmitOperand(src.LowBits(), Operand(dst));
@@ -1748,6 +1748,10 @@
   EmitOptionalRex(false, true, dst.NeedsRex(), false, src.NeedsRex());
 }
 
+void X86_64Assembler::EmitRex64(XmmRegister dst, CpuRegister src) {
+  EmitOptionalRex(false, true, dst.NeedsRex(), false, src.NeedsRex());
+}
+
 void X86_64Assembler::EmitRex64(CpuRegister dst, const Operand& operand) {
   uint8_t rex = 0x48 | operand.rex();  // REX.W000
   if (dst.NeedsRex()) {
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 4ffb6b5..7e5859c 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -666,6 +666,7 @@
   void EmitRex64(CpuRegister reg);
   void EmitRex64(CpuRegister dst, CpuRegister src);
   void EmitRex64(CpuRegister dst, const Operand& operand);
+  void EmitRex64(XmmRegister dst, CpuRegister src);
 
   // Emit a REX prefix to normalize byte registers plus necessary register bit encodings.
   void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src);
@@ -692,7 +693,7 @@
 inline void X86_64Assembler::EmitRegisterOperand(uint8_t rm, uint8_t reg) {
   CHECK_GE(rm, 0);
   CHECK_LT(rm, 8);
-  buffer_.Emit<uint8_t>(0xC0 + (rm << 3) + reg);
+  buffer_.Emit<uint8_t>((0xC0 | (reg & 7)) + (rm << 3));
 }
 
 inline void X86_64Assembler::EmitXmmRegisterOperand(uint8_t rm, XmmRegister reg) {
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 69a5fa0..37a0932 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -134,6 +134,32 @@
   DriverStr(RepeatRI(&x86_64::X86_64Assembler::xorq, 4U, "xorq ${imm}, %{reg}"), "xorqi");
 }
 
+TEST_F(AssemblerX86_64Test, Movaps) {
+  GetAssembler()->movaps(x86_64::XmmRegister(x86_64::XMM0), x86_64::XmmRegister(x86_64::XMM8));
+  DriverStr("movaps %xmm8, %xmm0", "movaps");
+}
+
+TEST_F(AssemblerX86_64Test, Movd) {
+  GetAssembler()->movd(x86_64::XmmRegister(x86_64::XMM0), x86_64::CpuRegister(x86_64::R11));
+  GetAssembler()->movd(x86_64::XmmRegister(x86_64::XMM0), x86_64::CpuRegister(x86_64::RAX));
+  GetAssembler()->movd(x86_64::XmmRegister(x86_64::XMM8), x86_64::CpuRegister(x86_64::R11));
+  GetAssembler()->movd(x86_64::XmmRegister(x86_64::XMM8), x86_64::CpuRegister(x86_64::RAX));
+  GetAssembler()->movd(x86_64::CpuRegister(x86_64::R11), x86_64::XmmRegister(x86_64::XMM0));
+  GetAssembler()->movd(x86_64::CpuRegister(x86_64::RAX), x86_64::XmmRegister(x86_64::XMM0));
+  GetAssembler()->movd(x86_64::CpuRegister(x86_64::R11), x86_64::XmmRegister(x86_64::XMM8));
+  GetAssembler()->movd(x86_64::CpuRegister(x86_64::RAX), x86_64::XmmRegister(x86_64::XMM8));
+  const char* expected =
+    "movd %r11, %xmm0\n"
+    "movd %rax, %xmm0\n"
+    "movd %r11, %xmm8\n"
+    "movd %rax, %xmm8\n"
+    "movd %xmm0, %r11\n"
+    "movd %xmm0, %rax\n"
+    "movd %xmm8, %r11\n"
+    "movd %xmm8, %rax\n";
+  DriverStr(expected, "movd");
+}
+
 TEST_F(AssemblerX86_64Test, Movl) {
   GetAssembler()->movl(x86_64::CpuRegister(x86_64::R8), x86_64::CpuRegister(x86_64::R11));
   GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::CpuRegister(x86_64::R11));
diff --git a/runtime/check_reference_map_visitor.h b/runtime/check_reference_map_visitor.h
index 1a78d72..8c2293f 100644
--- a/runtime/check_reference_map_visitor.h
+++ b/runtime/check_reference_map_visitor.h
@@ -84,8 +84,12 @@
         case DexRegisterMap::kInRegister:
           CHECK_NE(register_mask & dex_register_map.GetValue(reg), 0u);
           break;
+        case DexRegisterMap::kInFpuRegister:
+          // In Fpu register, should not be a reference.
+          CHECK(false);
+          break;
         case DexRegisterMap::kConstant:
-          CHECK_EQ(dex_register_map.GetValue(0), 0);
+          CHECK_EQ(dex_register_map.GetValue(reg), 0);
           break;
       }
     }
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index 9b49d31..b1c46a9 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -88,6 +88,7 @@
     kNone,
     kInStack,
     kInRegister,
+    kInFpuRegister,
     kConstant
   };
 
diff --git a/test/410-floats/src/Main.java b/test/410-floats/src/Main.java
index d8d6fac..2300457 100644
--- a/test/410-floats/src/Main.java
+++ b/test/410-floats/src/Main.java
@@ -17,9 +17,10 @@
 public class Main {
   public static void main(String[] args) {
     assertEquals(4.2f, returnFloat());
-    float[] a = new float[1];
+    float[] a = new float[2];
     a[0] = 42.2f;
-    assertEquals(42.2f, returnFloat(a));
+    a[1] = 3.2f;
+    assertEquals(45.4f, returnFloat(a));
 
     assertEquals(4.4, returnDouble());
     double[] b = new double[1];
@@ -36,6 +37,9 @@
     assertEquals(3.1, invokeTakeADouble(3.1));
     assertEquals(12.7, invokeTakeThreeDouble(3.1, 4.4, 5.2));
     assertEquals(12.7f, invokeTakeThreeFloat(3.1f, 4.4f, 5.2f));
+
+    testArrayOperations(new float[2], 0, 1.2f, 3.4f);
+    testArrayOperations(new double[2], 0, 4.1, 7.6);
   }
 
   public static float invokeReturnFloat() {
@@ -51,7 +55,7 @@
   }
 
   public static float returnFloat(float[] a) {
-    return a[0];
+    return a[0] + a[1];
   }
 
   public static double returnDouble() {
@@ -94,6 +98,34 @@
     return takeThreeFloat(a, b, c);
   }
 
+  // Test simple operations on a float array to ensure the register allocator works
+  // properly.
+  public static void testArrayOperations(float[] a, int index, float value1, float value2) {
+    a[0] = value1;
+    a[1] = value2;
+    assertEquals(value1 + value2, a[0] + a[1]);
+    a[0] = 0.0f;
+    a[1] = 0.0f;
+    assertEquals(0.0f, a[0] + a[1]);
+    a[index] = value1;
+    a[index + 1] = value2;
+    assertEquals(value1 + value2, a[0] + a[1]);
+  }
+
+  // Test simple operations on a double array to ensure the register allocator works
+  // properly.
+  public static void testArrayOperations(double[] a, int index, double value1, double value2) {
+    a[0] = value1;
+    a[1] = value2;
+    assertEquals(value1 + value2, a[0] + a[1]);
+    a[0] = 0.0;
+    a[1] = 0.0;
+    assertEquals(0.0, a[0] + a[1]);
+    a[index] = value1;
+    a[index + 1] = value2;
+    assertEquals(value1 + value2, a[0] + a[1]);
+  }
+
   public static void assertEquals(float expected, float actual) {
     if (expected != actual) {
       throw new AssertionError("Expected " + expected + " got " + actual);