Implement register allocator for floating point registers.

Also:
- Fix misuses of emitting the rex prefix in the x86_64 assembler.
- Fix movaps code generation in the x86_64 assembler.

Change-Id: Ib6dcf6e7c4a9c43368cfc46b02ba50f69ae69cbe
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 9df9d41..9e63f8b 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -191,12 +191,24 @@
   stream << X86_64ManagedRegister::FromXmmRegister(FloatRegister(reg));
 }
 
-void CodeGeneratorX86_64::SaveCoreRegister(Location stack_location, uint32_t reg_id) {
-  __ movq(Address(CpuRegister(RSP), stack_location.GetStackIndex()), CpuRegister(reg_id));
+size_t CodeGeneratorX86_64::SaveCoreRegister(size_t stack_index, uint32_t reg_id) {
+  __ movq(Address(CpuRegister(RSP), stack_index), CpuRegister(reg_id));
+  return kX86_64WordSize;
 }
 
-void CodeGeneratorX86_64::RestoreCoreRegister(Location stack_location, uint32_t reg_id) {
-  __ movq(CpuRegister(reg_id), Address(CpuRegister(RSP), stack_location.GetStackIndex()));
+size_t CodeGeneratorX86_64::RestoreCoreRegister(size_t stack_index, uint32_t reg_id) {
+  __ movq(CpuRegister(reg_id), Address(CpuRegister(RSP), stack_index));
+  return kX86_64WordSize;
+}
+
+size_t CodeGeneratorX86_64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
+  __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
+  return kX86_64WordSize;
+}
+
+size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
+  __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
+  return kX86_64WordSize;
 }
 
 CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph)
@@ -727,6 +739,26 @@
   // Will be generated at use site.
 }
 
+void LocationsBuilderX86_64::VisitFloatConstant(HFloatConstant* constant) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(constant, LocationSummary::kNoCall);
+  locations->SetOut(Location::ConstantLocation(constant));
+}
+
+void InstructionCodeGeneratorX86_64::VisitFloatConstant(HFloatConstant* constant) {
+  // Will be generated at use site.
+}
+
+void LocationsBuilderX86_64::VisitDoubleConstant(HDoubleConstant* constant) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(constant, LocationSummary::kNoCall);
+  locations->SetOut(Location::ConstantLocation(constant));
+}
+
+void InstructionCodeGeneratorX86_64::VisitDoubleConstant(HDoubleConstant* constant) {
+  // Will be generated at use site.
+}
+
 void LocationsBuilderX86_64::VisitReturnVoid(HReturnVoid* ret) {
   ret->SetLocations(nullptr);
 }
@@ -995,7 +1027,7 @@
     case Primitive::kPrimDouble:
     case Primitive::kPrimFloat: {
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::Any());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     }
@@ -1032,21 +1064,12 @@
     }
 
     case Primitive::kPrimFloat: {
-      if (second.IsFpuRegister()) {
-        __ addss(first.As<XmmRegister>(), second.As<XmmRegister>());
-      } else {
-        __ addss(first.As<XmmRegister>(),
-                 Address(CpuRegister(RSP), second.GetStackIndex()));
-      }
+      __ addss(first.As<XmmRegister>(), second.As<XmmRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      if (second.IsFpuRegister()) {
-        __ addsd(first.As<XmmRegister>(), second.As<XmmRegister>());
-      } else {
-        __ addsd(first.As<XmmRegister>(), Address(CpuRegister(RSP), second.GetStackIndex()));
-      }
+      __ addsd(first.As<XmmRegister>(), second.As<XmmRegister>());
       break;
     }
 
@@ -1482,10 +1505,30 @@
       break;
     }
 
-    case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented register type " << instruction->GetType();
-      UNREACHABLE();
+    case Primitive::kPrimFloat: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(float)).Uint32Value();
+      XmmRegister out = locations->Out().As<XmmRegister>();
+      if (index.IsConstant()) {
+        __ movss(out, Address(obj,
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset));
+      } else {
+        __ movss(out, Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset));
+      }
+      break;
+    }
+
+    case Primitive::kPrimDouble: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(double)).Uint32Value();
+      XmmRegister out = locations->Out().As<XmmRegister>();
+      if (index.IsConstant()) {
+        __ movsd(out, Address(obj,
+            (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset));
+      } else {
+        __ movsd(out, Address(obj, index.As<CpuRegister>(), TIMES_8, data_offset));
+      }
+      break;
+    }
+
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
@@ -1509,6 +1552,8 @@
     locations->SetInAt(2, Location::RequiresRegister());
     if (value_type == Primitive::kPrimLong) {
       locations->SetInAt(2, Location::RequiresRegister());
+    } else if (value_type == Primitive::kPrimFloat || value_type == Primitive::kPrimDouble) {
+      locations->SetInAt(2, Location::RequiresFpuRegister());
     } else {
       locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2)));
     }
@@ -1581,6 +1626,7 @@
           __ movl(Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset),
                   value.As<CpuRegister>());
         } else {
+          DCHECK(value.IsConstant()) << value;
           __ movl(Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset),
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
         }
@@ -1609,10 +1655,34 @@
       break;
     }
 
-    case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble:
-      LOG(FATAL) << "Unimplemented register type " << instruction->GetType();
-      UNREACHABLE();
+    case Primitive::kPrimFloat: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(float)).Uint32Value();
+      if (index.IsConstant()) {
+        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+        DCHECK(value.IsFpuRegister());
+        __ movss(Address(obj, offset), value.As<XmmRegister>());
+      } else {
+        DCHECK(value.IsFpuRegister());
+        __ movss(Address(obj, index.As<CpuRegister>(), TIMES_4, data_offset),
+                value.As<XmmRegister>());
+      }
+      break;
+    }
+
+    case Primitive::kPrimDouble: {
+      uint32_t data_offset = mirror::Array::DataOffset(sizeof(double)).Uint32Value();
+      if (index.IsConstant()) {
+        size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
+        DCHECK(value.IsFpuRegister());
+        __ movsd(Address(obj, offset), value.As<XmmRegister>());
+      } else {
+        DCHECK(value.IsFpuRegister());
+        __ movsd(Address(obj, index.As<CpuRegister>(), TIMES_8, data_offset),
+                value.As<XmmRegister>());
+      }
+      break;
+    }
+
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
@@ -1746,6 +1816,9 @@
     if (destination.IsRegister()) {
       __ movl(destination.As<CpuRegister>(),
               Address(CpuRegister(RSP), source.GetStackIndex()));
+    } else if (destination.IsFpuRegister()) {
+      __ movss(destination.As<XmmRegister>(),
+              Address(CpuRegister(RSP), source.GetStackIndex()));
     } else {
       DCHECK(destination.IsStackSlot());
       __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -1755,6 +1828,8 @@
     if (destination.IsRegister()) {
       __ movq(destination.As<CpuRegister>(),
               Address(CpuRegister(RSP), source.GetStackIndex()));
+    } else if (destination.IsFpuRegister()) {
+      __ movsd(destination.As<XmmRegister>(), Address(CpuRegister(RSP), source.GetStackIndex()));
     } else {
       DCHECK(destination.IsDoubleStackSlot());
       __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -1767,6 +1842,7 @@
       if (destination.IsRegister()) {
         __ movl(destination.As<CpuRegister>(), imm);
       } else {
+        DCHECK(destination.IsStackSlot()) << destination;
         __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), imm);
       }
     } else if (constant->IsLongConstant()) {
@@ -1774,14 +1850,42 @@
       if (destination.IsRegister()) {
         __ movq(destination.As<CpuRegister>(), Immediate(value));
       } else {
+        DCHECK(destination.IsDoubleStackSlot()) << destination;
         __ movq(CpuRegister(TMP), Immediate(value));
         __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
       }
+    } else if (constant->IsFloatConstant()) {
+      Immediate imm(bit_cast<float, int32_t>(constant->AsFloatConstant()->GetValue()));
+      if (destination.IsFpuRegister()) {
+        __ movl(CpuRegister(TMP), imm);
+        __ movd(destination.As<XmmRegister>(), CpuRegister(TMP));
+      } else {
+        DCHECK(destination.IsStackSlot()) << destination;
+        __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), imm);
+      }
     } else {
-      LOG(FATAL) << "Unimplemented constant type";
+      DCHECK(constant->IsDoubleConstant()) << constant->DebugName();
+      Immediate imm(bit_cast<double, int64_t>(constant->AsDoubleConstant()->GetValue()));
+      if (destination.IsFpuRegister()) {
+        __ movq(CpuRegister(TMP), imm);
+        __ movd(destination.As<XmmRegister>(), CpuRegister(TMP));
+      } else {
+        DCHECK(destination.IsDoubleStackSlot()) << destination;
+        __ movq(CpuRegister(TMP), imm);
+        __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+      }
     }
-  } else {
-    LOG(FATAL) << "Unimplemented";
+  } else if (source.IsFpuRegister()) {
+    if (destination.IsFpuRegister()) {
+      __ movaps(destination.As<XmmRegister>(), source.As<XmmRegister>());
+    } else if (destination.IsStackSlot()) {
+      __ movss(Address(CpuRegister(RSP), destination.GetStackIndex()),
+               source.As<XmmRegister>());
+    } else {
+      DCHECK(destination.IsDoubleStackSlot());
+      __ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()),
+               source.As<XmmRegister>());
+    }
   }
 }
 
@@ -1823,6 +1927,18 @@
           CpuRegister(ensure_scratch.GetRegister()));
 }
 
+void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) {
+  __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
+  __ movss(Address(CpuRegister(RSP), mem), reg);
+  __ movd(reg, CpuRegister(TMP));
+}
+
+void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) {
+  __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
+  __ movsd(Address(CpuRegister(RSP), mem), reg);
+  __ movd(reg, CpuRegister(TMP));
+}
+
 void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
   MoveOperands* move = moves_.Get(index);
   Location source = move->GetSource();
@@ -1842,8 +1958,20 @@
     Exchange64(destination.As<CpuRegister>(), source.GetStackIndex());
   } else if (source.IsDoubleStackSlot() && destination.IsDoubleStackSlot()) {
     Exchange64(destination.GetStackIndex(), source.GetStackIndex());
+  } else if (source.IsFpuRegister() && destination.IsFpuRegister()) {
+    __ movd(CpuRegister(TMP), source.As<XmmRegister>());
+    __ movaps(source.As<XmmRegister>(), destination.As<XmmRegister>());
+    __ movd(destination.As<XmmRegister>(), CpuRegister(TMP));
+  } else if (source.IsFpuRegister() && destination.IsStackSlot()) {
+    Exchange32(source.As<XmmRegister>(), destination.GetStackIndex());
+  } else if (source.IsStackSlot() && destination.IsFpuRegister()) {
+    Exchange32(destination.As<XmmRegister>(), source.GetStackIndex());
+  } else if (source.IsFpuRegister() && destination.IsDoubleStackSlot()) {
+    Exchange64(source.As<XmmRegister>(), destination.GetStackIndex());
+  } else if (source.IsDoubleStackSlot() && destination.IsFpuRegister()) {
+    Exchange64(destination.As<XmmRegister>(), source.GetStackIndex());
   } else {
-    LOG(FATAL) << "Unimplemented";
+    LOG(FATAL) << "Unimplemented swap between " << source << " and " << destination;
   }
 }