[optimizing compiler] Implement inline x86 FP '%'

Replace the calls to fmod/fmodf by inline code as is done in the Quick
compiler.

Remove the quick fmod/fmodf runtime entries, as they are no longer in
use.

64 bit code generator Move() routine needed to be enhanced to handle
constants, as Location::Any() allows them to be generated.

Change-Id: I6b6a42f6faeed4b0b3c940453e487daf5b25d184
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 196e0cf..dd6861f 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -47,6 +47,8 @@
 static constexpr size_t kRuntimeParameterFpuRegistersLength =
     arraysize(kRuntimeParameterFpuRegisters);
 
+static constexpr int kC2ConditionMask = 0x400;
+
 class InvokeRuntimeCallingConvention : public CallingConvention<Register, FloatRegister> {
  public:
   InvokeRuntimeCallingConvention()
@@ -583,8 +585,18 @@
     } else if (source.IsFpuRegister()) {
       __ movss(Address(CpuRegister(RSP), destination.GetStackIndex()),
                source.AsFpuRegister<XmmRegister>());
+    } else if (source.IsConstant()) {
+      HConstant* constant = source.GetConstant();
+      int32_t value;
+      if (constant->IsFloatConstant()) {
+        value = bit_cast<float, int32_t>(constant->AsFloatConstant()->GetValue());
+      } else {
+        DCHECK(constant->IsIntConstant());
+        value = constant->AsIntConstant()->GetValue();
+      }
+      __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), Immediate(value));
     } else {
-      DCHECK(source.IsStackSlot());
+      DCHECK(source.IsStackSlot()) << source;
       __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
       __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     }
@@ -596,6 +608,17 @@
     } else if (source.IsFpuRegister()) {
       __ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()),
                source.AsFpuRegister<XmmRegister>());
+    } else if (source.IsConstant()) {
+      HConstant* constant = source.GetConstant();
+      int64_t value = constant->AsLongConstant()->GetValue();
+      if (constant->IsDoubleConstant()) {
+        value = bit_cast<double, int64_t>(constant->AsDoubleConstant()->GetValue());
+      } else {
+        DCHECK(constant->IsLongConstant());
+        value = constant->AsLongConstant()->GetValue();
+      }
+      __ movq(CpuRegister(TMP), Immediate(value));
+      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -2000,6 +2023,81 @@
   }
 }
 
+void InstructionCodeGeneratorX86_64::PushOntoFPStack(Location source, uint32_t temp_offset,
+                                                     uint32_t stack_adjustment, bool is_float) {
+  if (source.IsStackSlot()) {
+    DCHECK(is_float);
+    __ flds(Address(CpuRegister(RSP), source.GetStackIndex() + stack_adjustment));
+  } else if (source.IsDoubleStackSlot()) {
+    DCHECK(!is_float);
+    __ fldl(Address(CpuRegister(RSP), source.GetStackIndex() + stack_adjustment));
+  } else {
+    // Write the value to the temporary location on the stack and load to FP stack.
+    if (is_float) {
+      Location stack_temp = Location::StackSlot(temp_offset);
+      codegen_->Move(stack_temp, source);
+      __ flds(Address(CpuRegister(RSP), temp_offset));
+    } else {
+      Location stack_temp = Location::DoubleStackSlot(temp_offset);
+      codegen_->Move(stack_temp, source);
+      __ fldl(Address(CpuRegister(RSP), temp_offset));
+    }
+  }
+}
+
+void InstructionCodeGeneratorX86_64::GenerateRemFP(HRem *rem) {
+  Primitive::Type type = rem->GetResultType();
+  bool is_float = type == Primitive::kPrimFloat;
+  size_t elem_size = Primitive::ComponentSize(type);
+  LocationSummary* locations = rem->GetLocations();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+  Location out = locations->Out();
+
+  // Create stack space for 2 elements.
+  // TODO: enhance register allocator to ask for stack temporaries.
+  __ subq(CpuRegister(RSP), Immediate(2 * elem_size));
+
+  // Load the values to the FP stack in reverse order, using temporaries if needed.
+  PushOntoFPStack(second, elem_size, 2 * elem_size, is_float);
+  PushOntoFPStack(first, 0, 2 * elem_size, is_float);
+
+  // Loop doing FPREM until we stabilize.
+  Label retry;
+  __ Bind(&retry);
+  __ fprem();
+
+  // Move FP status to AX.
+  __ fstsw();
+
+  // And see if the argument reduction is complete. This is signaled by the
+  // C2 FPU flag bit set to 0.
+  __ andl(CpuRegister(RAX), Immediate(kC2ConditionMask));
+  __ j(kNotEqual, &retry);
+
+  // We have settled on the final value. Retrieve it into an XMM register.
+  // Store FP top of stack to real stack.
+  if (is_float) {
+    __ fsts(Address(CpuRegister(RSP), 0));
+  } else {
+    __ fstl(Address(CpuRegister(RSP), 0));
+  }
+
+  // Pop the 2 items from the FP stack.
+  __ fucompp();
+
+  // Load the value from the stack into an XMM register.
+  DCHECK(out.IsFpuRegister()) << out;
+  if (is_float) {
+    __ movss(out.AsFpuRegister<XmmRegister>(), Address(CpuRegister(RSP), 0));
+  } else {
+    __ movsd(out.AsFpuRegister<XmmRegister>(), Address(CpuRegister(RSP), 0));
+  }
+
+  // And remove the temporary stack space we allocated.
+  __ addq(CpuRegister(RSP), Immediate(2 * elem_size));
+}
+
 void InstructionCodeGeneratorX86_64::GenerateDivRemIntegral(HBinaryOperation* instruction) {
   DCHECK(instruction->IsDiv() || instruction->IsRem());
   Primitive::Type type = instruction->GetResultType();
@@ -2099,11 +2197,8 @@
 
 void LocationsBuilderX86_64::VisitRem(HRem* rem) {
   Primitive::Type type = rem->GetResultType();
-  LocationSummary::CallKind call_kind =
-      (type == Primitive::kPrimInt) || (type == Primitive::kPrimLong)
-      ? LocationSummary::kNoCall
-      : LocationSummary::kCall;
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(rem, call_kind);
+  LocationSummary* locations =
+    new (GetGraph()->GetArena()) LocationSummary(rem, LocationSummary::kNoCall);
 
   switch (type) {
     case Primitive::kPrimInt:
@@ -2117,11 +2212,10 @@
 
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
-      InvokeRuntimeCallingConvention calling_convention;
-      locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
-      locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
-      // The runtime helper puts the result in XMM0.
-      locations->SetOut(Location::FpuRegisterLocation(XMM0));
+      locations->SetInAt(0, Location::Any());
+      locations->SetInAt(1, Location::Any());
+      locations->SetOut(Location::RequiresFpuRegister());
+      locations->AddTemp(Location::RegisterLocation(RAX));
       break;
     }
 
@@ -2138,14 +2232,9 @@
       GenerateDivRemIntegral(rem);
       break;
     }
-    case Primitive::kPrimFloat: {
-      __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pFmodf), true));
-      codegen_->RecordPcInfo(rem, rem->GetDexPc());
-      break;
-    }
+    case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
-      __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pFmod), true));
-      codegen_->RecordPcInfo(rem, rem->GetDexPc());
+      GenerateRemFP(rem);
       break;
     }
     default: