X86: Use the constant area for more operations.

Allow FP HNeg to use the constant area to hold the constant to flip the
sign bit.

Enhance some math intrinsics to allow the use of the constant
area: Abs{Float,Double}, {Min,Max}{FloatFloat,DoubleDouble}.

Allow compares of floats/doubles to constants using the constant area.

These eliminate almost all uses of loading constants from the stack.

Change-Id: Ic4b831565825cbe9f0801b1b53c1013be7c87ae4
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 50c4ba2..94bdd55 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -1386,6 +1386,41 @@
   __ j(final_condition, true_label);
 }
 
+void InstructionCodeGeneratorX86::GenerateFPCompare(Location lhs,
+                                                    Location rhs,
+                                                    HInstruction* insn,
+                                                    bool is_double) {
+  HX86LoadFromConstantTable* const_area = insn->InputAt(1)->AsX86LoadFromConstantTable();
+  if (is_double) {
+    if (rhs.IsFpuRegister()) {
+      __ ucomisd(lhs.AsFpuRegister<XmmRegister>(), rhs.AsFpuRegister<XmmRegister>());
+    } else if (const_area != nullptr) {
+      DCHECK(!const_area->NeedsMaterialization());
+      __ ucomisd(lhs.AsFpuRegister<XmmRegister>(),
+                 codegen_->LiteralDoubleAddress(
+                   const_area->GetConstant()->AsDoubleConstant()->GetValue(),
+                   const_area->GetLocations()->InAt(0).AsRegister<Register>()));
+    } else {
+      DCHECK(rhs.IsDoubleStackSlot());
+      __ ucomisd(lhs.AsFpuRegister<XmmRegister>(), Address(ESP, rhs.GetStackIndex()));
+    }
+    return;
+  }
+
+  if (rhs.IsFpuRegister()) {
+    __ ucomiss(lhs.AsFpuRegister<XmmRegister>(), rhs.AsFpuRegister<XmmRegister>());
+  } else if (const_area != nullptr) {
+    DCHECK(!const_area->NeedsMaterialization());
+    __ ucomiss(lhs.AsFpuRegister<XmmRegister>(),
+               codegen_->LiteralFloatAddress(
+                 const_area->GetConstant()->AsFloatConstant()->GetValue(),
+                 const_area->GetLocations()->InAt(0).AsRegister<Register>()));
+  } else {
+    DCHECK(rhs.IsStackSlot());
+    __ ucomiss(lhs.AsFpuRegister<XmmRegister>(), Address(ESP, rhs.GetStackIndex()));
+  }
+}
+
 template<class LabelType>
 void InstructionCodeGeneratorX86::GenerateCompareTestAndBranch(HCondition* condition,
                                                                LabelType* true_target_in,
@@ -1406,11 +1441,11 @@
       GenerateLongComparesAndJumps(condition, true_target, false_target);
       break;
     case Primitive::kPrimFloat:
-      __ ucomiss(left.AsFpuRegister<XmmRegister>(), right.AsFpuRegister<XmmRegister>());
+      GenerateFPCompare(left, right, condition, false);
       GenerateFPJumps(condition, true_target, false_target);
       break;
     case Primitive::kPrimDouble:
-      __ ucomisd(left.AsFpuRegister<XmmRegister>(), right.AsFpuRegister<XmmRegister>());
+      GenerateFPCompare(left, right, condition, true);
       GenerateFPJumps(condition, true_target, false_target);
       break;
     default:
@@ -1636,7 +1671,7 @@
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::Any());
       if (cond->NeedsMaterialization()) {
         locations->SetOut(Location::RequiresRegister());
       }
@@ -1690,11 +1725,11 @@
       GenerateLongComparesAndJumps(cond, &true_label, &false_label);
       break;
     case Primitive::kPrimFloat:
-      __ ucomiss(lhs.AsFpuRegister<XmmRegister>(), rhs.AsFpuRegister<XmmRegister>());
+      GenerateFPCompare(lhs, rhs, cond, false);
       GenerateFPJumps(cond, &true_label, &false_label);
       break;
     case Primitive::kPrimDouble:
-      __ ucomisd(lhs.AsFpuRegister<XmmRegister>(), rhs.AsFpuRegister<XmmRegister>());
+      GenerateFPCompare(lhs, rhs, cond, true);
       GenerateFPJumps(cond, &true_label, &false_label);
       break;
   }
@@ -2130,6 +2165,32 @@
   }
 }
 
+void LocationsBuilderX86::VisitX86FPNeg(HX86FPNeg* neg) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(neg, LocationSummary::kNoCall);
+  DCHECK(Primitive::IsFloatingPointType(neg->GetType()));
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::SameAsFirstInput());
+  locations->AddTemp(Location::RequiresFpuRegister());
+}
+
+void InstructionCodeGeneratorX86::VisitX86FPNeg(HX86FPNeg* neg) {
+  LocationSummary* locations = neg->GetLocations();
+  Location out = locations->Out();
+  DCHECK(locations->InAt(0).Equals(out));
+
+  Register constant_area = locations->InAt(1).AsRegister<Register>();
+  XmmRegister mask = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  if (neg->GetType() == Primitive::kPrimFloat) {
+    __ movss(mask, codegen_->LiteralInt32Address(INT32_C(0x80000000), constant_area));
+    __ xorps(out.AsFpuRegister<XmmRegister>(), mask);
+  } else {
+     __ movsd(mask, codegen_->LiteralInt64Address(INT64_C(0x8000000000000000), constant_area));
+     __ xorpd(out.AsFpuRegister<XmmRegister>(), mask);
+  }
+}
+
 void LocationsBuilderX86::VisitTypeConversion(HTypeConversion* conversion) {
   Primitive::Type result_type = conversion->GetResultType();
   Primitive::Type input_type = conversion->GetInputType();
@@ -4012,7 +4073,7 @@
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::Any());
       locations->SetOut(Location::RequiresRegister());
       break;
     }
@@ -4073,12 +4134,12 @@
       break;
     }
     case Primitive::kPrimFloat: {
-      __ ucomiss(left.AsFpuRegister<XmmRegister>(), right.AsFpuRegister<XmmRegister>());
+      GenerateFPCompare(left, right, compare, false);
       __ j(kUnordered, compare->IsGtBias() ? &greater : &less);
       break;
     }
     case Primitive::kPrimDouble: {
-      __ ucomisd(left.AsFpuRegister<XmmRegister>(), right.AsFpuRegister<XmmRegister>());
+      GenerateFPCompare(left, right, compare, true);
       __ j(kUnordered, compare->IsGtBias() ? &greater : &less);
       break;
     }
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 0aef478..d51b96f 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -296,6 +296,8 @@
                                    HBasicBlock* switch_block,
                                    HBasicBlock* default_block);
 
+  void GenerateFPCompare(Location lhs, Location rhs, HInstruction* insn, bool is_double);
+
   X86Assembler* const assembler_;
   CodeGeneratorX86* const codegen_;
 
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 3f5688b..aad9378 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -37,10 +37,12 @@
 
 static constexpr int kDoubleNaNHigh = 0x7FF80000;
 static constexpr int kDoubleNaNLow = 0x00000000;
-static constexpr int kFloatNaN = 0x7FC00000;
+static constexpr int64_t kDoubleNaN = INT64_C(0x7FF8000000000000);
+static constexpr int32_t kFloatNaN = INT32_C(0x7FC00000);
 
 IntrinsicLocationsBuilderX86::IntrinsicLocationsBuilderX86(CodeGeneratorX86* codegen)
-  : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) {
+  : arena_(codegen->GetGraph()->GetArena()),
+    codegen_(codegen) {
 }
 
 
@@ -256,15 +258,36 @@
                                                            LocationSummary::kNoCall,
                                                            kIntrinsified);
   locations->SetInAt(0, Location::RequiresFpuRegister());
-  // TODO: Allow x86 to work with memory. This requires assembler support, see below.
-  // locations->SetInAt(0, Location::Any());               // X86 can work on memory directly.
   locations->SetOut(Location::SameAsFirstInput());
+  HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect();
+  DCHECK(static_or_direct != nullptr);
+  if (invoke->InputAt(static_or_direct->GetSpecialInputIndex())->IsX86ComputeBaseMethodAddress()) {
+    // We need addressibility for the constant area.
+    locations->SetInAt(1, Location::RequiresRegister());
+    // We need a temporary to hold the constant.
+    locations->AddTemp(Location::RequiresFpuRegister());
+  }
 }
 
-static void MathAbsFP(LocationSummary* locations, bool is64bit, X86Assembler* assembler) {
+static void MathAbsFP(LocationSummary* locations,
+                      bool is64bit,
+                      X86Assembler* assembler,
+                      CodeGeneratorX86* codegen) {
   Location output = locations->Out();
 
-  if (output.IsFpuRegister()) {
+  DCHECK(output.IsFpuRegister());
+  if (locations->InAt(1).IsRegister()) {
+    // We also have a constant area pointer.
+    Register constant_area = locations->InAt(1).AsRegister<Register>();
+    XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+    if (is64bit) {
+      __ movsd(temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF), constant_area));
+      __ andpd(output.AsFpuRegister<XmmRegister>(), temp);
+    } else {
+      __ movss(temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF), constant_area));
+      __ andps(output.AsFpuRegister<XmmRegister>(), temp);
+    }
+  } else {
     // Create the right constant on an aligned stack.
     if (is64bit) {
       __ subl(ESP, Immediate(8));
@@ -277,19 +300,6 @@
       __ andps(output.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
     }
     __ addl(ESP, Immediate(16));
-  } else {
-    // TODO: update when assember support is available.
-    UNIMPLEMENTED(FATAL) << "Needs assembler support.";
-//  Once assembler support is available, in-memory operations look like this:
-//    if (is64bit) {
-//      DCHECK(output.IsDoubleStackSlot());
-//      __ andl(Address(Register(RSP), output.GetHighStackIndex(kX86WordSize)),
-//              Immediate(0x7FFFFFFF));
-//    } else {
-//      DCHECK(output.IsStackSlot());
-//      // Can use and with a literal directly.
-//      __ andl(Address(Register(RSP), output.GetStackIndex()), Immediate(0x7FFFFFFF));
-//    }
   }
 }
 
@@ -298,7 +308,7 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitMathAbsDouble(HInvoke* invoke) {
-  MathAbsFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
+  MathAbsFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler(), codegen_);
 }
 
 void IntrinsicLocationsBuilderX86::VisitMathAbsFloat(HInvoke* invoke) {
@@ -306,7 +316,7 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitMathAbsFloat(HInvoke* invoke) {
-  MathAbsFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
+  MathAbsFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler(), codegen_);
 }
 
 static void CreateAbsIntLocation(ArenaAllocator* arena, HInvoke* invoke) {
@@ -388,8 +398,11 @@
   GenAbsLong(invoke->GetLocations(), GetAssembler());
 }
 
-static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double,
-                        X86Assembler* assembler) {
+static void GenMinMaxFP(LocationSummary* locations,
+                        bool is_min,
+                        bool is_double,
+                        X86Assembler* assembler,
+                        CodeGeneratorX86* codegen) {
   Location op1_loc = locations->InAt(0);
   Location op2_loc = locations->InAt(1);
   Location out_loc = locations->Out();
@@ -450,15 +463,25 @@
 
   // NaN handling.
   __ Bind(&nan);
-  if (is_double) {
-    __ pushl(Immediate(kDoubleNaNHigh));
-    __ pushl(Immediate(kDoubleNaNLow));
-    __ movsd(out, Address(ESP, 0));
-    __ addl(ESP, Immediate(8));
+  // Do we have a constant area pointer?
+  if (locations->InAt(2).IsRegister()) {
+    Register constant_area = locations->InAt(2).AsRegister<Register>();
+    if (is_double) {
+      __ movsd(out, codegen->LiteralInt64Address(kDoubleNaN, constant_area));
+    } else {
+      __ movss(out, codegen->LiteralInt32Address(kFloatNaN, constant_area));
+    }
   } else {
-    __ pushl(Immediate(kFloatNaN));
-    __ movss(out, Address(ESP, 0));
-    __ addl(ESP, Immediate(4));
+    if (is_double) {
+      __ pushl(Immediate(kDoubleNaNHigh));
+      __ pushl(Immediate(kDoubleNaNLow));
+      __ movsd(out, Address(ESP, 0));
+      __ addl(ESP, Immediate(8));
+    } else {
+      __ pushl(Immediate(kFloatNaN));
+      __ movss(out, Address(ESP, 0));
+      __ addl(ESP, Immediate(4));
+    }
   }
   __ jmp(&done);
 
@@ -483,6 +506,11 @@
   // The following is sub-optimal, but all we can do for now. It would be fine to also accept
   // the second input to be the output (we can simply swap inputs).
   locations->SetOut(Location::SameAsFirstInput());
+  HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect();
+  DCHECK(static_or_direct != nullptr);
+  if (invoke->InputAt(static_or_direct->GetSpecialInputIndex())->IsX86ComputeBaseMethodAddress()) {
+    locations->SetInAt(2, Location::RequiresRegister());
+  }
 }
 
 void IntrinsicLocationsBuilderX86::VisitMathMinDoubleDouble(HInvoke* invoke) {
@@ -490,7 +518,11 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitMathMinDoubleDouble(HInvoke* invoke) {
-  GenMinMaxFP(invoke->GetLocations(), /* is_min */ true, /* is_double */ true, GetAssembler());
+  GenMinMaxFP(invoke->GetLocations(),
+              /* is_min */ true,
+              /* is_double */ true,
+              GetAssembler(),
+              codegen_);
 }
 
 void IntrinsicLocationsBuilderX86::VisitMathMinFloatFloat(HInvoke* invoke) {
@@ -498,7 +530,11 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitMathMinFloatFloat(HInvoke* invoke) {
-  GenMinMaxFP(invoke->GetLocations(), /* is_min */ true, /* is_double */ false, GetAssembler());
+  GenMinMaxFP(invoke->GetLocations(),
+              /* is_min */ true,
+              /* is_double */ false,
+              GetAssembler(),
+              codegen_);
 }
 
 void IntrinsicLocationsBuilderX86::VisitMathMaxDoubleDouble(HInvoke* invoke) {
@@ -506,7 +542,11 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitMathMaxDoubleDouble(HInvoke* invoke) {
-  GenMinMaxFP(invoke->GetLocations(), /* is_min */ false, /* is_double */ true, GetAssembler());
+  GenMinMaxFP(invoke->GetLocations(),
+              /* is_min */ false,
+              /* is_double */ true,
+              GetAssembler(),
+              codegen_);
 }
 
 void IntrinsicLocationsBuilderX86::VisitMathMaxFloatFloat(HInvoke* invoke) {
@@ -514,7 +554,11 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitMathMaxFloatFloat(HInvoke* invoke) {
-  GenMinMaxFP(invoke->GetLocations(), /* is_min */ false, /* is_double */ false, GetAssembler());
+  GenMinMaxFP(invoke->GetLocations(),
+              /* is_min */ false,
+              /* is_double */ false,
+              GetAssembler(),
+              codegen_);
 }
 
 static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long,
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 5246fd1..2e37f04 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1256,6 +1256,7 @@
 #define FOR_EACH_CONCRETE_INSTRUCTION_X86(M)                            \
   M(X86ComputeBaseMethodAddress, Instruction)                           \
   M(X86LoadFromConstantTable, Instruction)                              \
+  M(X86FPNeg, Instruction)                                              \
   M(X86PackedSwitch, Instruction)
 #endif
 
diff --git a/compiler/optimizing/nodes_x86.h b/compiler/optimizing/nodes_x86.h
index 556217b..2037246 100644
--- a/compiler/optimizing/nodes_x86.h
+++ b/compiler/optimizing/nodes_x86.h
@@ -62,6 +62,25 @@
   DISALLOW_COPY_AND_ASSIGN(HX86LoadFromConstantTable);
 };
 
+// Version of HNeg with access to the constant table for FP types.
+class HX86FPNeg : public HExpression<2> {
+ public:
+  HX86FPNeg(Primitive::Type result_type,
+            HInstruction* input,
+            HX86ComputeBaseMethodAddress* method_base,
+            uint32_t dex_pc)
+      : HExpression(result_type, SideEffects::None(), dex_pc) {
+    DCHECK(Primitive::IsFloatingPointType(result_type));
+    SetRawInputAt(0, input);
+    SetRawInputAt(1, method_base);
+  }
+
+  DECLARE_INSTRUCTION(X86FPNeg);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HX86FPNeg);
+};
+
 // X86 version of HPackedSwitch that holds a pointer to the base method address.
 class HX86PackedSwitch : public HTemplateInstruction<2> {
  public:
diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc
index 1394dfa..15d2024 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.cc
+++ b/compiler/optimizing/pc_relative_fixups_x86.cc
@@ -53,6 +53,10 @@
     BinaryFP(div);
   }
 
+  void VisitCompare(HCompare* compare) OVERRIDE {
+    BinaryFP(compare);
+  }
+
   void VisitReturn(HReturn* ret) OVERRIDE {
     HConstant* value = ret->InputAt(0)->AsConstant();
     if ((value != nullptr && Primitive::IsFloatingPointType(value->GetType()))) {
@@ -74,11 +78,50 @@
 
   void BinaryFP(HBinaryOperation* bin) {
     HConstant* rhs = bin->InputAt(1)->AsConstant();
-    if (rhs != nullptr && Primitive::IsFloatingPointType(bin->GetResultType())) {
+    if (rhs != nullptr && Primitive::IsFloatingPointType(rhs->GetType())) {
       ReplaceInput(bin, rhs, 1, false);
     }
   }
 
+  void VisitEqual(HEqual* cond) OVERRIDE {
+    BinaryFP(cond);
+  }
+
+  void VisitNotEqual(HNotEqual* cond) OVERRIDE {
+    BinaryFP(cond);
+  }
+
+  void VisitLessThan(HLessThan* cond) OVERRIDE {
+    BinaryFP(cond);
+  }
+
+  void VisitLessThanOrEqual(HLessThanOrEqual* cond) OVERRIDE {
+    BinaryFP(cond);
+  }
+
+  void VisitGreaterThan(HGreaterThan* cond) OVERRIDE {
+    BinaryFP(cond);
+  }
+
+  void VisitGreaterThanOrEqual(HGreaterThanOrEqual* cond) OVERRIDE {
+    BinaryFP(cond);
+  }
+
+  void VisitNeg(HNeg* neg) OVERRIDE {
+    if (Primitive::IsFloatingPointType(neg->GetType())) {
+      // We need to replace the HNeg with a HX86FPNeg in order to address the constant area.
+      InitializePCRelativeBasePointer();
+      HGraph* graph = GetGraph();
+      HBasicBlock* block = neg->GetBlock();
+      HX86FPNeg* x86_fp_neg = new (graph->GetArena()) HX86FPNeg(
+          neg->GetType(),
+          neg->InputAt(0),
+          base_,
+          neg->GetDexPc());
+      block->ReplaceAndRemoveInstructionWith(neg, x86_fp_neg);
+    }
+  }
+
   void VisitPackedSwitch(HPackedSwitch* switch_insn) OVERRIDE {
     if (switch_insn->GetNumEntries() <=
         InstructionCodeGeneratorX86::kPackedSwitchJumpTableThreshold) {
@@ -124,11 +167,13 @@
     // If this is an invoke-static/-direct with PC-relative dex cache array
     // addressing, we need the PC-relative address base.
     HInvokeStaticOrDirect* invoke_static_or_direct = invoke->AsInvokeStaticOrDirect();
+    bool base_added = false;
     if (invoke_static_or_direct != nullptr && invoke_static_or_direct->HasPcRelativeDexCache()) {
       InitializePCRelativeBasePointer();
       // Add the extra parameter base_.
       DCHECK(!invoke_static_or_direct->HasCurrentMethodInput());
       invoke_static_or_direct->AddSpecialInput(base_);
+      base_added = true;
     }
     // Ensure that we can load FP arguments from the constant area.
     for (size_t i = 0, e = invoke->InputCount(); i < e; i++) {
@@ -137,6 +182,25 @@
         ReplaceInput(invoke, input, i, true);
       }
     }
+
+    // These intrinsics need the constant area.
+    switch (invoke->GetIntrinsic()) {
+      case Intrinsics::kMathAbsDouble:
+      case Intrinsics::kMathAbsFloat:
+      case Intrinsics::kMathMaxDoubleDouble:
+      case Intrinsics::kMathMaxFloatFloat:
+      case Intrinsics::kMathMinDoubleDouble:
+      case Intrinsics::kMathMinFloatFloat:
+        if (!base_added) {
+          DCHECK(invoke_static_or_direct != nullptr);
+          DCHECK(!invoke_static_or_direct->HasCurrentMethodInput());
+          InitializePCRelativeBasePointer();
+          invoke_static_or_direct->AddSpecialInput(base_);
+        }
+        break;
+      default:
+        break;
+    }
   }
 
   // The generated HX86ComputeBaseMethodAddress in the entry block needed as an