[optimizing] More x86_64 code improvements

Use the constant area some more, use 32-bit immediates in movq
instructions when possible, and other small tweaks.

Remove the commented out code for Math.Abs(float/double) as it would
fail for baseline compiler due to the output being the same as the
input.

Change-Id: Ifa39f1865b94cec2e1c0a99af3066a645e9d3618
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 2b5fcbd..a0f45ed 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -104,7 +104,7 @@
       if (is_div_) {
         __ negl(cpu_reg_);
       } else {
-        __ movl(cpu_reg_, Immediate(0));
+        __ xorl(cpu_reg_, cpu_reg_);
       }
 
     } else {
@@ -749,8 +749,7 @@
         DCHECK(constant->IsLongConstant());
         value = constant->AsLongConstant()->GetValue();
       }
-      Load64BitValue(CpuRegister(TMP), value);
-      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+      Store64BitValueToStack(destination, value);
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
@@ -784,8 +783,7 @@
       if (location.IsRegister()) {
         Load64BitValue(location.AsRegister<CpuRegister>(), value);
       } else if (location.IsDoubleStackSlot()) {
-        Load64BitValue(CpuRegister(TMP), value);
-        __ movq(Address(CpuRegister(RSP), location.GetStackIndex()), CpuRegister(TMP));
+        Store64BitValueToStack(location, value);
       } else {
         DCHECK(location.IsConstant());
         DCHECK_EQ(location.GetConstant(), const_to_move);
@@ -1849,14 +1847,12 @@
           // Processing a Dex `float-to-int' instruction.
           locations->SetInAt(0, Location::RequiresFpuRegister());
           locations->SetOut(Location::RequiresRegister());
-          locations->AddTemp(Location::RequiresFpuRegister());
           break;
 
         case Primitive::kPrimDouble:
           // Processing a Dex `double-to-int' instruction.
           locations->SetInAt(0, Location::RequiresFpuRegister());
           locations->SetOut(Location::RequiresRegister());
-          locations->AddTemp(Location::RequiresFpuRegister());
           break;
 
         default:
@@ -1884,14 +1880,12 @@
           // Processing a Dex `float-to-long' instruction.
           locations->SetInAt(0, Location::RequiresFpuRegister());
           locations->SetOut(Location::RequiresRegister());
-          locations->AddTemp(Location::RequiresFpuRegister());
           break;
 
         case Primitive::kPrimDouble:
           // Processing a Dex `double-to-long' instruction.
           locations->SetInAt(0, Location::RequiresFpuRegister());
           locations->SetOut(Location::RequiresRegister());
-          locations->AddTemp(Location::RequiresFpuRegister());
           break;
 
         default:
@@ -2067,14 +2061,11 @@
           // Processing a Dex `float-to-int' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
           __ movl(output, Immediate(kPrimIntMax));
-          // temp = int-to-float(output)
-          __ cvtsi2ss(temp, output, false);
-          // if input >= temp goto done
-          __ comiss(input, temp);
+          // if input >= (float)INT_MAX goto done
+          __ comiss(input, codegen_->LiteralFloatAddress(kPrimIntMax));
           __ j(kAboveEqual, &done);
           // if input == NaN goto nan
           __ j(kUnordered, &nan);
@@ -2092,14 +2083,11 @@
           // Processing a Dex `double-to-int' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
           __ movl(output, Immediate(kPrimIntMax));
-          // temp = int-to-double(output)
-          __ cvtsi2sd(temp, output);
-          // if input >= temp goto done
-          __ comisd(input, temp);
+          // if input >= (double)INT_MAX goto done
+          __ comisd(input, codegen_->LiteralDoubleAddress(kPrimIntMax));
           __ j(kAboveEqual, &done);
           // if input == NaN goto nan
           __ j(kUnordered, &nan);
@@ -2137,14 +2125,11 @@
           // Processing a Dex `float-to-long' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
           codegen_->Load64BitValue(output, kPrimLongMax);
-          // temp = long-to-float(output)
-          __ cvtsi2ss(temp, output, true);
-          // if input >= temp goto done
-          __ comiss(input, temp);
+          // if input >= (float)LONG_MAX goto done
+          __ comiss(input, codegen_->LiteralFloatAddress(kPrimLongMax));
           __ j(kAboveEqual, &done);
           // if input == NaN goto nan
           __ j(kUnordered, &nan);
@@ -2162,14 +2147,11 @@
           // Processing a Dex `double-to-long' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
           codegen_->Load64BitValue(output, kPrimLongMax);
-          // temp = long-to-double(output)
-          __ cvtsi2sd(temp, output, true);
-          // if input >= temp goto done
-          __ comisd(input, temp);
+          // if input >= (double)LONG_MAX goto done
+          __ comisd(input, codegen_->LiteralDoubleAddress(kPrimLongMax));
           __ j(kAboveEqual, &done);
           // if input == NaN goto nan
           __ j(kUnordered, &nan);
@@ -4336,8 +4318,7 @@
         codegen_->Load64BitValue(destination.AsRegister<CpuRegister>(), value);
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
-        codegen_->Load64BitValue(CpuRegister(TMP), value);
-        __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+        codegen_->Store64BitValueToStack(destination, value);
       }
     } else if (constant->IsFloatConstant()) {
       float fp_value = constant->AsFloatConstant()->GetValue();
@@ -4368,8 +4349,7 @@
         }
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
-        codegen_->Load64BitValue(CpuRegister(TMP), value);
-        __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+        codegen_->Store64BitValueToStack(destination, value);
       }
     }
   } else if (source.IsFpuRegister()) {
@@ -4871,6 +4851,18 @@
   }
 }
 
+void CodeGeneratorX86_64::Store64BitValueToStack(Location dest, int64_t value) {
+  DCHECK(dest.IsDoubleStackSlot());
+  if (IsInt<32>(value)) {
+    // Can move directly as an int32 constant.
+    __ movq(Address(CpuRegister(RSP), dest.GetStackIndex()),
+            Immediate(static_cast<int32_t>(value)));
+  } else {
+    Load64BitValue(CpuRegister(TMP), value);
+    __ movq(Address(CpuRegister(RSP), dest.GetStackIndex()), CpuRegister(TMP));
+  }
+}
+
 void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) {
   // Generate the constant area if needed.
   X86_64Assembler* assembler = GetAssembler();
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 3b3915f..41bebac 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -322,6 +322,9 @@
   // Load a 64 bit value into a register in the most efficient manner.
   void Load64BitValue(CpuRegister dest, int64_t value);
 
+  // Store a 64 bit value into a DoubleStackSlot in the most efficient manner.
+  void Store64BitValueToStack(Location dest, int64_t value);
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 62cdb4c..85d40d7 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -282,8 +282,6 @@
                                                            LocationSummary::kNoCall,
                                                            kIntrinsified);
   locations->SetInAt(0, Location::RequiresFpuRegister());
-  // TODO: Allow x86 to work with memory. This requires assembler support, see below.
-  // locations->SetInAt(0, Location::Any());               // X86 can work on memory directly.
   locations->SetOut(Location::SameAsFirstInput());
   locations->AddTemp(Location::RequiresFpuRegister());  // FP reg to hold mask.
 }
@@ -294,34 +292,18 @@
                       CodeGeneratorX86_64* codegen) {
   Location output = locations->Out();
 
-  if (output.IsFpuRegister()) {
-    // In-register
-    XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  DCHECK(output.IsFpuRegister());
+  XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
 
-    // TODO: Can mask directly with constant area using pand if we can guarantee
-    // that the literal is aligned on a 16 byte boundary.  This will avoid a
-    // temporary.
-    if (is64bit) {
-      __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF)));
-      __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp);
-    } else {
-      __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF)));
-      __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp);
-    }
+  // TODO: Can mask directly with constant area using pand if we can guarantee
+  // that the literal is aligned on a 16 byte boundary.  This will avoid a
+  // temporary.
+  if (is64bit) {
+    __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF)));
+    __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp);
   } else {
-    // TODO: update when assember support is available.
-    UNIMPLEMENTED(FATAL) << "Needs assembler support.";
-//  Once assembler support is available, in-memory operations look like this:
-//    if (is64bit) {
-//      DCHECK(output.IsDoubleStackSlot());
-//      // No 64b and with literal.
-//      __ movq(cpu_temp, Immediate(INT64_C(0x7FFFFFFFFFFFFFFF)));
-//      __ andq(Address(CpuRegister(RSP), output.GetStackIndex()), cpu_temp);
-//    } else {
-//      DCHECK(output.IsStackSlot());
-//      // Can use and with a literal directly.
-//      __ andl(Address(CpuRegister(RSP), output.GetStackIndex()), Immediate(INT64_C(0x7FFFFFFF)));
-//    }
+    __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF)));
+    __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp);
   }
 }
 
@@ -736,6 +718,7 @@
   codegen_->Load64BitValue(out, kPrimIntMax);
 
   // if inPlusPointFive >= maxInt goto done
+  __ movl(out, Immediate(kPrimIntMax));
   __ comiss(inPlusPointFive, codegen_->LiteralFloatAddress(static_cast<float>(kPrimIntMax)));
   __ j(kAboveEqual, &done);
 
@@ -783,6 +766,7 @@
   codegen_->Load64BitValue(out, kPrimLongMax);
 
   // if inPlusPointFive >= maxLong goto done
+  __ movq(out, Immediate(kPrimLongMax));
   __ comisd(inPlusPointFive, codegen_->LiteralDoubleAddress(static_cast<double>(kPrimLongMax)));
   __ j(kAboveEqual, &done);
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index b8e5fb6..beca037 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -332,7 +332,7 @@
   void movq(CpuRegister dst, const Address& src);
   void movl(CpuRegister dst, const Address& src);
   void movq(const Address& dst, CpuRegister src);
-  void movq(const Address& dst, const Immediate& src);
+  void movq(const Address& dst, const Immediate& imm);
   void movl(const Address& dst, CpuRegister src);
   void movl(const Address& dst, const Immediate& imm);