[optimizing] Tune some x86_64 moves

Generate Moves of constant FP values by loading from the constant table.

Use 'movl' to load a 64 bit register for positive 32-bit values, saving
a byte in the generated code by taking advantage of the implicit
zero extension.

Change a couple of xorq(reg, reg) to xorl to (potentially) save a byte
of code per xor.

Change-Id: I5b2a807f0d3b29294fd4e7b8ef6d654491fa0b01
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 5ac6866..9cf5ecb 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -99,7 +99,7 @@
       if (is_div_) {
         __ negq(cpu_reg_);
       } else {
-        __ movq(cpu_reg_, Immediate(0));
+        __ xorl(cpu_reg_, cpu_reg_);
       }
     }
     __ jmp(GetExitLabel());
@@ -671,7 +671,7 @@
         DCHECK(constant->IsLongConstant());
         value = constant->AsLongConstant()->GetValue();
       }
-      __ movq(CpuRegister(TMP), Immediate(value));
+      Load64BitValue(CpuRegister(TMP), value);
       __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     } else {
       DCHECK(source.IsDoubleStackSlot());
@@ -704,9 +704,9 @@
     } else if (const_to_move->IsLongConstant()) {
       int64_t value = const_to_move->AsLongConstant()->GetValue();
       if (location.IsRegister()) {
-        __ movq(location.AsRegister<CpuRegister>(), Immediate(value));
+        Load64BitValue(location.AsRegister<CpuRegister>(), value);
       } else if (location.IsDoubleStackSlot()) {
-        __ movq(CpuRegister(TMP), Immediate(value));
+        Load64BitValue(CpuRegister(TMP), value);
         __ movq(Address(CpuRegister(RSP), location.GetStackIndex()), CpuRegister(TMP));
       } else {
         DCHECK(location.IsConstant());
@@ -956,7 +956,7 @@
     LocationSummary* locations = comp->GetLocations();
     CpuRegister reg = locations->Out().AsRegister<CpuRegister>();
     // Clear register: setcc only sets the low byte.
-    __ xorq(reg, reg);
+    __ xorl(reg, reg);
     Location lhs = locations->InAt(0);
     Location rhs = locations->InAt(1);
     if (rhs.IsRegister()) {
@@ -1419,8 +1419,8 @@
   size_t class_offset = mirror::Object::ClassOffset().SizeValue();
 
   // Set the hidden argument.
-  __ movq(invoke->GetLocations()->GetTemp(1).AsRegister<CpuRegister>(),
-          Immediate(invoke->GetDexMethodIndex()));
+  CpuRegister hidden_reg = invoke->GetLocations()->GetTemp(1).AsRegister<CpuRegister>();
+  codegen_->Load64BitValue(hidden_reg, invoke->GetDexMethodIndex());
 
   // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
@@ -1856,7 +1856,7 @@
           XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
-          __ movq(output, Immediate(kPrimLongMax));
+          codegen_->Load64BitValue(output, kPrimLongMax);
           // temp = long-to-float(output)
           __ cvtsi2ss(temp, output, true);
           // if input >= temp goto done
@@ -1869,7 +1869,7 @@
           __ jmp(&done);
           __ Bind(&nan);
           //  output = 0
-          __ xorq(output, output);
+          __ xorl(output, output);
           __ Bind(&done);
           break;
         }
@@ -1881,7 +1881,7 @@
           XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
           Label done, nan;
 
-          __ movq(output, Immediate(kPrimLongMax));
+          codegen_->Load64BitValue(output, kPrimLongMax);
           // temp = long-to-double(output)
           __ cvtsi2sd(temp, output, true);
           // if input >= temp goto done
@@ -1894,7 +1894,7 @@
           __ jmp(&done);
           __ Bind(&nan);
           //  output = 0
-          __ xorq(output, output);
+          __ xorl(output, output);
           __ Bind(&done);
           break;
         }
@@ -2483,7 +2483,7 @@
 
     case Primitive::kPrimLong: {
       if (instruction->IsRem()) {
-        __ xorq(output_register, output_register);
+        __ xorl(output_register, output_register);
       } else {
         __ movq(output_register, input_register);
         if (imm == -1) {
@@ -2527,7 +2527,7 @@
     DCHECK_EQ(instruction->GetResultType(), Primitive::kPrimLong);
     CpuRegister rdx = locations->GetTemp(0).AsRegister<CpuRegister>();
 
-    __ movq(rdx, Immediate(std::abs(imm) - 1));
+    codegen_->Load64BitValue(rdx, std::abs(imm) - 1);
     __ addq(rdx, numerator);
     __ testq(numerator, numerator);
     __ cmov(kGreaterEqual, rdx, numerator);
@@ -2624,7 +2624,7 @@
     __ movq(numerator, rax);
 
     // RAX = magic
-    __ movq(rax, Immediate(magic));
+    codegen_->Load64BitValue(rax, magic);
 
     // RDX:RAX = magic * numerator
     __ imulq(numerator);
@@ -2653,8 +2653,7 @@
       if (IsInt<32>(imm)) {
         __ imulq(rdx, Immediate(static_cast<int32_t>(imm)));
       } else {
-        __ movq(numerator, Immediate(imm));
-        __ imulq(rdx, numerator);
+        __ imulq(rdx, codegen_->LiteralInt64Address(imm));
       }
 
       __ subq(rax, rdx);
@@ -3020,8 +3019,8 @@
 void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
   codegen_->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(1)));
-  __ movq(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(instruction->GetTypeIndex()));
-
+  codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
+                           instruction->GetTypeIndex());
   __ gs()->call(
       Address::Absolute(GetThreadOffset<kX86_64WordSize>(instruction->GetEntrypoint()), true));
 
@@ -3042,7 +3041,8 @@
 void InstructionCodeGeneratorX86_64::VisitNewArray(HNewArray* instruction) {
   InvokeRuntimeCallingConvention calling_convention;
   codegen_->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(2)));
-  __ movq(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(instruction->GetTypeIndex()));
+  codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
+                           instruction->GetTypeIndex());
 
   __ gs()->call(
       Address::Absolute(GetThreadOffset<kX86_64WordSize>(instruction->GetEntrypoint()), true));
@@ -3938,45 +3938,42 @@
     } else if (constant->IsLongConstant()) {
       int64_t value = constant->AsLongConstant()->GetValue();
       if (destination.IsRegister()) {
-        __ movq(destination.AsRegister<CpuRegister>(), Immediate(value));
+        codegen_->Load64BitValue(destination.AsRegister<CpuRegister>(), value);
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
-        __ movq(CpuRegister(TMP), Immediate(value));
+        codegen_->Load64BitValue(CpuRegister(TMP), value);
         __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
       }
     } else if (constant->IsFloatConstant()) {
       float fp_value = constant->AsFloatConstant()->GetValue();
       int32_t value = bit_cast<int32_t, float>(fp_value);
-      Immediate imm(value);
       if (destination.IsFpuRegister()) {
         XmmRegister dest = destination.AsFpuRegister<XmmRegister>();
         if (value == 0) {
           // easy FP 0.0.
           __ xorps(dest, dest);
         } else {
-          __ movl(CpuRegister(TMP), imm);
-          __ movd(dest, CpuRegister(TMP));
+          __ movss(dest, codegen_->LiteralFloatAddress(fp_value));
         }
       } else {
         DCHECK(destination.IsStackSlot()) << destination;
+        Immediate imm(value);
         __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), imm);
       }
     } else {
       DCHECK(constant->IsDoubleConstant()) << constant->DebugName();
       double fp_value =  constant->AsDoubleConstant()->GetValue();
       int64_t value = bit_cast<int64_t, double>(fp_value);
-      Immediate imm(value);
       if (destination.IsFpuRegister()) {
         XmmRegister dest = destination.AsFpuRegister<XmmRegister>();
         if (value == 0) {
           __ xorpd(dest, dest);
         } else {
-          __ movq(CpuRegister(TMP), imm);
-          __ movd(dest, CpuRegister(TMP));
+          __ movsd(dest, codegen_->LiteralDoubleAddress(fp_value));
         }
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
-        __ movq(CpuRegister(TMP), imm);
+        codegen_->Load64BitValue(CpuRegister(TMP), value);
         __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
       }
     }
@@ -4435,6 +4432,17 @@
   LOG(FATAL) << "Unreachable";
 }
 
+void CodeGeneratorX86_64::Load64BitValue(CpuRegister dest, int64_t value) {
+  if (value == 0) {
+    __ xorl(dest, dest);
+  } else if (value > 0 && IsInt<32>(value)) {
+    // We can use a 32 bit move, as it will zero-extend and is one byte shorter.
+    __ movl(dest, Immediate(static_cast<int32_t>(value)));
+  } else {
+    __ movq(dest, Immediate(value));
+  }
+}
+
 void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) {
   // Generate the constant area if needed.
   X86_64Assembler* assembler = GetAssembler();