[optimizing] Tune some x86_64 moves
Generate Moves of constant FP values by loading from the constant table.
Use 'movl' to load a 64 bit register for positive 32-bit values, saving
a byte in the generated code by taking advantage of the implicit
zero extension.
Change a couple of xorq(reg, reg) to xorl to (potentially) save a byte
of code per xor.
Change-Id: I5b2a807f0d3b29294fd4e7b8ef6d654491fa0b01
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 5ac6866..9cf5ecb 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -99,7 +99,7 @@
if (is_div_) {
__ negq(cpu_reg_);
} else {
- __ movq(cpu_reg_, Immediate(0));
+ __ xorl(cpu_reg_, cpu_reg_);
}
}
__ jmp(GetExitLabel());
@@ -671,7 +671,7 @@
DCHECK(constant->IsLongConstant());
value = constant->AsLongConstant()->GetValue();
}
- __ movq(CpuRegister(TMP), Immediate(value));
+ Load64BitValue(CpuRegister(TMP), value);
__ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
} else {
DCHECK(source.IsDoubleStackSlot());
@@ -704,9 +704,9 @@
} else if (const_to_move->IsLongConstant()) {
int64_t value = const_to_move->AsLongConstant()->GetValue();
if (location.IsRegister()) {
- __ movq(location.AsRegister<CpuRegister>(), Immediate(value));
+ Load64BitValue(location.AsRegister<CpuRegister>(), value);
} else if (location.IsDoubleStackSlot()) {
- __ movq(CpuRegister(TMP), Immediate(value));
+ Load64BitValue(CpuRegister(TMP), value);
__ movq(Address(CpuRegister(RSP), location.GetStackIndex()), CpuRegister(TMP));
} else {
DCHECK(location.IsConstant());
@@ -956,7 +956,7 @@
LocationSummary* locations = comp->GetLocations();
CpuRegister reg = locations->Out().AsRegister<CpuRegister>();
// Clear register: setcc only sets the low byte.
- __ xorq(reg, reg);
+ __ xorl(reg, reg);
Location lhs = locations->InAt(0);
Location rhs = locations->InAt(1);
if (rhs.IsRegister()) {
@@ -1419,8 +1419,8 @@
size_t class_offset = mirror::Object::ClassOffset().SizeValue();
// Set the hidden argument.
- __ movq(invoke->GetLocations()->GetTemp(1).AsRegister<CpuRegister>(),
- Immediate(invoke->GetDexMethodIndex()));
+ CpuRegister hidden_reg = invoke->GetLocations()->GetTemp(1).AsRegister<CpuRegister>();
+ codegen_->Load64BitValue(hidden_reg, invoke->GetDexMethodIndex());
// temp = object->GetClass();
if (receiver.IsStackSlot()) {
@@ -1856,7 +1856,7 @@
XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
Label done, nan;
- __ movq(output, Immediate(kPrimLongMax));
+ codegen_->Load64BitValue(output, kPrimLongMax);
// temp = long-to-float(output)
__ cvtsi2ss(temp, output, true);
// if input >= temp goto done
@@ -1869,7 +1869,7 @@
__ jmp(&done);
__ Bind(&nan);
// output = 0
- __ xorq(output, output);
+ __ xorl(output, output);
__ Bind(&done);
break;
}
@@ -1881,7 +1881,7 @@
XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
Label done, nan;
- __ movq(output, Immediate(kPrimLongMax));
+ codegen_->Load64BitValue(output, kPrimLongMax);
// temp = long-to-double(output)
__ cvtsi2sd(temp, output, true);
// if input >= temp goto done
@@ -1894,7 +1894,7 @@
__ jmp(&done);
__ Bind(&nan);
// output = 0
- __ xorq(output, output);
+ __ xorl(output, output);
__ Bind(&done);
break;
}
@@ -2483,7 +2483,7 @@
case Primitive::kPrimLong: {
if (instruction->IsRem()) {
- __ xorq(output_register, output_register);
+ __ xorl(output_register, output_register);
} else {
__ movq(output_register, input_register);
if (imm == -1) {
@@ -2527,7 +2527,7 @@
DCHECK_EQ(instruction->GetResultType(), Primitive::kPrimLong);
CpuRegister rdx = locations->GetTemp(0).AsRegister<CpuRegister>();
- __ movq(rdx, Immediate(std::abs(imm) - 1));
+ codegen_->Load64BitValue(rdx, std::abs(imm) - 1);
__ addq(rdx, numerator);
__ testq(numerator, numerator);
__ cmov(kGreaterEqual, rdx, numerator);
@@ -2624,7 +2624,7 @@
__ movq(numerator, rax);
// RAX = magic
- __ movq(rax, Immediate(magic));
+ codegen_->Load64BitValue(rax, magic);
// RDX:RAX = magic * numerator
__ imulq(numerator);
@@ -2653,8 +2653,7 @@
if (IsInt<32>(imm)) {
__ imulq(rdx, Immediate(static_cast<int32_t>(imm)));
} else {
- __ movq(numerator, Immediate(imm));
- __ imulq(rdx, numerator);
+ __ imulq(rdx, codegen_->LiteralInt64Address(imm));
}
__ subq(rax, rdx);
@@ -3020,8 +3019,8 @@
void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction) {
InvokeRuntimeCallingConvention calling_convention;
codegen_->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(1)));
- __ movq(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(instruction->GetTypeIndex()));
-
+ codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
+ instruction->GetTypeIndex());
__ gs()->call(
Address::Absolute(GetThreadOffset<kX86_64WordSize>(instruction->GetEntrypoint()), true));
@@ -3042,7 +3041,8 @@
void InstructionCodeGeneratorX86_64::VisitNewArray(HNewArray* instruction) {
InvokeRuntimeCallingConvention calling_convention;
codegen_->LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(2)));
- __ movq(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(instruction->GetTypeIndex()));
+ codegen_->Load64BitValue(CpuRegister(calling_convention.GetRegisterAt(0)),
+ instruction->GetTypeIndex());
__ gs()->call(
Address::Absolute(GetThreadOffset<kX86_64WordSize>(instruction->GetEntrypoint()), true));
@@ -3938,45 +3938,42 @@
} else if (constant->IsLongConstant()) {
int64_t value = constant->AsLongConstant()->GetValue();
if (destination.IsRegister()) {
- __ movq(destination.AsRegister<CpuRegister>(), Immediate(value));
+ codegen_->Load64BitValue(destination.AsRegister<CpuRegister>(), value);
} else {
DCHECK(destination.IsDoubleStackSlot()) << destination;
- __ movq(CpuRegister(TMP), Immediate(value));
+ codegen_->Load64BitValue(CpuRegister(TMP), value);
__ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
}
} else if (constant->IsFloatConstant()) {
float fp_value = constant->AsFloatConstant()->GetValue();
int32_t value = bit_cast<int32_t, float>(fp_value);
- Immediate imm(value);
if (destination.IsFpuRegister()) {
XmmRegister dest = destination.AsFpuRegister<XmmRegister>();
if (value == 0) {
// easy FP 0.0.
__ xorps(dest, dest);
} else {
- __ movl(CpuRegister(TMP), imm);
- __ movd(dest, CpuRegister(TMP));
+ __ movss(dest, codegen_->LiteralFloatAddress(fp_value));
}
} else {
DCHECK(destination.IsStackSlot()) << destination;
+ Immediate imm(value);
__ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), imm);
}
} else {
DCHECK(constant->IsDoubleConstant()) << constant->DebugName();
double fp_value = constant->AsDoubleConstant()->GetValue();
int64_t value = bit_cast<int64_t, double>(fp_value);
- Immediate imm(value);
if (destination.IsFpuRegister()) {
XmmRegister dest = destination.AsFpuRegister<XmmRegister>();
if (value == 0) {
__ xorpd(dest, dest);
} else {
- __ movq(CpuRegister(TMP), imm);
- __ movd(dest, CpuRegister(TMP));
+ __ movsd(dest, codegen_->LiteralDoubleAddress(fp_value));
}
} else {
DCHECK(destination.IsDoubleStackSlot()) << destination;
- __ movq(CpuRegister(TMP), imm);
+ codegen_->Load64BitValue(CpuRegister(TMP), value);
__ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
}
}
@@ -4435,6 +4432,17 @@
LOG(FATAL) << "Unreachable";
}
+void CodeGeneratorX86_64::Load64BitValue(CpuRegister dest, int64_t value) {
+ if (value == 0) {
+ __ xorl(dest, dest);
+ } else if (value > 0 && IsInt<32>(value)) {
+ // We can use a 32 bit move, as it will zero-extend and is one byte shorter.
+ __ movl(dest, Immediate(static_cast<int32_t>(value)));
+ } else {
+ __ movq(dest, Immediate(value));
+ }
+}
+
void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) {
// Generate the constant area if needed.
X86_64Assembler* assembler = GetAssembler();