Don't block quick callee saved registers for optimizing.
This change builds on:
https://android-review.googlesource.com/#/c/118983/
- Also fix x86_64 assembler bug triggered by this change.
- Fix (and improve) x86's backend byte register usage.
- Fix a bug in baseline register allocator: a fixed
out register must prevent inputs from allocating it.
Change-Id: I4883862e29b4e4b6470f1823cf7eab7e7863d8ad
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 461409d..5334031 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -177,6 +177,31 @@
}
}
+void CodeGenerator::MaybeBlockPhysicalRegisters(Location loc, bool is_output) const {
+ // The DCHECKS below check that a register is not specified twice in
+ // the summary.
+ // Note that fixed output registers are allowed to overlap with fixed input and
+ // temp registers: the writer of the location summary has to make sure they
+ // don't conflict with each other.
+ if (loc.IsRegister()) {
+ DCHECK(is_output || !blocked_core_registers_[loc.reg()]);
+ blocked_core_registers_[loc.reg()] = true;
+ } else if (loc.IsFpuRegister()) {
+ DCHECK(is_output || !blocked_fpu_registers_[loc.reg()]);
+ blocked_fpu_registers_[loc.reg()] = true;
+ } else if (loc.IsFpuRegisterPair()) {
+ DCHECK(is_output || !blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()]);
+ blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()] = true;
+ DCHECK(is_output || !blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()]);
+ blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()] = true;
+ } else if (loc.IsRegisterPair()) {
+ DCHECK(is_output || !blocked_core_registers_[loc.AsRegisterPairLow<int>()]);
+ blocked_core_registers_[loc.AsRegisterPairLow<int>()] = true;
+ DCHECK(is_output || !blocked_core_registers_[loc.AsRegisterPairHigh<int>()]);
+ blocked_core_registers_[loc.AsRegisterPairHigh<int>()] = true;
+ }
+}
+
void CodeGenerator::AllocateRegistersLocally(HInstruction* instruction) const {
LocationSummary* locations = instruction->GetLocations();
if (locations == nullptr) return;
@@ -196,43 +221,18 @@
// Mark all fixed input, temp and output registers as used.
for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
Location loc = locations->InAt(i);
- // The DCHECKS below check that a register is not specified twice in
- // the summary.
- if (loc.IsRegister()) {
- DCHECK(!blocked_core_registers_[loc.reg()]);
- blocked_core_registers_[loc.reg()] = true;
- } else if (loc.IsFpuRegister()) {
- DCHECK(!blocked_fpu_registers_[loc.reg()]);
- blocked_fpu_registers_[loc.reg()] = true;
- } else if (loc.IsFpuRegisterPair()) {
- DCHECK(!blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()]);
- blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()] = true;
- DCHECK(!blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()]);
- blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()] = true;
- } else if (loc.IsRegisterPair()) {
- DCHECK(!blocked_core_registers_[loc.AsRegisterPairLow<int>()]);
- blocked_core_registers_[loc.AsRegisterPairLow<int>()] = true;
- DCHECK(!blocked_core_registers_[loc.AsRegisterPairHigh<int>()]);
- blocked_core_registers_[loc.AsRegisterPairHigh<int>()] = true;
- }
+ MaybeBlockPhysicalRegisters(loc, false);
}
for (size_t i = 0, e = locations->GetTempCount(); i < e; ++i) {
Location loc = locations->GetTemp(i);
- // The DCHECKS below check that a register is not specified twice in
- // the summary.
- if (loc.IsRegister()) {
- DCHECK(!blocked_core_registers_[loc.reg()]);
- blocked_core_registers_[loc.reg()] = true;
- } else if (loc.IsFpuRegister()) {
- DCHECK(!blocked_fpu_registers_[loc.reg()]);
- blocked_fpu_registers_[loc.reg()] = true;
- } else {
- DCHECK(loc.GetPolicy() == Location::kRequiresRegister
- || loc.GetPolicy() == Location::kRequiresFpuRegister);
- }
+ MaybeBlockPhysicalRegisters(loc, false);
}
+ // If the output is a fixed register, mark it as used.
+ Location result_location = locations->Out();
+ MaybeBlockPhysicalRegisters(result_location, true);
+
SetupBlockedRegisters();
// Allocate all unallocated input locations.
@@ -276,9 +276,11 @@
<< loc.GetPolicy();
}
locations->SetTempAt(i, loc);
+ } else {
+ DCHECK(loc.IsFpuRegister() || loc.IsRegister());
}
}
- Location result_location = locations->Out();
+
if (result_location.IsUnallocated()) {
switch (result_location.GetPolicy()) {
case Location::kAny:
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 1d42c47..e03bc53 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -211,6 +211,7 @@
// Register allocation logic.
void AllocateRegistersLocally(HInstruction* instruction) const;
+ void MaybeBlockPhysicalRegisters(Location loc, bool is_output) const;
// Backend specific implementation for allocating a register.
virtual Location AllocateFreeRegister(Primitive::Type type) const = 0;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index cbe5f0c..f43449d 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -38,7 +38,7 @@
static constexpr bool kExplicitStackOverflowCheck = false;
-static constexpr int kNumberOfPushedRegistersAtEntry = 1 + 2; // LR, R6, R7
+static constexpr int kNumberOfPushedRegistersAtEntry = 1; // LR.
static constexpr int kCurrentMethodStackOffset = 0;
static constexpr Register kRuntimeParameterCoreRegisters[] = { R0, R1, R2, R3 };
@@ -444,21 +444,16 @@
blocked_core_registers_[LR] = true;
blocked_core_registers_[PC] = true;
+ // Reserve frame pointer register.
+ blocked_core_registers_[FP] = true;
+
// Reserve thread register.
blocked_core_registers_[TR] = true;
// Reserve temp register.
blocked_core_registers_[IP] = true;
- // TODO: We currently don't use Quick's callee saved registers.
- // We always save and restore R6 and R7 to make sure we can use three
- // register pairs for long operations.
- blocked_core_registers_[R4] = true;
- blocked_core_registers_[R5] = true;
- blocked_core_registers_[R8] = true;
- blocked_core_registers_[R10] = true;
- blocked_core_registers_[R11] = true;
-
+ // Reserve hard-float's callee saved registers.
blocked_fpu_registers_[S16] = true;
blocked_fpu_registers_[S17] = true;
blocked_fpu_registers_[S18] = true;
@@ -513,8 +508,8 @@
}
}
- core_spill_mask_ |= (1 << LR | 1 << R6 | 1 << R7);
- __ PushList(1 << LR | 1 << R6 | 1 << R7);
+ core_spill_mask_ |= (1 << LR);
+ __ Push(LR);
// The return PC has already been pushed on the stack.
__ AddConstant(SP, -(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kArmWordSize));
@@ -523,7 +518,7 @@
void CodeGeneratorARM::GenerateFrameExit() {
__ AddConstant(SP, GetFrameSize() - kNumberOfPushedRegistersAtEntry * kArmWordSize);
- __ PopList(1 << PC | 1 << R6 | 1 << R7);
+ __ Pop(PC);
}
void CodeGeneratorARM::Bind(HBasicBlock* block) {
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index b048c07..b2bb875 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -716,11 +716,8 @@
// tr
// lr
// sp is not part of the allocatable registers, so we don't need to block it.
- // TODO: Avoid blocking callee-saved registers, and instead preserve them
- // where necessary.
CPURegList reserved_core_registers = vixl_reserved_core_registers;
reserved_core_registers.Combine(runtime_reserved_core_registers);
- reserved_core_registers.Combine(quick_callee_saved_registers);
while (!reserved_core_registers.IsEmpty()) {
blocked_core_registers_[reserved_core_registers.PopLowestIndex().code()] = true;
}
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 1d5bfb7..38e2965 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -48,9 +48,6 @@
const vixl::CPURegList vixl_reserved_core_registers(vixl::ip0, vixl::ip1);
const vixl::CPURegList vixl_reserved_fp_registers(vixl::d31);
const vixl::CPURegList runtime_reserved_core_registers(tr, vixl::lr);
-const vixl::CPURegList quick_callee_saved_registers(vixl::CPURegister::kRegister,
- vixl::kXRegSize,
- kArm64CalleeSaveRefSpills);
Location ARM64ReturnLocation(Primitive::Type return_type);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 4757235..18b1b32 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -42,6 +42,12 @@
static constexpr XmmRegister kRuntimeParameterFpuRegisters[] = { };
static constexpr size_t kRuntimeParameterFpuRegistersLength = 0;
+static constexpr Register kByteRegisters[] = { EAX, ECX, EDX, EBX };
+
+static ByteRegister ToByteRegister(Register reg) {
+ return X86ManagedRegister::FromCpuRegister(reg).AsByteRegister();
+}
+
// Marker for places that can be updated once we don't follow the quick ABI.
static constexpr bool kFollowsQuickABI = true;
@@ -437,11 +443,8 @@
// Stack register is always reserved.
blocked_core_registers_[ESP] = true;
- // TODO: We currently don't use Quick's callee saved registers.
- DCHECK(kFollowsQuickABI);
+ // Frame register is always reserved.
blocked_core_registers_[EBP] = true;
- blocked_core_registers_[ESI] = true;
- blocked_core_registers_[EDI] = true;
UpdateBlockedPairRegisters();
}
@@ -929,7 +932,7 @@
locations->SetInAt(0, Location::RequiresRegister());
locations->SetInAt(1, Location::Any());
if (comp->NeedsMaterialization()) {
- locations->SetOut(Location::RequiresRegister());
+ locations->SetOut(Location::RegisterLocation(kByteRegisters[0]));
}
}
@@ -950,7 +953,7 @@
__ cmpl(locations->InAt(0).AsRegister<Register>(),
Address(ESP, locations->InAt(1).GetStackIndex()));
}
- __ setb(X86Condition(comp->GetCondition()), reg);
+ __ setb(X86Condition(comp->GetCondition()), ToByteRegister(reg));
}
}
@@ -1165,11 +1168,11 @@
case Primitive::kPrimShort:
case Primitive::kPrimInt:
case Primitive::kPrimNot:
- locations->SetOut(Location::RegisterLocation(EAX));
+ locations->SetOut(Location::RegisterLocation(EAX), Location::kNoOutputOverlap);
break;
case Primitive::kPrimLong:
- locations->SetOut(Location::RegisterPairLocation(EAX, EDX));
+ locations->SetOut(Location::RegisterPairLocation(EAX, EDX), Location::kNoOutputOverlap);
break;
case Primitive::kPrimVoid:
@@ -1177,7 +1180,7 @@
case Primitive::kPrimDouble:
case Primitive::kPrimFloat:
- locations->SetOut(Location::FpuRegisterLocation(XMM0));
+ locations->SetOut(Location::FpuRegisterLocation(XMM0), Location::kNoOutputOverlap);
break;
}
@@ -1347,7 +1350,7 @@
case Primitive::kPrimInt:
case Primitive::kPrimChar:
// Processing a Dex `int-to-byte' instruction.
- locations->SetInAt(0, Location::Any());
+ locations->SetInAt(0, Location::RegisterLocation(kByteRegisters[0]));
locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
break;
@@ -1542,15 +1545,7 @@
case Primitive::kPrimInt:
case Primitive::kPrimChar:
// Processing a Dex `int-to-byte' instruction.
- if (in.IsRegister()) {
- __ movsxb(out.AsRegister<Register>(), in.AsRegister<ByteRegister>());
- } else if (in.IsStackSlot()) {
- __ movsxb(out.AsRegister<Register>(), Address(ESP, in.GetStackIndex()));
- } else {
- DCHECK(in.GetConstant()->IsIntConstant());
- int32_t value = in.GetConstant()->AsIntConstant()->GetValue();
- __ movl(out.AsRegister<Register>(), Immediate(static_cast<int8_t>(value)));
- }
+ __ movsxb(out.AsRegister<Register>(), ToByteRegister(in.AsRegister<Register>()));
break;
default:
@@ -2668,17 +2663,16 @@
|| (field_type == Primitive::kPrimByte);
// The register allocator does not support multiple
// inputs that die at entry with one in a specific register.
+ size_t byte_register_index = 0;
if (is_byte_type) {
- // Ensure the value is in a byte register.
- locations->SetInAt(1, Location::RegisterLocation(EAX));
+ locations->SetInAt(1, Location::RegisterLocation(kByteRegisters[byte_register_index++]));
} else {
locations->SetInAt(1, Location::RequiresRegister());
}
// Temporary registers for the write barrier.
if (needs_write_barrier) {
locations->AddTemp(Location::RequiresRegister());
- // Ensure the card is in a byte register.
- locations->AddTemp(Location::RegisterLocation(ECX));
+ locations->AddTemp(Location::RegisterLocation(kByteRegisters[byte_register_index]));
}
}
@@ -2691,7 +2685,7 @@
switch (field_type) {
case Primitive::kPrimBoolean:
case Primitive::kPrimByte: {
- ByteRegister value = locations->InAt(1).AsRegister<ByteRegister>();
+ ByteRegister value = ToByteRegister(locations->InAt(1).AsRegister<Register>());
__ movb(Address(obj, offset), value);
break;
}
@@ -2741,15 +2735,17 @@
}
}
-void CodeGeneratorX86::MarkGCCard(Register temp, Register card, Register object, Register value) {
+void CodeGeneratorX86::MarkGCCard(Register temp,
+ Register card,
+ Register object,
+ Register value) {
Label is_null;
__ testl(value, value);
__ j(kEqual, &is_null);
__ fs()->movl(card, Address::Absolute(Thread::CardTableOffset<kX86WordSize>().Int32Value()));
__ movl(temp, object);
__ shrl(temp, Immediate(gc::accounting::CardTable::kCardShift));
- __ movb(Address(temp, card, TIMES_1, 0),
- X86ManagedRegister::FromCpuRegister(card).AsByteRegister());
+ __ movb(Address(temp, card, TIMES_1, 0), ToByteRegister(card));
__ Bind(&is_null);
}
@@ -2980,17 +2976,17 @@
// inputs that die at entry with one in a specific register.
locations->SetInAt(0, Location::RequiresRegister());
locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
+ size_t byte_register_index = 0;
if (is_byte_type) {
- // Ensure the value is in a byte register.
- locations->SetInAt(2, Location::ByteRegisterOrConstant(EAX, instruction->InputAt(2)));
+ locations->SetInAt(2, Location::ByteRegisterOrConstant(
+ kByteRegisters[byte_register_index++], instruction->InputAt(2)));
} else {
locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2)));
}
// Temporary registers for the write barrier.
if (needs_write_barrier) {
locations->AddTemp(Location::RequiresRegister());
- // Ensure the card is in a byte register.
- locations->AddTemp(Location::RegisterLocation(ECX));
+ locations->AddTemp(Location::RegisterLocation(kByteRegisters[byte_register_index]));
}
}
}
@@ -3012,7 +3008,7 @@
if (index.IsConstant()) {
size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
if (value.IsRegister()) {
- __ movb(Address(obj, offset), value.AsRegister<ByteRegister>());
+ __ movb(Address(obj, offset), ToByteRegister(value.AsRegister<Register>()));
} else {
__ movb(Address(obj, offset),
Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
@@ -3020,7 +3016,7 @@
} else {
if (value.IsRegister()) {
__ movb(Address(obj, index.AsRegister<Register>(), TIMES_1, data_offset),
- value.AsRegister<ByteRegister>());
+ ToByteRegister(value.AsRegister<Register>()));
} else {
__ movb(Address(obj, index.AsRegister<Register>(), TIMES_1, data_offset),
Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
@@ -3463,17 +3459,16 @@
|| (field_type == Primitive::kPrimByte);
// The register allocator does not support multiple
// inputs that die at entry with one in a specific register.
+ size_t byte_register_index = 0;
if (is_byte_type) {
- // Ensure the value is in a byte register.
- locations->SetInAt(1, Location::RegisterLocation(EAX));
+ locations->SetInAt(1, Location::RegisterLocation(kByteRegisters[byte_register_index++]));
} else {
locations->SetInAt(1, Location::RequiresRegister());
}
// Temporary registers for the write barrier.
if (needs_write_barrier) {
locations->AddTemp(Location::RequiresRegister());
- // Ensure the card is in a byte register.
- locations->AddTemp(Location::RegisterLocation(ECX));
+ locations->AddTemp(Location::RegisterLocation(kByteRegisters[byte_register_index]));
}
}
@@ -3486,7 +3481,7 @@
switch (field_type) {
case Primitive::kPrimBoolean:
case Primitive::kPrimByte: {
- ByteRegister value = locations->InAt(1).AsRegister<ByteRegister>();
+ ByteRegister value = ToByteRegister(locations->InAt(1).AsRegister<Register>());
__ movb(Address(cls, offset), value);
break;
}
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index f8651f6..854e93b 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -453,18 +453,8 @@
// Block the register used as TMP.
blocked_core_registers_[TMP] = true;
- // TODO: We currently don't use Quick's callee saved registers.
- blocked_core_registers_[RBX] = true;
+ // Block the frame pointer.
blocked_core_registers_[RBP] = true;
- blocked_core_registers_[R12] = true;
- blocked_core_registers_[R13] = true;
- blocked_core_registers_[R14] = true;
- blocked_core_registers_[R15] = true;
-
- blocked_fpu_registers_[XMM12] = true;
- blocked_fpu_registers_[XMM13] = true;
- blocked_fpu_registers_[XMM14] = true;
- blocked_fpu_registers_[XMM15] = true;
}
void CodeGeneratorX86_64::GenerateFrameEntry() {