Don't block quick callee saved registers for optimizing.

This change builds on:
https://android-review.googlesource.com/#/c/118983/

- Also fix x86_64 assembler bug triggered by this change.
- Fix (and improve) x86's backend byte register usage.
- Fix a bug in baseline register allocator: a fixed
  out register must prevent inputs from allocating it.

Change-Id: I4883862e29b4e4b6470f1823cf7eab7e7863d8ad
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 461409d..5334031 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -177,6 +177,31 @@
   }
 }
 
+void CodeGenerator::MaybeBlockPhysicalRegisters(Location loc, bool is_output) const {
+  // The DCHECKS below check that a register is not specified twice in
+  // the summary.
+  // Note that fixed output registers are allowed to overlap with fixed input and
+  // temp registers: the writer of the location summary has to make sure they
+  // don't conflict with each other.
+  if (loc.IsRegister()) {
+    DCHECK(is_output || !blocked_core_registers_[loc.reg()]);
+    blocked_core_registers_[loc.reg()] = true;
+  } else if (loc.IsFpuRegister()) {
+    DCHECK(is_output || !blocked_fpu_registers_[loc.reg()]);
+    blocked_fpu_registers_[loc.reg()] = true;
+  } else if (loc.IsFpuRegisterPair()) {
+    DCHECK(is_output || !blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()]);
+    blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()] = true;
+    DCHECK(is_output || !blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()]);
+    blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()] = true;
+  } else if (loc.IsRegisterPair()) {
+    DCHECK(is_output || !blocked_core_registers_[loc.AsRegisterPairLow<int>()]);
+    blocked_core_registers_[loc.AsRegisterPairLow<int>()] = true;
+    DCHECK(is_output || !blocked_core_registers_[loc.AsRegisterPairHigh<int>()]);
+    blocked_core_registers_[loc.AsRegisterPairHigh<int>()] = true;
+  }
+}
+
 void CodeGenerator::AllocateRegistersLocally(HInstruction* instruction) const {
   LocationSummary* locations = instruction->GetLocations();
   if (locations == nullptr) return;
@@ -196,43 +221,18 @@
   // Mark all fixed input, temp and output registers as used.
   for (size_t i = 0, e = locations->GetInputCount(); i < e; ++i) {
     Location loc = locations->InAt(i);
-    // The DCHECKS below check that a register is not specified twice in
-    // the summary.
-    if (loc.IsRegister()) {
-      DCHECK(!blocked_core_registers_[loc.reg()]);
-      blocked_core_registers_[loc.reg()] = true;
-    } else if (loc.IsFpuRegister()) {
-      DCHECK(!blocked_fpu_registers_[loc.reg()]);
-      blocked_fpu_registers_[loc.reg()] = true;
-    } else if (loc.IsFpuRegisterPair()) {
-      DCHECK(!blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()]);
-      blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()] = true;
-      DCHECK(!blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()]);
-      blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()] = true;
-    } else if (loc.IsRegisterPair()) {
-      DCHECK(!blocked_core_registers_[loc.AsRegisterPairLow<int>()]);
-      blocked_core_registers_[loc.AsRegisterPairLow<int>()] = true;
-      DCHECK(!blocked_core_registers_[loc.AsRegisterPairHigh<int>()]);
-      blocked_core_registers_[loc.AsRegisterPairHigh<int>()] = true;
-    }
+    MaybeBlockPhysicalRegisters(loc, false);
   }
 
   for (size_t i = 0, e = locations->GetTempCount(); i < e; ++i) {
     Location loc = locations->GetTemp(i);
-    // The DCHECKS below check that a register is not specified twice in
-    // the summary.
-    if (loc.IsRegister()) {
-      DCHECK(!blocked_core_registers_[loc.reg()]);
-      blocked_core_registers_[loc.reg()] = true;
-    } else if (loc.IsFpuRegister()) {
-      DCHECK(!blocked_fpu_registers_[loc.reg()]);
-      blocked_fpu_registers_[loc.reg()] = true;
-    } else {
-      DCHECK(loc.GetPolicy() == Location::kRequiresRegister
-             || loc.GetPolicy() == Location::kRequiresFpuRegister);
-    }
+    MaybeBlockPhysicalRegisters(loc, false);
   }
 
+  // If the output is a fixed register, mark it as used.
+  Location result_location = locations->Out();
+  MaybeBlockPhysicalRegisters(result_location, true);
+
   SetupBlockedRegisters();
 
   // Allocate all unallocated input locations.
@@ -276,9 +276,11 @@
                      << loc.GetPolicy();
       }
       locations->SetTempAt(i, loc);
+    } else {
+      DCHECK(loc.IsFpuRegister() || loc.IsRegister());
     }
   }
-  Location result_location = locations->Out();
+
   if (result_location.IsUnallocated()) {
     switch (result_location.GetPolicy()) {
       case Location::kAny:
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 1d42c47..e03bc53 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -211,6 +211,7 @@
 
   // Register allocation logic.
   void AllocateRegistersLocally(HInstruction* instruction) const;
+  void MaybeBlockPhysicalRegisters(Location loc, bool is_output) const;
 
   // Backend specific implementation for allocating a register.
   virtual Location AllocateFreeRegister(Primitive::Type type) const = 0;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index cbe5f0c..f43449d 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -38,7 +38,7 @@
 
 static constexpr bool kExplicitStackOverflowCheck = false;
 
-static constexpr int kNumberOfPushedRegistersAtEntry = 1 + 2;  // LR, R6, R7
+static constexpr int kNumberOfPushedRegistersAtEntry = 1;  // LR.
 static constexpr int kCurrentMethodStackOffset = 0;
 
 static constexpr Register kRuntimeParameterCoreRegisters[] = { R0, R1, R2, R3 };
@@ -444,21 +444,16 @@
   blocked_core_registers_[LR] = true;
   blocked_core_registers_[PC] = true;
 
+  // Reserve frame pointer register.
+  blocked_core_registers_[FP] = true;
+
   // Reserve thread register.
   blocked_core_registers_[TR] = true;
 
   // Reserve temp register.
   blocked_core_registers_[IP] = true;
 
-  // TODO: We currently don't use Quick's callee saved registers.
-  // We always save and restore R6 and R7 to make sure we can use three
-  // register pairs for long operations.
-  blocked_core_registers_[R4] = true;
-  blocked_core_registers_[R5] = true;
-  blocked_core_registers_[R8] = true;
-  blocked_core_registers_[R10] = true;
-  blocked_core_registers_[R11] = true;
-
+  // Reserve hard-float's callee saved registers.
   blocked_fpu_registers_[S16] = true;
   blocked_fpu_registers_[S17] = true;
   blocked_fpu_registers_[S18] = true;
@@ -513,8 +508,8 @@
     }
   }
 
-  core_spill_mask_ |= (1 << LR | 1 << R6 | 1 << R7);
-  __ PushList(1 << LR | 1 << R6 | 1 << R7);
+  core_spill_mask_ |= (1 << LR);
+  __ Push(LR);
 
   // The return PC has already been pushed on the stack.
   __ AddConstant(SP, -(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kArmWordSize));
@@ -523,7 +518,7 @@
 
 void CodeGeneratorARM::GenerateFrameExit() {
   __ AddConstant(SP, GetFrameSize() - kNumberOfPushedRegistersAtEntry * kArmWordSize);
-  __ PopList(1 << PC | 1 << R6 | 1 << R7);
+  __ Pop(PC);
 }
 
 void CodeGeneratorARM::Bind(HBasicBlock* block) {
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index b048c07..b2bb875 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -716,11 +716,8 @@
   //   tr
   //   lr
   // sp is not part of the allocatable registers, so we don't need to block it.
-  // TODO: Avoid blocking callee-saved registers, and instead preserve them
-  // where necessary.
   CPURegList reserved_core_registers = vixl_reserved_core_registers;
   reserved_core_registers.Combine(runtime_reserved_core_registers);
-  reserved_core_registers.Combine(quick_callee_saved_registers);
   while (!reserved_core_registers.IsEmpty()) {
     blocked_core_registers_[reserved_core_registers.PopLowestIndex().code()] = true;
   }
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 1d5bfb7..38e2965 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -48,9 +48,6 @@
 const vixl::CPURegList vixl_reserved_core_registers(vixl::ip0, vixl::ip1);
 const vixl::CPURegList vixl_reserved_fp_registers(vixl::d31);
 const vixl::CPURegList runtime_reserved_core_registers(tr, vixl::lr);
-const vixl::CPURegList quick_callee_saved_registers(vixl::CPURegister::kRegister,
-                                                    vixl::kXRegSize,
-                                                    kArm64CalleeSaveRefSpills);
 
 Location ARM64ReturnLocation(Primitive::Type return_type);
 
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 4757235..18b1b32 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -42,6 +42,12 @@
 static constexpr XmmRegister kRuntimeParameterFpuRegisters[] = { };
 static constexpr size_t kRuntimeParameterFpuRegistersLength = 0;
 
+static constexpr Register kByteRegisters[] = { EAX, ECX, EDX, EBX };
+
+static ByteRegister ToByteRegister(Register reg) {
+  return X86ManagedRegister::FromCpuRegister(reg).AsByteRegister();
+}
+
 // Marker for places that can be updated once we don't follow the quick ABI.
 static constexpr bool kFollowsQuickABI = true;
 
@@ -437,11 +443,8 @@
   // Stack register is always reserved.
   blocked_core_registers_[ESP] = true;
 
-  // TODO: We currently don't use Quick's callee saved registers.
-  DCHECK(kFollowsQuickABI);
+  // Frame register is always reserved.
   blocked_core_registers_[EBP] = true;
-  blocked_core_registers_[ESI] = true;
-  blocked_core_registers_[EDI] = true;
 
   UpdateBlockedPairRegisters();
 }
@@ -929,7 +932,7 @@
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::Any());
   if (comp->NeedsMaterialization()) {
-    locations->SetOut(Location::RequiresRegister());
+    locations->SetOut(Location::RegisterLocation(kByteRegisters[0]));
   }
 }
 
@@ -950,7 +953,7 @@
       __ cmpl(locations->InAt(0).AsRegister<Register>(),
               Address(ESP, locations->InAt(1).GetStackIndex()));
     }
-    __ setb(X86Condition(comp->GetCondition()), reg);
+    __ setb(X86Condition(comp->GetCondition()), ToByteRegister(reg));
   }
 }
 
@@ -1165,11 +1168,11 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimNot:
-      locations->SetOut(Location::RegisterLocation(EAX));
+      locations->SetOut(Location::RegisterLocation(EAX), Location::kNoOutputOverlap);
       break;
 
     case Primitive::kPrimLong:
-      locations->SetOut(Location::RegisterPairLocation(EAX, EDX));
+      locations->SetOut(Location::RegisterPairLocation(EAX, EDX), Location::kNoOutputOverlap);
       break;
 
     case Primitive::kPrimVoid:
@@ -1177,7 +1180,7 @@
 
     case Primitive::kPrimDouble:
     case Primitive::kPrimFloat:
-      locations->SetOut(Location::FpuRegisterLocation(XMM0));
+      locations->SetOut(Location::FpuRegisterLocation(XMM0), Location::kNoOutputOverlap);
       break;
   }
 
@@ -1347,7 +1350,7 @@
         case Primitive::kPrimInt:
         case Primitive::kPrimChar:
           // Processing a Dex `int-to-byte' instruction.
-          locations->SetInAt(0, Location::Any());
+          locations->SetInAt(0, Location::RegisterLocation(kByteRegisters[0]));
           locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
           break;
 
@@ -1542,15 +1545,7 @@
         case Primitive::kPrimInt:
         case Primitive::kPrimChar:
           // Processing a Dex `int-to-byte' instruction.
-          if (in.IsRegister()) {
-            __ movsxb(out.AsRegister<Register>(), in.AsRegister<ByteRegister>());
-          } else if (in.IsStackSlot()) {
-            __ movsxb(out.AsRegister<Register>(), Address(ESP, in.GetStackIndex()));
-          } else {
-            DCHECK(in.GetConstant()->IsIntConstant());
-            int32_t value = in.GetConstant()->AsIntConstant()->GetValue();
-            __ movl(out.AsRegister<Register>(), Immediate(static_cast<int8_t>(value)));
-          }
+          __ movsxb(out.AsRegister<Register>(), ToByteRegister(in.AsRegister<Register>()));
           break;
 
         default:
@@ -2668,17 +2663,16 @@
       || (field_type == Primitive::kPrimByte);
   // The register allocator does not support multiple
   // inputs that die at entry with one in a specific register.
+  size_t byte_register_index = 0;
   if (is_byte_type) {
-    // Ensure the value is in a byte register.
-    locations->SetInAt(1, Location::RegisterLocation(EAX));
+    locations->SetInAt(1, Location::RegisterLocation(kByteRegisters[byte_register_index++]));
   } else {
     locations->SetInAt(1, Location::RequiresRegister());
   }
   // Temporary registers for the write barrier.
   if (needs_write_barrier) {
     locations->AddTemp(Location::RequiresRegister());
-    // Ensure the card is in a byte register.
-    locations->AddTemp(Location::RegisterLocation(ECX));
+    locations->AddTemp(Location::RegisterLocation(kByteRegisters[byte_register_index]));
   }
 }
 
@@ -2691,7 +2685,7 @@
   switch (field_type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
-      ByteRegister value = locations->InAt(1).AsRegister<ByteRegister>();
+      ByteRegister value = ToByteRegister(locations->InAt(1).AsRegister<Register>());
       __ movb(Address(obj, offset), value);
       break;
     }
@@ -2741,15 +2735,17 @@
   }
 }
 
-void CodeGeneratorX86::MarkGCCard(Register temp, Register card, Register object, Register value) {
+void CodeGeneratorX86::MarkGCCard(Register temp,
+                                  Register card,
+                                  Register object,
+                                  Register value) {
   Label is_null;
   __ testl(value, value);
   __ j(kEqual, &is_null);
   __ fs()->movl(card, Address::Absolute(Thread::CardTableOffset<kX86WordSize>().Int32Value()));
   __ movl(temp, object);
   __ shrl(temp, Immediate(gc::accounting::CardTable::kCardShift));
-  __ movb(Address(temp, card, TIMES_1, 0),
-          X86ManagedRegister::FromCpuRegister(card).AsByteRegister());
+  __ movb(Address(temp, card, TIMES_1, 0), ToByteRegister(card));
   __ Bind(&is_null);
 }
 
@@ -2980,17 +2976,17 @@
     // inputs that die at entry with one in a specific register.
     locations->SetInAt(0, Location::RequiresRegister());
     locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
+    size_t byte_register_index = 0;
     if (is_byte_type) {
-      // Ensure the value is in a byte register.
-      locations->SetInAt(2, Location::ByteRegisterOrConstant(EAX, instruction->InputAt(2)));
+      locations->SetInAt(2, Location::ByteRegisterOrConstant(
+          kByteRegisters[byte_register_index++], instruction->InputAt(2)));
     } else {
       locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2)));
     }
     // Temporary registers for the write barrier.
     if (needs_write_barrier) {
       locations->AddTemp(Location::RequiresRegister());
-      // Ensure the card is in a byte register.
-      locations->AddTemp(Location::RegisterLocation(ECX));
+      locations->AddTemp(Location::RegisterLocation(kByteRegisters[byte_register_index]));
     }
   }
 }
@@ -3012,7 +3008,7 @@
       if (index.IsConstant()) {
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
         if (value.IsRegister()) {
-          __ movb(Address(obj, offset), value.AsRegister<ByteRegister>());
+          __ movb(Address(obj, offset), ToByteRegister(value.AsRegister<Register>()));
         } else {
           __ movb(Address(obj, offset),
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
@@ -3020,7 +3016,7 @@
       } else {
         if (value.IsRegister()) {
           __ movb(Address(obj, index.AsRegister<Register>(), TIMES_1, data_offset),
-                  value.AsRegister<ByteRegister>());
+                  ToByteRegister(value.AsRegister<Register>()));
         } else {
           __ movb(Address(obj, index.AsRegister<Register>(), TIMES_1, data_offset),
                   Immediate(value.GetConstant()->AsIntConstant()->GetValue()));
@@ -3463,17 +3459,16 @@
       || (field_type == Primitive::kPrimByte);
   // The register allocator does not support multiple
   // inputs that die at entry with one in a specific register.
+  size_t byte_register_index = 0;
   if (is_byte_type) {
-    // Ensure the value is in a byte register.
-    locations->SetInAt(1, Location::RegisterLocation(EAX));
+    locations->SetInAt(1, Location::RegisterLocation(kByteRegisters[byte_register_index++]));
   } else {
     locations->SetInAt(1, Location::RequiresRegister());
   }
   // Temporary registers for the write barrier.
   if (needs_write_barrier) {
     locations->AddTemp(Location::RequiresRegister());
-    // Ensure the card is in a byte register.
-    locations->AddTemp(Location::RegisterLocation(ECX));
+    locations->AddTemp(Location::RegisterLocation(kByteRegisters[byte_register_index]));
   }
 }
 
@@ -3486,7 +3481,7 @@
   switch (field_type) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte: {
-      ByteRegister value = locations->InAt(1).AsRegister<ByteRegister>();
+      ByteRegister value = ToByteRegister(locations->InAt(1).AsRegister<Register>());
       __ movb(Address(cls, offset), value);
       break;
     }
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index f8651f6..854e93b 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -453,18 +453,8 @@
   // Block the register used as TMP.
   blocked_core_registers_[TMP] = true;
 
-  // TODO: We currently don't use Quick's callee saved registers.
-  blocked_core_registers_[RBX] = true;
+  // Block the frame pointer.
   blocked_core_registers_[RBP] = true;
-  blocked_core_registers_[R12] = true;
-  blocked_core_registers_[R13] = true;
-  blocked_core_registers_[R14] = true;
-  blocked_core_registers_[R15] = true;
-
-  blocked_fpu_registers_[XMM12] = true;
-  blocked_fpu_registers_[XMM13] = true;
-  blocked_fpu_registers_[XMM14] = true;
-  blocked_fpu_registers_[XMM15] = true;
 }
 
 void CodeGeneratorX86_64::GenerateFrameEntry() {