Optimize leaf methods.

Avoid suspend checks and stack changes when not needed.

Change-Id: I0fdb31e8c631e99091b818874a558c9aa04b1628
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index fd4e391..d0739a6 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -41,8 +41,6 @@
 }
 
 void CodeGenerator::CompileBaseline(CodeAllocator* allocator, bool is_leaf) {
-  DCHECK_EQ(frame_size_, kUninitializedFrameSize);
-
   Initialize();
   if (!is_leaf) {
     MarkNotLeaf();
@@ -59,7 +57,6 @@
 }
 
 void CodeGenerator::CompileInternal(CodeAllocator* allocator, bool is_baseline) {
-  HGraphVisitor* location_builder = GetLocationBuilder();
   HGraphVisitor* instruction_visitor = GetInstructionVisitor();
   DCHECK_EQ(current_block_index_, 0u);
   GenerateFrameEntry();
@@ -69,8 +66,7 @@
     for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
       HInstruction* current = it.Current();
       if (is_baseline) {
-        current->Accept(location_builder);
-        InitLocations(current);
+        InitLocationsBaseline(current);
       }
       current->Accept(instruction_visitor);
     }
@@ -88,7 +84,6 @@
 void CodeGenerator::CompileOptimized(CodeAllocator* allocator) {
   // The register allocator already called `InitializeCodeGeneration`,
   // where the frame size has been computed.
-  DCHECK_NE(frame_size_, kUninitializedFrameSize);
   DCHECK(block_order_ != nullptr);
   Initialize();
   CompileInternal(allocator, /* is_baseline */ false);
@@ -138,13 +133,22 @@
   ComputeSpillMask();
   first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize;
 
-  SetFrameSize(RoundUp(
-      number_of_spill_slots * kVRegSize
-      + number_of_out_slots * kVRegSize
-      + maximum_number_of_live_core_registers * GetWordSize()
-      + maximum_number_of_live_fp_registers * GetFloatingPointSpillSlotSize()
-      + FrameEntrySpillSize(),
-      kStackAlignment));
+  if (number_of_spill_slots == 0
+      && !HasAllocatedCalleeSaveRegisters()
+      && IsLeafMethod()
+      && !RequiresCurrentMethod()) {
+    DCHECK_EQ(maximum_number_of_live_core_registers, 0u);
+    DCHECK_EQ(maximum_number_of_live_fp_registers, 0u);
+    SetFrameSize(CallPushesPC() ? GetWordSize() : 0);
+  } else {
+    SetFrameSize(RoundUp(
+        number_of_spill_slots * kVRegSize
+        + number_of_out_slots * kVRegSize
+        + maximum_number_of_live_core_registers * GetWordSize()
+        + maximum_number_of_live_fp_registers * GetFloatingPointSpillSlotSize()
+        + FrameEntrySpillSize(),
+        kStackAlignment));
+  }
 }
 
 Location CodeGenerator::GetTemporaryLocation(HTemporary* temp) const {
@@ -294,7 +298,8 @@
   }
 }
 
-void CodeGenerator::InitLocations(HInstruction* instruction) {
+void CodeGenerator::InitLocationsBaseline(HInstruction* instruction) {
+  AllocateLocations(instruction);
   if (instruction->GetLocations() == nullptr) {
     if (instruction->IsTemporary()) {
       HInstruction* previous = instruction->GetPrevious();
@@ -320,6 +325,19 @@
   }
 }
 
+void CodeGenerator::AllocateLocations(HInstruction* instruction) {
+  instruction->Accept(GetLocationBuilder());
+  LocationSummary* locations = instruction->GetLocations();
+  if (!instruction->IsSuspendCheckEntry()) {
+    if (locations != nullptr && locations->CanCall()) {
+      MarkNotLeaf();
+    }
+    if (instruction->NeedsCurrentMethod()) {
+      SetRequiresCurrentMethod();
+    }
+  }
+}
+
 bool CodeGenerator::GoesToNextBlock(HBasicBlock* current, HBasicBlock* next) const {
   DCHECK_EQ(block_order_->Get(current_block_index_), current);
   return (current_block_index_ < block_order_->Size() - 1)
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index ab63b91..efd0c84 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -30,7 +30,6 @@
 namespace art {
 
 static size_t constexpr kVRegSize = 4;
-static size_t constexpr kUninitializedFrameSize = 0;
 
 // Binary encoding of 2^32 for type double.
 static int64_t constexpr k2Pow32EncodingForDouble = INT64_C(0x41F0000000000000);
@@ -107,8 +106,6 @@
   virtual void GenerateFrameExit() = 0;
   virtual void Bind(HBasicBlock* block) = 0;
   virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) = 0;
-  virtual HGraphVisitor* GetLocationBuilder() = 0;
-  virtual HGraphVisitor* GetInstructionVisitor() = 0;
   virtual Assembler* GetAssembler() = 0;
   virtual size_t GetWordSize() const = 0;
   virtual size_t GetFloatingPointSpillSlotSize() const = 0;
@@ -196,6 +193,15 @@
 
   void MarkNotLeaf() {
     is_leaf_ = false;
+    requires_current_method_ = true;
+  }
+
+  void SetRequiresCurrentMethod() {
+    requires_current_method_ = true;
+  }
+
+  bool RequiresCurrentMethod() const {
+    return requires_current_method_;
   }
 
   // Clears the spill slots taken by loop phis in the `LocationSummary` of the
@@ -228,6 +234,8 @@
     allocated_registers_.Add(location);
   }
 
+  void AllocateLocations(HInstruction* instruction);
+
  protected:
   CodeGenerator(HGraph* graph,
                 size_t number_of_core_registers,
@@ -236,7 +244,7 @@
                 uint32_t core_callee_save_mask,
                 uint32_t fpu_callee_save_mask,
                 const CompilerOptions& compiler_options)
-      : frame_size_(kUninitializedFrameSize),
+      : frame_size_(0),
         core_spill_mask_(0),
         fpu_spill_mask_(0),
         first_register_slot_in_slow_path_(0),
@@ -255,6 +263,7 @@
         block_order_(nullptr),
         current_block_index_(0),
         is_leaf_(true),
+        requires_current_method_(false),
         stack_map_stream_(graph->GetArena()) {}
 
   // Register allocation logic.
@@ -269,11 +278,12 @@
   virtual Location GetStackLocation(HLoadLocal* load) const = 0;
 
   virtual ParallelMoveResolver* GetMoveResolver() = 0;
+  virtual HGraphVisitor* GetLocationBuilder() = 0;
+  virtual HGraphVisitor* GetInstructionVisitor() = 0;
 
   // Returns the location of the first spilled entry for floating point registers,
   // relative to the stack pointer.
   uint32_t GetFpuSpillStart() const {
-    DCHECK_NE(frame_size_, kUninitializedFrameSize);
     return GetFrameSize() - FrameEntrySpillSize();
   }
 
@@ -289,6 +299,21 @@
     return GetFpuSpillSize() + GetCoreSpillSize();
   }
 
+  bool HasAllocatedCalleeSaveRegisters() const {
+    // We check the core registers against 1 because it always comprises the return PC.
+    return (POPCOUNT(allocated_registers_.GetCoreRegisters() & core_callee_save_mask_) != 1)
+      || (POPCOUNT(allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_) != 0);
+  }
+
+  bool CallPushesPC() const {
+    InstructionSet instruction_set = GetInstructionSet();
+    return instruction_set == kX86 || instruction_set == kX86_64;
+  }
+
+  bool HasEmptyFrame() const {
+    return GetFrameSize() == (CallPushesPC() ? GetWordSize() : 0);
+  }
+
   // Frame size required for this method.
   uint32_t frame_size_;
   uint32_t core_spill_mask_;
@@ -311,7 +336,7 @@
   const uint32_t fpu_callee_save_mask_;
 
  private:
-  void InitLocations(HInstruction* instruction);
+  void InitLocationsBaseline(HInstruction* instruction);
   size_t GetStackOffsetOfSavedRegister(size_t index);
   void CompileInternal(CodeAllocator* allocator, bool is_baseline);
 
@@ -328,8 +353,12 @@
   // we are generating code for.
   size_t current_block_index_;
 
+  // Whether the method is a leaf method.
   bool is_leaf_;
 
+  // Whether an instruction in the graph accesses the current method.
+  bool requires_current_method_;
+
   StackMapStream stack_map_stream_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGenerator);
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 78fd181..1841f06226 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -396,10 +396,6 @@
       move_resolver_(graph->GetArena(), this),
       assembler_(true),
       isa_features_(isa_features) {
-  // Save one extra register for baseline. Note that on thumb2, there is no easy
-  // instruction to restore just the PC, so this actually helps both baseline
-  // and non-baseline to save and restore at least two registers at entry and exit.
-  AddAllocatedRegister(Location::RegisterLocation(kCoreSavedRegisterForBaseline));
   // Save the PC register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(PC));
 }
@@ -508,6 +504,10 @@
 
 void CodeGeneratorARM::ComputeSpillMask() {
   core_spill_mask_ = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_;
+  // Save one extra register for baseline. Note that on thumb2, there is no easy
+  // instruction to restore just the PC, so this actually helps both baseline
+  // and non-baseline to save and restore at least two registers at entry and exit.
+  core_spill_mask_ |= (1 << kCoreSavedRegisterForBaseline);
   DCHECK_NE(core_spill_mask_, 0u) << "At least the return address register must be saved";
   fpu_spill_mask_ = allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_;
   // We use vpush and vpop for saving and restoring floating point registers, which take
@@ -529,6 +529,10 @@
   DCHECK(GetCompilerOptions().GetImplicitStackOverflowChecks());
   __ Bind(&frame_entry_label_);
 
+  if (HasEmptyFrame()) {
+    return;
+  }
+
   if (!skip_overflow_check) {
     __ AddConstant(IP, SP, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm)));
     __ LoadFromOffset(kLoadWord, IP, IP, 0);
@@ -547,6 +551,10 @@
 }
 
 void CodeGeneratorARM::GenerateFrameExit() {
+  if (HasEmptyFrame()) {
+    __ bx(LR);
+    return;
+  }
   __ AddConstant(SP, GetFrameSize() - FrameEntrySpillSize());
   if (fpu_spill_mask_ != 0) {
     SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_));
@@ -1172,6 +1180,7 @@
 }
 
 void CodeGeneratorARM::LoadCurrentMethod(Register reg) {
+  DCHECK(RequiresCurrentMethod());
   __ LoadFromOffset(kLoadWord, reg, SP, kCurrentMethodStackOffset);
 }
 
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 7588a29..46f1a9b 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -453,24 +453,27 @@
     RecordPcInfo(nullptr, 0);
   }
 
-  int frame_size = GetFrameSize();
-  __ Str(kArtMethodRegister, MemOperand(sp, -frame_size, PreIndex));
-  __ PokeCPURegList(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize());
-  __ PokeCPURegList(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize());
-
-  // Stack layout:
-  //      sp[frame_size - 8]        : lr.
-  //      ...                       : other preserved core registers.
-  //      ...                       : other preserved fp registers.
-  //      ...                       : reserved frame space.
-  //      sp[0]                     : current method.
+  if (!HasEmptyFrame()) {
+    int frame_size = GetFrameSize();
+    // Stack layout:
+    //      sp[frame_size - 8]        : lr.
+    //      ...                       : other preserved core registers.
+    //      ...                       : other preserved fp registers.
+    //      ...                       : reserved frame space.
+    //      sp[0]                     : current method.
+    __ Str(kArtMethodRegister, MemOperand(sp, -frame_size, PreIndex));
+    __ PokeCPURegList(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize());
+    __ PokeCPURegList(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize());
+  }
 }
 
 void CodeGeneratorARM64::GenerateFrameExit() {
-  int frame_size = GetFrameSize();
-  __ PeekCPURegList(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize());
-  __ PeekCPURegList(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize());
-  __ Drop(frame_size);
+  if (!HasEmptyFrame()) {
+    int frame_size = GetFrameSize();
+    __ PeekCPURegList(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize());
+    __ PeekCPURegList(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize());
+    __ Drop(frame_size);
+  }
 }
 
 void CodeGeneratorARM64::Bind(HBasicBlock* block) {
@@ -961,6 +964,7 @@
 }
 
 void CodeGeneratorARM64::LoadCurrentMethod(vixl::Register current_method) {
+  DCHECK(RequiresCurrentMethod());
   DCHECK(current_method.IsW());
   __ Ldr(current_method, MemOperand(sp, kCurrentMethodStackOffset));
 }
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 98f93a4..1a95f41 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -470,12 +470,16 @@
     RecordPcInfo(nullptr, 0);
   }
 
-  __ subl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
-  __ movl(Address(ESP, kCurrentMethodStackOffset), EAX);
+  if (!HasEmptyFrame()) {
+    __ subl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
+    __ movl(Address(ESP, kCurrentMethodStackOffset), EAX);
+  }
 }
 
 void CodeGeneratorX86::GenerateFrameExit() {
-  __ addl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
+  if (!HasEmptyFrame()) {
+    __ addl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
+  }
 }
 
 void CodeGeneratorX86::Bind(HBasicBlock* block) {
@@ -483,6 +487,7 @@
 }
 
 void CodeGeneratorX86::LoadCurrentMethod(Register reg) {
+  DCHECK(RequiresCurrentMethod());
   __ movl(reg, Address(ESP, kCurrentMethodStackOffset));
 }
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 2ff53a0..b2b427f 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -487,6 +487,10 @@
     RecordPcInfo(nullptr, 0);
   }
 
+  if (HasEmptyFrame()) {
+    return;
+  }
+
   for (int i = arraysize(kCoreCalleeSaves) - 1; i >= 0; --i) {
     Register reg = kCoreCalleeSaves[i];
     if (allocated_registers_.ContainsCoreRegister(reg)) {
@@ -509,6 +513,9 @@
 }
 
 void CodeGeneratorX86_64::GenerateFrameExit() {
+  if (HasEmptyFrame()) {
+    return;
+  }
   uint32_t xmm_spill_location = GetFpuSpillStart();
   size_t xmm_spill_slot_size = GetFloatingPointSpillSlotSize();
   for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) {
@@ -533,6 +540,7 @@
 }
 
 void CodeGeneratorX86_64::LoadCurrentMethod(CpuRegister reg) {
+  DCHECK(RequiresCurrentMethod());
   __ movl(reg, Address(CpuRegister(RSP), kCurrentMethodStackOffset));
 }
 
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 30d869d..9bb91d2 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1015,6 +1015,18 @@
   void SetLiveInterval(LiveInterval* interval) { live_interval_ = interval; }
   bool HasLiveInterval() const { return live_interval_ != nullptr; }
 
+  bool IsSuspendCheckEntry() const { return IsSuspendCheck() && GetBlock()->IsEntryBlock(); }
+
+  // Returns whether the code generation of the instruction will require to have access
+  // to the current method. Such instructions are:
+  // (1): Instructions that require an environment, as calling the runtime requires
+  //      to walk the stack and have the current method stored at a specific stack address.
+  // (2): Object literals like classes and strings, that are loaded from the dex cache
+  //      fields of the current method.
+  bool NeedsCurrentMethod() const {
+    return NeedsEnvironment() || IsLoadClass() || IsLoadString();
+  }
+
  private:
   HInstruction* previous_;
   HInstruction* next_;
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 3809720..bfbe63f 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -252,8 +252,13 @@
       && (instruction->GetType() != Primitive::kPrimFloat);
 
   if (locations->CanCall()) {
-    if (!instruction->IsSuspendCheck()) {
-      codegen_->MarkNotLeaf();
+    if (codegen_->IsLeafMethod()) {
+      // TODO: We do this here because we do not want the suspend check to artificially
+      // create live registers. We should find another place, but this is currently the
+      // simplest.
+      DCHECK(instruction->IsSuspendCheckEntry());
+      instruction->GetBlock()->RemoveInstruction(instruction);
+      return;
     }
     safepoints_.Add(instruction);
     if (locations->OnlyCallsOnSlowPath()) {
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index 1b06315..2a84735 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -115,14 +115,13 @@
   // to differentiate between the start and end of an instruction. Adding 2 to
   // the lifetime position for each instruction ensures the start of an
   // instruction is different than the end of the previous instruction.
-  HGraphVisitor* location_builder = codegen_->GetLocationBuilder();
   for (HLinearOrderIterator it(*this); !it.Done(); it.Advance()) {
     HBasicBlock* block = it.Current();
     block->SetLifetimeStart(lifetime_position);
 
     for (HInstructionIterator inst_it(block->GetPhis()); !inst_it.Done(); inst_it.Advance()) {
       HInstruction* current = inst_it.Current();
-      current->Accept(location_builder);
+      codegen_->AllocateLocations(current);
       LocationSummary* locations = current->GetLocations();
       if (locations != nullptr && locations->Out().IsValid()) {
         instructions_from_ssa_index_.Add(current);
@@ -140,7 +139,7 @@
     for (HInstructionIterator inst_it(block->GetInstructions()); !inst_it.Done();
          inst_it.Advance()) {
       HInstruction* current = inst_it.Current();
-      current->Accept(codegen_->GetLocationBuilder());
+      codegen_->AllocateLocations(current);
       LocationSummary* locations = current->GetLocations();
       if (locations != nullptr && locations->Out().IsValid()) {
         instructions_from_ssa_index_.Add(current);