Revert "Revert "Use trampolines for calls to helpers""

This reverts commit 081f73e888b3c246cf7635db37b7f1105cf1a2ff.

Change-Id: Ibd777f8ce73cf8ed6c4cb81d50bf6437ac28cb61

Conflicts:
	compiler/dex/quick/mir_to_lir.h
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index b66082d..ba15d2a 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -22,6 +22,8 @@
 #include <vector>
 #include <unistd.h>
 
+#include "arch/arm/final_relocations_arm.h"
+#include "base/hex_dump.h"
 #include "base/stl_util.h"
 #include "base/timing_logger.h"
 #include "class_linker.h"
@@ -504,6 +506,7 @@
   UniquePtr<ThreadPool> thread_pool(new ThreadPool("Compiler driver thread pool", thread_count_ - 1));
   PreCompile(class_loader, dex_files, thread_pool.get(), timings);
   Compile(class_loader, dex_files, thread_pool.get(), timings);
+  PostCompile();
   if (dump_stats_) {
     stats_->Dump();
   }
@@ -615,6 +618,10 @@
   UpdateImageClasses(timings);
 }
 
+void CompilerDriver::PostCompile() {
+  BuildEntrypointTrampolineCode();
+}
+
 bool CompilerDriver::IsImageClass(const char* descriptor) const {
   if (!IsImage()) {
     return true;
@@ -1238,6 +1245,27 @@
   return result;
 }
 
+uint32_t CompilerDriver::AddEntrypointTrampoline(uint32_t entrypoint) {
+  return entrypoint_trampolines_.AddEntrypoint(Thread::Current(), entrypoint);
+}
+
+
+void CompilerDriver::BuildEntrypointTrampolineCode() {
+  const auto& table = entrypoint_trampolines_.GetTrampolineTable();
+  for (uint32_t offset : table) {
+    switch (instruction_set_) {
+      case kArm:
+        // Intentional fall through.
+      case kThumb2:
+        BuildArmEntrypointTrampolineCall(ThreadOffset<4>(offset));
+        break;
+      default:
+        UNIMPLEMENTED(FATAL) << "No entrypoint trampolines for this architecture";
+    }
+  }
+}
+
+
 void CompilerDriver::AddCodePatch(const DexFile* dex_file,
                                   uint16_t referrer_class_def_idx,
                                   uint32_t referrer_method_idx,
@@ -2071,4 +2099,17 @@
   }
   return !compile;
 }
+
+FinalEntrypointRelocationSet* CompilerDriver::AllocateFinalEntrypointRelocationSet(
+    CompilationUnit* cu) const {
+  switch (instruction_set_) {
+    case kArm:
+    case kThumb2:
+      return new FinalEntrypointRelocationSetArm(this);
+    default:
+      UNIMPLEMENTED(FATAL) << "Cannot allocate FinalEntrypointRelocationSet for non-ARM";
+      return nullptr;
+  }
+}
+
 }  // namespace art
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index d49523a..dcf99b8 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -601,6 +601,118 @@
   // Should the compiler run on this method given profile information?
   bool SkipCompilation(const std::string& method_name);
 
+  // Entrypoint trampolines.
+  //
+  // The idea here is that we can save code size by collecting the branches
+  // to the entrypoints (helper functions called by the generated code) into a
+  // table and then branching relative to that table from the code.  On ARM 32 this
+  // will save 2 bytes per call.  Only the entrypoints used by the program (the whole
+  // program - these are global) are in this table and are in no particular order.
+  //
+  // The trampolines will be placed right at the start of the .text section in the file
+  // and will consist of a table of instructions, each of which will branch relative to
+  // the thread register (r9 on ARM) to an entrypoint.  On ARM this would look like:
+  //
+  // trampolines:
+  // 1: ldr pc, [r9, #40]
+  // 2: ldr pc, [r9, #8]
+  //    ...
+  //
+  // Then a call to an entrypoint would be an immediate BL instruction to the appropriate
+  // label (1 or 2 in the above example).  Because the entrypoint table has the lower bit
+  // of the address already set, the ldr pc will switch from ARM to Thumb for the entrypoint as
+  // necessary.
+  //
+  // On ARM, the range of a BL instruction is +-32M to this is more than enough for an
+  // immediate BL instruction in the generated code.
+  //
+  // The actual address of the trampoline for a particular entrypoint is not known until
+  // the OAT file is written and we know the addresses of all the branch instructions in
+  // the program.  At this point we can rewrite the BL instruction to have the correct relative
+  // offset.
+  class EntrypointTrampolines {
+   public:
+    EntrypointTrampolines() : current_offset_(0), lock_("Entrypoint Trampolines") {}
+    ~EntrypointTrampolines() {}
+
+    // Add a trampoline and return the offset added.  If it already exists
+    // return the offset it was added at previously.
+    uint32_t AddEntrypoint(Thread* self, uint32_t ep) LOCKS_EXCLUDED(lock_) {
+      MutexLock mu(self, lock_);
+      Trampolines::iterator tramp = trampolines_.find(ep);
+      if (tramp == trampolines_.end()) {
+        trampolines_[ep] = current_offset_;
+        trampoline_table_.push_back(ep);
+        LOG(DEBUG) << "adding new trampoline for " << ep << " at offset " << current_offset_;
+        return current_offset_++;
+      } else {
+        return tramp->second;
+      }
+    }
+
+    const std::vector<uint32_t>& GetTrampolineTable() const {
+      return trampoline_table_;
+    }
+
+    uint32_t GetTrampolineTableSize() const {
+      return current_offset_;
+    }
+
+   private:
+    uint32_t current_offset_;
+    // Mapping of entrypoint offset vs offset into trampoline table.
+    typedef std::map<uint32_t, uint32_t> Trampolines;
+    Trampolines trampolines_ GUARDED_BY(lock_);
+
+    // Table of all registered offsets in order of registration.
+    std::vector<uint32_t> trampoline_table_;
+    Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  };
+
+  uint32_t AddEntrypointTrampoline(uint32_t entrypoint);
+
+  const std::vector<uint32_t>& GetEntrypointTrampolineTable() const {
+    return entrypoint_trampolines_.GetTrampolineTable();
+  }
+
+  uint32_t GetEntrypointTrampolineTableSize() const {
+    uint32_t size = entrypoint_trampolines_.GetTrampolineTableSize();
+    switch (instruction_set_) {
+      case kThumb2:
+      case kArm:
+         return size * 4;
+      default:
+       return size;
+    }
+  }
+
+  // Get the maximum offset between entrypoint trampoline islands.  Different architectures
+  // have limitations on the max offset for a call instruction.  This function is used
+  // to determine when we need to generate a new trampoline island in the output to keep
+  // subsequent calls in range.
+  size_t GetMaxEntrypointTrampolineOffset() const {
+    switch (instruction_set_) {
+      case kThumb2:
+      case kArm:
+        // On Thumb2, the max range of a BL instruction is 16MB.  Give it a little wiggle room.
+         return 15*MB;
+      default:
+        // Returning 0 means we won't generate a trampoline island.
+       return 0;
+    }
+  }
+
+  void BuildEntrypointTrampolineCode();
+
+  // Architecture specific Entrypoint trampoline builder.
+  void BuildArmEntrypointTrampolineCall(ThreadOffset<4> offset);
+
+  const std::vector<uint8_t>& GetEntrypointTrampolineTableCode() const {
+    return entrypoint_trampoline_code_;
+  }
+
+  FinalEntrypointRelocationSet* AllocateFinalEntrypointRelocationSet(CompilationUnit* cu) const;
+
  private:
   // These flags are internal to CompilerDriver for collecting INVOKE resolution statistics.
   // The only external contract is that unresolved method has flags 0 and resolved non-0.
@@ -638,6 +750,7 @@
       LOCKS_EXCLUDED(Locks::mutator_lock_);
 
   void LoadImageClasses(TimingLogger* timings);
+  void PostCompile() LOCKS_EXCLUDED(Locks::mutator_lock_);
 
   // Attempt to resolve all type, methods, fields, and strings
   // referenced from code in the dex file following PathClassLoader
@@ -798,6 +911,10 @@
   DedupeSet<std::vector<uint8_t>, size_t, DedupeHashFunc, 4> dedupe_gc_map_;
   DedupeSet<std::vector<uint8_t>, size_t, DedupeHashFunc, 4> dedupe_cfi_info_;
 
+  EntrypointTrampolines entrypoint_trampolines_;
+
+  std::vector<uint8_t> entrypoint_trampoline_code_;
+
   DISALLOW_COPY_AND_ASSIGN(CompilerDriver);
 };
 
diff --git a/compiler/driver/compiler_options.h b/compiler/driver/compiler_options.h
index 20c6bc8..52248a6 100644
--- a/compiler/driver/compiler_options.h
+++ b/compiler/driver/compiler_options.h
@@ -50,7 +50,8 @@
     small_method_threshold_(kDefaultSmallMethodThreshold),
     tiny_method_threshold_(kDefaultTinyMethodThreshold),
     num_dex_methods_threshold_(kDefaultNumDexMethodsThreshold),
-    generate_gdb_information_(false)
+    generate_gdb_information_(false),
+    generate_helper_trampolines_(false)
 #ifdef ART_SEA_IR_MODE
     , sea_ir_mode_(false)
 #endif
@@ -62,7 +63,8 @@
                   size_t small_method_threshold,
                   size_t tiny_method_threshold,
                   size_t num_dex_methods_threshold,
-                  bool generate_gdb_information
+                  bool generate_gdb_information,
+                  bool generate_helper_trampolines
 #ifdef ART_SEA_IR_MODE
                   , bool sea_ir_mode
 #endif
@@ -73,7 +75,8 @@
     small_method_threshold_(small_method_threshold),
     tiny_method_threshold_(tiny_method_threshold),
     num_dex_methods_threshold_(num_dex_methods_threshold),
-    generate_gdb_information_(generate_gdb_information)
+    generate_gdb_information_(generate_gdb_information),
+    generate_helper_trampolines_(generate_helper_trampolines)
 #ifdef ART_SEA_IR_MODE
     , sea_ir_mode_(sea_ir_mode)
 #endif
@@ -140,6 +143,10 @@
     return generate_gdb_information_;
   }
 
+  bool GenerateHelperTrampolines() const {
+    return generate_helper_trampolines_;
+  }
+
  private:
   CompilerFilter compiler_filter_;
   size_t huge_method_threshold_;
@@ -148,6 +155,7 @@
   size_t tiny_method_threshold_;
   size_t num_dex_methods_threshold_;
   bool generate_gdb_information_;
+  bool generate_helper_trampolines_;
 
 #ifdef ART_SEA_IR_MODE
   bool sea_ir_mode_;