Instruction scheduling for ARM.

Performance improvements on various benchmarks with this CL:

benchmarks     improvements
---------------------------
algorithm                1%
benchmarksgame           2%
caffeinemark             2%
math                     3%
stanford                 4%

Tested on ARM Cortex-A53 CPU.

The code size impact is negligible.

Test: m test-art-host
Test: m test-art-target
Change-Id: I314c90c09ce27e3d224fc686ef73c7d94a6b5a2c
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index 9236a0e..73e8087 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -23,6 +23,7 @@
 #include "driver/compiler_driver.h"
 #include "nodes.h"
 #include "optimization.h"
+#include "code_generator.h"
 
 namespace art {
 
@@ -469,8 +470,9 @@
 
 class HInstructionScheduling : public HOptimization {
  public:
-  HInstructionScheduling(HGraph* graph, InstructionSet instruction_set)
+  HInstructionScheduling(HGraph* graph, InstructionSet instruction_set, CodeGenerator* cg = nullptr)
       : HOptimization(graph, kInstructionScheduling),
+        codegen_(cg),
         instruction_set_(instruction_set) {}
 
   void Run() {
@@ -480,6 +482,7 @@
 
   static constexpr const char* kInstructionScheduling = "scheduler";
 
+  CodeGenerator* const codegen_;
   const InstructionSet instruction_set_;
 
  private: