AArch64: Add HInstruction scheduling support.

This commit adds a new `HInstructionScheduling` pass that performs
basic scheduling on the `HGraph`.

Currently, scheduling is performed at the block level, so no
`HInstruction` ever leaves its block in this pass.

The scheduling process iterates through blocks in the graph. For
blocks that we can and want to schedule:
1) Build a dependency graph for instructions. It includes data
   dependencies (inputs/uses), but also environment dependencies and
   side-effect dependencies.
2) Schedule the dependency graph. This is a topological sort of the
   dependency graph, using heuristics to decide what node to schedule
   first when there are multiple candidates. Currently the heuristics
   only consider instruction latencies and schedule first the
   instructions that are on the critical path.

Test: m test-art-host
Test: m test-art-target

Change-Id: Iec103177d4f059666d7c9626e5770531fbc5ccdc
diff --git a/compiler/Android.bp b/compiler/Android.bp
index 46f3358..f6a4db4 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -80,6 +80,7 @@
         "optimizing/register_allocator_graph_color.cc",
         "optimizing/register_allocator_linear_scan.cc",
         "optimizing/select_generator.cc",
+        "optimizing/scheduler.cc",
         "optimizing/sharpening.cc",
         "optimizing/side_effects_analysis.cc",
         "optimizing/ssa_builder.cc",
@@ -123,6 +124,7 @@
                 "jni/quick/arm64/calling_convention_arm64.cc",
                 "linker/arm64/relative_patcher_arm64.cc",
                 "optimizing/code_generator_arm64.cc",
+                "optimizing/scheduler_arm64.cc",
                 "optimizing/instruction_simplifier_arm64.cc",
                 "optimizing/intrinsics_arm64.cc",
                 "optimizing/nodes_arm64.cc",
@@ -362,6 +364,7 @@
         "jni/jni_cfi_test.cc",
         "optimizing/codegen_test.cc",
         "optimizing/optimizing_cfi_test.cc",
+        "optimizing/scheduler_test.cc",
     ],
 
     codegen: {
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index e3f3df0..d131509 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -17,30 +17,15 @@
 #include <functional>
 #include <memory>
 
-#include "arch/instruction_set.h"
-#include "arch/arm/instruction_set_features_arm.h"
-#include "arch/arm/registers_arm.h"
-#include "arch/arm64/instruction_set_features_arm64.h"
-#include "arch/mips/instruction_set_features_mips.h"
-#include "arch/mips/registers_mips.h"
-#include "arch/mips64/instruction_set_features_mips64.h"
-#include "arch/mips64/registers_mips64.h"
-#include "arch/x86/instruction_set_features_x86.h"
-#include "arch/x86/registers_x86.h"
-#include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "base/macros.h"
 #include "builder.h"
-#include "code_simulator_container.h"
-#include "common_compiler_test.h"
+#include "codegen_test_utils.h"
 #include "dex_file.h"
 #include "dex_instruction.h"
 #include "driver/compiler_options.h"
-#include "graph_checker.h"
 #include "nodes.h"
 #include "optimizing_unit_test.h"
-#include "prepare_for_register_allocation.h"
 #include "register_allocator_linear_scan.h"
-#include "ssa_liveness_analysis.h"
 #include "utils.h"
 #include "utils/arm/assembler_arm_vixl.h"
 #include "utils/arm/managed_register_arm.h"
@@ -48,324 +33,10 @@
 #include "utils/mips64/managed_register_mips64.h"
 #include "utils/x86/managed_register_x86.h"
 
-#ifdef ART_ENABLE_CODEGEN_arm
-#include "code_generator_arm.h"
-#include "code_generator_arm_vixl.h"
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_arm64
-#include "code_generator_arm64.h"
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_x86
-#include "code_generator_x86.h"
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_x86_64
-#include "code_generator_x86_64.h"
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_mips
-#include "code_generator_mips.h"
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_mips64
-#include "code_generator_mips64.h"
-#endif
-
 #include "gtest/gtest.h"
 
 namespace art {
 
-typedef CodeGenerator* (*CreateCodegenFn)(HGraph*, const CompilerOptions&);
-
-class CodegenTargetConfig {
- public:
-  CodegenTargetConfig(InstructionSet isa, CreateCodegenFn create_codegen)
-      : isa_(isa), create_codegen_(create_codegen) {
-  }
-  InstructionSet GetInstructionSet() const { return isa_; }
-  CodeGenerator* CreateCodeGenerator(HGraph* graph, const CompilerOptions& compiler_options) {
-    return create_codegen_(graph, compiler_options);
-  }
-
- private:
-  CodegenTargetConfig() {}
-  InstructionSet isa_;
-  CreateCodegenFn create_codegen_;
-};
-
-#ifdef ART_ENABLE_CODEGEN_arm
-// Provide our own codegen, that ensures the C calling conventions
-// are preserved. Currently, ART and C do not match as R4 is caller-save
-// in ART, and callee-save in C. Alternatively, we could use or write
-// the stub that saves and restores all registers, but it is easier
-// to just overwrite the code generator.
-class TestCodeGeneratorARM : public arm::CodeGeneratorARM {
- public:
-  TestCodeGeneratorARM(HGraph* graph,
-                       const ArmInstructionSetFeatures& isa_features,
-                       const CompilerOptions& compiler_options)
-      : arm::CodeGeneratorARM(graph, isa_features, compiler_options) {
-    AddAllocatedRegister(Location::RegisterLocation(arm::R6));
-    AddAllocatedRegister(Location::RegisterLocation(arm::R7));
-  }
-
-  void SetupBlockedRegisters() const OVERRIDE {
-    arm::CodeGeneratorARM::SetupBlockedRegisters();
-    blocked_core_registers_[arm::R4] = true;
-    blocked_core_registers_[arm::R6] = false;
-    blocked_core_registers_[arm::R7] = false;
-  }
-};
-
-// A way to test the VIXL32-based code generator on ARM. This will replace
-// TestCodeGeneratorARM when the VIXL32-based backend replaces the existing one.
-class TestCodeGeneratorARMVIXL : public arm::CodeGeneratorARMVIXL {
- public:
-  TestCodeGeneratorARMVIXL(HGraph* graph,
-                           const ArmInstructionSetFeatures& isa_features,
-                           const CompilerOptions& compiler_options)
-      : arm::CodeGeneratorARMVIXL(graph, isa_features, compiler_options) {
-    AddAllocatedRegister(Location::RegisterLocation(arm::R6));
-    AddAllocatedRegister(Location::RegisterLocation(arm::R7));
-  }
-
-  void SetupBlockedRegisters() const OVERRIDE {
-    arm::CodeGeneratorARMVIXL::SetupBlockedRegisters();
-    blocked_core_registers_[arm::R4] = true;
-    blocked_core_registers_[arm::R6] = false;
-    blocked_core_registers_[arm::R7] = false;
-  }
-};
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_x86
-class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 {
- public:
-  TestCodeGeneratorX86(HGraph* graph,
-                       const X86InstructionSetFeatures& isa_features,
-                       const CompilerOptions& compiler_options)
-      : x86::CodeGeneratorX86(graph, isa_features, compiler_options) {
-    // Save edi, we need it for getting enough registers for long multiplication.
-    AddAllocatedRegister(Location::RegisterLocation(x86::EDI));
-  }
-
-  void SetupBlockedRegisters() const OVERRIDE {
-    x86::CodeGeneratorX86::SetupBlockedRegisters();
-    // ebx is a callee-save register in C, but caller-save for ART.
-    blocked_core_registers_[x86::EBX] = true;
-
-    // Make edi available.
-    blocked_core_registers_[x86::EDI] = false;
-  }
-};
-#endif
-
-class InternalCodeAllocator : public CodeAllocator {
- public:
-  InternalCodeAllocator() : size_(0) { }
-
-  virtual uint8_t* Allocate(size_t size) {
-    size_ = size;
-    memory_.reset(new uint8_t[size]);
-    return memory_.get();
-  }
-
-  size_t GetSize() const { return size_; }
-  uint8_t* GetMemory() const { return memory_.get(); }
-
- private:
-  size_t size_;
-  std::unique_ptr<uint8_t[]> memory_;
-
-  DISALLOW_COPY_AND_ASSIGN(InternalCodeAllocator);
-};
-
-static bool CanExecuteOnHardware(InstructionSet target_isa) {
-  return (target_isa == kRuntimeISA)
-      // Handle the special case of ARM, with two instructions sets (ARM32 and Thumb-2).
-      || (kRuntimeISA == kArm && target_isa == kThumb2);
-}
-
-static bool CanExecute(InstructionSet target_isa) {
-  CodeSimulatorContainer simulator(target_isa);
-  return CanExecuteOnHardware(target_isa) || simulator.CanSimulate();
-}
-
-template <typename Expected>
-static Expected SimulatorExecute(CodeSimulator* simulator, Expected (*f)());
-
-template <>
-bool SimulatorExecute<bool>(CodeSimulator* simulator, bool (*f)()) {
-  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
-  return simulator->GetCReturnBool();
-}
-
-template <>
-int32_t SimulatorExecute<int32_t>(CodeSimulator* simulator, int32_t (*f)()) {
-  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
-  return simulator->GetCReturnInt32();
-}
-
-template <>
-int64_t SimulatorExecute<int64_t>(CodeSimulator* simulator, int64_t (*f)()) {
-  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
-  return simulator->GetCReturnInt64();
-}
-
-template <typename Expected>
-static void VerifyGeneratedCode(InstructionSet target_isa,
-                                Expected (*f)(),
-                                bool has_result,
-                                Expected expected) {
-  ASSERT_TRUE(CanExecute(target_isa)) << "Target isa is not executable.";
-
-  // Verify on simulator.
-  CodeSimulatorContainer simulator(target_isa);
-  if (simulator.CanSimulate()) {
-    Expected result = SimulatorExecute<Expected>(simulator.Get(), f);
-    if (has_result) {
-      ASSERT_EQ(expected, result);
-    }
-  }
-
-  // Verify on hardware.
-  if (CanExecuteOnHardware(target_isa)) {
-    Expected result = f();
-    if (has_result) {
-      ASSERT_EQ(expected, result);
-    }
-  }
-}
-
-template <typename Expected>
-static void Run(const InternalCodeAllocator& allocator,
-                const CodeGenerator& codegen,
-                bool has_result,
-                Expected expected) {
-  InstructionSet target_isa = codegen.GetInstructionSet();
-
-  typedef Expected (*fptr)();
-  CommonCompilerTest::MakeExecutable(allocator.GetMemory(), allocator.GetSize());
-  fptr f = reinterpret_cast<fptr>(allocator.GetMemory());
-  if (target_isa == kThumb2) {
-    // For thumb we need the bottom bit set.
-    f = reinterpret_cast<fptr>(reinterpret_cast<uintptr_t>(f) + 1);
-  }
-  VerifyGeneratedCode(target_isa, f, has_result, expected);
-}
-
-static void ValidateGraph(HGraph* graph) {
-  GraphChecker graph_checker(graph);
-  graph_checker.Run();
-  if (!graph_checker.IsValid()) {
-    for (const auto& error : graph_checker.GetErrors()) {
-      std::cout << error << std::endl;
-    }
-  }
-  ASSERT_TRUE(graph_checker.IsValid());
-}
-
-template <typename Expected>
-static void RunCodeNoCheck(CodeGenerator* codegen,
-                           HGraph* graph,
-                           const std::function<void(HGraph*)>& hook_before_codegen,
-                           bool has_result,
-                           Expected expected) {
-  SsaLivenessAnalysis liveness(graph, codegen);
-  PrepareForRegisterAllocation(graph).Run();
-  liveness.Analyze();
-  RegisterAllocator::Create(graph->GetArena(), codegen, liveness)->AllocateRegisters();
-  hook_before_codegen(graph);
-  InternalCodeAllocator allocator;
-  codegen->Compile(&allocator);
-  Run(allocator, *codegen, has_result, expected);
-}
-
-template <typename Expected>
-static void RunCode(CodeGenerator* codegen,
-                    HGraph* graph,
-                    std::function<void(HGraph*)> hook_before_codegen,
-                    bool has_result,
-                    Expected expected) {
-  ValidateGraph(graph);
-  RunCodeNoCheck(codegen, graph, hook_before_codegen, has_result, expected);
-}
-
-template <typename Expected>
-static void RunCode(CodegenTargetConfig target_config,
-                    HGraph* graph,
-                    std::function<void(HGraph*)> hook_before_codegen,
-                    bool has_result,
-                    Expected expected) {
-  CompilerOptions compiler_options;
-  std::unique_ptr<CodeGenerator> codegen(target_config.CreateCodeGenerator(graph, compiler_options));
-  RunCode(codegen.get(), graph, hook_before_codegen, has_result, expected);
-}
-
-#ifdef ART_ENABLE_CODEGEN_arm
-CodeGenerator* create_codegen_arm(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const ArmInstructionSetFeatures> features_arm(
-      ArmInstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena()) TestCodeGeneratorARM(graph,
-                                                      *features_arm.get(),
-                                                      compiler_options);
-}
-
-CodeGenerator* create_codegen_arm_vixl32(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const ArmInstructionSetFeatures> features_arm(
-      ArmInstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena())
-      TestCodeGeneratorARMVIXL(graph, *features_arm.get(), compiler_options);
-}
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_arm64
-CodeGenerator* create_codegen_arm64(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const Arm64InstructionSetFeatures> features_arm64(
-      Arm64InstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena()) arm64::CodeGeneratorARM64(graph,
-                                                           *features_arm64.get(),
-                                                           compiler_options);
-}
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_x86
-CodeGenerator* create_codegen_x86(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
-      X86InstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena()) TestCodeGeneratorX86(graph, *features_x86.get(), compiler_options);
-}
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_x86_64
-CodeGenerator* create_codegen_x86_64(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64(
-     X86_64InstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena())
-      x86_64::CodeGeneratorX86_64(graph, *features_x86_64.get(), compiler_options);
-}
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_mips
-CodeGenerator* create_codegen_mips(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const MipsInstructionSetFeatures> features_mips(
-      MipsInstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena())
-      mips::CodeGeneratorMIPS(graph, *features_mips.get(), compiler_options);
-}
-#endif
-
-#ifdef ART_ENABLE_CODEGEN_mips64
-CodeGenerator* create_codegen_mips64(HGraph* graph, const CompilerOptions& compiler_options) {
-  std::unique_ptr<const Mips64InstructionSetFeatures> features_mips64(
-      Mips64InstructionSetFeatures::FromCppDefines());
-  return new (graph->GetArena())
-      mips64::CodeGeneratorMIPS64(graph, *features_mips64.get(), compiler_options);
-}
-#endif
-
 // Return all combinations of ISA and code generator that are executable on
 // hardware, or on simulator, and that we'd like to test.
 static ::std::vector<CodegenTargetConfig> GetTargetConfigs() {
diff --git a/compiler/optimizing/codegen_test_utils.h b/compiler/optimizing/codegen_test_utils.h
new file mode 100644
index 0000000..cd95404
--- /dev/null
+++ b/compiler/optimizing/codegen_test_utils.h
@@ -0,0 +1,355 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_CODEGEN_TEST_UTILS_H_
+#define ART_COMPILER_OPTIMIZING_CODEGEN_TEST_UTILS_H_
+
+#include "arch/arm/instruction_set_features_arm.h"
+#include "arch/arm/registers_arm.h"
+#include "arch/arm64/instruction_set_features_arm64.h"
+#include "arch/instruction_set.h"
+#include "arch/mips/instruction_set_features_mips.h"
+#include "arch/mips/registers_mips.h"
+#include "arch/mips64/instruction_set_features_mips64.h"
+#include "arch/mips64/registers_mips64.h"
+#include "arch/x86/instruction_set_features_x86.h"
+#include "arch/x86/registers_x86.h"
+#include "arch/x86_64/instruction_set_features_x86_64.h"
+#include "code_simulator_container.h"
+#include "common_compiler_test.h"
+#include "graph_checker.h"
+#include "prepare_for_register_allocation.h"
+#include "ssa_liveness_analysis.h"
+
+#ifdef ART_ENABLE_CODEGEN_arm
+#include "code_generator_arm.h"
+#include "code_generator_arm_vixl.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+#include "code_generator_arm64.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86
+#include "code_generator_x86.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86_64
+#include "code_generator_x86_64.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips
+#include "code_generator_mips.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips64
+#include "code_generator_mips64.h"
+#endif
+
+namespace art {
+
+typedef CodeGenerator* (*CreateCodegenFn)(HGraph*, const CompilerOptions&);
+
+class CodegenTargetConfig {
+ public:
+  CodegenTargetConfig(InstructionSet isa, CreateCodegenFn create_codegen)
+      : isa_(isa), create_codegen_(create_codegen) {
+  }
+  InstructionSet GetInstructionSet() const { return isa_; }
+  CodeGenerator* CreateCodeGenerator(HGraph* graph, const CompilerOptions& compiler_options) {
+    return create_codegen_(graph, compiler_options);
+  }
+
+ private:
+  CodegenTargetConfig() {}
+  InstructionSet isa_;
+  CreateCodegenFn create_codegen_;
+};
+
+#ifdef ART_ENABLE_CODEGEN_arm
+// Provide our own codegen, that ensures the C calling conventions
+// are preserved. Currently, ART and C do not match as R4 is caller-save
+// in ART, and callee-save in C. Alternatively, we could use or write
+// the stub that saves and restores all registers, but it is easier
+// to just overwrite the code generator.
+class TestCodeGeneratorARM : public arm::CodeGeneratorARM {
+ public:
+  TestCodeGeneratorARM(HGraph* graph,
+                       const ArmInstructionSetFeatures& isa_features,
+                       const CompilerOptions& compiler_options)
+      : arm::CodeGeneratorARM(graph, isa_features, compiler_options) {
+    AddAllocatedRegister(Location::RegisterLocation(arm::R6));
+    AddAllocatedRegister(Location::RegisterLocation(arm::R7));
+  }
+
+  void SetupBlockedRegisters() const OVERRIDE {
+    arm::CodeGeneratorARM::SetupBlockedRegisters();
+    blocked_core_registers_[arm::R4] = true;
+    blocked_core_registers_[arm::R6] = false;
+    blocked_core_registers_[arm::R7] = false;
+  }
+};
+
+// A way to test the VIXL32-based code generator on ARM. This will replace
+// TestCodeGeneratorARM when the VIXL32-based backend replaces the existing one.
+class TestCodeGeneratorARMVIXL : public arm::CodeGeneratorARMVIXL {
+ public:
+  TestCodeGeneratorARMVIXL(HGraph* graph,
+                           const ArmInstructionSetFeatures& isa_features,
+                           const CompilerOptions& compiler_options)
+      : arm::CodeGeneratorARMVIXL(graph, isa_features, compiler_options) {
+    AddAllocatedRegister(Location::RegisterLocation(arm::R6));
+    AddAllocatedRegister(Location::RegisterLocation(arm::R7));
+  }
+
+  void SetupBlockedRegisters() const OVERRIDE {
+    arm::CodeGeneratorARMVIXL::SetupBlockedRegisters();
+    blocked_core_registers_[arm::R4] = true;
+    blocked_core_registers_[arm::R6] = false;
+    blocked_core_registers_[arm::R7] = false;
+  }
+};
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86
+class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 {
+ public:
+  TestCodeGeneratorX86(HGraph* graph,
+                       const X86InstructionSetFeatures& isa_features,
+                       const CompilerOptions& compiler_options)
+      : x86::CodeGeneratorX86(graph, isa_features, compiler_options) {
+    // Save edi, we need it for getting enough registers for long multiplication.
+    AddAllocatedRegister(Location::RegisterLocation(x86::EDI));
+  }
+
+  void SetupBlockedRegisters() const OVERRIDE {
+    x86::CodeGeneratorX86::SetupBlockedRegisters();
+    // ebx is a callee-save register in C, but caller-save for ART.
+    blocked_core_registers_[x86::EBX] = true;
+
+    // Make edi available.
+    blocked_core_registers_[x86::EDI] = false;
+  }
+};
+#endif
+
+class InternalCodeAllocator : public CodeAllocator {
+ public:
+  InternalCodeAllocator() : size_(0) { }
+
+  virtual uint8_t* Allocate(size_t size) {
+    size_ = size;
+    memory_.reset(new uint8_t[size]);
+    return memory_.get();
+  }
+
+  size_t GetSize() const { return size_; }
+  uint8_t* GetMemory() const { return memory_.get(); }
+
+ private:
+  size_t size_;
+  std::unique_ptr<uint8_t[]> memory_;
+
+  DISALLOW_COPY_AND_ASSIGN(InternalCodeAllocator);
+};
+
+static bool CanExecuteOnHardware(InstructionSet target_isa) {
+  return (target_isa == kRuntimeISA)
+      // Handle the special case of ARM, with two instructions sets (ARM32 and Thumb-2).
+      || (kRuntimeISA == kArm && target_isa == kThumb2);
+}
+
+static bool CanExecute(InstructionSet target_isa) {
+  CodeSimulatorContainer simulator(target_isa);
+  return CanExecuteOnHardware(target_isa) || simulator.CanSimulate();
+}
+
+template <typename Expected>
+inline static Expected SimulatorExecute(CodeSimulator* simulator, Expected (*f)());
+
+template <>
+inline bool SimulatorExecute<bool>(CodeSimulator* simulator, bool (*f)()) {
+  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
+  return simulator->GetCReturnBool();
+}
+
+template <>
+inline int32_t SimulatorExecute<int32_t>(CodeSimulator* simulator, int32_t (*f)()) {
+  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
+  return simulator->GetCReturnInt32();
+}
+
+template <>
+inline int64_t SimulatorExecute<int64_t>(CodeSimulator* simulator, int64_t (*f)()) {
+  simulator->RunFrom(reinterpret_cast<intptr_t>(f));
+  return simulator->GetCReturnInt64();
+}
+
+template <typename Expected>
+static void VerifyGeneratedCode(InstructionSet target_isa,
+                                Expected (*f)(),
+                                bool has_result,
+                                Expected expected) {
+  ASSERT_TRUE(CanExecute(target_isa)) << "Target isa is not executable.";
+
+  // Verify on simulator.
+  CodeSimulatorContainer simulator(target_isa);
+  if (simulator.CanSimulate()) {
+    Expected result = SimulatorExecute<Expected>(simulator.Get(), f);
+    if (has_result) {
+      ASSERT_EQ(expected, result);
+    }
+  }
+
+  // Verify on hardware.
+  if (CanExecuteOnHardware(target_isa)) {
+    Expected result = f();
+    if (has_result) {
+      ASSERT_EQ(expected, result);
+    }
+  }
+}
+
+template <typename Expected>
+static void Run(const InternalCodeAllocator& allocator,
+                const CodeGenerator& codegen,
+                bool has_result,
+                Expected expected) {
+  InstructionSet target_isa = codegen.GetInstructionSet();
+
+  typedef Expected (*fptr)();
+  CommonCompilerTest::MakeExecutable(allocator.GetMemory(), allocator.GetSize());
+  fptr f = reinterpret_cast<fptr>(allocator.GetMemory());
+  if (target_isa == kThumb2) {
+    // For thumb we need the bottom bit set.
+    f = reinterpret_cast<fptr>(reinterpret_cast<uintptr_t>(f) + 1);
+  }
+  VerifyGeneratedCode(target_isa, f, has_result, expected);
+}
+
+static void ValidateGraph(HGraph* graph) {
+  GraphChecker graph_checker(graph);
+  graph_checker.Run();
+  if (!graph_checker.IsValid()) {
+    for (const auto& error : graph_checker.GetErrors()) {
+      std::cout << error << std::endl;
+    }
+  }
+  ASSERT_TRUE(graph_checker.IsValid());
+}
+
+template <typename Expected>
+static void RunCodeNoCheck(CodeGenerator* codegen,
+                           HGraph* graph,
+                           const std::function<void(HGraph*)>& hook_before_codegen,
+                           bool has_result,
+                           Expected expected) {
+  SsaLivenessAnalysis liveness(graph, codegen);
+  PrepareForRegisterAllocation(graph).Run();
+  liveness.Analyze();
+  RegisterAllocator::Create(graph->GetArena(), codegen, liveness)->AllocateRegisters();
+  hook_before_codegen(graph);
+  InternalCodeAllocator allocator;
+  codegen->Compile(&allocator);
+  Run(allocator, *codegen, has_result, expected);
+}
+
+template <typename Expected>
+static void RunCode(CodeGenerator* codegen,
+                    HGraph* graph,
+                    std::function<void(HGraph*)> hook_before_codegen,
+                    bool has_result,
+                    Expected expected) {
+  ValidateGraph(graph);
+  RunCodeNoCheck(codegen, graph, hook_before_codegen, has_result, expected);
+}
+
+template <typename Expected>
+static void RunCode(CodegenTargetConfig target_config,
+                    HGraph* graph,
+                    std::function<void(HGraph*)> hook_before_codegen,
+                    bool has_result,
+                    Expected expected) {
+  CompilerOptions compiler_options;
+  std::unique_ptr<CodeGenerator> codegen(target_config.CreateCodeGenerator(graph, compiler_options));
+  RunCode(codegen.get(), graph, hook_before_codegen, has_result, expected);
+}
+
+#ifdef ART_ENABLE_CODEGEN_arm
+CodeGenerator* create_codegen_arm(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const ArmInstructionSetFeatures> features_arm(
+      ArmInstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena()) TestCodeGeneratorARM(graph,
+                                                      *features_arm.get(),
+                                                      compiler_options);
+}
+
+CodeGenerator* create_codegen_arm_vixl32(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const ArmInstructionSetFeatures> features_arm(
+      ArmInstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena())
+      TestCodeGeneratorARMVIXL(graph, *features_arm.get(), compiler_options);
+}
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+CodeGenerator* create_codegen_arm64(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const Arm64InstructionSetFeatures> features_arm64(
+      Arm64InstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena()) arm64::CodeGeneratorARM64(graph,
+                                                           *features_arm64.get(),
+                                                           compiler_options);
+}
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86
+CodeGenerator* create_codegen_x86(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena()) TestCodeGeneratorX86(graph, *features_x86.get(), compiler_options);
+}
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86_64
+CodeGenerator* create_codegen_x86_64(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64(
+     X86_64InstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena())
+      x86_64::CodeGeneratorX86_64(graph, *features_x86_64.get(), compiler_options);
+}
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips
+CodeGenerator* create_codegen_mips(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const MipsInstructionSetFeatures> features_mips(
+      MipsInstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena())
+      mips::CodeGeneratorMIPS(graph, *features_mips.get(), compiler_options);
+}
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips64
+CodeGenerator* create_codegen_mips64(HGraph* graph, const CompilerOptions& compiler_options) {
+  std::unique_ptr<const Mips64InstructionSetFeatures> features_mips64(
+      Mips64InstructionSetFeatures::FromCppDefines());
+  return new (graph->GetArena())
+      mips64::CodeGeneratorMIPS64(graph, *features_mips64.get(), compiler_options);
+}
+#endif
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_CODEGEN_TEST_UTILS_H_
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index d15145e..76900f2 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -1354,13 +1354,15 @@
   return os;
 }
 
-void HInstruction::MoveBefore(HInstruction* cursor) {
-  DCHECK(!IsPhi());
-  DCHECK(!IsControlFlow());
-  DCHECK(CanBeMoved() ||
-         // HShouldDeoptimizeFlag can only be moved by CHAGuardOptimization.
-         IsShouldDeoptimizeFlag());
-  DCHECK(!cursor->IsPhi());
+void HInstruction::MoveBefore(HInstruction* cursor, bool do_checks) {
+  if (do_checks) {
+    DCHECK(!IsPhi());
+    DCHECK(!IsControlFlow());
+    DCHECK(CanBeMoved() ||
+           // HShouldDeoptimizeFlag can only be moved by CHAGuardOptimization.
+           IsShouldDeoptimizeFlag());
+    DCHECK(!cursor->IsPhi());
+  }
 
   next_->previous_ = previous_;
   if (previous_ != nullptr) {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index f0ea9e2..acf14aa 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -2065,8 +2065,8 @@
     other->ReplaceInput(this, use_index);
   }
 
-  // Move `this` instruction before `cursor`.
-  void MoveBefore(HInstruction* cursor);
+  // Move `this` instruction before `cursor`
+  void MoveBefore(HInstruction* cursor, bool do_checks = true);
 
   // Move `this` before its first user and out of any loops. If there is no
   // out-of-loop user that dominates all other users, move the instruction
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 297500b..1ab6710 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -90,6 +90,7 @@
 #include "reference_type_propagation.h"
 #include "register_allocator_linear_scan.h"
 #include "select_generator.h"
+#include "scheduler.h"
 #include "sharpening.h"
 #include "side_effects_analysis.h"
 #include "ssa_builder.h"
@@ -658,10 +659,13 @@
           new (arena) arm64::InstructionSimplifierArm64(graph, stats);
       SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
       GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN$after_arch");
+      HInstructionScheduling* scheduling =
+          new (arena) HInstructionScheduling(graph, instruction_set);
       HOptimization* arm64_optimizations[] = {
         simplifier,
         side_effects,
-        gvn
+        gvn,
+        scheduling,
       };
       RunOptimizations(arm64_optimizations, arraysize(arm64_optimizations), pass_observer);
       break;
diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h
index 58d9017..bf963b8 100644
--- a/compiler/optimizing/optimizing_unit_test.h
+++ b/compiler/optimizing/optimizing_unit_test.h
@@ -64,6 +64,9 @@
 void RemoveSuspendChecks(HGraph* graph) {
   for (HBasicBlock* block : graph->GetBlocks()) {
     if (block != nullptr) {
+      if (block->GetLoopInformation() != nullptr) {
+        block->GetLoopInformation()->SetSuspendCheck(nullptr);
+      }
       for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
         HInstruction* current = it.Current();
         if (current->IsSuspendCheck()) {
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
new file mode 100644
index 0000000..d65d20c
--- /dev/null
+++ b/compiler/optimizing/scheduler.cc
@@ -0,0 +1,610 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+
+#include "prepare_for_register_allocation.h"
+#include "scheduler.h"
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+#include "scheduler_arm64.h"
+#endif
+
+namespace art {
+
+void SchedulingGraph::AddDependency(SchedulingNode* node,
+                                    SchedulingNode* dependency,
+                                    bool is_data_dependency) {
+  if (node == nullptr || dependency == nullptr) {
+    // A `nullptr` node indicates an instruction out of scheduling range (eg. in
+    // an other block), so we do not need to add a dependency edge to the graph.
+    return;
+  }
+
+  if (is_data_dependency) {
+    if (!HasImmediateDataDependency(node, dependency)) {
+      node->AddDataPredecessor(dependency);
+    }
+  } else if (!HasImmediateOtherDependency(node, dependency)) {
+    node->AddOtherPredecessor(dependency);
+  }
+}
+
+static bool MayHaveReorderingDependency(SideEffects node, SideEffects other) {
+  // Read after write.
+  if (node.MayDependOn(other)) {
+    return true;
+  }
+
+  // Write after read.
+  if (other.MayDependOn(node)) {
+    return true;
+  }
+
+  // Memory write after write.
+  if (node.DoesAnyWrite() && other.DoesAnyWrite()) {
+    return true;
+  }
+
+  return false;
+}
+
+
+// Check whether `node` depends on `other`, taking into account `SideEffect`
+// information and `CanThrow` information.
+static bool HasSideEffectDependency(const HInstruction* node, const HInstruction* other) {
+  if (MayHaveReorderingDependency(node->GetSideEffects(), other->GetSideEffects())) {
+    return true;
+  }
+
+  if (other->CanThrow() && node->GetSideEffects().DoesAnyWrite()) {
+    return true;
+  }
+
+  if (other->GetSideEffects().DoesAnyWrite() && node->CanThrow()) {
+    return true;
+  }
+
+  if (other->CanThrow() && node->CanThrow()) {
+    return true;
+  }
+
+  // Check side-effect dependency between ArrayGet and BoundsCheck.
+  if (node->IsArrayGet() && other->IsBoundsCheck() && node->InputAt(1) == other) {
+    return true;
+  }
+
+  return false;
+}
+
+void SchedulingGraph::AddDependencies(HInstruction* instruction, bool is_scheduling_barrier) {
+  SchedulingNode* instruction_node = GetNode(instruction);
+
+  // Define-use dependencies.
+  for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) {
+    AddDataDependency(GetNode(use.GetUser()), instruction_node);
+  }
+
+  // Scheduling barrier dependencies.
+  DCHECK(!is_scheduling_barrier || contains_scheduling_barrier_);
+  if (contains_scheduling_barrier_) {
+    // A barrier depends on instructions after it. And instructions before the
+    // barrier depend on it.
+    for (HInstruction* other = instruction->GetNext(); other != nullptr; other = other->GetNext()) {
+      SchedulingNode* other_node = GetNode(other);
+      bool other_is_barrier = other_node->IsSchedulingBarrier();
+      if (is_scheduling_barrier || other_is_barrier) {
+        AddOtherDependency(other_node, instruction_node);
+      }
+      if (other_is_barrier) {
+        // This other scheduling barrier guarantees ordering of instructions after
+        // it, so avoid creating additional useless dependencies in the graph.
+        // For example if we have
+        //     instr_1
+        //     barrier_2
+        //     instr_3
+        //     barrier_4
+        //     instr_5
+        // we only create the following non-data dependencies
+        //     1 -> 2
+        //     2 -> 3
+        //     2 -> 4
+        //     3 -> 4
+        //     4 -> 5
+        // and do not create
+        //     1 -> 4
+        //     2 -> 5
+        // Note that in this example we could also avoid creating the dependency
+        // `2 -> 4`.  But if we remove `instr_3` that dependency is required to
+        // order the barriers. So we generate it to avoid a special case.
+        break;
+      }
+    }
+  }
+
+  // Side effect dependencies.
+  if (!instruction->GetSideEffects().DoesNothing() || instruction->CanThrow()) {
+    for (HInstruction* other = instruction->GetNext(); other != nullptr; other = other->GetNext()) {
+      SchedulingNode* other_node = GetNode(other);
+      if (other_node->IsSchedulingBarrier()) {
+        // We have reached a scheduling barrier so we can stop further
+        // processing.
+        DCHECK(HasImmediateOtherDependency(other_node, instruction_node));
+        break;
+      }
+      if (HasSideEffectDependency(other, instruction)) {
+        AddOtherDependency(other_node, instruction_node);
+      }
+    }
+  }
+
+  // Environment dependencies.
+  // We do not need to process those if the instruction is a scheduling barrier,
+  // since the barrier already has non-data dependencies on all following
+  // instructions.
+  if (!is_scheduling_barrier) {
+    for (const HUseListNode<HEnvironment*>& use : instruction->GetEnvUses()) {
+      // Note that here we could stop processing if the environment holder is
+      // across a scheduling barrier. But checking this would likely require
+      // more work than simply iterating through environment uses.
+      AddOtherDependency(GetNode(use.GetUser()->GetHolder()), instruction_node);
+    }
+  }
+}
+
+bool SchedulingGraph::HasImmediateDataDependency(const SchedulingNode* node,
+                                                 const SchedulingNode* other) const {
+  return ContainsElement(node->GetDataPredecessors(), other);
+}
+
+bool SchedulingGraph::HasImmediateDataDependency(const HInstruction* instruction,
+                                                 const HInstruction* other_instruction) const {
+  const SchedulingNode* node = GetNode(instruction);
+  const SchedulingNode* other = GetNode(other_instruction);
+  if (node == nullptr || other == nullptr) {
+    // Both instructions must be in current basic block, i.e. the SchedulingGraph can see their
+    // corresponding SchedulingNode in the graph, and tell whether there is a dependency.
+    // Otherwise there is no dependency from SchedulingGraph's perspective, for example,
+    // instruction and other_instruction are in different basic blocks.
+    return false;
+  }
+  return HasImmediateDataDependency(node, other);
+}
+
+bool SchedulingGraph::HasImmediateOtherDependency(const SchedulingNode* node,
+                                                  const SchedulingNode* other) const {
+  return ContainsElement(node->GetOtherPredecessors(), other);
+}
+
+bool SchedulingGraph::HasImmediateOtherDependency(const HInstruction* instruction,
+                                                  const HInstruction* other_instruction) const {
+  const SchedulingNode* node = GetNode(instruction);
+  const SchedulingNode* other = GetNode(other_instruction);
+  if (node == nullptr || other == nullptr) {
+    // Both instructions must be in current basic block, i.e. the SchedulingGraph can see their
+    // corresponding SchedulingNode in the graph, and tell whether there is a dependency.
+    // Otherwise there is no dependency from SchedulingGraph's perspective, for example,
+    // instruction and other_instruction are in different basic blocks.
+    return false;
+  }
+  return HasImmediateOtherDependency(node, other);
+}
+
+static const std::string InstructionTypeId(const HInstruction* instruction) {
+  std::string id;
+  Primitive::Type type = instruction->GetType();
+  if (type == Primitive::kPrimNot) {
+    id.append("l");
+  } else {
+    id.append(Primitive::Descriptor(instruction->GetType()));
+  }
+  // Use lower-case to be closer to the `HGraphVisualizer` output.
+  id[0] = std::tolower(id[0]);
+  id.append(std::to_string(instruction->GetId()));
+  return id;
+}
+
+// Ideally we would reuse the graph visualizer code, but it is not available
+// from here and it is not worth moving all that code only for our use.
+static void DumpAsDotNode(std::ostream& output, const SchedulingNode* node) {
+  const HInstruction* instruction = node->GetInstruction();
+  // Use the instruction typed id as the node identifier.
+  std::string instruction_id = InstructionTypeId(instruction);
+  output << instruction_id << "[shape=record, label=\""
+      << instruction_id << ' ' << instruction->DebugName() << " [";
+  // List the instruction's inputs in its description. When visualizing the
+  // graph this helps differentiating data inputs from other dependencies.
+  const char* seperator = "";
+  for (const HInstruction* input : instruction->GetInputs()) {
+    output << seperator << InstructionTypeId(input);
+    seperator = ",";
+  }
+  output << "]";
+  // Other properties of the node.
+  output << "\\ninternal_latency: " << node->GetInternalLatency();
+  output << "\\ncritical_path: " << node->GetCriticalPath();
+  if (node->IsSchedulingBarrier()) {
+    output << "\\n(barrier)";
+  }
+  output << "\"];\n";
+  // We want program order to go from top to bottom in the graph output, so we
+  // reverse the edges and specify `dir=back`.
+  for (const SchedulingNode* predecessor : node->GetDataPredecessors()) {
+    const HInstruction* predecessor_instruction = predecessor->GetInstruction();
+    output << InstructionTypeId(predecessor_instruction) << ":s -> " << instruction_id << ":n "
+        << "[label=\"" << predecessor->GetLatency() << "\",dir=back]\n";
+  }
+  for (const SchedulingNode* predecessor : node->GetOtherPredecessors()) {
+    const HInstruction* predecessor_instruction = predecessor->GetInstruction();
+    output << InstructionTypeId(predecessor_instruction) << ":s -> " << instruction_id << ":n "
+        << "[dir=back,color=blue]\n";
+  }
+}
+
+void SchedulingGraph::DumpAsDotGraph(const std::string& description,
+                                     const ArenaVector<SchedulingNode*>& initial_candidates) {
+  // TODO(xueliang): ideally we should move scheduling information into HInstruction, after that
+  // we should move this dotty graph dump feature to visualizer, and have a compiler option for it.
+  std::ofstream output("scheduling_graphs.dot", std::ofstream::out | std::ofstream::app);
+  // Description of this graph, as a comment.
+  output << "// " << description << "\n";
+  // Start the dot graph. Use an increasing index for easier differentiation.
+  output << "digraph G {\n";
+  for (const auto& entry : nodes_map_) {
+    DumpAsDotNode(output, entry.second);
+  }
+  // Create a fake 'end_of_scheduling' node to help visualization of critical_paths.
+  for (auto node : initial_candidates) {
+    const HInstruction* instruction = node->GetInstruction();
+    output << InstructionTypeId(instruction) << ":s -> end_of_scheduling:n "
+      << "[label=\"" << node->GetLatency() << "\",dir=back]\n";
+  }
+  // End of the dot graph.
+  output << "}\n";
+  output.close();
+}
+
+SchedulingNode* CriticalPathSchedulingNodeSelector::SelectMaterializedCondition(
+    ArenaVector<SchedulingNode*>* nodes, const SchedulingGraph& graph) const {
+  // Schedule condition inputs that can be materialized immediately before their use.
+  // In following example, after we've scheduled HSelect, we want LessThan to be scheduled
+  // immediately, because it is a materialized condition, and will be emitted right before HSelect
+  // in codegen phase.
+  //
+  // i20 HLessThan [...]                  HLessThan    HAdd      HAdd
+  // i21 HAdd [...]                ===>      |          |         |
+  // i22 HAdd [...]                          +----------+---------+
+  // i23 HSelect [i21, i22, i20]                     HSelect
+
+  if (prev_select_ == nullptr) {
+    return nullptr;
+  }
+
+  const HInstruction* instruction = prev_select_->GetInstruction();
+  const HCondition* condition = nullptr;
+  DCHECK(instruction != nullptr);
+
+  if (instruction->IsIf()) {
+    condition = instruction->AsIf()->InputAt(0)->AsCondition();
+  } else if (instruction->IsSelect()) {
+    condition = instruction->AsSelect()->GetCondition()->AsCondition();
+  }
+
+  SchedulingNode* condition_node = (condition != nullptr) ? graph.GetNode(condition) : nullptr;
+
+  if ((condition_node != nullptr) &&
+      condition->HasOnlyOneNonEnvironmentUse() &&
+      ContainsElement(*nodes, condition_node)) {
+    DCHECK(!condition_node->HasUnscheduledSuccessors());
+    // Remove the condition from the list of candidates and schedule it.
+    RemoveElement(*nodes, condition_node);
+    return condition_node;
+  }
+
+  return nullptr;
+}
+
+SchedulingNode* CriticalPathSchedulingNodeSelector::PopHighestPriorityNode(
+    ArenaVector<SchedulingNode*>* nodes, const SchedulingGraph& graph) {
+  DCHECK(!nodes->empty());
+  SchedulingNode* select_node = nullptr;
+
+  // Optimize for materialized condition and its emit before use scenario.
+  select_node = SelectMaterializedCondition(nodes, graph);
+
+  if (select_node == nullptr) {
+    // Get highest priority node based on critical path information.
+    select_node = (*nodes)[0];
+    size_t select = 0;
+    for (size_t i = 1, e = nodes->size(); i < e; i++) {
+      SchedulingNode* check = (*nodes)[i];
+      SchedulingNode* candidate = (*nodes)[select];
+      select_node = GetHigherPrioritySchedulingNode(candidate, check);
+      if (select_node == check) {
+        select = i;
+      }
+    }
+    DeleteNodeAtIndex(nodes, select);
+  }
+
+  prev_select_ = select_node;
+  return select_node;
+}
+
+SchedulingNode* CriticalPathSchedulingNodeSelector::GetHigherPrioritySchedulingNode(
+    SchedulingNode* candidate, SchedulingNode* check) const {
+  uint32_t candidate_path = candidate->GetCriticalPath();
+  uint32_t check_path = check->GetCriticalPath();
+  // First look at the critical_path.
+  if (check_path != candidate_path) {
+    return check_path < candidate_path ? check : candidate;
+  }
+  // If both critical paths are equal, schedule instructions with a higher latency
+  // first in program order.
+  return check->GetLatency() < candidate->GetLatency() ? check : candidate;
+}
+
+void HScheduler::Schedule(HGraph* graph) {
+  for (HBasicBlock* block : graph->GetReversePostOrder()) {
+    if (IsSchedulable(block)) {
+      Schedule(block);
+    }
+  }
+}
+
+void HScheduler::Schedule(HBasicBlock* block) {
+  ArenaVector<SchedulingNode*> scheduling_nodes(arena_->Adapter(kArenaAllocScheduler));
+
+  // Build the scheduling graph.
+  scheduling_graph_.Clear();
+  for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+    HInstruction* instruction = it.Current();
+    SchedulingNode* node = scheduling_graph_.AddNode(instruction, IsSchedulingBarrier(instruction));
+    CalculateLatency(node);
+    scheduling_nodes.push_back(node);
+  }
+
+  if (scheduling_graph_.Size() <= 1) {
+    scheduling_graph_.Clear();
+    return;
+  }
+
+  cursor_ = block->GetLastInstruction();
+
+  // Find the initial candidates for scheduling.
+  candidates_.clear();
+  for (SchedulingNode* node : scheduling_nodes) {
+    if (!node->HasUnscheduledSuccessors()) {
+      node->MaybeUpdateCriticalPath(node->GetLatency());
+      candidates_.push_back(node);
+    }
+  }
+
+  ArenaVector<SchedulingNode*> initial_candidates(arena_->Adapter(kArenaAllocScheduler));
+  if (kDumpDotSchedulingGraphs) {
+    // Remember the list of initial candidates for debug output purposes.
+    initial_candidates.assign(candidates_.begin(), candidates_.end());
+  }
+
+  // Schedule all nodes.
+  while (!candidates_.empty()) {
+    Schedule(selector_->PopHighestPriorityNode(&candidates_, scheduling_graph_));
+  }
+
+  if (kDumpDotSchedulingGraphs) {
+    // Dump the graph in `dot` format.
+    HGraph* graph = block->GetGraph();
+    std::stringstream description;
+    description << graph->GetDexFile().PrettyMethod(graph->GetMethodIdx())
+        << " B" << block->GetBlockId();
+    scheduling_graph_.DumpAsDotGraph(description.str(), initial_candidates);
+  }
+}
+
+void HScheduler::Schedule(SchedulingNode* scheduling_node) {
+  // Check whether any of the node's predecessors will be valid candidates after
+  // this node is scheduled.
+  uint32_t path_to_node = scheduling_node->GetCriticalPath();
+  for (SchedulingNode* predecessor : scheduling_node->GetDataPredecessors()) {
+    predecessor->MaybeUpdateCriticalPath(
+        path_to_node + predecessor->GetInternalLatency() + predecessor->GetLatency());
+    predecessor->DecrementNumberOfUnscheduledSuccessors();
+    if (!predecessor->HasUnscheduledSuccessors()) {
+      candidates_.push_back(predecessor);
+    }
+  }
+  for (SchedulingNode* predecessor : scheduling_node->GetOtherPredecessors()) {
+    // Do not update the critical path.
+    // The 'other' (so 'non-data') dependencies (usually) do not represent a
+    // 'material' dependency of nodes on others. They exist for program
+    // correctness. So we do not use them to compute the critical path.
+    predecessor->DecrementNumberOfUnscheduledSuccessors();
+    if (!predecessor->HasUnscheduledSuccessors()) {
+      candidates_.push_back(predecessor);
+    }
+  }
+
+  Schedule(scheduling_node->GetInstruction());
+}
+
+// Move an instruction after cursor instruction inside one basic block.
+static void MoveAfterInBlock(HInstruction* instruction, HInstruction* cursor) {
+  DCHECK_EQ(instruction->GetBlock(), cursor->GetBlock());
+  DCHECK_NE(cursor, cursor->GetBlock()->GetLastInstruction());
+  DCHECK(!instruction->IsControlFlow());
+  DCHECK(!cursor->IsControlFlow());
+  instruction->MoveBefore(cursor->GetNext(), /* do_checks */ false);
+}
+
+void HScheduler::Schedule(HInstruction* instruction) {
+  if (instruction == cursor_) {
+    cursor_ = cursor_->GetPrevious();
+  } else {
+    MoveAfterInBlock(instruction, cursor_);
+  }
+}
+
+bool HScheduler::IsSchedulable(const HInstruction* instruction) const {
+  // We want to avoid exhaustively listing all instructions, so we first check
+  // for instruction categories that we know are safe.
+  if (instruction->IsControlFlow() ||
+      instruction->IsConstant()) {
+    return true;
+  }
+  // Currently all unary and binary operations are safe to schedule, so avoid
+  // checking for each of them individually.
+  // Since nothing prevents a new scheduling-unsafe HInstruction to subclass
+  // HUnaryOperation (or HBinaryOperation), check in debug mode that we have
+  // the exhaustive lists here.
+  if (instruction->IsUnaryOperation()) {
+    DCHECK(instruction->IsBooleanNot() ||
+           instruction->IsNot() ||
+           instruction->IsNeg()) << "unexpected instruction " << instruction->DebugName();
+    return true;
+  }
+  if (instruction->IsBinaryOperation()) {
+    DCHECK(instruction->IsAdd() ||
+           instruction->IsAnd() ||
+           instruction->IsCompare() ||
+           instruction->IsCondition() ||
+           instruction->IsDiv() ||
+           instruction->IsMul() ||
+           instruction->IsOr() ||
+           instruction->IsRem() ||
+           instruction->IsRor() ||
+           instruction->IsShl() ||
+           instruction->IsShr() ||
+           instruction->IsSub() ||
+           instruction->IsUShr() ||
+           instruction->IsXor()) << "unexpected instruction " << instruction->DebugName();
+    return true;
+  }
+  // The scheduler should not see any of these.
+  DCHECK(!instruction->IsParallelMove()) << "unexpected instruction " << instruction->DebugName();
+  // List of instructions explicitly excluded:
+  //    HClearException
+  //    HClinitCheck
+  //    HDeoptimize
+  //    HLoadClass
+  //    HLoadException
+  //    HMemoryBarrier
+  //    HMonitorOperation
+  //    HNativeDebugInfo
+  //    HThrow
+  //    HTryBoundary
+  // TODO: Some of the instructions above may be safe to schedule (maybe as
+  // scheduling barriers).
+  return instruction->IsArrayGet() ||
+      instruction->IsArraySet() ||
+      instruction->IsArrayLength() ||
+      instruction->IsBoundType() ||
+      instruction->IsBoundsCheck() ||
+      instruction->IsCheckCast() ||
+      instruction->IsClassTableGet() ||
+      instruction->IsCurrentMethod() ||
+      instruction->IsDivZeroCheck() ||
+      instruction->IsInstanceFieldGet() ||
+      instruction->IsInstanceFieldSet() ||
+      instruction->IsInstanceOf() ||
+      instruction->IsInvokeInterface() ||
+      instruction->IsInvokeStaticOrDirect() ||
+      instruction->IsInvokeUnresolved() ||
+      instruction->IsInvokeVirtual() ||
+      instruction->IsLoadString() ||
+      instruction->IsNewArray() ||
+      instruction->IsNewInstance() ||
+      instruction->IsNullCheck() ||
+      instruction->IsPackedSwitch() ||
+      instruction->IsParameterValue() ||
+      instruction->IsPhi() ||
+      instruction->IsReturn() ||
+      instruction->IsReturnVoid() ||
+      instruction->IsSelect() ||
+      instruction->IsStaticFieldGet() ||
+      instruction->IsStaticFieldSet() ||
+      instruction->IsSuspendCheck() ||
+      instruction->IsTypeConversion() ||
+      instruction->IsUnresolvedInstanceFieldGet() ||
+      instruction->IsUnresolvedInstanceFieldSet() ||
+      instruction->IsUnresolvedStaticFieldGet() ||
+      instruction->IsUnresolvedStaticFieldSet();
+}
+
+bool HScheduler::IsSchedulable(const HBasicBlock* block) const {
+  // We may be only interested in loop blocks.
+  if (only_optimize_loop_blocks_ && !block->IsInLoop()) {
+    return false;
+  }
+  if (block->GetTryCatchInformation() != nullptr) {
+    // Do not schedule blocks that are part of try-catch.
+    // Because scheduler cannot see if catch block has assumptions on the instruction order in
+    // the try block. In following example, if we enable scheduler for the try block,
+    // MulitiplyAccumulate may be scheduled before DivZeroCheck,
+    // which can result in an incorrect value in the catch block.
+    //   try {
+    //     a = a/b;    // DivZeroCheck
+    //                 // Div
+    //     c = c*d+e;  // MulitiplyAccumulate
+    //   } catch {System.out.print(c); }
+    return false;
+  }
+  // Check whether all instructions in this block are schedulable.
+  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+    if (!IsSchedulable(it.Current())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool HScheduler::IsSchedulingBarrier(const HInstruction* instr) const {
+  return instr->IsControlFlow() ||
+      // Don't break calling convention.
+      instr->IsParameterValue() ||
+      // Code generation of goto relies on SuspendCheck's position.
+      instr->IsSuspendCheck();
+}
+
+void HInstructionScheduling::Run(bool only_optimize_loop_blocks,
+                                 bool schedule_randomly) {
+  // Avoid compilation error when compiling for unsupported instruction set.
+  UNUSED(only_optimize_loop_blocks);
+  UNUSED(schedule_randomly);
+  switch (instruction_set_) {
+#ifdef ART_ENABLE_CODEGEN_arm64
+    case kArm64: {
+      // Phase-local allocator that allocates scheduler internal data structures like
+      // scheduling nodes, internel nodes map, dependencies, etc.
+      ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool());
+
+      CriticalPathSchedulingNodeSelector critical_path_selector;
+      RandomSchedulingNodeSelector random_selector;
+      SchedulingNodeSelector* selector = schedule_randomly
+          ? static_cast<SchedulingNodeSelector*>(&random_selector)
+          : static_cast<SchedulingNodeSelector*>(&critical_path_selector);
+
+      arm64::HSchedulerARM64 scheduler(&arena_allocator, selector);
+      scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
+      scheduler.Schedule(graph_);
+      break;
+    }
+#endif
+    default:
+      break;
+  }
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
new file mode 100644
index 0000000..ab0dad4
--- /dev/null
+++ b/compiler/optimizing/scheduler.h
@@ -0,0 +1,487 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_H_
+#define ART_COMPILER_OPTIMIZING_SCHEDULER_H_
+
+#include <fstream>
+
+#include "base/time_utils.h"
+#include "driver/compiler_driver.h"
+#include "nodes.h"
+#include "optimization.h"
+
+namespace art {
+
+// General description of instruction scheduling.
+//
+// This pass tries to improve the quality of the generated code by reordering
+// instructions in the graph to avoid execution delays caused by execution
+// dependencies.
+// Currently, scheduling is performed at the block level, so no `HInstruction`
+// ever leaves its block in this pass.
+//
+// The scheduling process iterates through blocks in the graph. For blocks that
+// we can and want to schedule:
+// 1) Build a dependency graph for instructions.
+//    It includes data dependencies (inputs/uses), but also environment
+//    dependencies and side-effect dependencies.
+// 2) Schedule the dependency graph.
+//    This is a topological sort of the dependency graph, using heuristics to
+//    decide what node to scheduler first when there are multiple candidates.
+//
+// A few factors impacting the quality of the scheduling are:
+// - The heuristics used to decide what node to schedule in the topological sort
+//   when there are multiple valid candidates. There is a wide range of
+//   complexity possible here, going from a simple model only considering
+//   latencies, to a super detailed CPU pipeline model.
+// - Fewer dependencies in the dependency graph give more freedom for the
+//   scheduling heuristics. For example de-aliasing can allow possibilities for
+//   reordering of memory accesses.
+// - The level of abstraction of the IR. It is easier to evaluate scheduling for
+//   IRs that translate to a single assembly instruction than for IRs
+//   that generate multiple assembly instructions or generate different code
+//   depending on properties of the IR.
+// - Scheduling is performed before register allocation, it is not aware of the
+//   impact of moving instructions on register allocation.
+//
+//
+// The scheduling code uses the terms predecessors, successors, and dependencies.
+// This can be confusing at times, so here are clarifications.
+// These terms are used from the point of view of the program dependency graph. So
+// the inputs of an instruction are part of its dependencies, and hence part its
+// predecessors. So the uses of an instruction are (part of) its successors.
+// (Side-effect dependencies can yield predecessors or successors that are not
+// inputs or uses.)
+//
+// Here is a trivial example. For the Java code:
+//
+//    int a = 1 + 2;
+//
+// we would have the instructions
+//
+//    i1 HIntConstant 1
+//    i2 HIntConstant 2
+//    i3 HAdd [i1,i2]
+//
+// `i1` and `i2` are predecessors of `i3`.
+// `i3` is a successor of `i1` and a successor of `i2`.
+// In a scheduling graph for this code we would have three nodes `n1`, `n2`,
+// and `n3` (respectively for instructions `i1`, `i1`, and `i3`).
+// Conceptually the program dependency graph for this would contain two edges
+//
+//    n1 -> n3
+//    n2 -> n3
+//
+// Since we schedule backwards (starting from the last instruction in each basic
+// block), the implementation of nodes keeps a list of pointers their
+// predecessors. So `n3` would keep pointers to its predecessors `n1` and `n2`.
+//
+// Node dependencies are also referred to from the program dependency graph
+// point of view: we say that node `B` immediately depends on `A` if there is an
+// edge from `A` to `B` in the program dependency graph. `A` is a predecessor of
+// `B`, `B` is a successor of `A`. In the example above `n3` depends on `n1` and
+// `n2`.
+// Since nodes in the scheduling graph keep a list of their predecessors, node
+// `B` will have a pointer to its predecessor `A`.
+// As we schedule backwards, `B` will be selected for scheduling before `A` is.
+//
+// So the scheduling for the example above could happen as follow
+//
+//    |---------------------------+------------------------|
+//    | candidates for scheduling | instructions scheduled |
+//    | --------------------------+------------------------|
+//
+// The only node without successors is `n3`, so it is the only initial
+// candidate.
+//
+//    | n3                        | (none)                 |
+//
+// We schedule `n3` as the last (and only) instruction. All its predecessors
+// that do not have any unscheduled successors become candidate. That is, `n1`
+// and `n2` become candidates.
+//
+//    | n1, n2                    | n3                     |
+//
+// One of the candidates is selected. In practice this is where scheduling
+// heuristics kick in, to decide which of the candidates should be selected.
+// In this example, let it be `n1`. It is scheduled before previously scheduled
+// nodes (in program order). There are no other nodes to add to the list of
+// candidates.
+//
+//    | n2                        | n1                     |
+//    |                           | n3                     |
+//
+// The only candidate available for scheduling is `n2`. Schedule it before
+// (in program order) the previously scheduled nodes.
+//
+//    | (none)                    | n2                     |
+//    |                           | n1                     |
+//    |                           | n3                     |
+//    |---------------------------+------------------------|
+//
+// So finally the instructions will be executed in the order `i2`, `i1`, and `i3`.
+// In this trivial example, it does not matter which of `i1` and `i2` is
+// scheduled first since they are constants. However the same process would
+// apply if `i1` and `i2` were actual operations (for example `HMul` and `HDiv`).
+
+// Set to true to have instruction scheduling dump scheduling graphs to the file
+// `scheduling_graphs.dot`. See `SchedulingGraph::DumpAsDotGraph()`.
+static constexpr bool kDumpDotSchedulingGraphs = false;
+
+// Typically used as a default instruction latency.
+static constexpr uint32_t kGenericInstructionLatency = 1;
+
+class HScheduler;
+
+/**
+ * A node representing an `HInstruction` in the `SchedulingGraph`.
+ */
+class SchedulingNode : public ArenaObject<kArenaAllocScheduler> {
+ public:
+  SchedulingNode(HInstruction* instr, ArenaAllocator* arena, bool is_scheduling_barrier)
+      : latency_(0),
+        internal_latency_(0),
+        critical_path_(0),
+        instruction_(instr),
+        is_scheduling_barrier_(is_scheduling_barrier),
+        data_predecessors_(arena->Adapter(kArenaAllocScheduler)),
+        other_predecessors_(arena->Adapter(kArenaAllocScheduler)),
+        num_unscheduled_successors_(0) {
+    data_predecessors_.reserve(kPreallocatedPredecessors);
+  }
+
+  void AddDataPredecessor(SchedulingNode* predecessor) {
+    data_predecessors_.push_back(predecessor);
+    predecessor->num_unscheduled_successors_++;
+  }
+
+  void AddOtherPredecessor(SchedulingNode* predecessor) {
+    other_predecessors_.push_back(predecessor);
+    predecessor->num_unscheduled_successors_++;
+  }
+
+  void DecrementNumberOfUnscheduledSuccessors() {
+    num_unscheduled_successors_--;
+  }
+
+  void MaybeUpdateCriticalPath(uint32_t other_critical_path) {
+    critical_path_ = std::max(critical_path_, other_critical_path);
+  }
+
+  bool HasUnscheduledSuccessors() const {
+    return num_unscheduled_successors_ != 0;
+  }
+
+  HInstruction* GetInstruction() const { return instruction_; }
+  uint32_t GetLatency() const { return latency_; }
+  void SetLatency(uint32_t latency) { latency_ = latency; }
+  uint32_t GetInternalLatency() const { return internal_latency_; }
+  void SetInternalLatency(uint32_t internal_latency) { internal_latency_ = internal_latency; }
+  uint32_t GetCriticalPath() const { return critical_path_; }
+  bool IsSchedulingBarrier() const { return is_scheduling_barrier_; }
+  const ArenaVector<SchedulingNode*>& GetDataPredecessors() const { return data_predecessors_; }
+  const ArenaVector<SchedulingNode*>& GetOtherPredecessors() const { return other_predecessors_; }
+
+ private:
+  // The latency of this node. It represents the latency between the moment the
+  // last instruction for this node has executed to the moment the result
+  // produced by this node is available to users.
+  uint32_t latency_;
+  // This represents the time spent *within* the generated code for this node.
+  // It should be zero for nodes that only generate a single instruction.
+  uint32_t internal_latency_;
+
+  // The critical path from this instruction to the end of scheduling. It is
+  // used by the scheduling heuristics to measure the priority of this instruction.
+  // It is defined as
+  //     critical_path_ = latency_ + max((use.internal_latency_ + use.critical_path_) for all uses)
+  // (Note that here 'uses' is equivalent to 'data successors'. Also see comments in
+  // `HScheduler::Schedule(SchedulingNode* scheduling_node)`).
+  uint32_t critical_path_;
+
+  // The instruction that this node represents.
+  HInstruction* const instruction_;
+
+  // If a node is scheduling barrier, other nodes cannot be scheduled before it.
+  const bool is_scheduling_barrier_;
+
+  // The lists of predecessors. They cannot be scheduled before this node. Once
+  // this node is scheduled, we check whether any of its predecessors has become a
+  // valid candidate for scheduling.
+  // Predecessors in `data_predecessors_` are data dependencies. Those in
+  // `other_predecessors_` contain side-effect dependencies, environment
+  // dependencies, and scheduling barrier dependencies.
+  ArenaVector<SchedulingNode*> data_predecessors_;
+  ArenaVector<SchedulingNode*> other_predecessors_;
+
+  // The number of unscheduled successors for this node. This number is
+  // decremented as successors are scheduled. When it reaches zero this node
+  // becomes a valid candidate to schedule.
+  uint32_t num_unscheduled_successors_;
+
+  static constexpr size_t kPreallocatedPredecessors = 4;
+};
+
+/*
+ * Directed acyclic graph for scheduling.
+ */
+class SchedulingGraph : public ValueObject {
+ public:
+  SchedulingGraph(const HScheduler* scheduler, ArenaAllocator* arena)
+      : scheduler_(scheduler),
+        arena_(arena),
+        contains_scheduling_barrier_(false),
+        nodes_map_(arena_->Adapter(kArenaAllocScheduler)) {}
+
+  SchedulingNode* AddNode(HInstruction* instr, bool is_scheduling_barrier = false) {
+    SchedulingNode* node = new (arena_) SchedulingNode(instr, arena_, is_scheduling_barrier);
+    nodes_map_.Insert(std::make_pair(instr, node));
+    contains_scheduling_barrier_ |= is_scheduling_barrier;
+    AddDependencies(instr, is_scheduling_barrier);
+    return node;
+  }
+
+  void Clear() {
+    nodes_map_.Clear();
+    contains_scheduling_barrier_ = false;
+  }
+
+  SchedulingNode* GetNode(const HInstruction* instr) const {
+    auto it = nodes_map_.Find(instr);
+    if (it == nodes_map_.end()) {
+      return nullptr;
+    } else {
+      return it->second;
+    }
+  }
+
+  bool IsSchedulingBarrier(const HInstruction* instruction) const;
+
+  bool HasImmediateDataDependency(const SchedulingNode* node, const SchedulingNode* other) const;
+  bool HasImmediateDataDependency(const HInstruction* node, const HInstruction* other) const;
+  bool HasImmediateOtherDependency(const SchedulingNode* node, const SchedulingNode* other) const;
+  bool HasImmediateOtherDependency(const HInstruction* node, const HInstruction* other) const;
+
+  size_t Size() const {
+    return nodes_map_.Size();
+  }
+
+  // Dump the scheduling graph, in dot file format, appending it to the file
+  // `scheduling_graphs.dot`.
+  void DumpAsDotGraph(const std::string& description,
+                      const ArenaVector<SchedulingNode*>& initial_candidates);
+
+ protected:
+  void AddDependency(SchedulingNode* node, SchedulingNode* dependency, bool is_data_dependency);
+  void AddDataDependency(SchedulingNode* node, SchedulingNode* dependency) {
+    AddDependency(node, dependency, /*is_data_dependency*/true);
+  }
+  void AddOtherDependency(SchedulingNode* node, SchedulingNode* dependency) {
+    AddDependency(node, dependency, /*is_data_dependency*/false);
+  }
+
+  // Add dependencies nodes for the given `HInstruction`: inputs, environments, and side-effects.
+  void AddDependencies(HInstruction* instruction, bool is_scheduling_barrier = false);
+
+  const HScheduler* const scheduler_;
+
+  ArenaAllocator* const arena_;
+
+  bool contains_scheduling_barrier_;
+
+  ArenaHashMap<const HInstruction*, SchedulingNode*> nodes_map_;
+};
+
+/*
+ * The visitors derived from this base class are used by schedulers to evaluate
+ * the latencies of `HInstruction`s.
+ */
+class SchedulingLatencyVisitor : public HGraphDelegateVisitor {
+ public:
+  // This class and its sub-classes will never be used to drive a visit of an
+  // `HGraph` but only to visit `HInstructions` one at a time, so we do not need
+  // to pass a valid graph to `HGraphDelegateVisitor()`.
+  SchedulingLatencyVisitor() : HGraphDelegateVisitor(nullptr) {}
+
+  void VisitInstruction(HInstruction* instruction) OVERRIDE {
+    LOG(FATAL) << "Error visiting " << instruction->DebugName() << ". "
+        "Architecture-specific scheduling latency visitors must handle all instructions"
+        " (potentially by overriding the generic `VisitInstruction()`.";
+    UNREACHABLE();
+  }
+
+  void Visit(HInstruction* instruction) {
+    instruction->Accept(this);
+  }
+
+  void CalculateLatency(SchedulingNode* node) {
+    // By default nodes have no internal latency.
+    last_visited_internal_latency_ = 0;
+    Visit(node->GetInstruction());
+  }
+
+  uint32_t GetLastVisitedLatency() const { return last_visited_latency_; }
+  uint32_t GetLastVisitedInternalLatency() const { return last_visited_internal_latency_; }
+
+ protected:
+  // The latency of the most recent visited SchedulingNode.
+  // This is for reporting the latency value to the user of this visitor.
+  uint32_t last_visited_latency_;
+  // This represents the time spent *within* the generated code for the most recent visited
+  // SchedulingNode. This is for reporting the internal latency value to the user of this visitor.
+  uint32_t last_visited_internal_latency_;
+};
+
+class SchedulingNodeSelector : public ArenaObject<kArenaAllocScheduler> {
+ public:
+  virtual SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
+                                                 const SchedulingGraph& graph) = 0;
+  virtual ~SchedulingNodeSelector() {}
+ protected:
+  static void DeleteNodeAtIndex(ArenaVector<SchedulingNode*>* nodes, size_t index) {
+    (*nodes)[index] = nodes->back();
+    nodes->pop_back();
+  }
+};
+
+/*
+ * Select a `SchedulingNode` at random within the candidates.
+ */
+class RandomSchedulingNodeSelector : public SchedulingNodeSelector {
+ public:
+  explicit RandomSchedulingNodeSelector() : seed_(0) {
+    seed_  = static_cast<uint32_t>(NanoTime());
+    srand(seed_);
+  }
+
+  SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
+                                         const SchedulingGraph& graph) OVERRIDE {
+    UNUSED(graph);
+    DCHECK(!nodes->empty());
+    size_t select = rand_r(&seed_) % nodes->size();
+    SchedulingNode* select_node = (*nodes)[select];
+    DeleteNodeAtIndex(nodes, select);
+    return select_node;
+  }
+
+  uint32_t seed_;
+};
+
+/*
+ * Select a `SchedulingNode` according to critical path information,
+ * with heuristics to favor certain instruction patterns like materialized condition.
+ */
+class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
+ public:
+  CriticalPathSchedulingNodeSelector() : prev_select_(nullptr) {}
+
+  SchedulingNode* PopHighestPriorityNode(ArenaVector<SchedulingNode*>* nodes,
+                                         const SchedulingGraph& graph) OVERRIDE;
+
+ protected:
+  SchedulingNode* GetHigherPrioritySchedulingNode(SchedulingNode* candidate,
+                                                  SchedulingNode* check) const;
+
+  SchedulingNode* SelectMaterializedCondition(ArenaVector<SchedulingNode*>* nodes,
+                                               const SchedulingGraph& graph) const;
+
+ private:
+  const SchedulingNode* prev_select_;
+};
+
+class HScheduler {
+ public:
+  HScheduler(ArenaAllocator* arena,
+             SchedulingLatencyVisitor* latency_visitor,
+             SchedulingNodeSelector* selector)
+      : arena_(arena),
+        latency_visitor_(latency_visitor),
+        selector_(selector),
+        only_optimize_loop_blocks_(true),
+        scheduling_graph_(this, arena),
+        candidates_(arena_->Adapter(kArenaAllocScheduler)) {}
+  virtual ~HScheduler() {}
+
+  void Schedule(HGraph* graph);
+
+  void SetOnlyOptimizeLoopBlocks(bool loop_only) { only_optimize_loop_blocks_ = loop_only; }
+
+  // Instructions can not be rescheduled across a scheduling barrier.
+  virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
+
+ protected:
+  void Schedule(HBasicBlock* block);
+  void Schedule(SchedulingNode* scheduling_node);
+  void Schedule(HInstruction* instruction);
+
+  // Any instruction returning `false` via this method will prevent its
+  // containing basic block from being scheduled.
+  // This method is used to restrict scheduling to instructions that we know are
+  // safe to handle.
+  virtual bool IsSchedulable(const HInstruction* instruction) const;
+  bool IsSchedulable(const HBasicBlock* block) const;
+
+  void CalculateLatency(SchedulingNode* node) {
+    latency_visitor_->CalculateLatency(node);
+    node->SetLatency(latency_visitor_->GetLastVisitedLatency());
+    node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
+  }
+
+  ArenaAllocator* const arena_;
+  SchedulingLatencyVisitor* const latency_visitor_;
+  SchedulingNodeSelector* const selector_;
+  bool only_optimize_loop_blocks_;
+
+  // We instantiate the members below as part of this class to avoid
+  // instantiating them locally for every chunk scheduled.
+  SchedulingGraph scheduling_graph_;
+  // A pointer indicating where the next instruction to be scheduled will be inserted.
+  HInstruction* cursor_;
+  // The list of candidates for scheduling. A node becomes a candidate when all
+  // its predecessors have been scheduled.
+  ArenaVector<SchedulingNode*> candidates_;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HScheduler);
+};
+
+inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction) const {
+  return scheduler_->IsSchedulingBarrier(instruction);
+}
+
+class HInstructionScheduling : public HOptimization {
+ public:
+  HInstructionScheduling(HGraph* graph, InstructionSet instruction_set)
+      : HOptimization(graph, kInstructionScheduling),
+        instruction_set_(instruction_set) {}
+
+  void Run() {
+    Run(/*only_optimize_loop_blocks*/ true, /*schedule_randomly*/ false);
+  }
+  void Run(bool only_optimize_loop_blocks, bool schedule_randomly);
+
+  static constexpr const char* kInstructionScheduling = "scheduler";
+
+  const InstructionSet instruction_set_;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HInstructionScheduling);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_SCHEDULER_H_
diff --git a/compiler/optimizing/scheduler_arm64.cc b/compiler/optimizing/scheduler_arm64.cc
new file mode 100644
index 0000000..e3701fb
--- /dev/null
+++ b/compiler/optimizing/scheduler_arm64.cc
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scheduler_arm64.h"
+#include "code_generator_utils.h"
+
+namespace art {
+namespace arm64 {
+
+void SchedulingLatencyVisitorARM64::VisitBinaryOperation(HBinaryOperation* instr) {
+  last_visited_latency_ = Primitive::IsFloatingPointType(instr->GetResultType())
+      ? kArm64FloatingPointOpLatency
+      : kArm64IntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitBitwiseNegatedRight(
+    HBitwiseNegatedRight* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64IntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitArm64DataProcWithShifterOp(
+    HArm64DataProcWithShifterOp* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64DataProcWithShifterOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitIntermediateAddress(
+    HIntermediateAddress* ATTRIBUTE_UNUSED) {
+  // Although the code generated is a simple `add` instruction, we found through empirical results
+  // that spacing it from its use in memory accesses was beneficial.
+  last_visited_latency_ = kArm64IntegerOpLatency + 2;
+}
+
+void SchedulingLatencyVisitorARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64MulIntegerLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitArrayGet(HArrayGet* instruction) {
+  if (!instruction->GetArray()->IsIntermediateAddress()) {
+    // Take the intermediate address computation into account.
+    last_visited_internal_latency_ = kArm64IntegerOpLatency;
+  }
+  last_visited_latency_ = kArm64MemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitArrayLength(HArrayLength* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64MemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitArraySet(HArraySet* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64MemoryStoreLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitBoundsCheck(HBoundsCheck* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArm64IntegerOpLatency;
+  // Users do not use any data results.
+  last_visited_latency_ = 0;
+}
+
+void SchedulingLatencyVisitorARM64::VisitDiv(HDiv* instr) {
+  Primitive::Type type = instr->GetResultType();
+  switch (type) {
+    case Primitive::kPrimFloat:
+      last_visited_latency_ = kArm64DivFloatLatency;
+      break;
+    case Primitive::kPrimDouble:
+      last_visited_latency_ = kArm64DivDoubleLatency;
+      break;
+    default:
+      // Follow the code path used by code generation.
+      if (instr->GetRight()->IsConstant()) {
+        int64_t imm = Int64FromConstant(instr->GetRight()->AsConstant());
+        if (imm == 0) {
+          last_visited_internal_latency_ = 0;
+          last_visited_latency_ = 0;
+        } else if (imm == 1 || imm == -1) {
+          last_visited_internal_latency_ = 0;
+          last_visited_latency_ = kArm64IntegerOpLatency;
+        } else if (IsPowerOfTwo(AbsOrMin(imm))) {
+          last_visited_internal_latency_ = 4 * kArm64IntegerOpLatency;
+          last_visited_latency_ = kArm64IntegerOpLatency;
+        } else {
+          DCHECK(imm <= -2 || imm >= 2);
+          last_visited_internal_latency_ = 4 * kArm64IntegerOpLatency;
+          last_visited_latency_ = kArm64MulIntegerLatency;
+        }
+      } else {
+        last_visited_latency_ = kArm64DivIntegerLatency;
+      }
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM64::VisitInstanceFieldGet(HInstanceFieldGet* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64MemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitInstanceOf(HInstanceOf* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArm64CallInternalLatency;
+  last_visited_latency_ = kArm64IntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitInvoke(HInvoke* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArm64CallInternalLatency;
+  last_visited_latency_ = kArm64CallLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitLoadString(HLoadString* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArm64LoadStringInternalLatency;
+  last_visited_latency_ = kArm64MemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitMul(HMul* instr) {
+  last_visited_latency_ = Primitive::IsFloatingPointType(instr->GetResultType())
+      ? kArm64MulFloatingPointLatency
+      : kArm64MulIntegerLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitNewArray(HNewArray* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArm64IntegerOpLatency + kArm64CallInternalLatency;
+  last_visited_latency_ = kArm64CallLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitNewInstance(HNewInstance* instruction) {
+  if (instruction->IsStringAlloc()) {
+    last_visited_internal_latency_ = 2 + kArm64MemoryLoadLatency + kArm64CallInternalLatency;
+  } else {
+    last_visited_internal_latency_ = kArm64CallInternalLatency;
+  }
+  last_visited_latency_ = kArm64CallLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitRem(HRem* instruction) {
+  if (Primitive::IsFloatingPointType(instruction->GetResultType())) {
+    last_visited_internal_latency_ = kArm64CallInternalLatency;
+    last_visited_latency_ = kArm64CallLatency;
+  } else {
+    // Follow the code path used by code generation.
+    if (instruction->GetRight()->IsConstant()) {
+      int64_t imm = Int64FromConstant(instruction->GetRight()->AsConstant());
+      if (imm == 0) {
+        last_visited_internal_latency_ = 0;
+        last_visited_latency_ = 0;
+      } else if (imm == 1 || imm == -1) {
+        last_visited_internal_latency_ = 0;
+        last_visited_latency_ = kArm64IntegerOpLatency;
+      } else if (IsPowerOfTwo(AbsOrMin(imm))) {
+        last_visited_internal_latency_ = 4 * kArm64IntegerOpLatency;
+        last_visited_latency_ = kArm64IntegerOpLatency;
+      } else {
+        DCHECK(imm <= -2 || imm >= 2);
+        last_visited_internal_latency_ = 4 * kArm64IntegerOpLatency;
+        last_visited_latency_ = kArm64MulIntegerLatency;
+      }
+    } else {
+      last_visited_internal_latency_ = kArm64DivIntegerLatency;
+      last_visited_latency_ = kArm64MulIntegerLatency;
+    }
+  }
+}
+
+void SchedulingLatencyVisitorARM64::VisitStaticFieldGet(HStaticFieldGet* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64MemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM64::VisitSuspendCheck(HSuspendCheck* instruction) {
+  HBasicBlock* block = instruction->GetBlock();
+  DCHECK((block->GetLoopInformation() != nullptr) ||
+         (block->IsEntryBlock() && instruction->GetNext()->IsGoto()));
+  // Users do not use any data results.
+  last_visited_latency_ = 0;
+}
+
+void SchedulingLatencyVisitorARM64::VisitTypeConversion(HTypeConversion* instr) {
+  if (Primitive::IsFloatingPointType(instr->GetResultType()) ||
+      Primitive::IsFloatingPointType(instr->GetInputType())) {
+    last_visited_latency_ = kArm64TypeConversionFloatingPointIntegerLatency;
+  } else {
+    last_visited_latency_ = kArm64IntegerOpLatency;
+  }
+}
+
+}  // namespace arm64
+}  // namespace art
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
new file mode 100644
index 0000000..702027c
--- /dev/null
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_ARM64_H_
+#define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM64_H_
+
+#include "scheduler.h"
+
+namespace art {
+namespace arm64 {
+
+static constexpr uint32_t kArm64MemoryLoadLatency = 5;
+static constexpr uint32_t kArm64MemoryStoreLatency = 3;
+
+static constexpr uint32_t kArm64CallInternalLatency = 10;
+static constexpr uint32_t kArm64CallLatency = 5;
+
+// AArch64 instruction latency.
+// We currently assume that all arm64 CPUs share the same instruction latency list.
+static constexpr uint32_t kArm64IntegerOpLatency = 2;
+static constexpr uint32_t kArm64FloatingPointOpLatency = 5;
+
+
+static constexpr uint32_t kArm64DataProcWithShifterOpLatency = 3;
+static constexpr uint32_t kArm64DivDoubleLatency = 30;
+static constexpr uint32_t kArm64DivFloatLatency = 15;
+static constexpr uint32_t kArm64DivIntegerLatency = 5;
+static constexpr uint32_t kArm64LoadStringInternalLatency = 7;
+static constexpr uint32_t kArm64MulFloatingPointLatency = 6;
+static constexpr uint32_t kArm64MulIntegerLatency = 6;
+static constexpr uint32_t kArm64TypeConversionFloatingPointIntegerLatency = 5;
+
+class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor {
+ public:
+  // Default visitor for instructions not handled specifically below.
+  void VisitInstruction(HInstruction* ATTRIBUTE_UNUSED) {
+    last_visited_latency_ = kArm64IntegerOpLatency;
+  }
+
+// We add a second unused parameter to be able to use this macro like the others
+// defined in `nodes.h`.
+#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M) \
+  M(ArrayGet         , unused)                   \
+  M(ArrayLength      , unused)                   \
+  M(ArraySet         , unused)                   \
+  M(BinaryOperation  , unused)                   \
+  M(BoundsCheck      , unused)                   \
+  M(Div              , unused)                   \
+  M(InstanceFieldGet , unused)                   \
+  M(InstanceOf       , unused)                   \
+  M(Invoke           , unused)                   \
+  M(LoadString       , unused)                   \
+  M(Mul              , unused)                   \
+  M(NewArray         , unused)                   \
+  M(NewInstance      , unused)                   \
+  M(Rem              , unused)                   \
+  M(StaticFieldGet   , unused)                   \
+  M(SuspendCheck     , unused)                   \
+  M(TypeConversion   , unused)
+
+#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
+  M(BitwiseNegatedRight, unused)                 \
+  M(MultiplyAccumulate, unused)                  \
+  M(IntermediateAddress, unused)
+
+#define DECLARE_VISIT_INSTRUCTION(type, unused)  \
+  void Visit##type(H##type* instruction) OVERRIDE;
+
+  FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+};
+
+class HSchedulerARM64 : public HScheduler {
+ public:
+  HSchedulerARM64(ArenaAllocator* arena, SchedulingNodeSelector* selector)
+      : HScheduler(arena, &arm64_latency_visitor_, selector) {}
+  ~HSchedulerARM64() OVERRIDE {}
+
+  bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
+#define CASE_INSTRUCTION_KIND(type, unused) case \
+  HInstruction::InstructionKind::k##type:
+    switch (instruction->GetKind()) {
+      FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND)
+        return true;
+      FOR_EACH_CONCRETE_INSTRUCTION_ARM64(CASE_INSTRUCTION_KIND)
+        return true;
+      default:
+        return HScheduler::IsSchedulable(instruction);
+    }
+#undef CASE_INSTRUCTION_KIND
+  }
+
+ private:
+  SchedulingLatencyVisitorARM64 arm64_latency_visitor_;
+  DISALLOW_COPY_AND_ASSIGN(HSchedulerARM64);
+};
+
+}  // namespace arm64
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_SCHEDULER_ARM64_H_
diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc
new file mode 100644
index 0000000..31d13e2
--- /dev/null
+++ b/compiler/optimizing/scheduler_test.cc
@@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "base/arena_allocator.h"
+#include "builder.h"
+#include "codegen_test_utils.h"
+#include "common_compiler_test.h"
+#include "nodes.h"
+#include "optimizing_unit_test.h"
+#include "pc_relative_fixups_x86.h"
+#include "register_allocator.h"
+#include "scheduler.h"
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+#include "scheduler_arm64.h"
+#endif
+
+namespace art {
+
+// Return all combinations of ISA and code generator that are executable on
+// hardware, or on simulator, and that we'd like to test.
+static ::std::vector<CodegenTargetConfig> GetTargetConfigs() {
+  ::std::vector<CodegenTargetConfig> v;
+  ::std::vector<CodegenTargetConfig> test_config_candidates = {
+#ifdef ART_ENABLE_CODEGEN_arm
+    CodegenTargetConfig(kArm, create_codegen_arm),
+    CodegenTargetConfig(kThumb2, create_codegen_arm),
+#endif
+#ifdef ART_ENABLE_CODEGEN_arm64
+    CodegenTargetConfig(kArm64, create_codegen_arm64),
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86
+    CodegenTargetConfig(kX86, create_codegen_x86),
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86_64
+    CodegenTargetConfig(kX86_64, create_codegen_x86_64),
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips
+    CodegenTargetConfig(kMips, create_codegen_mips),
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips64
+    CodegenTargetConfig(kMips64, create_codegen_mips64)
+#endif
+  };
+
+  for (auto test_config : test_config_candidates) {
+    if (CanExecute(test_config.GetInstructionSet())) {
+      v.push_back(test_config);
+    }
+  }
+
+  return v;
+}
+
+class SchedulerTest : public CommonCompilerTest {};
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+TEST_F(SchedulerTest, DependencyGraph) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+  HGraph* graph = CreateGraph(&allocator);
+  HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
+  HBasicBlock* block1 = new (&allocator) HBasicBlock(graph);
+  graph->AddBlock(entry);
+  graph->AddBlock(block1);
+  graph->SetEntryBlock(entry);
+
+  // entry:
+  // array         ParameterValue
+  // c1            IntConstant
+  // c2            IntConstant
+  // block1:
+  // add1          Add [c1, c2]
+  // add2          Add [add1, c2]
+  // mul           Mul [add1, add2]
+  // div_check     DivZeroCheck [add2] (env: add2, mul)
+  // div           Div [add1, div_check]
+  // array_get1    ArrayGet [array, add1]
+  // array_set1    ArraySet [array, add1, add2]
+  // array_get2    ArrayGet [array, add1]
+  // array_set2    ArraySet [array, add1, add2]
+
+  HInstruction* array = new (&allocator) HParameterValue(graph->GetDexFile(),
+                                                         dex::TypeIndex(0),
+                                                         0,
+                                                         Primitive::kPrimNot);
+  HInstruction* c1 = graph->GetIntConstant(1);
+  HInstruction* c2 = graph->GetIntConstant(10);
+  HInstruction* add1 = new (&allocator) HAdd(Primitive::kPrimInt, c1, c2);
+  HInstruction* add2 = new (&allocator) HAdd(Primitive::kPrimInt, add1, c2);
+  HInstruction* mul = new (&allocator) HMul(Primitive::kPrimInt, add1, add2);
+  HInstruction* div_check = new (&allocator) HDivZeroCheck(add2, 0);
+  HInstruction* div = new (&allocator) HDiv(Primitive::kPrimInt, add1, div_check, 0);
+  HInstruction* array_get1 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0);
+  HInstruction* array_set1 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
+  HInstruction* array_get2 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0);
+  HInstruction* array_set2 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
+
+  DCHECK(div_check->CanThrow());
+
+  entry->AddInstruction(array);
+
+  HInstruction* block_instructions[] = {add1,
+                                        add2,
+                                        mul,
+                                        div_check,
+                                        div,
+                                        array_get1,
+                                        array_set1,
+                                        array_get2,
+                                        array_set2};
+  for (auto instr : block_instructions) {
+    block1->AddInstruction(instr);
+  }
+
+  HEnvironment* environment = new (&allocator) HEnvironment(&allocator,
+                                                            2,
+                                                            graph->GetArtMethod(),
+                                                            0,
+                                                            div_check);
+  div_check->SetRawEnvironment(environment);
+  environment->SetRawEnvAt(0, add2);
+  add2->AddEnvUseAt(div_check->GetEnvironment(), 0);
+  environment->SetRawEnvAt(1, mul);
+  mul->AddEnvUseAt(div_check->GetEnvironment(), 1);
+
+  ArenaAllocator* arena = graph->GetArena();
+  CriticalPathSchedulingNodeSelector critical_path_selector;
+  arm64::HSchedulerARM64 scheduler(arena, &critical_path_selector);
+  SchedulingGraph scheduling_graph(&scheduler, arena);
+  // Instructions must be inserted in reverse order into the scheduling graph.
+  for (auto instr : ReverseRange(block_instructions)) {
+    scheduling_graph.AddNode(instr);
+  }
+
+  // Should not have dependencies cross basic blocks.
+  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1));
+  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2));
+
+  // Define-use dependency.
+  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1));
+  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2));
+  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2));
+  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1));
+  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check));
+  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1));
+  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2));
+
+  // Read and write dependencies
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1));
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2));
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1));
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1));
+
+  // Env dependency.
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul));
+  ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check));
+
+  // CanThrow.
+  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check));
+}
+#endif
+
+static void CompileWithRandomSchedulerAndRun(const uint16_t* data,
+                                             bool has_result,
+                                             int expected) {
+  for (CodegenTargetConfig target_config : GetTargetConfigs()) {
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+    HGraph* graph = CreateCFG(&arena, data);
+
+    // Schedule the graph randomly.
+    HInstructionScheduling scheduling(graph, target_config.GetInstructionSet());
+    scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true);
+
+    RunCode(target_config,
+            graph,
+            [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); },
+            has_result, expected);
+  }
+}
+
+TEST_F(SchedulerTest, RandomScheduling) {
+  //
+  // Java source: crafted code to make sure (random) scheduling should get correct result.
+  //
+  //  int result = 0;
+  //  float fr = 10.0f;
+  //  for (int i = 1; i < 10; i++) {
+  //    fr ++;
+  //    int t1 = result >> i;
+  //    int t2 = result * i;
+  //    result = result + t1 - t2;
+  //    fr = fr / i;
+  //    result += (int)fr;
+  //  }
+  //  return result;
+  //
+  const uint16_t data[] = SIX_REGISTERS_CODE_ITEM(
+    Instruction::CONST_4 | 0 << 12 | 2 << 8,          // const/4 v2, #int 0
+    Instruction::CONST_HIGH16 | 0 << 8, 0x4120,       // const/high16 v0, #float 10.0 // #41200000
+    Instruction::CONST_4 | 1 << 12 | 1 << 8,          // const/4 v1, #int 1
+    Instruction::CONST_16 | 5 << 8, 0x000a,           // const/16 v5, #int 10
+    Instruction::IF_GE | 5 << 12 | 1 << 8, 0x0014,    // if-ge v1, v5, 001a // +0014
+    Instruction::CONST_HIGH16 | 5 << 8, 0x3f80,       // const/high16 v5, #float 1.0 // #3f800000
+    Instruction::ADD_FLOAT_2ADDR | 5 << 12 | 0 << 8,  // add-float/2addr v0, v5
+    Instruction::SHR_INT | 3 << 8, 1 << 8 | 2 ,       // shr-int v3, v2, v1
+    Instruction::MUL_INT | 4 << 8, 1 << 8 | 2,        // mul-int v4, v2, v1
+    Instruction::ADD_INT | 5 << 8, 3 << 8 | 2,        // add-int v5, v2, v3
+    Instruction::SUB_INT | 2 << 8, 4 << 8 | 5,        // sub-int v2, v5, v4
+    Instruction::INT_TO_FLOAT | 1 << 12 | 5 << 8,     // int-to-float v5, v1
+    Instruction::DIV_FLOAT_2ADDR | 5 << 12 | 0 << 8,  // div-float/2addr v0, v5
+    Instruction::FLOAT_TO_INT | 0 << 12 | 5 << 8,     // float-to-int v5, v0
+    Instruction::ADD_INT_2ADDR | 5 << 12 | 2 << 8,    // add-int/2addr v2, v5
+    Instruction::ADD_INT_LIT8 | 1 << 8, 1 << 8 | 1,   // add-int/lit8 v1, v1, #int 1 // #01
+    Instruction::GOTO | 0xeb << 8,                    // goto 0004 // -0015
+    Instruction::RETURN | 2 << 8);                    // return v2
+
+  constexpr int kNumberOfRuns = 10;
+  for (int i = 0; i < kNumberOfRuns; ++i) {
+    CompileWithRandomSchedulerAndRun(data, true, 138774);
+  }
+}
+
+}  // namespace art