Merge "ART: Refactor CompileOptimized"
diff --git a/compiler/compiler.h b/compiler/compiler.h
index a04641e..94b0fe3 100644
--- a/compiler/compiler.h
+++ b/compiler/compiler.h
@@ -63,13 +63,6 @@
   virtual uintptr_t GetEntryPointOf(mirror::ArtMethod* method) const
      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
 
-  virtual bool WriteElf(art::File* file,
-                        OatWriter* oat_writer,
-                        const std::vector<const art::DexFile*>& dex_files,
-                        const std::string& android_root,
-                        bool is_host) const
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) = 0;
-
   uint64_t GetMaximumCompilationTimeBeforeWarning() const {
     return maximum_compilation_time_before_warning_;
   }
@@ -107,9 +100,6 @@
     return driver_;
   }
 
-  // Whether to produce 64-bit ELF files for 64-bit targets. Leave this off for now.
-  static constexpr bool kProduce64BitELFFiles = false;
-
  private:
   CompilerDriver* const driver_;
   const uint64_t maximum_compilation_time_before_warning_;
diff --git a/compiler/dex/quick/quick_compiler.cc b/compiler/dex/quick/quick_compiler.cc
index 2c0bd47..fc3e687 100644
--- a/compiler/dex/quick/quick_compiler.cc
+++ b/compiler/dex/quick/quick_compiler.cc
@@ -793,20 +793,6 @@
       InstructionSetPointerSize(GetCompilerDriver()->GetInstructionSet())));
 }
 
-bool QuickCompiler::WriteElf(art::File* file,
-                             OatWriter* oat_writer,
-                             const std::vector<const art::DexFile*>& dex_files,
-                             const std::string& android_root,
-                             bool is_host) const {
-  if (kProduce64BitELFFiles && Is64BitInstructionSet(GetCompilerDriver()->GetInstructionSet())) {
-    return art::ElfWriterQuick64::Create(file, oat_writer, dex_files, android_root, is_host,
-                                         *GetCompilerDriver());
-  } else {
-    return art::ElfWriterQuick32::Create(file, oat_writer, dex_files, android_root, is_host,
-                                         *GetCompilerDriver());
-  }
-}
-
 Mir2Lir* QuickCompiler::GetCodeGenerator(CompilationUnit* cu, void* compilation_unit) {
   UNUSED(compilation_unit);
   Mir2Lir* mir_to_lir = nullptr;
diff --git a/compiler/dex/quick/quick_compiler.h b/compiler/dex/quick/quick_compiler.h
index 09b08ac..8d2c324 100644
--- a/compiler/dex/quick/quick_compiler.h
+++ b/compiler/dex/quick/quick_compiler.h
@@ -52,14 +52,6 @@
   uintptr_t GetEntryPointOf(mirror::ArtMethod* method) const OVERRIDE
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  bool WriteElf(art::File* file,
-                OatWriter* oat_writer,
-                const std::vector<const art::DexFile*>& dex_files,
-                const std::string& android_root,
-                bool is_host) const
-    OVERRIDE
-    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   static Mir2Lir* GetCodeGenerator(CompilationUnit* cu, void* compilation_unit);
 
   void InitCompilationUnit(CompilationUnit& cu) const OVERRIDE;
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index c2b8375..f263f6d 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -40,6 +40,7 @@
 #include "dex/verified_method.h"
 #include "dex/quick/dex_file_method_inliner.h"
 #include "driver/compiler_options.h"
+#include "elf_writer_quick.h"
 #include "jni_internal.h"
 #include "object_lock.h"
 #include "profiler.h"
@@ -72,6 +73,9 @@
 
 static constexpr bool kTimeCompileMethod = !kIsDebugBuild;
 
+// Whether to produce 64-bit ELF files for 64-bit targets. Leave this off for now.
+static constexpr bool kProduce64BitELFFiles = false;
+
 static double Percentage(size_t x, size_t y) {
   return 100.0 * (static_cast<double>(x)) / (static_cast<double>(x + y));
 }
@@ -2368,7 +2372,11 @@
                               OatWriter* oat_writer,
                               art::File* file)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  return compiler_->WriteElf(file, oat_writer, dex_files, android_root, is_host);
+  if (kProduce64BitELFFiles && Is64BitInstructionSet(GetInstructionSet())) {
+    return art::ElfWriterQuick64::Create(file, oat_writer, dex_files, android_root, is_host, *this);
+  } else {
+    return art::ElfWriterQuick32::Create(file, oat_writer, dex_files, android_root, is_host, *this);
+  }
 }
 
 bool CompilerDriver::SkipCompilation(const std::string& method_name) {
diff --git a/compiler/optimizing/code_generator_utils.cc b/compiler/optimizing/code_generator_utils.cc
index 26cab2f..921c1d8 100644
--- a/compiler/optimizing/code_generator_utils.cc
+++ b/compiler/optimizing/code_generator_utils.cc
@@ -18,13 +18,15 @@
 
 #include "base/logging.h"
 
+namespace art {
+
 void CalculateMagicAndShiftForDivRem(int64_t divisor, bool is_long,
                                      int64_t* magic, int* shift) {
   // It does not make sense to calculate magic and shift for zero divisor.
   DCHECK_NE(divisor, 0);
 
-  /* According to implementation from H.S.Warren's "Hacker's Delight" (Addison Wesley, 2002)
-   * Chapter 10 and T,Grablund, P.L.Montogomery's "Division by Invariant Integers Using
+  /* Implementation according to H.S.Warren's "Hacker's Delight" (Addison Wesley, 2002)
+   * Chapter 10 and T.Grablund, P.L.Montogomery's "Division by Invariant Integers Using
    * Multiplication" (PLDI 1994).
    * The magic number M and shift S can be calculated in the following way:
    * Let nc be the most positive value of numerator(n) such that nc = kd - 1,
@@ -39,11 +41,11 @@
    * 2^p > nc * (d - 2^p % d), where d >= 2
    * 2^p > nc * (d + 2^p % d), where d <= -2.
    *
-   * The magic number M is calcuated by
+   * The magic number M is calculated by
    * M = (2^p + d - 2^p % d) / d, where d >= 2
    * M = (2^p - d - 2^p % d) / d, where d <= -2.
    *
-   * Notice that p is always bigger than or equal to 32 (resp. 64), so we just return 32-p
+   * Notice that p is always bigger than or equal to 32 (resp. 64), so we just return 32 - p
    * (resp. 64 - p) as the shift number S.
    */
 
@@ -52,9 +54,10 @@
 
   // Initialize the computations.
   uint64_t abs_d = (divisor >= 0) ? divisor : -divisor;
-  uint64_t tmp = exp + (is_long ? static_cast<uint64_t>(divisor) >> 63 :
-                                    static_cast<uint32_t>(divisor) >> 31);
-  uint64_t abs_nc = tmp - 1 - tmp % abs_d;
+  uint64_t sign_bit = is_long ? static_cast<uint64_t>(divisor) >> 63 :
+                                static_cast<uint32_t>(divisor) >> 31;
+  uint64_t tmp = exp + sign_bit;
+  uint64_t abs_nc = tmp - 1 - (tmp % abs_d);
   uint64_t quotient1 = exp / abs_nc;
   uint64_t remainder1 = exp % abs_nc;
   uint64_t quotient2 = exp / abs_d;
@@ -91,3 +94,4 @@
   *shift = is_long ? p - 64 : p - 32;
 }
 
+}  // namespace art
diff --git a/compiler/optimizing/code_generator_utils.h b/compiler/optimizing/code_generator_utils.h
index 742d675..59b495c 100644
--- a/compiler/optimizing/code_generator_utils.h
+++ b/compiler/optimizing/code_generator_utils.h
@@ -19,7 +19,12 @@
 
 #include <cstdint>
 
-// Computes the magic number and the shift needed in the div/rem by constant algorithm
+namespace art {
+
+// Computes the magic number and the shift needed in the div/rem by constant algorithm, as out
+// arguments `magic` and `shift`
 void CalculateMagicAndShiftForDivRem(int64_t divisor, bool is_long, int64_t* magic, int* shift);
 
+}  // namespace art
+
 #endif  // ART_COMPILER_OPTIMIZING_CODE_GENERATOR_UTILS_H_
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 4d74683..a6fb07f 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -2304,10 +2304,11 @@
 
   LocationSummary* locations = instruction->GetLocations();
   DCHECK(locations->InAt(1).IsConstant());
+  DCHECK(locations->InAt(1).GetConstant()->IsIntConstant());
 
   Register out_register = locations->Out().AsRegister<Register>();
   Register input_register = locations->InAt(0).AsRegister<Register>();
-  int imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  int32_t imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
 
   DCHECK(imm == 1 || imm == -1);
 
@@ -2322,16 +2323,14 @@
 }
 
 
-void InstructionCodeGeneratorX86::DivByPowerOfTwo(HBinaryOperation* instruction) {
-  DCHECK(instruction->IsDiv());
-
+void InstructionCodeGeneratorX86::DivByPowerOfTwo(HDiv* instruction) {
   LocationSummary* locations = instruction->GetLocations();
 
   Register out_register = locations->Out().AsRegister<Register>();
   Register input_register = locations->InAt(0).AsRegister<Register>();
-  int imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  int32_t imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
 
-  DCHECK(instruction->IsDiv() && IsPowerOfTwo(std::abs(imm)));
+  DCHECK(IsPowerOfTwo(std::abs(imm)));
   Register num = locations->GetTemp(0).AsRegister<Register>();
 
   __ leal(num, Address(input_register, std::abs(imm) - 1));
@@ -2440,15 +2439,15 @@
       DCHECK_EQ(EAX, first.AsRegister<Register>());
       DCHECK_EQ(is_div ? EAX : EDX, out.AsRegister<Register>());
 
-      if (second.IsConstant()) {
-        int imm = second.GetConstant()->AsIntConstant()->GetValue();
+      if (instruction->InputAt(1)->IsIntConstant()) {
+        int32_t imm = second.GetConstant()->AsIntConstant()->GetValue();
 
         if (imm == 0) {
           // Do not generate anything for 0. DivZeroCheck would forbid any generated code.
         } else if (imm == 1 || imm == -1) {
           DivRemOneOrMinusOne(instruction);
         } else if (is_div && IsPowerOfTwo(std::abs(imm))) {
-          DivByPowerOfTwo(instruction);
+          DivByPowerOfTwo(instruction->AsDiv());
         } else {
           DCHECK(imm <= -2 || imm >= 2);
           GenerateDivRemWithAnyConstant(instruction);
@@ -2519,7 +2518,7 @@
       // We need to save the numerator while we tweak eax and edx. As we are using imul in a way
       // which enforces results to be in EAX and EDX, things are simpler if we use EAX also as
       // output and request another temp.
-      if (div->InputAt(1)->IsConstant()) {
+      if (div->InputAt(1)->IsIntConstant()) {
         locations->AddTemp(Location::RequiresRegister());
       }
       break;
@@ -2593,7 +2592,7 @@
       // We need to save the numerator while we tweak eax and edx. As we are using imul in a way
       // which enforces results to be in EAX and EDX, things are simpler if we use EDX also as
       // output and request another temp.
-      if (rem->InputAt(1)->IsConstant()) {
+      if (rem->InputAt(1)->IsIntConstant()) {
         locations->AddTemp(Location::RequiresRegister());
       }
       break;
@@ -3958,23 +3957,43 @@
 }
 
 void ParallelMoveResolverX86::MoveMemoryToMemory32(int dst, int src) {
-  ScratchRegisterScope ensure_scratch(
-      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
-  Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister());
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
-  __ movl(temp_reg, Address(ESP, src + stack_offset));
-  __ movl(Address(ESP, dst + stack_offset), temp_reg);
+  ScratchRegisterScope possible_scratch(
+      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+  int temp = possible_scratch.GetRegister();
+  if (temp == kNoRegister) {
+    // Use the stack.
+    __ pushl(Address(ESP, src));
+    __ popl(Address(ESP, dst));
+  } else {
+    Register temp_reg = static_cast<Register>(temp);
+    __ movl(temp_reg, Address(ESP, src));
+    __ movl(Address(ESP, dst), temp_reg);
+  }
 }
 
 void ParallelMoveResolverX86::MoveMemoryToMemory64(int dst, int src) {
-  ScratchRegisterScope ensure_scratch(
-      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
-  Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister());
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
-  __ movl(temp_reg, Address(ESP, src + stack_offset));
-  __ movl(Address(ESP, dst + stack_offset), temp_reg);
-  __ movl(temp_reg, Address(ESP, src + stack_offset + kX86WordSize));
-  __ movl(Address(ESP, dst + stack_offset + kX86WordSize), temp_reg);
+  ScratchRegisterScope possible_scratch(
+      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+  int temp = possible_scratch.GetRegister();
+  if (temp == kNoRegister) {
+    // Use the stack instead.
+    // Push src low word.
+    __ pushl(Address(ESP, src));
+    // Push src high word. Stack offset = 4.
+    __ pushl(Address(ESP, src + 4 /* offset */ + kX86WordSize /* high */));
+
+    // Pop into dst high word. Stack offset = 8.
+    // Pop with ESP address uses the 'after increment' value of ESP.
+    __ popl(Address(ESP, dst + 4 /* offset */ + kX86WordSize /* high */));
+    // Finally dst low word. Stack offset = 4.
+    __ popl(Address(ESP, dst));
+  } else {
+    Register temp_reg = static_cast<Register>(temp);
+    __ movl(temp_reg, Address(ESP, src));
+    __ movl(Address(ESP, dst), temp_reg);
+    __ movl(temp_reg, Address(ESP, src + kX86WordSize));
+    __ movl(Address(ESP, dst + kX86WordSize), temp_reg);
+  }
 }
 
 void ParallelMoveResolverX86::EmitMove(size_t index) {
@@ -4039,10 +4058,18 @@
           __ xorps(dest, dest);
         } else {
           ScratchRegisterScope ensure_scratch(
-              this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
-          Register temp = static_cast<Register>(ensure_scratch.GetRegister());
-          __ movl(temp, Immediate(value));
-          __ movd(dest, temp);
+              this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+          int temp_reg = ensure_scratch.GetRegister();
+          if (temp_reg == kNoRegister) {
+            // Avoid spilling/restoring a scratch register by using the stack.
+            __ pushl(Immediate(value));
+            __ movss(dest, Address(ESP, 0));
+            __ addl(ESP, Immediate(4));
+          } else {
+            Register temp = static_cast<Register>(temp_reg);
+            __ movl(temp, Immediate(value));
+            __ movd(dest, temp);
+          }
         }
       } else {
         DCHECK(destination.IsStackSlot()) << destination;
@@ -4091,42 +4118,96 @@
   }
 }
 
-void ParallelMoveResolverX86::Exchange(Register reg, int mem) {
-  Register suggested_scratch = reg == EAX ? EBX : EAX;
-  ScratchRegisterScope ensure_scratch(
-      this, reg, suggested_scratch, codegen_->GetNumberOfCoreRegisters());
+void ParallelMoveResolverX86::Exchange(Register reg1, Register reg2) {
+  // Prefer to avoid xchg as it isn't speedy on smaller processors.
+  ScratchRegisterScope possible_scratch(
+      this, reg1, codegen_->GetNumberOfCoreRegisters());
+  int temp_reg = possible_scratch.GetRegister();
+  if (temp_reg == kNoRegister || temp_reg == reg2) {
+    __ pushl(reg1);
+    __ movl(reg1, reg2);
+    __ popl(reg2);
+  } else {
+    Register temp = static_cast<Register>(temp_reg);
+    __ movl(temp, reg1);
+    __ movl(reg1, reg2);
+    __ movl(reg2, temp);
+  }
+}
 
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
-  __ movl(static_cast<Register>(ensure_scratch.GetRegister()), Address(ESP, mem + stack_offset));
-  __ movl(Address(ESP, mem + stack_offset), reg);
-  __ movl(reg, static_cast<Register>(ensure_scratch.GetRegister()));
+void ParallelMoveResolverX86::Exchange(Register reg, int mem) {
+  ScratchRegisterScope possible_scratch(
+      this, reg, codegen_->GetNumberOfCoreRegisters());
+  int temp_reg = possible_scratch.GetRegister();
+  if (temp_reg == kNoRegister) {
+    __ pushl(Address(ESP, mem));
+    __ movl(Address(ESP, mem + kX86WordSize), reg);
+    __ popl(reg);
+  } else {
+    Register temp = static_cast<Register>(temp_reg);
+    __ movl(temp, Address(ESP, mem));
+    __ movl(Address(ESP, mem), reg);
+    __ movl(reg, temp);
+  }
 }
 
 void ParallelMoveResolverX86::Exchange32(XmmRegister reg, int mem) {
-  ScratchRegisterScope ensure_scratch(
-      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
-
-  Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister());
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
-  __ movl(temp_reg, Address(ESP, mem + stack_offset));
-  __ movss(Address(ESP, mem + stack_offset), reg);
-  __ movd(reg, temp_reg);
+  ScratchRegisterScope possible_scratch(
+      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+  int temp_reg = possible_scratch.GetRegister();
+  if (temp_reg == kNoRegister) {
+    __ pushl(Address(ESP, mem));
+    __ movss(Address(ESP, mem + kX86WordSize), reg);
+    __ movss(reg, Address(ESP, 0));
+    __ addl(ESP, Immediate(kX86WordSize));
+  } else {
+    Register temp = static_cast<Register>(temp_reg);
+    __ movl(temp, Address(ESP, mem));
+    __ movss(Address(ESP, mem), reg);
+    __ movd(reg, temp);
+  }
 }
 
 void ParallelMoveResolverX86::Exchange(int mem1, int mem2) {
-  ScratchRegisterScope ensure_scratch1(
-      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
+  ScratchRegisterScope possible_scratch1(
+      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+  int temp_reg1 = possible_scratch1.GetRegister();
+  if (temp_reg1 == kNoRegister) {
+    // No free registers.  Use the stack.
+    __ pushl(Address(ESP, mem1));
+    __ pushl(Address(ESP, mem2 + kX86WordSize));
+    // Pop with ESP address uses the 'after increment' value of ESP.
+    __ popl(Address(ESP, mem1 + kX86WordSize));
+    __ popl(Address(ESP, mem2));
+  } else {
+    // Got the first one.  Try for a second.
+    ScratchRegisterScope possible_scratch2(
+        this, temp_reg1, codegen_->GetNumberOfCoreRegisters());
+    int temp_reg2 = possible_scratch2.GetRegister();
+    if (temp_reg2 == kNoRegister) {
+      Register temp = static_cast<Register>(temp_reg1);
+      // Bummer.  Only have one free register to use.
+      // Save mem1 on the stack.
+      __ pushl(Address(ESP, mem1));
 
-  Register suggested_scratch = ensure_scratch1.GetRegister() == EAX ? EBX : EAX;
-  ScratchRegisterScope ensure_scratch2(
-      this, ensure_scratch1.GetRegister(), suggested_scratch, codegen_->GetNumberOfCoreRegisters());
+      // Copy mem2 into mem1.
+      __ movl(temp, Address(ESP, mem2 + kX86WordSize));
+      __ movl(Address(ESP, mem1 + kX86WordSize), temp);
 
-  int stack_offset = ensure_scratch1.IsSpilled() ? kX86WordSize : 0;
-  stack_offset += ensure_scratch2.IsSpilled() ? kX86WordSize : 0;
-  __ movl(static_cast<Register>(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset));
-  __ movl(static_cast<Register>(ensure_scratch2.GetRegister()), Address(ESP, mem2 + stack_offset));
-  __ movl(Address(ESP, mem2 + stack_offset), static_cast<Register>(ensure_scratch1.GetRegister()));
-  __ movl(Address(ESP, mem1 + stack_offset), static_cast<Register>(ensure_scratch2.GetRegister()));
+      // Now pop mem1 into mem2.
+      // Pop with ESP address uses the 'after increment' value of ESP.
+      __ popl(Address(ESP, mem2));
+    } else {
+      // Great.  We have 2 registers to play with.
+      Register temp1 = static_cast<Register>(temp_reg1);
+      Register temp2 = static_cast<Register>(temp_reg2);
+      DCHECK_NE(temp1, temp2);
+      __ movl(temp1, Address(ESP, mem1));
+      __ movl(temp2, Address(ESP, mem2));
+      __ movl(Address(ESP, mem2), temp1);
+      __ movl(Address(ESP, mem1), temp2);
+    }
+  }
 }
 
 void ParallelMoveResolverX86::EmitSwap(size_t index) {
@@ -4135,7 +4216,7 @@
   Location destination = move->GetDestination();
 
   if (source.IsRegister() && destination.IsRegister()) {
-    __ xchgl(destination.AsRegister<Register>(), source.AsRegister<Register>());
+    Exchange(destination.AsRegister<Register>(), source.AsRegister<Register>());
   } else if (source.IsRegister() && destination.IsStackSlot()) {
     Exchange(source.AsRegister<Register>(), destination.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsRegister()) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index e6e7fb7..8c56e35 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -106,6 +106,7 @@
   X86Assembler* GetAssembler() const;
 
  private:
+  void Exchange(Register reg1, Register Reg2);
   void Exchange(Register reg, int mem);
   void Exchange(int mem1, int mem2);
   void Exchange32(XmmRegister reg, int mem);
@@ -164,7 +165,7 @@
   void HandleBitwiseOperation(HBinaryOperation* instruction);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
-  void DivByPowerOfTwo(HBinaryOperation* instruction);
+  void DivByPowerOfTwo(HDiv* instruction);
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
   void GenerateRemFP(HRem *rem);
   void HandleShift(HBinaryOperation* instruction);
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 5710ec5..01b24ea 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -2348,12 +2348,7 @@
 
   CpuRegister output_register = locations->Out().AsRegister<CpuRegister>();
   CpuRegister input_register = locations->InAt(0).AsRegister<CpuRegister>();
-  int64_t imm;
-  if (second.GetConstant()->IsLongConstant()) {
-    imm = second.GetConstant()->AsLongConstant()->GetValue();
-  } else {
-    imm = second.GetConstant()->AsIntConstant()->GetValue();
-  }
+  int64_t imm = Int64FromConstant(second.GetConstant());
 
   DCHECK(imm == 1 || imm == -1);
 
@@ -2383,25 +2378,18 @@
     }
 
     default:
-      LOG(FATAL) << "Unreachable";
+      LOG(FATAL) << "Unexpected type for div by (-)1 " << instruction->GetResultType();
   }
 }
 
-void InstructionCodeGeneratorX86_64::DivByPowerOfTwo(HBinaryOperation* instruction) {
-  DCHECK(instruction->IsDiv());
-
+void InstructionCodeGeneratorX86_64::DivByPowerOfTwo(HDiv* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   Location second = locations->InAt(1);
 
   CpuRegister output_register = locations->Out().AsRegister<CpuRegister>();
   CpuRegister numerator = locations->InAt(0).AsRegister<CpuRegister>();
 
-  int64_t imm;
-  if (instruction->GetResultType() == Primitive::kPrimLong) {
-    imm = second.GetConstant()->AsLongConstant()->GetValue();
-  } else {
-    imm = second.GetConstant()->AsIntConstant()->GetValue();
-  }
+  int64_t imm = Int64FromConstant(second.GetConstant());
 
   DCHECK(IsPowerOfTwo(std::abs(imm)));
 
@@ -2462,7 +2450,7 @@
   int64_t magic;
   int shift;
 
-  // TODO: can these branch be written as one?
+  // TODO: can these branches be written as one?
   if (instruction->GetResultType() == Primitive::kPrimInt) {
     int imm = second.GetConstant()->AsIntConstant()->GetValue();
 
@@ -2526,7 +2514,7 @@
     __ imulq(numerator);
 
     if (imm > 0 && magic < 0) {
-      // RDX += numeratorerator
+      // RDX += numerator
       __ addq(rdx, numerator);
     } else if (imm < 0 && magic > 0) {
       // RDX -= numerator
@@ -2576,19 +2564,14 @@
   DCHECK_EQ(is_div ? RAX : RDX, out.AsRegister());
 
   if (second.IsConstant()) {
-    int64_t imm;
-    if (second.GetConstant()->AsLongConstant()) {
-      imm = second.GetConstant()->AsLongConstant()->GetValue();
-    } else {
-      imm = second.GetConstant()->AsIntConstant()->GetValue();
-    }
+    int64_t imm = Int64FromConstant(second.GetConstant());
 
     if (imm == 0) {
       // Do not generate anything. DivZeroCheck would prevent any code to be executed.
     } else if (imm == 1 || imm == -1) {
       DivRemOneOrMinusOne(instruction);
     } else if (instruction->IsDiv() && IsPowerOfTwo(std::abs(imm))) {
-      DivByPowerOfTwo(instruction);
+      DivByPowerOfTwo(instruction->AsDiv());
     } else {
       DCHECK(imm <= -2 || imm >= 2);
       GenerateDivRemWithAnyConstant(instruction);
@@ -3838,15 +3821,27 @@
 
 void ParallelMoveResolverX86_64::Exchange64(int mem1, int mem2) {
   ScratchRegisterScope ensure_scratch(
-      this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
+      this, TMP, codegen_->GetNumberOfCoreRegisters());
 
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
-  __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
-  __ movq(CpuRegister(ensure_scratch.GetRegister()),
-          Address(CpuRegister(RSP), mem2 + stack_offset));
-  __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
-  __ movq(Address(CpuRegister(RSP), mem1 + stack_offset),
-          CpuRegister(ensure_scratch.GetRegister()));
+  int temp_reg = ensure_scratch.GetRegister();
+  if (temp_reg == kNoRegister) {
+    // Use the stack as a temporary.
+    // Save mem1 on the stack.
+    __ pushq(Address(CpuRegister(RSP), mem1));
+
+    // Copy mem2 into mem1.
+    __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem2 + kX86_64WordSize));
+    __ movq(Address(CpuRegister(RSP), mem1 + kX86_64WordSize), CpuRegister(TMP));
+
+    // Now pop mem1 into mem2.
+    __ popq(Address(CpuRegister(RSP), mem2));
+  } else {
+    CpuRegister temp = CpuRegister(temp_reg);
+    __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1));
+    __ movq(temp, Address(CpuRegister(RSP), mem2));
+    __ movq(Address(CpuRegister(RSP), mem2), CpuRegister(TMP));
+    __ movq(Address(CpuRegister(RSP), mem1), temp);
+  }
 }
 
 void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) {
@@ -3855,6 +3850,13 @@
   __ movd(reg, CpuRegister(TMP));
 }
 
+void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg1, CpuRegister reg2) {
+  // Prefer to avoid xchg as it isn't speedy on smaller processors.
+  __ movq(CpuRegister(TMP), reg1);
+  __ movq(reg1, reg2);
+  __ movq(reg2, CpuRegister(TMP));
+}
+
 void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) {
   __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
   __ movsd(Address(CpuRegister(RSP), mem), reg);
@@ -3867,7 +3869,7 @@
   Location destination = move->GetDestination();
 
   if (source.IsRegister() && destination.IsRegister()) {
-    __ xchgq(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
+    Exchange64(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
   } else if (source.IsRegister() && destination.IsStackSlot()) {
     Exchange32(source.AsRegister<CpuRegister>(), destination.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsRegister()) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index aae7de0..61bf6ac 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -118,6 +118,7 @@
   void Exchange32(CpuRegister reg, int mem);
   void Exchange32(XmmRegister reg, int mem);
   void Exchange32(int mem1, int mem2);
+  void Exchange64(CpuRegister reg1, CpuRegister reg2);
   void Exchange64(CpuRegister reg, int mem);
   void Exchange64(XmmRegister reg, int mem);
   void Exchange64(int mem1, int mem2);
@@ -174,7 +175,7 @@
   void HandleBitwiseOperation(HBinaryOperation* operation);
   void GenerateRemFP(HRem *rem);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
-  void DivByPowerOfTwo(HBinaryOperation* instruction);
+  void DivByPowerOfTwo(HDiv* instruction);
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void HandleShift(HBinaryOperation* operation);
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 56ec8a7..afbc490 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -24,9 +24,21 @@
 class InstructionSimplifierVisitor : public HGraphVisitor {
  public:
   InstructionSimplifierVisitor(HGraph* graph, OptimizingCompilerStats* stats)
-      : HGraphVisitor(graph), stats_(stats) {}
+      : HGraphVisitor(graph),
+        stats_(stats) {}
+
+  void Run();
 
  private:
+  void RecordSimplification() {
+    simplification_occurred_ = true;
+    simplifications_at_current_position_++;
+    if (stats_) {
+      stats_->RecordStat(kInstructionSimplifications);
+    }
+  }
+
+  bool TryMoveNegOnInputsAfterBinop(HBinaryOperation* binop);
   void VisitShift(HBinaryOperation* shift);
 
   void VisitSuspendCheck(HSuspendCheck* check) OVERRIDE;
@@ -40,6 +52,8 @@
   void VisitAnd(HAnd* instruction) OVERRIDE;
   void VisitDiv(HDiv* instruction) OVERRIDE;
   void VisitMul(HMul* instruction) OVERRIDE;
+  void VisitNeg(HNeg* instruction) OVERRIDE;
+  void VisitNot(HNot* instruction) OVERRIDE;
   void VisitOr(HOr* instruction) OVERRIDE;
   void VisitShl(HShl* instruction) OVERRIDE;
   void VisitShr(HShr* instruction) OVERRIDE;
@@ -48,11 +62,38 @@
   void VisitXor(HXor* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
+  bool simplification_occurred_ = false;
+  int simplifications_at_current_position_ = 0;
+  // We ensure we do not loop infinitely. The value is a finger in the air guess
+  // that should allow enough simplification.
+  static constexpr int kMaxSamePositionSimplifications = 10;
 };
 
 void InstructionSimplifier::Run() {
   InstructionSimplifierVisitor visitor(graph_, stats_);
-  visitor.VisitInsertionOrder();
+  visitor.Run();
+}
+
+void InstructionSimplifierVisitor::Run() {
+  for (HReversePostOrderIterator it(*GetGraph()); !it.Done();) {
+    // The simplification of an instruction to another instruction may yield
+    // possibilities for other simplifications. So although we perform a reverse
+    // post order visit, we sometimes need to revisit an instruction index.
+    simplification_occurred_ = false;
+    VisitBasicBlock(it.Current());
+    if (simplification_occurred_ &&
+        (simplifications_at_current_position_ < kMaxSamePositionSimplifications)) {
+      // New simplifications may be applicable to the instruction at the
+      // current index, so don't advance the iterator.
+      continue;
+    }
+    if (simplifications_at_current_position_ >= kMaxSamePositionSimplifications) {
+      LOG(WARNING) << "Too many simplifications (" << simplifications_at_current_position_
+          << ") occurred at the current position.";
+    }
+    simplifications_at_current_position_ = 0;
+    it.Advance();
+  }
 }
 
 namespace {
@@ -63,6 +104,35 @@
 
 }  // namespace
 
+// Returns true if the code was simplified to use only one negation operation
+// after the binary operation instead of one on each of the inputs.
+bool InstructionSimplifierVisitor::TryMoveNegOnInputsAfterBinop(HBinaryOperation* binop) {
+  DCHECK(binop->IsAdd() || binop->IsSub());
+  DCHECK(binop->GetLeft()->IsNeg() && binop->GetRight()->IsNeg());
+  HNeg* left_neg = binop->GetLeft()->AsNeg();
+  HNeg* right_neg = binop->GetRight()->AsNeg();
+  if (!left_neg->HasOnlyOneNonEnvironmentUse() ||
+      !right_neg->HasOnlyOneNonEnvironmentUse()) {
+    return false;
+  }
+  // Replace code looking like
+  //    NEG tmp1, a
+  //    NEG tmp2, b
+  //    ADD dst, tmp1, tmp2
+  // with
+  //    ADD tmp, a, b
+  //    NEG dst, tmp
+  binop->ReplaceInput(left_neg->GetInput(), 0);
+  binop->ReplaceInput(right_neg->GetInput(), 1);
+  left_neg->GetBlock()->RemoveInstruction(left_neg);
+  right_neg->GetBlock()->RemoveInstruction(right_neg);
+  HNeg* neg = new (GetGraph()->GetArena()) HNeg(binop->GetType(), binop);
+  binop->GetBlock()->InsertInstructionBefore(neg, binop->GetNext());
+  binop->ReplaceWithExceptInReplacementAtIndex(neg, 0);
+  RecordSimplification();
+  return true;
+}
+
 void InstructionSimplifierVisitor::VisitShift(HBinaryOperation* instruction) {
   DCHECK(instruction->IsShl() || instruction->IsShr() || instruction->IsUShr());
   HConstant* input_cst = instruction->GetConstantRight();
@@ -182,6 +252,36 @@
     //    src
     instruction->ReplaceWith(input_other);
     instruction->GetBlock()->RemoveInstruction(instruction);
+    return;
+  }
+
+  HInstruction* left = instruction->GetLeft();
+  HInstruction* right = instruction->GetRight();
+  bool left_is_neg = left->IsNeg();
+  bool right_is_neg = right->IsNeg();
+
+  if (left_is_neg && right_is_neg) {
+    if (TryMoveNegOnInputsAfterBinop(instruction)) {
+      return;
+    }
+  }
+
+  HNeg* neg = left_is_neg ? left->AsNeg() : right->AsNeg();
+  if ((left_is_neg ^ right_is_neg) && neg->HasOnlyOneNonEnvironmentUse()) {
+    // Replace code looking like
+    //    NEG tmp, b
+    //    ADD dst, a, tmp
+    // with
+    //    SUB dst, a, b
+    // We do not perform the optimization if the input negation has environment
+    // uses or multiple non-environment uses as it could lead to worse code. In
+    // particular, we do not want the live range of `b` to be extended if we are
+    // not sure the initial 'NEG' instruction can be removed.
+    HInstruction* other = left_is_neg ? right : left;
+    HSub* sub = new(GetGraph()->GetArena()) HSub(instruction->GetType(), other, neg->GetInput());
+    instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, sub);
+    RecordSimplification();
+    neg->GetBlock()->RemoveInstruction(neg);
   }
 }
 
@@ -201,7 +301,7 @@
 
   // We assume that GVN has run before, so we only perform a pointer comparison.
   // If for some reason the values are equal but the pointers are different, we
-  // are still correct and only miss an optimisation opportunity.
+  // are still correct and only miss an optimization opportunity.
   if (instruction->GetLeft() == instruction->GetRight()) {
     // Replace code looking like
     //    AND dst, src, src
@@ -235,6 +335,7 @@
     //    NEG dst, src
     instruction->GetBlock()->ReplaceAndRemoveInstructionWith(
         instruction, (new (GetGraph()->GetArena()) HNeg(type, input_other)));
+    RecordSimplification();
   }
 }
 
@@ -267,6 +368,7 @@
     //    NEG dst, src
     HNeg* neg = new (allocator) HNeg(type, input_other);
     block->ReplaceAndRemoveInstructionWith(instruction, neg);
+    RecordSimplification();
     return;
   }
 
@@ -280,6 +382,7 @@
     // The 'int' and 'long' cases are handled below.
     block->ReplaceAndRemoveInstructionWith(instruction,
                                            new (allocator) HAdd(type, input_other, input_other));
+    RecordSimplification();
     return;
   }
 
@@ -295,10 +398,75 @@
       HIntConstant* shift = GetGraph()->GetIntConstant(WhichPowerOf2(factor));
       HShl* shl = new(allocator) HShl(type, input_other, shift);
       block->ReplaceAndRemoveInstructionWith(instruction, shl);
+      RecordSimplification();
     }
   }
 }
 
+void InstructionSimplifierVisitor::VisitNeg(HNeg* instruction) {
+  HInstruction* input = instruction->GetInput();
+  if (input->IsNeg()) {
+    // Replace code looking like
+    //    NEG tmp, src
+    //    NEG dst, tmp
+    // with
+    //    src
+    HNeg* previous_neg = input->AsNeg();
+    instruction->ReplaceWith(previous_neg->GetInput());
+    instruction->GetBlock()->RemoveInstruction(instruction);
+    // We perform the optimization even if the input negation has environment
+    // uses since it allows removing the current instruction. But we only delete
+    // the input negation only if it is does not have any uses left.
+    if (!previous_neg->HasUses()) {
+      previous_neg->GetBlock()->RemoveInstruction(previous_neg);
+    }
+    RecordSimplification();
+    return;
+  }
+
+  if (input->IsSub() && input->HasOnlyOneNonEnvironmentUse()) {
+    // Replace code looking like
+    //    SUB tmp, a, b
+    //    NEG dst, tmp
+    // with
+    //    SUB dst, b, a
+    // We do not perform the optimization if the input subtraction has
+    // environment uses or multiple non-environment uses as it could lead to
+    // worse code. In particular, we do not want the live ranges of `a` and `b`
+    // to be extended if we are not sure the initial 'SUB' instruction can be
+    // removed.
+    HSub* sub = input->AsSub();
+    HSub* new_sub =
+        new (GetGraph()->GetArena()) HSub(instruction->GetType(), sub->GetRight(), sub->GetLeft());
+    instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, new_sub);
+    if (!sub->HasUses()) {
+      sub->GetBlock()->RemoveInstruction(sub);
+    }
+    RecordSimplification();
+  }
+}
+
+void InstructionSimplifierVisitor::VisitNot(HNot* instruction) {
+  HInstruction* input = instruction->GetInput();
+  if (input->IsNot()) {
+    // Replace code looking like
+    //    NOT tmp, src
+    //    NOT dst, tmp
+    // with
+    //    src
+    // We perform the optimization even if the input negation has environment
+    // uses since it allows removing the current instruction. But we only delete
+    // the input negation only if it is does not have any uses left.
+    HNot* previous_not = input->AsNot();
+    instruction->ReplaceWith(previous_not->GetInput());
+    instruction->GetBlock()->RemoveInstruction(instruction);
+    if (!previous_not->HasUses()) {
+      previous_not->GetBlock()->RemoveInstruction(previous_not);
+    }
+    RecordSimplification();
+  }
+}
+
 void InstructionSimplifierVisitor::VisitOr(HOr* instruction) {
   HConstant* input_cst = instruction->GetConstantRight();
   HInstruction* input_other = instruction->GetLeastConstantLeft();
@@ -315,7 +483,7 @@
 
   // We assume that GVN has run before, so we only perform a pointer comparison.
   // If for some reason the values are equal but the pointers are different, we
-  // are still correct and only miss an optimisation opportunity.
+  // are still correct and only miss an optimization opportunity.
   if (instruction->GetLeft() == instruction->GetRight()) {
     // Replace code looking like
     //    OR dst, src, src
@@ -356,20 +524,61 @@
   HBasicBlock* block = instruction->GetBlock();
   ArenaAllocator* allocator = GetGraph()->GetArena();
 
-  if (instruction->GetLeft()->IsConstant()) {
-    int64_t left = Int64FromConstant(instruction->GetLeft()->AsConstant());
-    if (left == 0) {
+  HInstruction* left = instruction->GetLeft();
+  HInstruction* right = instruction->GetRight();
+  if (left->IsConstant()) {
+    if (Int64FromConstant(left->AsConstant()) == 0) {
       // Replace code looking like
       //    SUB dst, 0, src
       // with
       //    NEG dst, src
-      // Note that we cannot optimise `0.0 - x` to `-x` for floating-point. When
+      // Note that we cannot optimize `0.0 - x` to `-x` for floating-point. When
       // `x` is `0.0`, the former expression yields `0.0`, while the later
       // yields `-0.0`.
-      HNeg* neg = new (allocator) HNeg(type, instruction->GetRight());
+      HNeg* neg = new (allocator) HNeg(type, right);
       block->ReplaceAndRemoveInstructionWith(instruction, neg);
+      RecordSimplification();
+      return;
     }
   }
+
+  if (left->IsNeg() && right->IsNeg()) {
+    if (TryMoveNegOnInputsAfterBinop(instruction)) {
+      return;
+    }
+  }
+
+  if (right->IsNeg() && right->HasOnlyOneNonEnvironmentUse()) {
+    // Replace code looking like
+    //    NEG tmp, b
+    //    SUB dst, a, tmp
+    // with
+    //    ADD dst, a, b
+    HAdd* add = new(GetGraph()->GetArena()) HAdd(type, left, right->AsNeg()->GetInput());
+    instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, add);
+    RecordSimplification();
+    right->GetBlock()->RemoveInstruction(right);
+    return;
+  }
+
+  if (left->IsNeg() && left->HasOnlyOneNonEnvironmentUse()) {
+    // Replace code looking like
+    //    NEG tmp, a
+    //    SUB dst, tmp, b
+    // with
+    //    ADD tmp, a, b
+    //    NEG dst, tmp
+    // The second version is not intrinsically better, but enables more
+    // transformations.
+    HAdd* add = new(GetGraph()->GetArena()) HAdd(type, left->AsNeg()->GetInput(), right);
+    instruction->GetBlock()->InsertInstructionBefore(add, instruction);
+    HNeg* neg = new (GetGraph()->GetArena()) HNeg(instruction->GetType(), add);
+    instruction->GetBlock()->InsertInstructionBefore(neg, instruction);
+    instruction->ReplaceWith(neg);
+    instruction->GetBlock()->RemoveInstruction(instruction);
+    RecordSimplification();
+    left->GetBlock()->RemoveInstruction(left);
+  }
 }
 
 void InstructionSimplifierVisitor::VisitUShr(HUShr* instruction) {
@@ -397,6 +606,7 @@
     //    NOT dst, src
     HNot* bitwise_not = new (GetGraph()->GetArena()) HNot(instruction->GetType(), input_other);
     instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, bitwise_not);
+    RecordSimplification();
     return;
   }
 }
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index f764eb4..5f50494 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1177,6 +1177,9 @@
   bool HasUses() const { return !uses_.IsEmpty() || !env_uses_.IsEmpty(); }
   bool HasEnvironmentUses() const { return !env_uses_.IsEmpty(); }
   bool HasNonEnvironmentUses() const { return !uses_.IsEmpty(); }
+  bool HasOnlyOneNonEnvironmentUse() const {
+    return !HasEnvironmentUses() && GetUses().HasOnlyOneUse();
+  }
 
   // Does this instruction strictly dominate `other_instruction`?
   // Returns false if this instruction and `other_instruction` are the same.
@@ -1214,6 +1217,13 @@
   void ReplaceWith(HInstruction* instruction);
   void ReplaceInput(HInstruction* replacement, size_t index);
 
+  // This is almost the same as doing `ReplaceWith()`. But in this helper, the
+  // uses of this instruction by `other` are *not* updated.
+  void ReplaceWithExceptInReplacementAtIndex(HInstruction* other, size_t use_index) {
+    ReplaceWith(other);
+    other->ReplaceInput(this, use_index);
+  }
+
   // Move `this` instruction before `cursor`.
   void MoveBefore(HInstruction* cursor);
 
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index f642dfc..c2b5c99 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -200,20 +200,6 @@
         InstructionSetPointerSize(GetCompilerDriver()->GetInstructionSet())));
   }
 
-  bool WriteElf(art::File* file,
-                OatWriter* oat_writer,
-                const std::vector<const art::DexFile*>& dex_files,
-                const std::string& android_root,
-                bool is_host) const OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    if (kProduce64BitELFFiles && Is64BitInstructionSet(GetCompilerDriver()->GetInstructionSet())) {
-      return art::ElfWriterQuick64::Create(file, oat_writer, dex_files, android_root, is_host,
-                                           *GetCompilerDriver());
-    } else {
-      return art::ElfWriterQuick32::Create(file, oat_writer, dex_files, android_root, is_host,
-                                           *GetCompilerDriver());
-    }
-  }
-
   void InitCompilationUnit(CompilationUnit& cu) const OVERRIDE;
 
   void Init() OVERRIDE;
diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h
index b97a667..4d5b8d0 100644
--- a/compiler/optimizing/optimizing_compiler_stats.h
+++ b/compiler/optimizing/optimizing_compiler_stats.h
@@ -47,6 +47,7 @@
   kNotCompiledUnhandledInstruction,
   kRemovedCheckedCast,
   kRemovedNullCheck,
+  kInstructionSimplifications,
   kLastStat
 };
 
@@ -110,6 +111,7 @@
       case kNotCompiledUnhandledInstruction : return "kNotCompiledUnhandledInstruction";
       case kRemovedCheckedCast: return "kRemovedCheckedCast";
       case kRemovedNullCheck: return "kRemovedNullCheck";
+      case kInstructionSimplifications: return "kInstructionSimplifications";
       default: LOG(FATAL) << "invalid stat";
     }
     return "";
diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc
index 9df8f56..4936685 100644
--- a/compiler/optimizing/parallel_move_resolver.cc
+++ b/compiler/optimizing/parallel_move_resolver.cc
@@ -269,6 +269,20 @@
 }
 
 
+int ParallelMoveResolver::AllocateScratchRegister(int blocked,
+                                                  int register_count) {
+  int scratch = -1;
+  for (int reg = 0; reg < register_count; ++reg) {
+    if ((blocked != reg) && IsScratchLocation(Location::RegisterLocation(reg))) {
+      scratch = reg;
+      break;
+    }
+  }
+
+  return scratch;
+}
+
+
 ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
     ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers)
     : resolver_(resolver),
@@ -282,6 +296,16 @@
 }
 
 
+ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
+    ParallelMoveResolver* resolver, int blocked, int number_of_registers)
+    : resolver_(resolver),
+      reg_(kNoRegister),
+      spilled_(false) {
+  // We don't want to spill a register if none are free.
+  reg_ = resolver_->AllocateScratchRegister(blocked, number_of_registers);
+}
+
+
 ParallelMoveResolver::ScratchRegisterScope::~ScratchRegisterScope() {
   if (spilled_) {
     resolver_->RestoreScratch(reg_);
diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h
index 3fa1b37..173cffc 100644
--- a/compiler/optimizing/parallel_move_resolver.h
+++ b/compiler/optimizing/parallel_move_resolver.h
@@ -42,10 +42,15 @@
  protected:
   class ScratchRegisterScope : public ValueObject {
    public:
+    // Spill a scratch register if no regs are free.
     ScratchRegisterScope(ParallelMoveResolver* resolver,
                          int blocked,
                          int if_scratch,
                          int number_of_registers);
+    // Grab a scratch register only if available.
+    ScratchRegisterScope(ParallelMoveResolver* resolver,
+                         int blocked,
+                         int number_of_registers);
     ~ScratchRegisterScope();
 
     int GetRegister() const { return reg_; }
@@ -62,6 +67,8 @@
   // Allocate a scratch register for performing a move. The method will try to use
   // a register that is the destination of a move, but that move has not been emitted yet.
   int AllocateScratchRegister(int blocked, int if_scratch, int register_count, bool* spilled);
+  // As above, but return -1 if no free register.
+  int AllocateScratchRegister(int blocked, int register_count);
 
   // Emit a move.
   virtual void EmitMove(size_t index) = 0;
diff --git a/runtime/mirror/art_method-inl.h b/runtime/mirror/art_method-inl.h
index 0ccf5db..fb427dc 100644
--- a/runtime/mirror/art_method-inl.h
+++ b/runtime/mirror/art_method-inl.h
@@ -36,7 +36,7 @@
 namespace mirror {
 
 inline uint32_t ArtMethod::ClassSize() {
-  uint32_t vtable_entries = Object::kVTableLength + 8;
+  uint32_t vtable_entries = Object::kVTableLength + 7;
   return Class::ComputeClassSize(true, vtable_entries, 0, 0, 0, 0, 0);
 }
 
diff --git a/runtime/mirror/art_method.h b/runtime/mirror/art_method.h
index 22481ce..55b8068 100644
--- a/runtime/mirror/art_method.h
+++ b/runtime/mirror/art_method.h
@@ -183,6 +183,10 @@
     SetField32<false>(OFFSET_OF_OBJECT_MEMBER(ArtMethod, method_index_), new_method_index);
   }
 
+  static MemberOffset DexMethodIndexOffset() {
+    return OFFSET_OF_OBJECT_MEMBER(ArtMethod, dex_method_index_);
+  }
+
   static MemberOffset MethodIndexOffset() {
     return OFFSET_OF_OBJECT_MEMBER(ArtMethod, method_index_);
   }
@@ -214,6 +218,8 @@
     return OFFSET_OF_OBJECT_MEMBER(ArtMethod, dex_cache_resolved_types_);
   }
 
+  ALWAYS_INLINE ObjectArray<ArtMethod>* GetDexCacheResolvedMethods()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   ALWAYS_INLINE ArtMethod* GetDexCacheResolvedMethod(uint16_t method_idx)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   ALWAYS_INLINE void SetDexCacheResolvedMethod(uint16_t method_idx, ArtMethod* new_method)
@@ -434,10 +440,6 @@
         EntryPointFromJniOffset(pointer_size), entrypoint, pointer_size);
   }
 
-  static MemberOffset GetMethodIndexOffset() {
-    return OFFSET_OF_OBJECT_MEMBER(ArtMethod, method_index_);
-  }
-
   // Is this a CalleSaveMethod or ResolutionMethod and therefore doesn't adhere to normal
   // conventions for a method of managed code. Returns false for Proxy methods.
   bool IsRuntimeMethod() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -604,9 +606,6 @@
  private:
   ALWAYS_INLINE void CheckObjectSizeEqualsMirrorSize() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  ALWAYS_INLINE ObjectArray<ArtMethod>* GetDexCacheResolvedMethods()
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-
   ALWAYS_INLINE ObjectArray<Class>* GetDexCacheResolvedTypes()
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
diff --git a/runtime/mirror/dex_cache-inl.h b/runtime/mirror/dex_cache-inl.h
index 288e88e..6758e22 100644
--- a/runtime/mirror/dex_cache-inl.h
+++ b/runtime/mirror/dex_cache-inl.h
@@ -27,7 +27,7 @@
 namespace mirror {
 
 inline uint32_t DexCache::ClassSize() {
-  uint32_t vtable_entries = Object::kVTableLength + 1;
+  uint32_t vtable_entries = Object::kVTableLength + 5;
   return Class::ComputeClassSize(true, vtable_entries, 0, 0, 0, 0, 0);
 }
 
diff --git a/runtime/native/java_lang_Class.cc b/runtime/native/java_lang_Class.cc
index c893f0a..b657aec 100644
--- a/runtime/native/java_lang_Class.cc
+++ b/runtime/native/java_lang_Class.cc
@@ -91,6 +91,18 @@
   return soa.AddLocalReference<jclass>(c.Get());
 }
 
+static jobject Class_findOverriddenMethodIfProxy(JNIEnv* env, jclass, jobject art_method) {
+  ScopedFastNativeObjectAccess soa(env);
+  mirror::ArtMethod* method = soa.Decode<mirror::ArtMethod*>(art_method);
+  mirror::Class* declaring_klass = method->GetDeclaringClass();
+  if (!declaring_klass->IsProxyClass()) {
+    return art_method;
+  }
+  uint32_t dex_method_index = method->GetDexMethodIndex();
+  mirror::ArtMethod* overriden_method = method->GetDexCacheResolvedMethods()->Get(dex_method_index);
+  return soa.AddLocalReference<jobject>(overriden_method);
+}
+
 static jstring Class_getNameNative(JNIEnv* env, jobject javaThis) {
   ScopedFastNativeObjectAccess soa(env);
   StackHandleScope<1> hs(soa.Self());
@@ -264,6 +276,8 @@
 
 static JNINativeMethod gMethods[] = {
   NATIVE_METHOD(Class, classForName, "!(Ljava/lang/String;ZLjava/lang/ClassLoader;)Ljava/lang/Class;"),
+  NATIVE_METHOD(Class, findOverriddenMethodIfProxy,
+                "!(Ljava/lang/reflect/ArtMethod;)Ljava/lang/reflect/ArtMethod;"),
   NATIVE_METHOD(Class, getNameNative, "!()Ljava/lang/String;"),
   NATIVE_METHOD(Class, getProxyInterfaces, "!()[Ljava/lang/Class;"),
   NATIVE_METHOD(Class, getDeclaredFields, "!()[Ljava/lang/reflect/Field;"),
diff --git a/runtime/native/java_lang_DexCache.cc b/runtime/native/java_lang_DexCache.cc
index 27eae46..8944270 100644
--- a/runtime/native/java_lang_DexCache.cc
+++ b/runtime/native/java_lang_DexCache.cc
@@ -48,8 +48,38 @@
                                       args);
 }
 
+static jobject DexCache_getResolvedType(JNIEnv* env, jobject javaDexCache, jint type_index) {
+  ScopedFastNativeObjectAccess soa(env);
+  mirror::DexCache* dex_cache = soa.Decode<mirror::DexCache*>(javaDexCache);
+  return soa.AddLocalReference<jobject>(dex_cache->GetResolvedType(type_index));
+}
+
+static jobject DexCache_getResolvedString(JNIEnv* env, jobject javaDexCache, jint string_index) {
+  ScopedFastNativeObjectAccess soa(env);
+  mirror::DexCache* dex_cache = soa.Decode<mirror::DexCache*>(javaDexCache);
+  return soa.AddLocalReference<jobject>(dex_cache->GetResolvedString(string_index));
+}
+
+static void DexCache_setResolvedType(JNIEnv* env, jobject javaDexCache, jint type_index,
+                                     jobject type) {
+  ScopedFastNativeObjectAccess soa(env);
+  mirror::DexCache* dex_cache = soa.Decode<mirror::DexCache*>(javaDexCache);
+  dex_cache->SetResolvedType(type_index, soa.Decode<mirror::Class*>(type));
+}
+
+static void DexCache_setResolvedString(JNIEnv* env, jobject javaDexCache, jint string_index,
+                                       jobject string) {
+  ScopedFastNativeObjectAccess soa(env);
+  mirror::DexCache* dex_cache = soa.Decode<mirror::DexCache*>(javaDexCache);
+  dex_cache->SetResolvedString(string_index, soa.Decode<mirror::String*>(string));
+}
+
 static JNINativeMethod gMethods[] = {
   NATIVE_METHOD(DexCache, getDexNative, "!()Lcom/android/dex/Dex;"),
+  NATIVE_METHOD(DexCache, getResolvedType, "!(I)Ljava/lang/Class;"),
+  NATIVE_METHOD(DexCache, getResolvedString, "!(I)Ljava/lang/String;"),
+  NATIVE_METHOD(DexCache, setResolvedType, "!(ILjava/lang/Class;)V"),
+  NATIVE_METHOD(DexCache, setResolvedString, "!(ILjava/lang/String;)V"),
 };
 
 void register_java_lang_DexCache(JNIEnv* env) {
diff --git a/test/458-checker-instruction-simplification/src/Main.java b/test/458-checker-instruction-simplification/src/Main.java
index 1f0017e..3cbcebb 100644
--- a/test/458-checker-instruction-simplification/src/Main.java
+++ b/test/458-checker-instruction-simplification/src/Main.java
@@ -309,6 +309,457 @@
     return arg ^ -1;
   }
 
+  /**
+   * Test that addition or subtraction operation with both inputs negated are
+   * optimized to use a single negation after the operation.
+   * The transformation tested is implemented in
+   * `InstructionSimplifierVisitor::TryMoveNegOnInputsAfterBinop`.
+   */
+
+  // CHECK-START: int Main.AddNegs1(int, int) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg1:i\d+]]     Neg [ [[Arg1]] ]
+  // CHECK-DAG:     [[Neg2:i\d+]]     Neg [ [[Arg2]] ]
+  // CHECK-DAG:     [[Add:i\d+]]      Add [ [[Neg1]] [[Neg2]] ]
+  // CHECK-DAG:                       Return [ [[Add]] ]
+
+  // CHECK-START: int Main.AddNegs1(int, int) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-NOT:                       Neg
+  // CHECK-DAG:     [[Add:i\d+]]      Add [ [[Arg1]] [[Arg2]] ]
+  // CHECK-DAG:     [[Neg:i\d+]]      Neg [ [[Add]] ]
+  // CHECK-DAG:                       Return [ [[Neg]] ]
+
+  public static int AddNegs1(int arg1, int arg2) {
+    return -arg1 + -arg2;
+  }
+
+  /**
+   * This is similar to the test-case AddNegs1, but the negations have
+   * multiple uses.
+   * The transformation tested is implemented in
+   * `InstructionSimplifierVisitor::TryMoveNegOnInputsAfterBinop`.
+   * The current code won't perform the previous optimization. The
+   * transformations do not look at other uses of their inputs. As they don't
+   * know what will happen with other uses, they do not take the risk of
+   * increasing the register pressure by creating or extending live ranges.
+   */
+
+  // CHECK-START: int Main.AddNegs2(int, int) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg1:i\d+]]     Neg [ [[Arg1]] ]
+  // CHECK-DAG:     [[Neg2:i\d+]]     Neg [ [[Arg2]] ]
+  // CHECK-DAG:     [[Add1:i\d+]]     Add [ [[Neg1]] [[Neg2]] ]
+  // CHECK-DAG:     [[Add2:i\d+]]     Add [ [[Neg1]] [[Neg2]] ]
+  // CHECK-DAG:     [[Or:i\d+]]       Or [ [[Add1]] [[Add2]] ]
+  // CHECK-DAG:                       Return [ [[Or]] ]
+
+  // CHECK-START: int Main.AddNegs2(int, int) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg1:i\d+]]     Neg [ [[Arg1]] ]
+  // CHECK-DAG:     [[Neg2:i\d+]]     Neg [ [[Arg2]] ]
+  // CHECK-DAG:     [[Add1:i\d+]]     Add [ [[Neg1]] [[Neg2]] ]
+  // CHECK-DAG:     [[Add2:i\d+]]     Add [ [[Neg1]] [[Neg2]] ]
+  // CHECK-NOT:                       Neg
+  // CHECK-DAG:     [[Or:i\d+]]       Or [ [[Add1]] [[Add2]] ]
+  // CHECK-DAG:                       Return [ [[Or]] ]
+
+  public static int AddNegs2(int arg1, int arg2) {
+    int temp1 = -arg1;
+    int temp2 = -arg2;
+    return (temp1 + temp2) | (temp1 + temp2);
+  }
+
+  /**
+   * This follows test-cases AddNegs1 and AddNegs2.
+   * The transformation tested is implemented in
+   * `InstructionSimplifierVisitor::TryMoveNegOnInputsAfterBinop`.
+   * The optimization should not happen if it moves an additional instruction in
+   * the loop.
+   */
+
+  // CHECK-START: long Main.AddNegs3(long, long) instruction_simplifier (before)
+  // -------------- Arguments and initial negation operations.
+  // CHECK-DAG:     [[Arg1:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg1:j\d+]]     Neg [ [[Arg1]] ]
+  // CHECK-DAG:     [[Neg2:j\d+]]     Neg [ [[Arg2]] ]
+  // CHECK:                           Goto
+  // -------------- Loop
+  // CHECK:                           SuspendCheck
+  // CHECK:         [[Add:j\d+]]      Add [ [[Neg1]] [[Neg2]] ]
+  // CHECK:                           Goto
+
+  // CHECK-START: long Main.AddNegs3(long, long) instruction_simplifier (after)
+  // -------------- Arguments and initial negation operations.
+  // CHECK-DAG:     [[Arg1:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg1:j\d+]]     Neg [ [[Arg1]] ]
+  // CHECK-DAG:     [[Neg2:j\d+]]     Neg [ [[Arg2]] ]
+  // CHECK:                           Goto
+  // -------------- Loop
+  // CHECK:                           SuspendCheck
+  // CHECK:         [[Add:j\d+]]      Add [ [[Neg1]] [[Neg2]] ]
+  // CHECK-NOT:                       Neg
+  // CHECK:                           Goto
+
+  public static long AddNegs3(long arg1, long arg2) {
+    long res = 0;
+    long n_arg1 = -arg1;
+    long n_arg2 = -arg2;
+    for (long i = 0; i < 1; i++) {
+      res += n_arg1 + n_arg2 + i;
+    }
+    return res;
+  }
+
+  /**
+   * Test the simplification of an addition with a negated argument into a
+   * subtraction.
+   * The transformation tested is implemented in `InstructionSimplifierVisitor::VisitAdd`.
+   */
+
+  // CHECK-START: long Main.AddNeg1(long, long) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg1:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg:j\d+]]      Neg [ [[Arg1]] ]
+  // CHECK-DAG:     [[Add:j\d+]]      Add [ [[Neg]] [[Arg2]] ]
+  // CHECK-DAG:                       Return [ [[Add]] ]
+
+  // CHECK-START: long Main.AddNeg1(long, long) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg1:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Sub:j\d+]]      Sub [ [[Arg2]] [[Arg1]] ]
+  // CHECK-DAG:                       Return [ [[Sub]] ]
+
+  // CHECK-START: long Main.AddNeg1(long, long) instruction_simplifier (after)
+  // CHECK-NOT:                       Neg
+  // CHECK-NOT:                       Add
+
+  public static long AddNeg1(long arg1, long arg2) {
+    return -arg1 + arg2;
+  }
+
+  /**
+   * This is similar to the test-case AddNeg1, but the negation has two uses.
+   * The transformation tested is implemented in `InstructionSimplifierVisitor::VisitAdd`.
+   * The current code won't perform the previous optimization. The
+   * transformations do not look at other uses of their inputs. As they don't
+   * know what will happen with other uses, they do not take the risk of
+   * increasing the register pressure by creating or extending live ranges.
+   */
+
+  // CHECK-START: long Main.AddNeg2(long, long) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg1:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg:j\d+]]      Neg [ [[Arg2]] ]
+  // CHECK-DAG:     [[Add1:j\d+]]     Add [ [[Arg1]] [[Neg]] ]
+  // CHECK-DAG:     [[Add2:j\d+]]     Add [ [[Arg1]] [[Neg]] ]
+  // CHECK-DAG:     [[Res:j\d+]]      Or [ [[Add1]] [[Add2]] ]
+  // CHECK-DAG:                       Return [ [[Res]] ]
+
+  // CHECK-START: long Main.AddNeg2(long, long) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg1:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg:j\d+]]      Neg [ [[Arg2]] ]
+  // CHECK-DAG:     [[Add1:j\d+]]     Add [ [[Arg1]] [[Neg]] ]
+  // CHECK-DAG:     [[Add2:j\d+]]     Add [ [[Arg1]] [[Neg]] ]
+  // CHECK-DAG:     [[Res:j\d+]]      Or [ [[Add1]] [[Add2]] ]
+  // CHECK-DAG:                       Return [ [[Res]] ]
+
+  // CHECK-START: long Main.AddNeg2(long, long) instruction_simplifier (after)
+  // CHECK-NOT:                       Sub
+
+  public static long AddNeg2(long arg1, long arg2) {
+    long temp = -arg2;
+    return (arg1 + temp) | (arg1 + temp);
+  }
+
+  /**
+   * Test simplification of the `-(-var)` pattern.
+   * The transformation tested is implemented in `InstructionSimplifierVisitor::VisitNeg`.
+   */
+
+  // CHECK-START: long Main.NegNeg1(long) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:j\d+]]      ParameterValue
+  // CHECK-DAG:     [[Neg1:j\d+]]     Neg [ [[Arg]] ]
+  // CHECK-DAG:     [[Neg2:j\d+]]     Neg [ [[Neg1]] ]
+  // CHECK-DAG:                       Return [ [[Neg2]] ]
+
+  // CHECK-START: long Main.NegNeg1(long) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:j\d+]]      ParameterValue
+  // CHECK-DAG:                       Return [ [[Arg]] ]
+
+  // CHECK-START: long Main.NegNeg1(long) instruction_simplifier (after)
+  // CHECK-NOT:                       Neg
+
+  public static long NegNeg1(long arg) {
+    return -(-arg);
+  }
+
+  /**
+   * Test 'multi-step' simplification, where a first transformation yields a
+   * new simplification possibility for the current instruction.
+   * The transformations tested are implemented in `InstructionSimplifierVisitor::VisitNeg`
+   * and in `InstructionSimplifierVisitor::VisitAdd`.
+   */
+
+  // CHECK-START: int Main.NegNeg2(int) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:i\d+]]      ParameterValue
+  // CHECK-DAG:     [[Neg1:i\d+]]     Neg [ [[Arg]] ]
+  // CHECK-DAG:     [[Neg2:i\d+]]     Neg [ [[Neg1]] ]
+  // CHECK-DAG:     [[Add:i\d+]]      Add [ [[Neg1]] [[Neg2]] ]
+  // CHECK-DAG:                       Return [ [[Add]] ]
+
+  // CHECK-START: int Main.NegNeg2(int) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:i\d+]]      ParameterValue
+  // CHECK-DAG:     [[Sub:i\d+]]      Sub [ [[Arg]] [[Arg]] ]
+  // CHECK-DAG:                       Return [ [[Sub]] ]
+
+  // CHECK-START: int Main.NegNeg2(int) instruction_simplifier (after)
+  // CHECK-NOT:                       Neg
+  // CHECK-NOT:                       Add
+
+  public static int NegNeg2(int arg) {
+    int temp = -arg;
+    return temp + -temp;
+  }
+
+  /**
+   * Test another 'multi-step' simplification, where a first transformation
+   * yields a new simplification possibility for the current instruction.
+   * The transformations tested are implemented in `InstructionSimplifierVisitor::VisitNeg`
+   * and in `InstructionSimplifierVisitor::VisitSub`.
+   */
+
+  // CHECK-START: long Main.NegNeg3(long) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:j\d+]]      ParameterValue
+  // CHECK-DAG:     [[Const0:j\d+]]   LongConstant 0
+  // CHECK-DAG:     [[Neg:j\d+]]      Neg [ [[Arg]] ]
+  // CHECK-DAG:     [[Sub:j\d+]]      Sub [ [[Const0]] [[Neg]] ]
+  // CHECK-DAG:                       Return [ [[Sub]] ]
+
+  // CHECK-START: long Main.NegNeg3(long) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:j\d+]]      ParameterValue
+  // CHECK-DAG:                       Return [ [[Arg]] ]
+
+  // CHECK-START: long Main.NegNeg3(long) instruction_simplifier (after)
+  // CHECK-NOT:                       Neg
+  // CHECK-NOT:                       Sub
+
+  public static long NegNeg3(long arg) {
+    return 0 - -arg;
+  }
+
+  /**
+   * Test that a negated subtraction is simplified to a subtraction with its
+   * arguments reversed.
+   * The transformation tested is implemented in `InstructionSimplifierVisitor::VisitNeg`.
+   */
+
+  // CHECK-START: int Main.NegSub1(int, int) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Sub:i\d+]]      Sub [ [[Arg1]] [[Arg2]] ]
+  // CHECK-DAG:     [[Neg:i\d+]]      Neg [ [[Sub]] ]
+  // CHECK-DAG:                       Return [ [[Neg]] ]
+
+  // CHECK-START: int Main.NegSub1(int, int) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Sub:i\d+]]      Sub [ [[Arg2]] [[Arg1]] ]
+  // CHECK-DAG:                       Return [ [[Sub]] ]
+
+  // CHECK-START: int Main.NegSub1(int, int) instruction_simplifier (after)
+  // CHECK-NOT:                       Neg
+
+  public static int NegSub1(int arg1, int arg2) {
+    return -(arg1 - arg2);
+  }
+
+  /**
+   * This is similar to the test-case NegSub1, but the subtraction has
+   * multiple uses.
+   * The transformation tested is implemented in `InstructionSimplifierVisitor::VisitNeg`.
+   * The current code won't perform the previous optimization. The
+   * transformations do not look at other uses of their inputs. As they don't
+   * know what will happen with other uses, they do not take the risk of
+   * increasing the register pressure by creating or extending live ranges.
+   */
+
+  // CHECK-START: int Main.NegSub2(int, int) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Sub:i\d+]]      Sub [ [[Arg1]] [[Arg2]] ]
+  // CHECK-DAG:     [[Neg1:i\d+]]     Neg [ [[Sub]] ]
+  // CHECK-DAG:     [[Neg2:i\d+]]     Neg [ [[Sub]] ]
+  // CHECK-DAG:     [[Or:i\d+]]       Or [ [[Neg1]] [[Neg2]] ]
+  // CHECK-DAG:                       Return [ [[Or]] ]
+
+  // CHECK-START: int Main.NegSub2(int, int) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Sub:i\d+]]      Sub [ [[Arg1]] [[Arg2]] ]
+  // CHECK-DAG:     [[Neg1:i\d+]]     Neg [ [[Sub]] ]
+  // CHECK-DAG:     [[Neg2:i\d+]]     Neg [ [[Sub]] ]
+  // CHECK-DAG:     [[Or:i\d+]]       Or [ [[Neg1]] [[Neg2]] ]
+  // CHECK-DAG:                       Return [ [[Or]] ]
+
+  public static int NegSub2(int arg1, int arg2) {
+    int temp = arg1 - arg2;
+    return -temp | -temp;
+  }
+
+  /**
+   * Test simplification of the `~~var` pattern.
+   * The transformation tested is implemented in `InstructionSimplifierVisitor::VisitNot`.
+   */
+
+  // CHECK-START: long Main.NotNot1(long) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:j\d+]]      ParameterValue
+  // CHECK-DAG:     [[ConstF1:j\d+]]  LongConstant -1
+  // CHECK-DAG:     [[Xor1:j\d+]]     Xor [ [[Arg]] [[ConstF1]] ]
+  // CHECK-DAG:     [[Xor2:j\d+]]     Xor [ [[Xor1]] [[ConstF1]] ]
+  // CHECK-DAG:                       Return [ [[Xor2]] ]
+
+  // CHECK-START: long Main.NotNot1(long) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:j\d+]]      ParameterValue
+  // CHECK-DAG:                       Return [ [[Arg]] ]
+
+  // CHECK-START: long Main.NotNot1(long) instruction_simplifier (after)
+  // CHECK-NOT:                       Xor
+
+  public static long NotNot1(long arg) {
+    return ~~arg;
+  }
+
+  // CHECK-START: int Main.NotNot2(int) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:i\d+]]      ParameterValue
+  // CHECK-DAG:     [[ConstF1:i\d+]]  IntConstant -1
+  // CHECK-DAG:     [[Xor1:i\d+]]     Xor [ [[Arg]] [[ConstF1]] ]
+  // CHECK-DAG:     [[Xor2:i\d+]]     Xor [ [[Xor1]] [[ConstF1]] ]
+  // CHECK-DAG:     [[Add:i\d+]]      Add [ [[Xor1]] [[Xor2]] ]
+  // CHECK-DAG:                       Return [ [[Add]] ]
+
+  // CHECK-START: int Main.NotNot2(int) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:i\d+]]      ParameterValue
+  // CHECK-DAG:     [[Not:i\d+]]      Not [ [[Arg]] ]
+  // CHECK-DAG:     [[Add:i\d+]]      Add [ [[Not]] [[Arg]] ]
+  // CHECK-DAG:                       Return [ [[Add]] ]
+
+  // CHECK-START: int Main.NotNot2(int) instruction_simplifier (after)
+  // CHECK-NOT:                       Xor
+
+  public static int NotNot2(int arg) {
+    int temp = ~arg;
+    return temp + ~temp;
+  }
+
+  /**
+   * Test the simplification of a subtraction with a negated argument.
+   * The transformation tested is implemented in `InstructionSimplifierVisitor::VisitSub`.
+   */
+
+  // CHECK-START: int Main.SubNeg1(int, int) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg:i\d+]]      Neg [ [[Arg1]] ]
+  // CHECK-DAG:     [[Sub:i\d+]]      Sub [ [[Neg]] [[Arg2]] ]
+  // CHECK-DAG:                       Return [ [[Sub]] ]
+
+  // CHECK-START: int Main.SubNeg1(int, int) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Add:i\d+]]      Add [ [[Arg1]] [[Arg2]] ]
+  // CHECK-DAG:     [[Neg:i\d+]]      Neg [ [[Add]] ]
+  // CHECK-DAG:                       Return [ [[Neg]] ]
+
+  // CHECK-START: int Main.SubNeg1(int, int) instruction_simplifier (after)
+  // CHECK-NOT:                       Sub
+
+  public static int SubNeg1(int arg1, int arg2) {
+    return -arg1 - arg2;
+  }
+
+  /**
+   * This is similar to the test-case SubNeg1, but the negation has
+   * multiple uses.
+   * The transformation tested is implemented in `InstructionSimplifierVisitor::VisitSub`.
+   * The current code won't perform the previous optimization. The
+   * transformations do not look at other uses of their inputs. As they don't
+   * know what will happen with other uses, they do not take the risk of
+   * increasing the register pressure by creating or extending live ranges.
+   */
+
+  // CHECK-START: int Main.SubNeg2(int, int) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg:i\d+]]      Neg [ [[Arg1]] ]
+  // CHECK-DAG:     [[Sub1:i\d+]]     Sub [ [[Neg]] [[Arg2]] ]
+  // CHECK-DAG:     [[Sub2:i\d+]]     Sub [ [[Neg]] [[Arg2]] ]
+  // CHECK-DAG:     [[Or:i\d+]]       Or [ [[Sub1]] [[Sub2]] ]
+  // CHECK-DAG:                       Return [ [[Or]] ]
+
+  // CHECK-START: int Main.SubNeg2(int, int) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg1:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:i\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg:i\d+]]      Neg [ [[Arg1]] ]
+  // CHECK-DAG:     [[Sub1:i\d+]]     Sub [ [[Neg]] [[Arg2]] ]
+  // CHECK-DAG:     [[Sub2:i\d+]]     Sub [ [[Neg]] [[Arg2]] ]
+  // CHECK-DAG:     [[Or:i\d+]]       Or [ [[Sub1]] [[Sub2]] ]
+  // CHECK-DAG:                       Return [ [[Or]] ]
+
+  // CHECK-START: int Main.SubNeg2(int, int) instruction_simplifier (after)
+  // CHECK-NOT:                       Add
+
+  public static int SubNeg2(int arg1, int arg2) {
+    int temp = -arg1;
+    return (temp - arg2) | (temp - arg2);
+  }
+
+  /**
+   * This follows test-cases SubNeg1 and SubNeg2.
+   * The transformation tested is implemented in `InstructionSimplifierVisitor::VisitSub`.
+   * The optimization should not happen if it moves an additional instruction in
+   * the loop.
+   */
+
+  // CHECK-START: long Main.SubNeg3(long, long) instruction_simplifier (before)
+  // -------------- Arguments and initial negation operation.
+  // CHECK-DAG:     [[Arg1:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg:j\d+]]      Neg [ [[Arg1]] ]
+  // CHECK:                           Goto
+  // -------------- Loop
+  // CHECK:                           SuspendCheck
+  // CHECK:         [[Sub:j\d+]]      Sub [ [[Neg]] [[Arg2]] ]
+  // CHECK:                           Goto
+
+  // CHECK-START: long Main.SubNeg3(long, long) instruction_simplifier (after)
+  // -------------- Arguments and initial negation operation.
+  // CHECK-DAG:     [[Arg1:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Arg2:j\d+]]     ParameterValue
+  // CHECK-DAG:     [[Neg:j\d+]]      Neg [ [[Arg1]] ]
+  // CHECK-DAG:                       Goto
+  // -------------- Loop
+  // CHECK:                           SuspendCheck
+  // CHECK:         [[Sub:j\d+]]      Sub [ [[Neg]] [[Arg2]] ]
+  // CHECK-NOT:                       Neg
+  // CHECK:                           Goto
+
+  public static long SubNeg3(long arg1, long arg2) {
+    long res = 0;
+    long temp = -arg1;
+    for (long i = 0; i < 1; i++) {
+      res += temp - arg2 - i;
+    }
+    return res;
+  }
+
   public static void main(String[] args) {
     int arg = 123456;
 
@@ -328,5 +779,20 @@
     assertLongEquals(UShr0(arg), arg);
     assertIntEquals(Xor0(arg), arg);
     assertIntEquals(XorAllOnes(arg), ~arg);
+    assertIntEquals(AddNegs1(arg, arg + 1), -(arg + arg + 1));
+    assertIntEquals(AddNegs2(arg, arg + 1), -(arg + arg + 1));
+    assertLongEquals(AddNegs3(arg, arg + 1), -(2 * arg + 1));
+    assertLongEquals(AddNeg1(arg, arg + 1), 1);
+    assertLongEquals(AddNeg2(arg, arg + 1), -1);
+    assertLongEquals(NegNeg1(arg), arg);
+    assertIntEquals(NegNeg2(arg), 0);
+    assertLongEquals(NegNeg3(arg), arg);
+    assertIntEquals(NegSub1(arg, arg + 1), 1);
+    assertIntEquals(NegSub2(arg, arg + 1), 1);
+    assertLongEquals(NotNot1(arg), arg);
+    assertIntEquals(NotNot2(arg), -1);
+    assertIntEquals(SubNeg1(arg, arg + 1), -(arg + arg + 1));
+    assertIntEquals(SubNeg2(arg, arg + 1), -(arg + arg + 1));
+    assertLongEquals(SubNeg3(arg, arg + 1), -(2 * arg + 1));
   }
 }