Merge "Use mmapped boot image class table for PIC app HLoadClass."
diff --git a/compiler/Android.bp b/compiler/Android.bp
index d0b5192..c798d97 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -54,6 +54,7 @@
         "optimizing/code_generator_utils.cc",
         "optimizing/code_sinking.cc",
         "optimizing/constant_folding.cc",
+        "optimizing/constructor_fence_redundancy_elimination.cc",
         "optimizing/dead_code_elimination.cc",
         "optimizing/escape.cc",
         "optimizing/graph_checker.cc",
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 9095ecd..18a55c8 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -27,12 +27,13 @@
 using helpers::ARM64EncodableConstantOrRegister;
 using helpers::Arm64CanEncodeConstantAsImmediate;
 using helpers::DRegisterFrom;
-using helpers::VRegisterFrom;
 using helpers::HeapOperand;
 using helpers::InputRegisterAt;
 using helpers::Int64ConstantFrom;
-using helpers::XRegisterFrom;
+using helpers::OutputRegister;
+using helpers::VRegisterFrom;
 using helpers::WRegisterFrom;
+using helpers::XRegisterFrom;
 
 #define __ GetVIXLAssembler()->
 
@@ -127,20 +128,51 @@
   }
 }
 
-void LocationsBuilderARM64::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void LocationsBuilderARM64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
-void InstructionCodeGeneratorARM64::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void LocationsBuilderARM64::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorARM64::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void InstructionCodeGeneratorARM64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister src = VRegisterFrom(locations->InAt(0));
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Umov(OutputRegister(instruction), src.V4S(), 0);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Umov(OutputRegister(instruction), src.V2D(), 0);
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 4u);
+      DCHECK(locations->InAt(0).Equals(locations->Out()));  // no code required
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 // Helper to set up locations for vector unary operations.
@@ -169,6 +201,46 @@
   }
 }
 
+void LocationsBuilderARM64::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecReduce(HVecReduce* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister src = VRegisterFrom(locations->InAt(0));
+  VRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ Addv(dst.S(), src.V4S());
+          break;
+        case HVecReduce::kMin:
+          __ Sminv(dst.S(), src.V4S());
+          break;
+        case HVecReduce::kMax:
+          __ Smaxv(dst.S(), src.V4S());
+          break;
+      }
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ Addp(dst.D(), src.V2D());
+          break;
+        default:
+          LOG(FATAL) << "Unsupported SIMD min/max";
+          UNREACHABLE();
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderARM64::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -263,6 +335,7 @@
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
   }
 }
 
@@ -805,6 +878,77 @@
   }
 }
 
+void LocationsBuilderARM64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
+
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorARM64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister dst = VRegisterFrom(locations->Out());
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  // Zero out all other elements first.
+  __ Movi(dst.V16B(), 0);
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    return;
+  }
+
+  // Set required elements.
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Mov(dst.V16B(), 0, InputRegisterAt(instruction, 0));
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Mov(dst.V8H(), 0, InputRegisterAt(instruction, 0));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Mov(dst.V4S(), 0, InputRegisterAt(instruction, 0));
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Mov(dst.V2D(), 0, InputRegisterAt(instruction, 0));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
   switch (instr->GetPackedType()) {
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index 527691d..7a11dff 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -73,19 +73,11 @@
   }
 }
 
-void LocationsBuilderARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
+void LocationsBuilderARMVIXL::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void InstructionCodeGeneratorARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void LocationsBuilderARMVIXL::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorARMVIXL::VisitVecSumReduce(HVecSumReduce* instruction) {
+void InstructionCodeGeneratorARMVIXL::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
@@ -112,6 +104,14 @@
   }
 }
 
+void LocationsBuilderARMVIXL::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecReduce(HVecReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -621,6 +621,14 @@
   }
 }
 
+void LocationsBuilderARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LOG(FATAL) << "No SIMD for " << instr->GetId();
 }
diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index 6bf28ab..c2fbf7f 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc
@@ -88,19 +88,11 @@
   }
 }
 
-void LocationsBuilderMIPS::VisitVecSetScalars(HVecSetScalars* instruction) {
+void LocationsBuilderMIPS::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void InstructionCodeGeneratorMIPS::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void LocationsBuilderMIPS::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorMIPS::VisitVecSumReduce(HVecSumReduce* instruction) {
+void InstructionCodeGeneratorMIPS::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
@@ -133,6 +125,14 @@
   }
 }
 
+void LocationsBuilderMIPS::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecReduce(HVecReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -818,6 +818,14 @@
   }
 }
 
+void LocationsBuilderMIPS::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
   switch (instr->GetPackedType()) {
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index 75bf7a7..9d3a777 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -91,19 +91,11 @@
   }
 }
 
-void LocationsBuilderMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) {
+void LocationsBuilderMIPS64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void InstructionCodeGeneratorMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void LocationsBuilderMIPS64::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorMIPS64::VisitVecSumReduce(HVecSumReduce* instruction) {
+void InstructionCodeGeneratorMIPS64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
@@ -136,6 +128,14 @@
   }
 }
 
+void LocationsBuilderMIPS64::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecReduce(HVecReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS64::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -822,6 +822,14 @@
   }
 }
 
+void LocationsBuilderMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
   switch (instr->GetPackedType()) {
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index e7aec76..37190f8 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -27,9 +27,99 @@
 
 void LocationsBuilderX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimLong:
-      // Long needs extra temporary to load the register pair.
+      // Long needs extra temporary to load from the register pair.
+      if (!is_zero) {
+        locations->AddTemp(Location::RequiresFpuRegister());
+      }
+      FALLTHROUGH_INTENDED;
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(is_zero ? Location::RequiresFpuRegister()
+                                : Location::SameAsFirstInput());
+
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    __ xorps(dst, dst);
+    return;
+  }
+
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      __ punpcklbw(dst, dst);
+      __ punpcklwd(dst, dst);
+      __ pshufd(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      __ punpcklwd(dst, dst);
+      __ pshufd(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      __ pshufd(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimLong: {
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegisterPairLow<Register>());
+      __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>());
+      __ punpckldq(dst, tmp);
+      __ punpcklqdq(dst, dst);
+      break;
+    }
+    case Primitive::kPrimFloat:
+      DCHECK(locations->InAt(0).Equals(locations->Out()));
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ shufps(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK(locations->InAt(0).Equals(locations->Out()));
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ shufpd(dst, dst, Immediate(0));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimLong:
+      // Long needs extra temporary to store into the register pair.
       locations->AddTemp(Location::RequiresFpuRegister());
       FALLTHROUGH_INTENDED;
     case Primitive::kPrimBoolean:
@@ -37,8 +127,8 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
-      locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetOut(Location::RequiresFpuRegister());
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
@@ -51,48 +141,34 @@
   }
 }
 
-void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+void InstructionCodeGeneratorX86::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<Register>());
-      __ punpcklbw(reg, reg);
-      __ punpcklwd(reg, reg);
-      __ pshufd(reg, reg, Immediate(0));
-      break;
     case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<Register>());
-      __ punpcklwd(reg, reg);
-      __ pshufd(reg, reg, Immediate(0));
-      break;
+    case Primitive::kPrimShort:  // TODO: up to here, and?
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
     case Primitive::kPrimInt:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<Register>());
-      __ pshufd(reg, reg, Immediate(0));
+      DCHECK_LE(4u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ movd(locations->Out().AsRegister<Register>(), src);
       break;
     case Primitive::kPrimLong: {
       XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegisterPairLow<Register>());
-      __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>());
-      __ punpckldq(reg, tmp);
-      __ punpcklqdq(reg, reg);
+      __ movd(locations->Out().AsRegisterPairLow<Register>(), src);
+      __ pshufd(tmp, src, Immediate(1));
+      __ movd(locations->Out().AsRegisterPairHigh<Register>(), tmp);
       break;
     }
     case Primitive::kPrimFloat:
-      DCHECK(locations->InAt(0).Equals(locations->Out()));
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ shufps(reg, reg, Immediate(0));
-      break;
     case Primitive::kPrimDouble:
-      DCHECK(locations->InAt(0).Equals(locations->Out()));
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ shufpd(reg, reg, Immediate(0));
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 4u);
+      DCHECK(locations->InAt(0).Equals(locations->Out()));  // no code required
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -100,22 +176,6 @@
   }
 }
 
-void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void LocationsBuilderX86::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorX86::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
 // Helper to set up locations for vector unary operations.
 static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) {
   LocationSummary* locations = new (arena) LocationSummary(instruction);
@@ -137,6 +197,73 @@
   }
 }
 
+void LocationsBuilderX86::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Long reduction or min/max require a temporary.
+  if (instruction->GetPackedType() == Primitive::kPrimLong ||
+      instruction->GetKind() == HVecReduce::kMin ||
+      instruction->GetKind() == HVecReduce::kMax) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
+}
+
+void InstructionCodeGeneratorX86::VisitVecReduce(HVecReduce* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ movaps(dst, src);
+          __ phaddd(dst, dst);
+          __ phaddd(dst, dst);
+          break;
+        case HVecReduce::kMin: {
+          XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ psrldq(tmp, Immediate(8));
+          __ pminsd(dst, tmp);
+          __ psrldq(tmp, Immediate(4));
+          __ pminsd(dst, tmp);
+          break;
+        }
+        case HVecReduce::kMax: {
+          XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ psrldq(tmp, Immediate(8));
+          __ pmaxsd(dst, tmp);
+          __ psrldq(tmp, Immediate(4));
+          __ pmaxsd(dst, tmp);
+          break;
+        }
+      }
+      break;
+    case Primitive::kPrimLong: {
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ punpckhqdq(tmp, tmp);
+          __ paddq(dst, tmp);
+          break;
+        case HVecReduce::kMin:
+        case HVecReduce::kMax:
+          LOG(FATAL) << "Unsupported SIMD type";
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -821,6 +948,91 @@
   }
 }
 
+void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
+
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimLong:
+      // Long needs extra temporary to load from register pairs.
+      if (!is_zero) {
+        locations->AddTemp(Location::RequiresFpuRegister());
+      }
+      FALLTHROUGH_INTENDED;
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  // Zero out all other elements first.
+  __ xorps(dst, dst);
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    return;
+  }
+
+  // Set required elements.
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:  // TODO: up to here, and?
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      break;
+    case Primitive::kPrimLong: {
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ xorps(tmp, tmp);
+      __ movd(dst, locations->InAt(0).AsRegisterPairLow<Register>());
+      __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>());
+      __ punpckldq(dst, tmp);
+      break;
+    }
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movss(dst, locations->InAt(1).AsFpuRegister<XmmRegister>());
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movsd(dst, locations->InAt(1).AsFpuRegister<XmmRegister>());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LOG(FATAL) << "No SIMD for " << instr->GetId();
 }
@@ -868,6 +1080,7 @@
     case 8: scale = TIMES_8; break;
     default: break;
   }
+  // Incorporate the string or array offset in the address computation.
   uint32_t offset = is_string_char_at
       ? mirror::String::ValueOffset().Uint32Value()
       : mirror::Array::DataOffset(size).Uint32Value();
@@ -902,7 +1115,7 @@
         __ testb(Address(locations->InAt(0).AsRegister<Register>(), count_offset), Immediate(1));
         __ j(kNotZero, &not_compressed);
         // Zero extend 8 compressed bytes into 8 chars.
-        __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true));
+        __ movsd(reg, VecAddress(locations, 1, instruction->IsStringCharAt()));
         __ pxor(tmp, tmp);
         __ punpcklbw(reg, tmp);
         __ jmp(&done);
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index c7ee81c..edd0209 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -27,6 +27,8 @@
 
 void LocationsBuilderX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -34,11 +36,89 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
-      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
       locations->SetOut(Location::RequiresFpuRegister());
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(is_zero ? Location::RequiresFpuRegister()
+                                : Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    __ xorps(dst, dst);
+    return;
+  }
+
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>(), /*64-bit*/ false);
+      __ punpcklbw(dst, dst);
+      __ punpcklwd(dst, dst);
+      __ pshufd(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>(), /*64-bit*/ false);
+      __ punpcklwd(dst, dst);
+      __ pshufd(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>(), /*64-bit*/ false);
+      __ pshufd(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>(), /*64-bit*/ true);
+      __ punpcklqdq(dst, dst);
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      DCHECK(locations->InAt(0).Equals(locations->Out()));
+      __ shufps(dst, dst, Immediate(0));
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      DCHECK(locations->InAt(0).Equals(locations->Out()));
+      __ shufpd(dst, dst, Immediate(0));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
       locations->SetInAt(0, Location::RequiresFpuRegister());
       locations->SetOut(Location::SameAsFirstInput());
       break;
@@ -48,44 +128,29 @@
   }
 }
 
-void InstructionCodeGeneratorX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+void InstructionCodeGeneratorX86_64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());
-      __ punpcklbw(reg, reg);
-      __ punpcklwd(reg, reg);
-      __ pshufd(reg, reg, Immediate(0));
-      break;
     case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());
-      __ punpcklwd(reg, reg);
-      __ pshufd(reg, reg, Immediate(0));
-      break;
+    case Primitive::kPrimShort:  // TODO: up to here, and?
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
     case Primitive::kPrimInt:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());
-      __ pshufd(reg, reg, Immediate(0));
+      __ movd(locations->Out().AsRegister<CpuRegister>(), src, /*64-bit*/ false);
       break;
     case Primitive::kPrimLong:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());  // is 64-bit
-      __ punpcklqdq(reg, reg);
+      __ movd(locations->Out().AsRegister<CpuRegister>(), src, /*64-bit*/ true);
       break;
     case Primitive::kPrimFloat:
-      DCHECK(locations->InAt(0).Equals(locations->Out()));
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ shufps(reg, reg, Immediate(0));
-      break;
     case Primitive::kPrimDouble:
-      DCHECK(locations->InAt(0).Equals(locations->Out()));
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ shufpd(reg, reg, Immediate(0));
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 4u);
+      DCHECK(locations->InAt(0).Equals(locations->Out()));  // no code required
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -93,22 +158,6 @@
   }
 }
 
-void LocationsBuilderX86_64::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorX86_64::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void LocationsBuilderX86_64::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorX86_64::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
 // Helper to set up locations for vector unary operations.
 static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) {
   LocationSummary* locations = new (arena) LocationSummary(instruction);
@@ -130,6 +179,73 @@
   }
 }
 
+void LocationsBuilderX86_64::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Long reduction or min/max require a temporary.
+  if (instruction->GetPackedType() == Primitive::kPrimLong ||
+      instruction->GetKind() == HVecReduce::kMin ||
+      instruction->GetKind() == HVecReduce::kMax) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecReduce(HVecReduce* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ movaps(dst, src);
+          __ phaddd(dst, dst);
+          __ phaddd(dst, dst);
+          break;
+        case HVecReduce::kMin: {
+          XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ psrldq(tmp, Immediate(8));
+          __ pminsd(dst, tmp);
+          __ psrldq(tmp, Immediate(4));
+          __ pminsd(dst, tmp);
+          break;
+        }
+        case HVecReduce::kMax: {
+          XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ psrldq(tmp, Immediate(8));
+          __ pmaxsd(dst, tmp);
+          __ psrldq(tmp, Immediate(4));
+          __ pmaxsd(dst, tmp);
+          break;
+        }
+      }
+      break;
+    case Primitive::kPrimLong: {
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ punpckhqdq(tmp, tmp);
+          __ paddq(dst, tmp);
+          break;
+        case HVecReduce::kMin:
+        case HVecReduce::kMax:
+          LOG(FATAL) << "Unsupported SIMD type";
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -814,12 +930,87 @@
   }
 }
 
-void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LOG(FATAL) << "No SIMD for " << instr->GetId();
+void LocationsBuilderX86_64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
+
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
-void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LOG(FATAL) << "No SIMD for " << instr->GetId();
+void InstructionCodeGeneratorX86_64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  // Zero out all other elements first.
+  __ xorps(dst, dst);
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    return;
+  }
+
+  // Set required elements.
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:  // TODO: up to here, and?
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>());
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>());  // is 64-bit
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movss(dst, locations->InAt(0).AsFpuRegister<XmmRegister>());
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movsd(dst, locations->InAt(0).AsFpuRegister<XmmRegister>());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 // Helper to set up locations for vector memory operations.
@@ -861,6 +1052,7 @@
     case 8: scale = TIMES_8; break;
     default: break;
   }
+  // Incorporate the string or array offset in the address computation.
   uint32_t offset = is_string_char_at
       ? mirror::String::ValueOffset().Uint32Value()
       : mirror::Array::DataOffset(size).Uint32Value();
@@ -895,7 +1087,7 @@
         __ testb(Address(locations->InAt(0).AsRegister<CpuRegister>(), count_offset), Immediate(1));
         __ j(kNotZero, &not_compressed);
         // Zero extend 8 compressed bytes into 8 chars.
-        __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true));
+        __ movsd(reg, VecAddress(locations, 1, instruction->IsStringCharAt()));
         __ pxor(tmp, tmp);
         __ punpcklbw(reg, tmp);
         __ jmp(&done);
diff --git a/compiler/optimizing/code_sinking.cc b/compiler/optimizing/code_sinking.cc
index 6c3a9fd..b558eb1 100644
--- a/compiler/optimizing/code_sinking.cc
+++ b/compiler/optimizing/code_sinking.cc
@@ -64,6 +64,11 @@
     // A fence with "0" inputs is dead and should've been removed in a prior pass.
     DCHECK_NE(0u, ctor_fence->InputCount());
 
+    // TODO: this should be simplified to 'return true' since it's
+    // potentially pessimizing any code sinking for inlined constructors with final fields.
+    // TODO: double check that if the final field assignments are not moved,
+    // then the fence is not moved either.
+
     return ctor_fence->GetAssociatedAllocation() != nullptr;
   }
 
diff --git a/compiler/optimizing/constructor_fence_redundancy_elimination.cc b/compiler/optimizing/constructor_fence_redundancy_elimination.cc
new file mode 100644
index 0000000..ff7ce60
--- /dev/null
+++ b/compiler/optimizing/constructor_fence_redundancy_elimination.cc
@@ -0,0 +1,261 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "constructor_fence_redundancy_elimination.h"
+
+#include "base/arena_allocator.h"
+
+namespace art {
+
+static constexpr bool kCfreLogFenceInputCount = false;
+
+// TODO: refactor this code by reusing escape analysis.
+class CFREVisitor : public HGraphVisitor {
+ public:
+  CFREVisitor(HGraph* graph, OptimizingCompilerStats* stats)
+      : HGraphVisitor(graph),
+        scoped_allocator_(graph->GetArena()->GetArenaPool()),
+        candidate_fences_(scoped_allocator_.Adapter(kArenaAllocCFRE)),
+        candidate_fence_targets_(scoped_allocator_.Adapter(kArenaAllocCFRE)),
+        stats_(stats) {}
+
+  void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
+    // Visit all instructions in block.
+    HGraphVisitor::VisitBasicBlock(block);
+
+    // If there were any unmerged fences left, merge them together,
+    // the objects are considered 'published' at the end of the block.
+    MergeCandidateFences();
+  }
+
+  void VisitConstructorFence(HConstructorFence* constructor_fence) OVERRIDE {
+    candidate_fences_.push_back(constructor_fence);
+
+    for (size_t input_idx = 0; input_idx < constructor_fence->InputCount(); ++input_idx) {
+      candidate_fence_targets_.Insert(constructor_fence->InputAt(input_idx));
+    }
+  }
+
+  void VisitBoundType(HBoundType* bound_type) OVERRIDE {
+    VisitAlias(bound_type);
+  }
+
+  void VisitNullCheck(HNullCheck* null_check) OVERRIDE {
+    VisitAlias(null_check);
+  }
+
+  void VisitSelect(HSelect* select) OVERRIDE {
+    VisitAlias(select);
+  }
+
+  void VisitInstanceFieldSet(HInstanceFieldSet* instruction) OVERRIDE {
+    HInstruction* value = instruction->InputAt(1);
+    VisitSetLocation(instruction, value);
+  }
+
+  void VisitStaticFieldSet(HStaticFieldSet* instruction) OVERRIDE {
+    HInstruction* value = instruction->InputAt(1);
+    VisitSetLocation(instruction, value);
+  }
+
+  void VisitArraySet(HArraySet* instruction) OVERRIDE {
+    HInstruction* value = instruction->InputAt(2);
+    VisitSetLocation(instruction, value);
+  }
+
+  void VisitDeoptimize(HDeoptimize* instruction ATTRIBUTE_UNUSED) {
+    // Pessimize: Merge all fences.
+    MergeCandidateFences();
+  }
+
+  void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE {
+    HandleInvoke(invoke);
+  }
+
+  void VisitInvokeVirtual(HInvokeVirtual* invoke) OVERRIDE {
+    HandleInvoke(invoke);
+  }
+
+  void VisitInvokeInterface(HInvokeInterface* invoke) OVERRIDE {
+    HandleInvoke(invoke);
+  }
+
+  void VisitInvokeUnresolved(HInvokeUnresolved* invoke) OVERRIDE {
+    HandleInvoke(invoke);
+  }
+
+  void VisitInvokePolymorphic(HInvokePolymorphic* invoke) OVERRIDE {
+    HandleInvoke(invoke);
+  }
+
+  void VisitClinitCheck(HClinitCheck* clinit) OVERRIDE {
+    HandleInvoke(clinit);
+  }
+
+  void VisitUnresolvedInstanceFieldGet(HUnresolvedInstanceFieldGet* instruction) OVERRIDE {
+    // Conservatively treat it as an invocation.
+    HandleInvoke(instruction);
+  }
+
+  void VisitUnresolvedInstanceFieldSet(HUnresolvedInstanceFieldSet* instruction) OVERRIDE {
+    // Conservatively treat it as an invocation.
+    HandleInvoke(instruction);
+  }
+
+  void VisitUnresolvedStaticFieldGet(HUnresolvedStaticFieldGet* instruction) OVERRIDE {
+    // Conservatively treat it as an invocation.
+    HandleInvoke(instruction);
+  }
+
+  void VisitUnresolvedStaticFieldSet(HUnresolvedStaticFieldSet* instruction) OVERRIDE {
+    // Conservatively treat it as an invocation.
+    HandleInvoke(instruction);
+  }
+
+ private:
+  void HandleInvoke(HInstruction* invoke) {
+    // An object is considered "published" if it escapes into an invoke as any of the parameters.
+    if (HasInterestingPublishTargetAsInput(invoke)) {
+        MergeCandidateFences();
+    }
+  }
+
+  // Called by any instruction visitor that may create an alias.
+  // These instructions may create an alias:
+  // - BoundType
+  // - NullCheck
+  // - Select
+  //
+  // These also create an alias, but are not handled by this function:
+  // - Phi: propagates values across blocks, but we always merge at the end of a block.
+  // - Invoke: this is handled by HandleInvoke.
+  void VisitAlias(HInstruction* aliasing_inst) {
+    // An object is considered "published" if it becomes aliased by other instructions.
+    if (HasInterestingPublishTargetAsInput(aliasing_inst))  {
+      // Note that constructing a "NullCheck" for new-instance, new-array,
+      // or a 'this' (receiver) reference is impossible.
+      //
+      // If by some reason we actually encounter such a NullCheck(FenceTarget),
+      // we LOG(WARNING).
+      if (UNLIKELY(aliasing_inst->IsNullCheck())) {
+        LOG(kIsDebugBuild ? FATAL : WARNING)
+            << "Unexpected instruction: NullCheck; should not be legal in graph";
+        // We then do a best-effort to handle this case.
+      }
+      MergeCandidateFences();
+    }
+  }
+
+  void VisitSetLocation(HInstruction* inst ATTRIBUTE_UNUSED, HInstruction* store_input) {
+    // An object is considered "published" if it's stored onto the heap.
+    // Sidenote: A later "LSE" pass can still remove the fence if it proves the
+    // object doesn't actually escape.
+    if (IsInterestingPublishTarget(store_input)) {
+      // Merge all constructor fences that we've seen since
+      // the last interesting store (or since the beginning).
+      MergeCandidateFences();
+    }
+  }
+
+  bool HasInterestingPublishTargetAsInput(HInstruction* inst) {
+    for (size_t input_count = 0; input_count < inst->InputCount(); ++input_count) {
+      if (IsInterestingPublishTarget(inst->InputAt(input_count))) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  // Merges all the existing fences we've seen so far into the last-most fence.
+  //
+  // This resets the list of candidate fences and their targets back to {}.
+  void MergeCandidateFences() {
+    if (candidate_fences_.empty()) {
+      // Nothing to do, need 1+ fences to merge.
+      return;
+    }
+
+    // The merge target is always the "last" candidate fence.
+    HConstructorFence* merge_target = candidate_fences_[candidate_fences_.size() - 1];
+
+    for (HConstructorFence* fence : candidate_fences_) {
+      MaybeMerge(merge_target, fence);
+    }
+
+    if (kCfreLogFenceInputCount) {
+      LOG(INFO) << "CFRE-MergeCandidateFences: Post-merge fence input count "
+                << merge_target->InputCount();
+    }
+
+    // Each merge acts as a cut-off point. The optimization is reset completely.
+    // In theory, we could push the fence as far as its publish, but in practice
+    // there is no benefit to this extra complexity unless we also reordered
+    // the stores to come later.
+    candidate_fences_.clear();
+    candidate_fence_targets_.Clear();
+  }
+
+  // A publishing 'store' is only interesting if the value being stored
+  // is one of the fence `targets` in `candidate_fences`.
+  bool IsInterestingPublishTarget(HInstruction* store_input) const {
+    return candidate_fence_targets_.Find(store_input) != candidate_fence_targets_.end();
+  }
+
+  void MaybeMerge(HConstructorFence* target, HConstructorFence* src) {
+    if (target == src) {
+      return;  // Don't merge a fence into itself.
+      // This is mostly for stats-purposes, we don't want to count merge(x,x)
+      // as removing a fence because it's a no-op.
+    }
+
+    target->Merge(src);
+
+    MaybeRecordStat(stats_, MethodCompilationStat::kConstructorFenceRemovedCFRE);
+  }
+
+  // Phase-local heap memory allocator for CFRE optimizer. Storage obtained
+  // through this allocator is immediately released when the CFRE optimizer is done.
+  ArenaAllocator scoped_allocator_;
+
+  // Set of constructor fences that we've seen in the current block.
+  // Each constructor fences acts as a guard for one or more `targets`.
+  // There exist no stores to any `targets` between any of these fences.
+  //
+  // Fences are in succession order (e.g. fence[i] succeeds fence[i-1]
+  // within the same basic block).
+  ArenaVector<HConstructorFence*> candidate_fences_;
+
+  // Stores a set of the fence targets, to allow faster lookup of whether
+  // a detected publish is a target of one of the candidate fences.
+  ArenaHashSet<HInstruction*> candidate_fence_targets_;
+
+  // Used to record stats about the optimization.
+  OptimizingCompilerStats* const stats_;
+
+  DISALLOW_COPY_AND_ASSIGN(CFREVisitor);
+};
+
+void ConstructorFenceRedundancyElimination::Run() {
+  CFREVisitor cfre_visitor(graph_, stats_);
+
+  // Arbitrarily visit in reverse-post order.
+  // The exact block visit order does not matter, as the algorithm
+  // only operates on a single block at a time.
+  cfre_visitor.VisitReversePostOrder();
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/constructor_fence_redundancy_elimination.h b/compiler/optimizing/constructor_fence_redundancy_elimination.h
new file mode 100644
index 0000000..d89210c
--- /dev/null
+++ b/compiler/optimizing/constructor_fence_redundancy_elimination.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_CONSTRUCTOR_FENCE_REDUNDANCY_ELIMINATION_H_
+#define ART_COMPILER_OPTIMIZING_CONSTRUCTOR_FENCE_REDUNDANCY_ELIMINATION_H_
+
+#include "optimization.h"
+
+namespace art {
+
+/*
+ * Constructor Fence Redundancy Elimination (CFRE).
+ *
+ * A local optimization pass that merges redundant constructor fences
+ * together within the same basic block.
+ *
+ * Abbreviations:
+ * - CF: Constructor Fence
+ * - CFS: Constructor Fence Set
+ * - CFTargets: The unique set of the inputs of all the instructions in CFS.
+ *
+ * Given any CFS = { CF(x), CF(y), CF(z), ... }, define CFTargets = { x, y, z, ... }.
+ * - Publish(R) must not exist for any R in CFTargets if this Publish(R) is between any CF in CFS.
+ * - This type of Publish(R) is called an "interesting publish".
+ *
+ * A Publish(R) is considered any instruction at which the reference to "R"
+ * may escape (e.g. invoke, store, return, etc) to another thread.
+ *
+ * Starting at the beginning of the block:
+ * - Find the largest contiguous CFS.
+ * - If we see an interesting publish, merge all instructions in CFS into a single CF(CFTargets).
+ * - Repeat until the block is fully visited.
+ * - At the end of the block, merge all instructions in CFS into a single CF(CFTargets).
+ */
+class ConstructorFenceRedundancyElimination : public HOptimization {
+ public:
+  ConstructorFenceRedundancyElimination(HGraph* graph,
+                                        OptimizingCompilerStats* stats)
+      : HOptimization(graph, kPassName, stats) {}
+
+  void Run() OVERRIDE;
+
+  static constexpr const char* kPassName = "constructor_fence_redundancy_elimination";
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ConstructorFenceRedundancyElimination);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_CONSTRUCTOR_FENCE_REDUNDANCY_ELIMINATION_H_
diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc
index 089340e..191d3d1 100644
--- a/compiler/optimizing/induction_var_range.cc
+++ b/compiler/optimizing/induction_var_range.cc
@@ -670,6 +670,15 @@
       return AddValue(GetFetch(instruction->InputAt(0), trip, in_body, is_min),
                       Value(static_cast<int32_t>(value)));
     }
+  } else if (instruction->IsSub()) {
+    // Incorporate suitable constants in the chased value.
+    if (IsInt64AndGet(instruction->InputAt(0), &value) && CanLongValueFitIntoInt(value)) {
+      return SubValue(Value(static_cast<int32_t>(value)),
+                      GetFetch(instruction->InputAt(1), trip, in_body, !is_min));
+    } else if (IsInt64AndGet(instruction->InputAt(1), &value) && CanLongValueFitIntoInt(value)) {
+      return SubValue(GetFetch(instruction->InputAt(0), trip, in_body, is_min),
+                      Value(static_cast<int32_t>(value)));
+    }
   } else if (instruction->IsArrayLength()) {
     // Exploit length properties when chasing constants or chase into a new array declaration.
     if (chase_hint_ == nullptr) {
diff --git a/compiler/optimizing/induction_var_range_test.cc b/compiler/optimizing/induction_var_range_test.cc
index 2b82b33..9437014 100644
--- a/compiler/optimizing/induction_var_range_test.cc
+++ b/compiler/optimizing/induction_var_range_test.cc
@@ -723,6 +723,29 @@
   ExpectEqual(Value(x_, 1, 0), GetMax(CreateFetch(array_length), nullptr));
 }
 
+TEST_F(InductionVarRangeTest, AddOrSubAndConstant) {
+  HInstruction* add = new (&allocator_)
+      HAdd(Primitive::kPrimInt, x_, graph_->GetIntConstant(-1));
+  HInstruction* alt = new (&allocator_)
+      HAdd(Primitive::kPrimInt, graph_->GetIntConstant(-1), x_);
+  HInstruction* sub = new (&allocator_)
+      HSub(Primitive::kPrimInt, x_, graph_->GetIntConstant(1));
+  HInstruction* rev = new (&allocator_)
+      HSub(Primitive::kPrimInt, graph_->GetIntConstant(1), x_);
+  entry_block_->AddInstruction(add);
+  entry_block_->AddInstruction(alt);
+  entry_block_->AddInstruction(sub);
+  entry_block_->AddInstruction(rev);
+  ExpectEqual(Value(x_, 1, -1), GetMin(CreateFetch(add), nullptr));
+  ExpectEqual(Value(x_, 1, -1), GetMax(CreateFetch(add), nullptr));
+  ExpectEqual(Value(x_, 1, -1), GetMin(CreateFetch(alt), nullptr));
+  ExpectEqual(Value(x_, 1, -1), GetMax(CreateFetch(alt), nullptr));
+  ExpectEqual(Value(x_, 1, -1), GetMin(CreateFetch(sub), nullptr));
+  ExpectEqual(Value(x_, 1, -1), GetMax(CreateFetch(sub), nullptr));
+  ExpectEqual(Value(x_, -1, 1), GetMin(CreateFetch(rev), nullptr));
+  ExpectEqual(Value(x_, -1, 1), GetMax(CreateFetch(rev), nullptr));
+}
+
 //
 // Tests on public methods.
 //
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 027ba77..e150b65 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -285,6 +285,19 @@
   return false;
 }
 
+// Translates operation to reduction kind.
+static HVecReduce::ReductionKind GetReductionKind(HInstruction* reduction) {
+  if (reduction->IsVecAdd() || reduction->IsVecSub()) {
+    return HVecReduce::kSum;
+  } else if (reduction->IsVecMin()) {
+    return HVecReduce::kMin;
+  } else if (reduction->IsVecMax()) {
+    return HVecReduce::kMax;
+  }
+  LOG(FATAL) << "Unsupported SIMD reduction";
+  UNREACHABLE();
+}
+
 // Test vector restrictions.
 static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) {
   return (restrictions & tested) != 0;
@@ -318,8 +331,9 @@
 
 HLoopOptimization::HLoopOptimization(HGraph* graph,
                                      CompilerDriver* compiler_driver,
-                                     HInductionVarAnalysis* induction_analysis)
-    : HOptimization(graph, kLoopOptimizationPassName),
+                                     HInductionVarAnalysis* induction_analysis,
+                                     OptimizingCompilerStats* stats)
+    : HOptimization(graph, kLoopOptimizationPassName, stats),
       compiler_driver_(compiler_driver),
       induction_range_(induction_analysis),
       loop_allocator_(nullptr),
@@ -334,7 +348,8 @@
       vector_peeling_candidate_(nullptr),
       vector_runtime_test_a_(nullptr),
       vector_runtime_test_b_(nullptr),
-      vector_map_(nullptr) {
+      vector_map_(nullptr),
+      vector_permanent_map_(nullptr) {
 }
 
 void HLoopOptimization::Run() {
@@ -388,11 +403,14 @@
     ArenaSet<ArrayReference> refs(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
     ArenaSafeMap<HInstruction*, HInstruction*> map(
         std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+    ArenaSafeMap<HInstruction*, HInstruction*> perm(
+        std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
     // Attach.
     iset_ = &iset;
     reductions_ = &reds;
     vector_refs_ = &refs;
     vector_map_ = &map;
+    vector_permanent_map_ = &perm;
     // Traverse.
     TraverseLoopsInnerToOuter(top_loop_);
     // Detach.
@@ -400,6 +418,7 @@
     reductions_ = nullptr;
     vector_refs_ = nullptr;
     vector_map_ = nullptr;
+    vector_permanent_map_ = nullptr;
   }
 }
 
@@ -603,11 +622,11 @@
   // Vectorize loop, if possible and valid.
   if (kEnableVectorization &&
       TrySetSimpleLoopHeader(header, &main_phi) &&
-      reductions_->empty() &&  // TODO: possible with some effort
       ShouldVectorize(node, body, trip_count) &&
       TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) {
     Vectorize(node, body, exit, trip_count);
     graph_->SetHasSIMD(true);  // flag SIMD usage
+    MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorized);
     return true;
   }
   return false;
@@ -802,6 +821,13 @@
                     /*unroll*/ 1);
   }
 
+  // Link reductions to their final uses.
+  for (auto i = reductions_->begin(); i != reductions_->end(); ++i) {
+    if (i->first->IsPhi()) {
+      i->first->ReplaceWith(ReduceAndExtractIfNeeded(i->second));
+    }
+  }
+
   // Remove the original loop by disconnecting the body block
   // and removing all instructions from the header.
   block->DisconnectAndDelete();
@@ -841,21 +867,10 @@
   vector_header_->AddInstruction(cond);
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
   vector_index_ = phi;
+  vector_permanent_map_->clear();  // preserved over unrolling
   for (uint32_t u = 0; u < unroll; u++) {
-    // Clear map, leaving loop invariants setup during unrolling.
-    if (u == 0) {
-      vector_map_->clear();
-    } else {
-      for (auto i = vector_map_->begin(); i != vector_map_->end(); ) {
-        if (i->second->IsVecReplicateScalar()) {
-          DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first));
-          ++i;
-        } else {
-          i = vector_map_->erase(i);
-        }
-      }
-    }
     // Generate instruction map.
+    vector_map_->clear();
     for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
       bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
       DCHECK(vectorized_def);
@@ -872,9 +887,17 @@
         }
       }
     }
+    // Generate the induction.
     vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step);
     Insert(vector_body_, vector_index_);
   }
+  // Finalize phi inputs for the reductions (if any).
+  for (auto i = reductions_->begin(); i != reductions_->end(); ++i) {
+    if (!i->first->IsPhi()) {
+      DCHECK(i->second->IsPhi());
+      GenerateVecReductionPhiInputs(i->second->AsPhi(), i->first);
+    }
+  }
   // Finalize phi inputs for the loop index.
   phi->AddInput(lo);
   phi->AddInput(vector_index_);
@@ -910,6 +933,23 @@
     }
     return false;
   }
+  // Accept a left-hand-side reduction for
+  // (1) supported vector type,
+  // (2) vectorizable right-hand-side value.
+  auto redit = reductions_->find(instruction);
+  if (redit != reductions_->end()) {
+    Primitive::Type type = instruction->GetType();
+    if (TrySetVectorType(type, &restrictions) &&
+        VectorizeUse(node, instruction, generate_code, type, restrictions)) {
+      if (generate_code) {
+        HInstruction* new_red = vector_map_->Get(instruction);
+        vector_permanent_map_->Put(new_red, vector_map_->Get(redit->second));
+        vector_permanent_map_->Overwrite(redit->second, new_red);
+      }
+      return true;
+    }
+    return false;
+  }
   // Branch back okay.
   if (instruction->IsGoto()) {
     return true;
@@ -965,6 +1005,21 @@
       }
       return true;
     }
+  } else if (instruction->IsPhi()) {
+    // Accept particular phi operations.
+    if (reductions_->find(instruction) != reductions_->end()) {
+      // Deal with vector restrictions.
+      if (HasVectorRestrictions(restrictions, kNoReduction)) {
+        return false;
+      }
+      // Accept a reduction.
+      if (generate_code) {
+        GenerateVecReductionPhi(instruction->AsPhi());
+      }
+      return true;
+    }
+    // TODO: accept right-hand-side induction?
+    return false;
   } else if (instruction->IsTypeConversion()) {
     // Accept particular type conversions.
     HTypeConversion* conversion = instruction->AsTypeConversion();
@@ -1155,14 +1210,14 @@
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(8);
         case Primitive::kPrimChar:
         case Primitive::kPrimShort:
-          *restrictions |= kNoDiv | kNoStringCharAt;
+          *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
           return TrySetVectorLength(4);
         case Primitive::kPrimInt:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(2);
         default:
           break;
@@ -1174,11 +1229,11 @@
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(16);
         case Primitive::kPrimChar:
         case Primitive::kPrimShort:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(8);
         case Primitive::kPrimInt:
           *restrictions |= kNoDiv;
@@ -1187,8 +1242,10 @@
           *restrictions |= kNoDiv | kNoMul | kNoMinMax;
           return TrySetVectorLength(2);
         case Primitive::kPrimFloat:
+          *restrictions |= kNoReduction;
           return TrySetVectorLength(4);
         case Primitive::kPrimDouble:
+          *restrictions |= kNoReduction;
           return TrySetVectorLength(2);
         default:
           return false;
@@ -1200,11 +1257,12 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
+            *restrictions |=
+                kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
+            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
             *restrictions |= kNoDiv;
@@ -1213,10 +1271,10 @@
             *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
-            *restrictions |= kNoMinMax;  // -0.0 vs +0.0
+            *restrictions |= kNoMinMax | kNoReduction;  // minmax: -0.0 vs +0.0
             return TrySetVectorLength(4);
           case Primitive::kPrimDouble:
-            *restrictions |= kNoMinMax;  // -0.0 vs +0.0
+            *restrictions |= kNoMinMax | kNoReduction;  // minmax: -0.0 vs +0.0
             return TrySetVectorLength(2);
           default:
             break;
@@ -1228,23 +1286,23 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(4);
           case Primitive::kPrimDouble:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(2);
           default:
             break;
@@ -1256,23 +1314,23 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(4);
           case Primitive::kPrimDouble:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(2);
           default:
             break;
@@ -1305,9 +1363,16 @@
       return;
     }
     // In vector code, explicit scalar expansion is needed.
-    HInstruction* vector = new (global_allocator_) HVecReplicateScalar(
-        global_allocator_, org, type, vector_length_);
-    vector_map_->Put(org, Insert(vector_preheader_, vector));
+    HInstruction* vector = nullptr;
+    auto it = vector_permanent_map_->find(org);
+    if (it != vector_permanent_map_->end()) {
+      vector = it->second;  // reuse during unrolling
+    } else {
+      vector = new (global_allocator_) HVecReplicateScalar(
+          global_allocator_, org, type, vector_length_);
+      vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
+    }
+    vector_map_->Put(org, vector);
   }
 }
 
@@ -1362,6 +1427,78 @@
   vector_map_->Put(org, vector);
 }
 
+void HLoopOptimization::GenerateVecReductionPhi(HPhi* phi) {
+  DCHECK(reductions_->find(phi) != reductions_->end());
+  DCHECK(reductions_->Get(phi->InputAt(1)) == phi);
+  HInstruction* vector = nullptr;
+  if (vector_mode_ == kSequential) {
+    HPhi* new_phi = new (global_allocator_) HPhi(
+        global_allocator_, kNoRegNumber, 0, phi->GetType());
+    vector_header_->AddPhi(new_phi);
+    vector = new_phi;
+  } else {
+    // Link vector reduction back to prior unrolled update, or a first phi.
+    auto it = vector_permanent_map_->find(phi);
+    if (it != vector_permanent_map_->end()) {
+      vector = it->second;
+    } else {
+      HPhi* new_phi = new (global_allocator_) HPhi(
+          global_allocator_, kNoRegNumber, 0, HVecOperation::kSIMDType);
+      vector_header_->AddPhi(new_phi);
+      vector = new_phi;
+    }
+  }
+  vector_map_->Put(phi, vector);
+}
+
+void HLoopOptimization::GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction) {
+  HInstruction* new_phi = vector_map_->Get(phi);
+  HInstruction* new_init = reductions_->Get(phi);
+  HInstruction* new_red = vector_map_->Get(reduction);
+  // Link unrolled vector loop back to new phi.
+  for (; !new_phi->IsPhi(); new_phi = vector_permanent_map_->Get(new_phi)) {
+    DCHECK(new_phi->IsVecOperation());
+  }
+  // Prepare the new initialization.
+  if (vector_mode_ == kVector) {
+    // Generate a [initial, 0, .., 0] vector.
+    new_init = Insert(
+            vector_preheader_,
+            new (global_allocator_) HVecSetScalars(
+                global_allocator_, &new_init, phi->GetType(), vector_length_, 1));
+  } else {
+    new_init = ReduceAndExtractIfNeeded(new_init);
+  }
+  // Set the phi inputs.
+  DCHECK(new_phi->IsPhi());
+  new_phi->AsPhi()->AddInput(new_init);
+  new_phi->AsPhi()->AddInput(new_red);
+  // New feed value for next phi (safe mutation in iteration).
+  reductions_->find(phi)->second = new_phi;
+}
+
+HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruction) {
+  if (instruction->IsPhi()) {
+    HInstruction* input = instruction->InputAt(1);
+    if (input->IsVecOperation()) {
+      Primitive::Type type = input->AsVecOperation()->GetPackedType();
+      HBasicBlock* exit = instruction->GetBlock()->GetSuccessors()[0];
+      // Generate a vector reduction and scalar extract
+      //    x = REDUCE( [x_1, .., x_n] )
+      //    y = x_1
+      // along the exit of the defining loop.
+      HVecReduce::ReductionKind kind = GetReductionKind(input);
+      HInstruction* reduce = new (global_allocator_) HVecReduce(
+          global_allocator_, instruction, type, vector_length_, kind);
+      exit->InsertInstructionBefore(reduce, exit->GetFirstInstruction());
+      instruction = new (global_allocator_) HVecExtractScalar(
+          global_allocator_, reduce, type, vector_length_, 0);
+      exit->InsertInstructionAfter(instruction, reduce);
+    }
+  }
+  return instruction;
+}
+
 #define GENERATE_VEC(x, y) \
   if (vector_mode_ == kVector) { \
     vector = (x); \
@@ -1542,10 +1679,9 @@
   // Test for top level arithmetic shift right x >> 1 or logical shift right x >>> 1
   // (note whether the sign bit in wider precision is shifted in has no effect
   // on the narrow precision computed by the idiom).
-  int64_t distance = 0;
   if ((instruction->IsShr() ||
        instruction->IsUShr()) &&
-      IsInt64AndGet(instruction->InputAt(1), /*out*/ &distance) && distance == 1) {
+      IsInt64Value(instruction->InputAt(1), 1)) {
     // Test for (a + b + c) >> 1 for optional constant c.
     HInstruction* a = nullptr;
     HInstruction* b = nullptr;
@@ -1590,6 +1726,7 @@
                 vector_length_,
                 is_unsigned,
                 is_rounded));
+            MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
           } else {
             GenerateVecOp(instruction, vector_map_->Get(r), vector_map_->Get(s), type);
           }
@@ -1624,21 +1761,33 @@
   vector_peeling_candidate_ = candidate;
 }
 
+static constexpr uint32_t ARM64_SIMD_MAXIMUM_UNROLL_FACTOR = 8;
+static constexpr uint32_t ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE = 50;
+
 uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
-  // Current heuristic: unroll by 2 on ARM64/X86 for large known trip
-  // counts and small loop bodies.
-  // TODO: refine with operation count, remaining iterations, etc.
-  //       Artem had some really cool ideas for this already.
   switch (compiler_driver_->GetInstructionSet()) {
-    case kArm64:
-    case kX86:
-    case kX86_64: {
-      size_t num_instructions = block->GetInstructions().CountSize();
-      if (num_instructions <= 10 && trip_count >= 4 * vector_length_) {
-        return 2;
+    case kArm64: {
+      DCHECK_NE(vector_length_, 0u);
+      // TODO: Unroll loops with unknown trip count.
+      if (trip_count < 2 * vector_length_) {
+        return 1;
       }
-      return 1;
+
+      uint32_t instruction_count = block->GetInstructions().CountSize();
+
+      // Find a beneficial unroll factor with the following restrictions:
+      //  - At least one iteration of the transformed loop should be executed.
+      //  - The loop body shouldn't be "too big" (heuristic).
+      uint32_t uf1 = ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE / instruction_count;
+      uint32_t uf2 = trip_count / vector_length_;
+      uint32_t unroll_factor =
+          TruncToPowerOfTwo(std::min({uf1, uf2, ARM64_SIMD_MAXIMUM_UNROLL_FACTOR}));
+      DCHECK_GE(unroll_factor, 1u);
+
+      return unroll_factor;
     }
+    case kX86:
+    case kX86_64:
     default:
       return 1;
   }
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 49be8a3..f347518 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -34,7 +34,8 @@
  public:
   HLoopOptimization(HGraph* graph,
                     CompilerDriver* compiler_driver,
-                    HInductionVarAnalysis* induction_analysis);
+                    HInductionVarAnalysis* induction_analysis,
+                    OptimizingCompilerStats* stats);
 
   void Run() OVERRIDE;
 
@@ -62,17 +63,18 @@
    * Vectorization restrictions (bit mask).
    */
   enum VectorRestrictions {
-    kNone            = 0,    // no restrictions
-    kNoMul           = 1,    // no multiplication
-    kNoDiv           = 2,    // no division
-    kNoShift         = 4,    // no shift
-    kNoShr           = 8,    // no arithmetic shift right
-    kNoHiBits        = 16,   // "wider" operations cannot bring in higher order bits
-    kNoSignedHAdd    = 32,   // no signed halving add
-    kNoUnroundedHAdd = 64,   // no unrounded halving add
-    kNoAbs           = 128,  // no absolute value
-    kNoMinMax        = 256,  // no min/max
-    kNoStringCharAt  = 512,  // no StringCharAt
+    kNone            = 0,        // no restrictions
+    kNoMul           = 1 << 0,   // no multiplication
+    kNoDiv           = 1 << 1,   // no division
+    kNoShift         = 1 << 2,   // no shift
+    kNoShr           = 1 << 3,   // no arithmetic shift right
+    kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
+    kNoSignedHAdd    = 1 << 5,   // no signed halving add
+    kNoUnroundedHAdd = 1 << 6,   // no unrounded halving add
+    kNoAbs           = 1 << 7,   // no absolute value
+    kNoMinMax        = 1 << 8,   // no min/max
+    kNoStringCharAt  = 1 << 9,   // no StringCharAt
+    kNoReduction     = 1 << 10,  // no reduction
   };
 
   /*
@@ -155,6 +157,9 @@
                       HInstruction* opb,
                       HInstruction* offset,
                       Primitive::Type type);
+  void GenerateVecReductionPhi(HPhi* phi);
+  void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
+  HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
   void GenerateVecOp(HInstruction* org,
                      HInstruction* opa,
                      HInstruction* opb,
@@ -253,6 +258,10 @@
   // Contents reside in phase-local heap memory.
   ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
 
+  // Permanent mapping used during vectorization synthesis.
+  // Contents reside in phase-local heap memory.
+  ArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;
+
   // Temporary vectorization bookkeeping.
   VectorMode vector_mode_;  // synthesis mode
   HBasicBlock* vector_preheader_;  // preheader of the new loop
diff --git a/compiler/optimizing/loop_optimization_test.cc b/compiler/optimizing/loop_optimization_test.cc
index b5b03d8..1c5603d 100644
--- a/compiler/optimizing/loop_optimization_test.cc
+++ b/compiler/optimizing/loop_optimization_test.cc
@@ -31,7 +31,7 @@
         allocator_(&pool_),
         graph_(CreateGraph(&allocator_)),
         iva_(new (&allocator_) HInductionVarAnalysis(graph_)),
-        loop_opt_(new (&allocator_) HLoopOptimization(graph_, nullptr, iva_)) {
+        loop_opt_(new (&allocator_) HLoopOptimization(graph_, nullptr, iva_, nullptr)) {
     BuildGraph();
   }
 
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 2a7017c..9cff6b0 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -1289,18 +1289,59 @@
   return remove_count;
 }
 
-HInstruction* HConstructorFence::GetAssociatedAllocation() {
+void HConstructorFence::Merge(HConstructorFence* other) {
+  // Do not delete yourself from the graph.
+  DCHECK(this != other);
+  // Don't try to merge with an instruction not associated with a block.
+  DCHECK(other->GetBlock() != nullptr);
+  // A constructor fence's return type is "kPrimVoid"
+  // and therefore it cannot have any environment uses.
+  DCHECK(!other->HasEnvironmentUses());
+
+  auto has_input = [](HInstruction* haystack, HInstruction* needle) {
+    // Check if `haystack` has `needle` as any of its inputs.
+    for (size_t input_count = 0; input_count < haystack->InputCount(); ++input_count) {
+      if (haystack->InputAt(input_count) == needle) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  // Add any inputs from `other` into `this` if it wasn't already an input.
+  for (size_t input_count = 0; input_count < other->InputCount(); ++input_count) {
+    HInstruction* other_input = other->InputAt(input_count);
+    if (!has_input(this, other_input)) {
+      AddInput(other_input);
+    }
+  }
+
+  other->GetBlock()->RemoveInstruction(other);
+}
+
+HInstruction* HConstructorFence::GetAssociatedAllocation(bool ignore_inputs) {
   HInstruction* new_instance_inst = GetPrevious();
   // Check if the immediately preceding instruction is a new-instance/new-array.
   // Otherwise this fence is for protecting final fields.
   if (new_instance_inst != nullptr &&
       (new_instance_inst->IsNewInstance() || new_instance_inst->IsNewArray())) {
-    // TODO: Need to update this code to handle multiple inputs.
-    DCHECK_EQ(InputCount(), 1u);
-    return new_instance_inst;
-  } else {
-    return nullptr;
+    if (ignore_inputs) {
+      // If inputs are ignored, simply check if the predecessor is
+      // *any* HNewInstance/HNewArray.
+      //
+      // Inputs are normally only ignored for prepare_for_register_allocation,
+      // at which point *any* prior HNewInstance/Array can be considered
+      // associated.
+      return new_instance_inst;
+    } else {
+      // Normal case: There must be exactly 1 input and the previous instruction
+      // must be that input.
+      if (InputCount() == 1u && InputAt(0) == new_instance_inst) {
+        return new_instance_inst;
+      }
+    }
   }
+  return nullptr;
 }
 
 #define DEFINE_ACCEPT(name, super)                                             \
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index b551f37..a6d0da1 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1374,7 +1374,8 @@
   M(UShr, BinaryOperation)                                              \
   M(Xor, BinaryOperation)                                               \
   M(VecReplicateScalar, VecUnaryOperation)                              \
-  M(VecSumReduce, VecUnaryOperation)                                    \
+  M(VecExtractScalar, VecUnaryOperation)                                \
+  M(VecReduce, VecUnaryOperation)                                       \
   M(VecCnv, VecUnaryOperation)                                          \
   M(VecNeg, VecUnaryOperation)                                          \
   M(VecAbs, VecUnaryOperation)                                          \
@@ -6648,13 +6649,24 @@
   // Returns how many HConstructorFence instructions were removed from graph.
   static size_t RemoveConstructorFences(HInstruction* instruction);
 
+  // Combine all inputs of `this` and `other` instruction and remove
+  // `other` from the graph.
+  //
+  // Inputs are unique after the merge.
+  //
+  // Requirement: `this` must not be the same as `other.
+  void Merge(HConstructorFence* other);
+
   // Check if this constructor fence is protecting
   // an HNewInstance or HNewArray that is also the immediate
   // predecessor of `this`.
   //
+  // If `ignore_inputs` is true, then the immediate predecessor doesn't need
+  // to be one of the inputs of `this`.
+  //
   // Returns the associated HNewArray or HNewInstance,
   // or null otherwise.
-  HInstruction* GetAssociatedAllocation();
+  HInstruction* GetAssociatedAllocation(bool ignore_inputs = false);
 
   DECLARE_INSTRUCTION(ConstructorFence);
 
@@ -7042,6 +7054,17 @@
   return false;
 }
 
+// Returns true iff instruction is the given integral constant.
+inline bool IsInt64Value(HInstruction* instruction, int64_t value) {
+  int64_t val = 0;
+  return IsInt64AndGet(instruction, &val) && val == value;
+}
+
+// Returns true iff instruction is a zero bit pattern.
+inline bool IsZeroBitPattern(HInstruction* instruction) {
+  return instruction->IsConstant() && instruction->AsConstant()->IsZeroBitPattern();
+}
+
 #define INSTRUCTION_TYPE_CHECK(type, super)                                    \
   inline bool HInstruction::Is##type() const { return GetKind() == k##type; }  \
   inline const H##type* HInstruction::As##type() const {                       \
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 6261171..886d75e 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -63,6 +63,10 @@
 // GetVectorLength() x GetPackedType() operations simultaneously.
 class HVecOperation : public HVariableInputSizeInstruction {
  public:
+  // A SIMD operation looks like a FPU location.
+  // TODO: we could introduce SIMD types in HIR.
+  static constexpr Primitive::Type kSIMDType = Primitive::kPrimDouble;
+
   HVecOperation(ArenaAllocator* arena,
                 Primitive::Type packed_type,
                 SideEffects side_effects,
@@ -89,10 +93,9 @@
     return vector_length_ * Primitive::ComponentSize(GetPackedType());
   }
 
-  // Returns the type of the vector operation: a SIMD operation looks like a FPU location.
-  // TODO: we could introduce SIMD types in HIR.
+  // Returns the type of the vector operation.
   Primitive::Type GetType() const OVERRIDE {
-    return Primitive::kPrimDouble;
+    return kSIMDType;
   }
 
   // Returns the true component type packed in a vector.
@@ -220,8 +223,11 @@
   DISALLOW_COPY_AND_ASSIGN(HVecMemoryOperation);
 };
 
-// Packed type consistency checker (same vector length integral types may mix freely).
+// Packed type consistency checker ("same vector length" integral types may mix freely).
 inline static bool HasConsistentPackedTypes(HInstruction* input, Primitive::Type type) {
+  if (input->IsPhi()) {
+    return input->GetType() == HVecOperation::kSIMDType;  // carries SIMD
+  }
   DCHECK(input->IsVecOperation());
   Primitive::Type input_type = input->AsVecOperation()->GetPackedType();
   switch (input_type) {
@@ -265,27 +271,77 @@
   DISALLOW_COPY_AND_ASSIGN(HVecReplicateScalar);
 };
 
-// Sum-reduces the given vector into a shorter vector (m < n) or scalar (m = 1),
-// viz. sum-reduce[ x1, .. , xn ] = [ y1, .., ym ], where yi = sum_j x_j.
-class HVecSumReduce FINAL : public HVecUnaryOperation {
-  HVecSumReduce(ArenaAllocator* arena,
-                HInstruction* input,
-                Primitive::Type packed_type,
-                size_t vector_length,
-                uint32_t dex_pc = kNoDexPc)
+// Extracts a particular scalar from the given vector,
+// viz. extract[ x1, .. , xn ] = x_i.
+//
+// TODO: for now only i == 1 case supported.
+class HVecExtractScalar FINAL : public HVecUnaryOperation {
+ public:
+  HVecExtractScalar(ArenaAllocator* arena,
+                    HInstruction* input,
+                    Primitive::Type packed_type,
+                    size_t vector_length,
+                    size_t index,
+                    uint32_t dex_pc = kNoDexPc)
       : HVecUnaryOperation(arena, input, packed_type, vector_length, dex_pc) {
     DCHECK(HasConsistentPackedTypes(input, packed_type));
+    DCHECK_LT(index, vector_length);
+    DCHECK_EQ(index, 0u);
+  }
+
+  // Yields a single component in the vector.
+  Primitive::Type GetType() const OVERRIDE {
+    return GetPackedType();
+  }
+
+  // An extract needs to stay in place, since SIMD registers are not
+  // kept alive across vector loop boundaries (yet).
+  bool CanBeMoved() const OVERRIDE { return false; }
+
+  DECLARE_INSTRUCTION(VecExtractScalar);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecExtractScalar);
+};
+
+// Reduces the given vector into the first element as sum/min/max,
+// viz. sum-reduce[ x1, .. , xn ] = [ y, ---- ], where y = sum xi
+// and the "-" denotes "don't care" (implementation dependent).
+class HVecReduce FINAL : public HVecUnaryOperation {
+ public:
+  enum ReductionKind {
+    kSum = 1,
+    kMin = 2,
+    kMax = 3
+  };
+
+  HVecReduce(ArenaAllocator* arena,
+             HInstruction* input,
+             Primitive::Type packed_type,
+             size_t vector_length,
+             ReductionKind kind,
+             uint32_t dex_pc = kNoDexPc)
+      : HVecUnaryOperation(arena, input, packed_type, vector_length, dex_pc),
+        kind_(kind) {
+    DCHECK(HasConsistentPackedTypes(input, packed_type));
   }
 
-  // TODO: probably integral promotion
-  Primitive::Type GetType() const OVERRIDE { return GetPackedType(); }
+  ReductionKind GetKind() const { return kind_; }
 
   bool CanBeMoved() const OVERRIDE { return true; }
 
-  DECLARE_INSTRUCTION(VecSumReduce);
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
+    DCHECK(other->IsVecReduce());
+    const HVecReduce* o = other->AsVecReduce();
+    return HVecOperation::InstructionDataEquals(o) && GetKind() == o->GetKind();
+  }
+
+  DECLARE_INSTRUCTION(VecReduce);
 
  private:
-  DISALLOW_COPY_AND_ASSIGN(HVecSumReduce);
+  const ReductionKind kind_;
+
+  DISALLOW_COPY_AND_ASSIGN(HVecReduce);
 };
 
 // Converts every component in the vector,
@@ -754,20 +810,23 @@
 //
 
 // Assigns the given scalar elements to a vector,
-// viz. set( array(x1, .., xn) ) = [ x1, .. , xn ].
+// viz. set( array(x1, .., xn) ) = [ x1, .. ,           xn ] if n == m,
+//      set( array(x1, .., xm) ) = [ x1, .. , xm, 0, .., 0 ] if m <  n.
 class HVecSetScalars FINAL : public HVecOperation {
+ public:
   HVecSetScalars(ArenaAllocator* arena,
                  HInstruction** scalars,  // array
                  Primitive::Type packed_type,
                  size_t vector_length,
+                 size_t number_of_scalars,
                  uint32_t dex_pc = kNoDexPc)
       : HVecOperation(arena,
                       packed_type,
                       SideEffects::None(),
-                      /* number_of_inputs */ vector_length,
+                      number_of_scalars,
                       vector_length,
                       dex_pc) {
-    for (size_t i = 0; i < vector_length; i++) {
+    for (size_t i = 0; i < number_of_scalars; i++) {
       DCHECK(!scalars[i]->IsVecOperation());
       SetRawInputAt(0, scalars[i]);
     }
diff --git a/compiler/optimizing/nodes_vector_test.cc b/compiler/optimizing/nodes_vector_test.cc
index 0238ea4..5a56a2c 100644
--- a/compiler/optimizing/nodes_vector_test.cc
+++ b/compiler/optimizing/nodes_vector_test.cc
@@ -332,4 +332,32 @@
   EXPECT_FALSE(v1->Equals(v3));  // different vector lengths
 }
 
+TEST_F(NodesVectorTest, VectorKindMattersOnReduce) {
+  HVecOperation* v0 = new (&allocator_)
+      HVecReplicateScalar(&allocator_, parameter_, Primitive::kPrimInt, 4);
+
+  HVecReduce* v1 = new (&allocator_) HVecReduce(
+      &allocator_, v0, Primitive::kPrimInt, 4, HVecReduce::kSum);
+  HVecReduce* v2 = new (&allocator_) HVecReduce(
+      &allocator_, v0, Primitive::kPrimInt, 4, HVecReduce::kMin);
+  HVecReduce* v3 = new (&allocator_) HVecReduce(
+      &allocator_, v0, Primitive::kPrimInt, 4, HVecReduce::kMax);
+
+  EXPECT_FALSE(v0->CanBeMoved());
+  EXPECT_TRUE(v1->CanBeMoved());
+  EXPECT_TRUE(v2->CanBeMoved());
+  EXPECT_TRUE(v3->CanBeMoved());
+
+  EXPECT_EQ(HVecReduce::kSum, v1->GetKind());
+  EXPECT_EQ(HVecReduce::kMin, v2->GetKind());
+  EXPECT_EQ(HVecReduce::kMax, v3->GetKind());
+
+  EXPECT_TRUE(v1->Equals(v1));
+  EXPECT_TRUE(v2->Equals(v2));
+  EXPECT_TRUE(v3->Equals(v3));
+
+  EXPECT_FALSE(v1->Equals(v2));  // different kinds
+  EXPECT_FALSE(v1->Equals(v3));
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index e98c97c..399cd98 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -53,6 +53,7 @@
 #include "compiled_method.h"
 #include "compiler.h"
 #include "constant_folding.h"
+#include "constructor_fence_redundancy_elimination.h"
 #include "dead_code_elimination.h"
 #include "debug/elf_debug_writer.h"
 #include "debug/method_debug_info.h"
@@ -509,11 +510,13 @@
   } else if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) {
     return new (arena) SideEffectsAnalysis(graph);
   } else if (opt_name == HLoopOptimization::kLoopOptimizationPassName) {
-    return new (arena) HLoopOptimization(graph, driver, most_recent_induction);
+    return new (arena) HLoopOptimization(graph, driver, most_recent_induction, stats);
   } else if (opt_name == CHAGuardOptimization::kCHAGuardOptimizationPassName) {
     return new (arena) CHAGuardOptimization(graph);
   } else if (opt_name == CodeSinking::kCodeSinkingPassName) {
     return new (arena) CodeSinking(graph, stats);
+  } else if (opt_name == ConstructorFenceRedundancyElimination::kPassName) {
+    return new (arena) ConstructorFenceRedundancyElimination(graph, stats);
 #ifdef ART_ENABLE_CODEGEN_arm
   } else if (opt_name == arm::InstructionSimplifierArm::kInstructionSimplifierArmPassName) {
     return new (arena) arm::InstructionSimplifierArm(graph, stats);
@@ -770,7 +773,7 @@
   LICM* licm = new (arena) LICM(graph, *side_effects1, stats);
   HInductionVarAnalysis* induction = new (arena) HInductionVarAnalysis(graph);
   BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph, *side_effects1, induction);
-  HLoopOptimization* loop = new (arena) HLoopOptimization(graph, driver, induction);
+  HLoopOptimization* loop = new (arena) HLoopOptimization(graph, driver, induction, stats);
   LoadStoreAnalysis* lsa = new (arena) LoadStoreAnalysis(graph);
   LoadStoreElimination* lse = new (arena) LoadStoreElimination(graph, *side_effects2, *lsa, stats);
   HSharpening* sharpening = new (arena) HSharpening(
@@ -784,6 +787,8 @@
   IntrinsicsRecognizer* intrinsics = new (arena) IntrinsicsRecognizer(graph, stats);
   CHAGuardOptimization* cha_guard = new (arena) CHAGuardOptimization(graph);
   CodeSinking* code_sinking = new (arena) CodeSinking(graph, stats);
+  ConstructorFenceRedundancyElimination* cfre =
+      new (arena) ConstructorFenceRedundancyElimination(graph, stats);
 
   HOptimization* optimizations1[] = {
     intrinsics,
@@ -821,6 +826,8 @@
     // can satisfy. For example, the code generator does not expect to see a
     // HTypeConversion from a type to the same type.
     simplify4,
+    cfre,  // Eliminate constructor fences after code sinking to avoid
+           // complicated sinking logic to split a fence with many inputs.
   };
   RunOptimizations(optimizations2, arraysize(optimizations2), pass_observer);
 
diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h
index d6da73c..07f9635 100644
--- a/compiler/optimizing/optimizing_compiler_stats.h
+++ b/compiler/optimizing/optimizing_compiler_stats.h
@@ -63,6 +63,8 @@
   kBooleanSimplified,
   kIntrinsicRecognized,
   kLoopInvariantMoved,
+  kLoopVectorized,
+  kLoopVectorizedIdiom,
   kSelectGenerated,
   kRemovedInstanceOf,
   kInlinedInvokeVirtualOrInterface,
@@ -91,6 +93,7 @@
   kConstructorFenceGeneratedFinal,
   kConstructorFenceRemovedLSE,
   kConstructorFenceRemovedPFRA,
+  kConstructorFenceRemovedCFRE,
   kLastStat
 };
 
@@ -183,6 +186,8 @@
       case kBooleanSimplified : name = "BooleanSimplified"; break;
       case kIntrinsicRecognized : name = "IntrinsicRecognized"; break;
       case kLoopInvariantMoved : name = "LoopInvariantMoved"; break;
+      case kLoopVectorized : name = "LoopVectorized"; break;
+      case kLoopVectorizedIdiom : name = "LoopVectorizedIdiom"; break;
       case kSelectGenerated : name = "SelectGenerated"; break;
       case kRemovedInstanceOf: name = "RemovedInstanceOf"; break;
       case kInlinedInvokeVirtualOrInterface: name = "InlinedInvokeVirtualOrInterface"; break;
@@ -211,6 +216,7 @@
       case kConstructorFenceGeneratedFinal: name = "ConstructorFenceGeneratedFinal"; break;
       case kConstructorFenceRemovedLSE: name = "ConstructorFenceRemovedLSE"; break;
       case kConstructorFenceRemovedPFRA: name = "ConstructorFenceRemovedPFRA"; break;
+      case kConstructorFenceRemovedCFRE: name = "ConstructorFenceRemovedCFRE"; break;
 
       case kLastStat:
         LOG(FATAL) << "invalid stat "
diff --git a/compiler/optimizing/scheduler_arm64.cc b/compiler/optimizing/scheduler_arm64.cc
index 510619f..1d9d28a 100644
--- a/compiler/optimizing/scheduler_arm64.cc
+++ b/compiler/optimizing/scheduler_arm64.cc
@@ -215,12 +215,12 @@
   last_visited_latency_ = kArm64SIMDReplicateOpLatency;
 }
 
-void SchedulingLatencyVisitorARM64::VisitVecSetScalars(HVecSetScalars* instr) {
-  LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId();
+void SchedulingLatencyVisitorARM64::VisitVecExtractScalar(HVecExtractScalar* instr) {
+  HandleSimpleArithmeticSIMD(instr);
 }
 
-void SchedulingLatencyVisitorARM64::VisitVecSumReduce(HVecSumReduce* instr) {
-  LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId();
+void SchedulingLatencyVisitorARM64::VisitVecReduce(HVecReduce* instr) {
+  HandleSimpleArithmeticSIMD(instr);
 }
 
 void SchedulingLatencyVisitorARM64::VisitVecCnv(HVecCnv* instr ATTRIBUTE_UNUSED) {
@@ -283,8 +283,8 @@
   last_visited_latency_ = kArm64SIMDIntegerOpLatency;
 }
 
-void SchedulingLatencyVisitorARM64::VisitVecAndNot(HVecAndNot* instr) {
-  LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId();
+void SchedulingLatencyVisitorARM64::VisitVecAndNot(HVecAndNot* instr ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64SIMDIntegerOpLatency;
 }
 
 void SchedulingLatencyVisitorARM64::VisitVecOr(HVecOr* instr ATTRIBUTE_UNUSED) {
@@ -307,6 +307,10 @@
   HandleSimpleArithmeticSIMD(instr);
 }
 
+void SchedulingLatencyVisitorARM64::VisitVecSetScalars(HVecSetScalars* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
 void SchedulingLatencyVisitorARM64::VisitVecMultiplyAccumulate(
     HVecMultiplyAccumulate* instr ATTRIBUTE_UNUSED) {
   last_visited_latency_ = kArm64SIMDMulIntegerLatency;
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
index 63d5b7d..e1a80ec 100644
--- a/compiler/optimizing/scheduler_arm64.h
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -83,8 +83,8 @@
   M(SuspendCheck         , unused)                   \
   M(TypeConversion       , unused)                   \
   M(VecReplicateScalar   , unused)                   \
-  M(VecSetScalars        , unused)                   \
-  M(VecSumReduce         , unused)                   \
+  M(VecExtractScalar     , unused)                   \
+  M(VecReduce            , unused)                   \
   M(VecCnv               , unused)                   \
   M(VecNeg               , unused)                   \
   M(VecAbs               , unused)                   \
@@ -103,6 +103,7 @@
   M(VecShl               , unused)                   \
   M(VecShr               , unused)                   \
   M(VecUShr              , unused)                   \
+  M(VecSetScalars        , unused)                   \
   M(VecMultiplyAccumulate, unused)                   \
   M(VecLoad              , unused)                   \
   M(VecStore             , unused)
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 4c8fb68..dd6dcd1 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -86,7 +86,7 @@
   int32_t y = High32Bits(value);
 
   if (x == y) {
-    return (IsUint<16>(x) || IsInt<16>(x) || ((x & 0xFFFF) == 0 && IsInt<16>(value >> 16))) ? 2 : 3;
+    return (IsUint<16>(x) || IsInt<16>(x) || ((x & 0xFFFF) == 0)) ? 2 : 3;
   }
 
   return INT_MAX;
diff --git a/openjdkjvmti/OpenjdkJvmTi.cc b/openjdkjvmti/OpenjdkJvmTi.cc
index 6c0d492..277f611 100644
--- a/openjdkjvmti/OpenjdkJvmTi.cc
+++ b/openjdkjvmti/OpenjdkJvmTi.cc
@@ -810,11 +810,11 @@
   }
 
   static jvmtiError GetObjectMonitorUsage(jvmtiEnv* env,
-                                          jobject object ATTRIBUTE_UNUSED,
-                                          jvmtiMonitorUsage* info_ptr ATTRIBUTE_UNUSED) {
+                                          jobject object,
+                                          jvmtiMonitorUsage* info_ptr) {
     ENSURE_VALID_ENV(env);
     ENSURE_HAS_CAP(env, can_get_monitor_info);
-    return ERR(NOT_IMPLEMENTED);
+    return ObjectUtil::GetObjectMonitorUsage(env, object, info_ptr);
   }
 
   static jvmtiError GetFieldName(jvmtiEnv* env,
diff --git a/openjdkjvmti/art_jvmti.h b/openjdkjvmti/art_jvmti.h
index 93eee28..d3f52f6 100644
--- a/openjdkjvmti/art_jvmti.h
+++ b/openjdkjvmti/art_jvmti.h
@@ -226,7 +226,7 @@
     .can_get_synthetic_attribute                     = 1,
     .can_get_owned_monitor_info                      = 1,
     .can_get_current_contended_monitor               = 0,
-    .can_get_monitor_info                            = 0,
+    .can_get_monitor_info                            = 1,
     .can_pop_frame                                   = 0,
     .can_redefine_classes                            = 1,
     .can_signal_thread                               = 0,
diff --git a/openjdkjvmti/ti_object.cc b/openjdkjvmti/ti_object.cc
index 2506aca..89ce352 100644
--- a/openjdkjvmti/ti_object.cc
+++ b/openjdkjvmti/ti_object.cc
@@ -35,6 +35,8 @@
 #include "mirror/object-inl.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread-current-inl.h"
+#include "thread_list.h"
+#include "ti_thread.h"
 
 namespace openjdkjvmti {
 
@@ -73,4 +75,59 @@
   return ERR(NONE);
 }
 
+jvmtiError ObjectUtil::GetObjectMonitorUsage(
+    jvmtiEnv* baseenv, jobject obj, jvmtiMonitorUsage* usage) {
+  ArtJvmTiEnv* env = ArtJvmTiEnv::AsArtJvmTiEnv(baseenv);
+  if (obj == nullptr) {
+    return ERR(INVALID_OBJECT);
+  }
+  if (usage == nullptr) {
+    return ERR(NULL_POINTER);
+  }
+  art::Thread* self = art::Thread::Current();
+  ThreadUtil::SuspendCheck(self);
+  art::JNIEnvExt* jni = self->GetJniEnv();
+  std::vector<jthread> wait;
+  std::vector<jthread> notify_wait;
+  {
+    art::ScopedObjectAccess soa(self);      // Now we know we have the shared lock.
+    art::ScopedThreadSuspension sts(self, art::kNative);
+    art::ScopedSuspendAll ssa("GetObjectMonitorUsage", /*long_suspend*/false);
+    art::ObjPtr<art::mirror::Object> target(self->DecodeJObject(obj));
+    // This gets the list of threads trying to lock or wait on the monitor.
+    art::MonitorInfo info(target.Ptr());
+    usage->owner = info.owner_ != nullptr ?
+        jni->AddLocalReference<jthread>(info.owner_->GetPeerFromOtherThread()) : nullptr;
+    usage->entry_count = info.entry_count_;
+    for (art::Thread* thd : info.waiters_) {
+      // RI seems to consider waiting for notify to be included in those waiting to acquire the
+      // monitor. We will match this behavior.
+      notify_wait.push_back(jni->AddLocalReference<jthread>(thd->GetPeerFromOtherThread()));
+      wait.push_back(jni->AddLocalReference<jthread>(thd->GetPeerFromOtherThread()));
+    }
+    {
+      // Scan all threads to see which are waiting on this particular monitor.
+      art::MutexLock tll(self, *art::Locks::thread_list_lock_);
+      for (art::Thread* thd : art::Runtime::Current()->GetThreadList()->GetList()) {
+        if (thd != info.owner_ && target.Ptr() == thd->GetMonitorEnterObject()) {
+          wait.push_back(jni->AddLocalReference<jthread>(thd->GetPeerFromOtherThread()));
+        }
+      }
+    }
+  }
+  usage->waiter_count = wait.size();
+  usage->notify_waiter_count = notify_wait.size();
+  jvmtiError ret = CopyDataIntoJvmtiBuffer(env,
+                                           reinterpret_cast<const unsigned char*>(wait.data()),
+                                           wait.size() * sizeof(jthread),
+                                           reinterpret_cast<unsigned char**>(&usage->waiters));
+  if (ret != OK) {
+    return ret;
+  }
+  return CopyDataIntoJvmtiBuffer(env,
+                                 reinterpret_cast<const unsigned char*>(notify_wait.data()),
+                                 notify_wait.size() * sizeof(jthread),
+                                 reinterpret_cast<unsigned char**>(&usage->notify_waiters));
+}
+
 }  // namespace openjdkjvmti
diff --git a/openjdkjvmti/ti_object.h b/openjdkjvmti/ti_object.h
index fa3bd0f..977ec39 100644
--- a/openjdkjvmti/ti_object.h
+++ b/openjdkjvmti/ti_object.h
@@ -42,6 +42,8 @@
   static jvmtiError GetObjectSize(jvmtiEnv* env, jobject object, jlong* size_ptr);
 
   static jvmtiError GetObjectHashCode(jvmtiEnv* env, jobject object, jint* hash_code_ptr);
+
+  static jvmtiError GetObjectMonitorUsage(jvmtiEnv* env, jobject object, jvmtiMonitorUsage* usage);
 };
 
 }  // namespace openjdkjvmti
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 69c615d..ab9ca84 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1843,55 +1843,53 @@
     mov   r12, r0        @ r12 holds reference to code
     ldr   r0, [sp, #4]   @ restore r0
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    adr   lr, art_quick_instrumentation_exit + /* thumb mode */ 1
+                         @ load art_quick_instrumentation_exit into lr in thumb mode
     REFRESH_MARKING_REGISTER
-    blx   r12            @ call method with lr set to art_quick_instrumentation_exit
-@ Deliberate fall-through into art_quick_instrumentation_exit.
-    .type art_quick_instrumentation_exit, #function
-    .global art_quick_instrumentation_exit
-art_quick_instrumentation_exit:
-    mov   lr, #0         @ link register is to here, so clobber with 0 for later checks
-    SETUP_SAVE_REFS_ONLY_FRAME r2  @ set up frame knowing r2 and r3 must be dead on exit
-    mov   r12, sp        @ remember bottom of caller's frame
-    push  {r0-r1}        @ save return value
-    .cfi_adjust_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset r1, 4
-    mov   r2, sp         @ store gpr_res pointer.
-    vpush {d0}           @ save fp return value
-    .cfi_adjust_cfa_offset 8
-    mov   r3, sp         @ store fpr_res pointer
-    mov   r1, r12        @ pass SP
-    mov   r0, r9         @ pass Thread::Current
-    blx   artInstrumentationMethodExitFromCode  @ (Thread*, SP, gpr_res*, fpr_res*)
-
-    mov   r2, r0         @ link register saved by instrumentation
-    mov   lr, r1         @ r1 is holding link register if we're to bounce to deoptimize
-    vpop  {d0}           @ restore fp return value
-    .cfi_adjust_cfa_offset -8
-    pop   {r0, r1}       @ restore return value
-    .cfi_adjust_cfa_offset -8
-    .cfi_restore r0
-    .cfi_restore r1
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    REFRESH_MARKING_REGISTER
-    cbz   r2, .Ldo_deliver_instrumentation_exception
-                         @ Deliver exception if we got nullptr as function.
-    bx    r2             @ Otherwise, return
+    bx    r12            @ call method with lr set to art_quick_instrumentation_exit
 .Ldeliver_instrumentation_entry_exception:
     @ Deliver exception for art_quick_instrumentation_entry placed after
     @ art_quick_instrumentation_exit so that the fallthrough works.
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
-.Ldo_deliver_instrumentation_exception:
     DELIVER_PENDING_EXCEPTION
 END art_quick_instrumentation_entry
 
+ENTRY art_quick_instrumentation_exit
+    mov   lr, #0         @ link register is to here, so clobber with 0 for later checks
+    SETUP_SAVE_EVERYTHING_FRAME r2
+
+    add   r3, sp, #8     @ store fpr_res pointer, in kSaveEverything frame
+    add   r2, sp, #136   @ store gpr_res pointer, in kSaveEverything frame
+    mov   r1, sp         @ pass SP
+    mov   r0, r9         @ pass Thread::Current
+    blx   artInstrumentationMethodExitFromCode  @ (Thread*, SP, gpr_res*, fpr_res*)
+
+    cbz   r0, .Ldo_deliver_instrumentation_exception
+                         @ Deliver exception if we got nullptr as function.
+    cbnz  r1, .Ldeoptimize
+    // Normal return.
+    str   r0, [sp, #FRAME_SIZE_SAVE_EVERYTHING - 4]
+                         @ Set return pc.
+    RESTORE_SAVE_EVERYTHING_FRAME
+    REFRESH_MARKING_REGISTER
+    bx lr
+.Ldo_deliver_instrumentation_exception:
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+.Ldeoptimize:
+    str   r1, [sp, #FRAME_SIZE_SAVE_EVERYTHING - 4]
+                         @ Set return pc.
+    RESTORE_SAVE_EVERYTHING_FRAME
+    // Jump to art_quick_deoptimize.
+    b     art_quick_deoptimize
+END art_quick_instrumentation_exit
+
     /*
      * Instrumentation has requested that we deoptimize into the interpreter. The deoptimization
      * will long jump to the upcall with a special exception of -1.
      */
     .extern artDeoptimize
 ENTRY art_quick_deoptimize
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r0
+    SETUP_SAVE_EVERYTHING_FRAME r0
     mov    r0, r9         @ pass Thread::Current
     blx    artDeoptimize  @ (Thread*)
 END art_quick_deoptimize
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 802cf5e..adfc88f 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2365,32 +2365,31 @@
     .extern artInstrumentationMethodExitFromCode
 ENTRY art_quick_instrumentation_exit
     mov   xLR, #0             // Clobber LR for later checks.
+    SETUP_SAVE_EVERYTHING_FRAME
 
-    SETUP_SAVE_REFS_ONLY_FRAME
-
-    str x0, [sp, #-16]!       // Save integer result.
-    .cfi_adjust_cfa_offset 16
-    str d0, [sp, #8]          // Save floating-point result.
-
-    add   x3, sp, #8          // Pass floating-point result pointer.
-    mov   x2, sp              // Pass integer result pointer.
-    add   x1, sp, #16         // Pass SP.
+    add   x3, sp, #8          // Pass floating-point result pointer, in kSaveEverything frame.
+    add   x2, sp, #264        // Pass integer result pointer, in kSaveEverything frame.
+    mov   x1, sp              // Pass SP.
     mov   x0, xSELF           // Pass Thread.
     bl   artInstrumentationMethodExitFromCode    // (Thread*, SP, gpr_res*, fpr_res*)
 
-    mov   xIP0, x0            // Return address from instrumentation call.
-    mov   xLR, x1             // r1 is holding link register if we're to bounce to deoptimize
-
-    ldr   d0, [sp, #8]        // Restore floating-point result.
-    ldr   x0, [sp], #16       // Restore integer result, and drop stack area.
-    .cfi_adjust_cfa_offset -16
-
-    RESTORE_SAVE_REFS_ONLY_FRAME
+    cbz   x0, .Ldo_deliver_instrumentation_exception
+                              // Handle error
+    cbnz  x1, .Ldeoptimize
+    // Normal return.
+    str   x0, [sp, #FRAME_SIZE_SAVE_EVERYTHING - 8]
+                              // Set return pc.
+    RESTORE_SAVE_EVERYTHING_FRAME
     REFRESH_MARKING_REGISTER
-    cbz   xIP0, 1f            // Handle error
-    br    xIP0                // Tail-call out.
-1:
-    DELIVER_PENDING_EXCEPTION
+    br    lr
+.Ldo_deliver_instrumentation_exception:
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+.Ldeoptimize:
+    str   x1, [sp, #FRAME_SIZE_SAVE_EVERYTHING - 8]
+                              // Set return pc.
+    RESTORE_SAVE_EVERYTHING_FRAME
+    // Jump to art_quick_deoptimize.
+    b     art_quick_deoptimize
 END art_quick_instrumentation_exit
 
     /*
@@ -2399,7 +2398,7 @@
      */
     .extern artDeoptimize
 ENTRY art_quick_deoptimize
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME
+    SETUP_SAVE_EVERYTHING_FRAME
     mov    x0, xSELF          // Pass thread.
     bl     artDeoptimize      // (Thread*)
     brk 0
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index f08c7fe..eecca58 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -2063,42 +2063,43 @@
 DEFINE_FUNCTION_CUSTOM_CFA art_quick_instrumentation_exit, 0
     pushl LITERAL(0)              // Push a fake return PC as there will be none on the stack.
     CFI_ADJUST_CFA_OFFSET(4)
-    SETUP_SAVE_REFS_ONLY_FRAME ebx, ebx
-    mov  %esp, %ecx               // Remember SP
-    subl LITERAL(8), %esp         // Save float return value.
+    SETUP_SAVE_EVERYTHING_FRAME ebx, ebx
+
+    movl %esp, %ecx               // Remember SP
+    subl LITERAL(8), %esp         // Align stack.
     CFI_ADJUST_CFA_OFFSET(8)
-    movq %xmm0, (%esp)
-    PUSH edx                      // Save gpr return value.
+    PUSH edx                      // Save gpr return value. edx and eax need to be together,
+                                  // which isn't the case in kSaveEverything frame.
     PUSH eax
-    leal 8(%esp), %eax            // Get pointer to fpr_result
+    leal 32(%esp), %eax           // Get pointer to fpr_result, in kSaveEverything frame
     movl %esp, %edx               // Get pointer to gpr_result
     PUSH eax                      // Pass fpr_result
     PUSH edx                      // Pass gpr_result
-    PUSH ecx                      // Pass SP.
+    PUSH ecx                      // Pass SP
     pushl %fs:THREAD_SELF_OFFSET  // Pass Thread::Current.
     CFI_ADJUST_CFA_OFFSET(4)
+
     call SYMBOL(artInstrumentationMethodExitFromCode)  // (Thread*, SP, gpr_result*, fpr_result*)
-    testl %eax, %eax              // Check if we returned error.
-    jz 1f
-    mov   %eax, %ecx              // Move returned link register.
-    addl LITERAL(16), %esp        // Pop arguments.
-    CFI_ADJUST_CFA_OFFSET(-16)
-    movl %edx, %ebx               // Move returned link register for deopt
-                                  // (ebx is pretending to be our LR).
-    POP eax                       // Restore gpr return value.
-    POP edx
-    movq (%esp), %xmm0            // Restore fpr return value.
-    addl LITERAL(8), %esp
-    CFI_ADJUST_CFA_OFFSET(-8)
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    addl LITERAL(4), %esp         // Remove fake return pc.
-    CFI_ADJUST_CFA_OFFSET(-4)
-    jmp   *%ecx                   // Return.
-1:
-    addl LITERAL(32), %esp
+    // Return result could have been changed if it's a reference.
+    movl 16(%esp), %ecx
+    movl %ecx, (80+32)(%esp)
+    addl LITERAL(32), %esp        // Pop arguments and grp_result.
     CFI_ADJUST_CFA_OFFSET(-32)
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    DELIVER_PENDING_EXCEPTION
+
+    testl %eax, %eax              // Check if we returned error.
+    jz .Ldo_deliver_instrumentation_exception
+    testl %edx, %edx
+    jnz .Ldeoptimize
+    // Normal return.
+    movl %eax, FRAME_SIZE_SAVE_EVERYTHING-4(%esp)   // Set return pc.
+    RESTORE_SAVE_EVERYTHING_FRAME
+    ret
+.Ldeoptimize:
+    mov %edx, (FRAME_SIZE_SAVE_EVERYTHING-4)(%esp)  // Set return pc.
+    RESTORE_SAVE_EVERYTHING_FRAME
+    jmp SYMBOL(art_quick_deoptimize)
+.Ldo_deliver_instrumentation_exception:
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
 END_FUNCTION art_quick_instrumentation_exit
 
     /*
@@ -2106,8 +2107,7 @@
      * will long jump to the upcall with a special exception of -1.
      */
 DEFINE_FUNCTION art_quick_deoptimize
-    PUSH ebx                      // Entry point for a jump. Fake that we were called.
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME ebx, ebx
+    SETUP_SAVE_EVERYTHING_FRAME ebx, ebx
     subl LITERAL(12), %esp        // Align stack.
     CFI_ADJUST_CFA_OFFSET(12)
     pushl %fs:THREAD_SELF_OFFSET  // Pass Thread::Current().
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index b70abaa..2c3da90 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -2026,45 +2026,31 @@
     pushq LITERAL(0)          // Push a fake return PC as there will be none on the stack.
     CFI_ADJUST_CFA_OFFSET(8)
 
-    SETUP_SAVE_REFS_ONLY_FRAME
+    SETUP_SAVE_EVERYTHING_FRAME
 
-    // We need to save rax and xmm0. We could use a callee-save from SETUP_REF_ONLY, but then
-    // we would need to fully restore it. As there are a good number of callee-save registers, it
-    // seems easier to have an extra small stack area. But this should be revisited.
-
-    movq  %rsp, %rsi                          // Pass SP.
-
-    PUSH rax                  // Save integer result.
-    movq %rsp, %rdx           // Pass integer result pointer.
-
-    subq LITERAL(8), %rsp     // Save floating-point result.
-    CFI_ADJUST_CFA_OFFSET(8)
-    movq %xmm0, (%rsp)
-    movq %rsp, %rcx           // Pass floating-point result pointer.
-
-    movq  %gs:THREAD_SELF_OFFSET, %rdi        // Pass Thread.
+    leaq 16(%rsp), %rcx       // Pass floating-point result pointer, in kSaveEverything frame.
+    leaq 144(%rsp), %rdx      // Pass integer result pointer, in kSaveEverything frame.
+    movq %rsp, %rsi           // Pass SP.
+    movq %gs:THREAD_SELF_OFFSET, %rdi  // Pass Thread.
 
     call SYMBOL(artInstrumentationMethodExitFromCode)   // (Thread*, SP, gpr_res*, fpr_res*)
 
-    movq  %rax, %rdi          // Store return PC
-    movq  %rdx, %rsi          // Store second return PC in hidden arg.
-
-    movq (%rsp), %xmm0        // Restore floating-point result.
-    addq LITERAL(8), %rsp
-    CFI_ADJUST_CFA_OFFSET(-8)
-    POP rax                   // Restore integer result.
-
-    RESTORE_SAVE_REFS_ONLY_FRAME
-
-    testq %rdi, %rdi          // Check if we have a return-pc to go to. If we don't then there was
+    testq %rax, %rax          // Check if we have a return-pc to go to. If we don't then there was
                               // an exception
-    jz 1f
-
-    addq LITERAL(8), %rsp     // Drop fake return pc.
-
-    jmp   *%rdi               // Return.
-1:
-    DELIVER_PENDING_EXCEPTION
+    jz .Ldo_deliver_instrumentation_exception
+    testq %rdx, %rdx
+    jnz .Ldeoptimize
+    // Normal return.
+    movq %rax, FRAME_SIZE_SAVE_EVERYTHING-8(%rsp)  // Set return pc.
+    RESTORE_SAVE_EVERYTHING_FRAME
+    ret
+.Ldeoptimize:
+    movq %rdx, FRAME_SIZE_SAVE_EVERYTHING-8(%rsp)  // Set return pc.
+    RESTORE_SAVE_EVERYTHING_FRAME
+    // Jump to art_quick_deoptimize.
+    jmp SYMBOL(art_quick_deoptimize)
+.Ldo_deliver_instrumentation_exception:
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
 END_FUNCTION art_quick_instrumentation_exit
 
     /*
@@ -2072,10 +2058,7 @@
      * will long jump to the upcall with a special exception of -1.
      */
 DEFINE_FUNCTION art_quick_deoptimize
-    pushq %rsi                         // Entry point for a jump. Fake that we were called.
-                                       // Use hidden arg.
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME
-                                       // Stack should be aligned now.
+    SETUP_SAVE_EVERYTHING_FRAME        // Stack should be aligned now.
     movq %gs:THREAD_SELF_OFFSET, %rdi  // Pass Thread.
     call SYMBOL(artDeoptimize)         // (Thread*)
     UNREACHABLE
diff --git a/runtime/base/arena_allocator.cc b/runtime/base/arena_allocator.cc
index 148ef86..8738adf 100644
--- a/runtime/base/arena_allocator.cc
+++ b/runtime/base/arena_allocator.cc
@@ -73,6 +73,7 @@
   "BCE          ",
   "DCE          ",
   "LSE          ",
+  "CFRE         ",
   "LICM         ",
   "LoopOpt      ",
   "SsaLiveness  ",
diff --git a/runtime/base/arena_allocator.h b/runtime/base/arena_allocator.h
index 0b1a3ba..212edfb 100644
--- a/runtime/base/arena_allocator.h
+++ b/runtime/base/arena_allocator.h
@@ -80,6 +80,7 @@
   kArenaAllocBoundsCheckElimination,
   kArenaAllocDCE,
   kArenaAllocLSE,
+  kArenaAllocCFRE,
   kArenaAllocLICM,
   kArenaAllocLoopOptimization,
   kArenaAllocSsaLiveness,
diff --git a/runtime/base/bit_utils.h b/runtime/base/bit_utils.h
index 0844678..87dac02 100644
--- a/runtime/base/bit_utils.h
+++ b/runtime/base/bit_utils.h
@@ -127,6 +127,14 @@
   return (x < 2u) ? x : static_cast<T>(1u) << (std::numeric_limits<T>::digits - CLZ(x - 1u));
 }
 
+// Return highest possible N - a power of two - such that val >= N.
+template <typename T>
+constexpr T TruncToPowerOfTwo(T val) {
+  static_assert(std::is_integral<T>::value, "T must be integral");
+  static_assert(std::is_unsigned<T>::value, "T must be unsigned");
+  return (val != 0) ? static_cast<T>(1u) << (BitSizeOf<T>() - CLZ(val) - 1u) : 0;
+}
+
 template<typename T>
 constexpr bool IsPowerOfTwo(T x) {
   static_assert(std::is_integral<T>::value, "T must be integral");
diff --git a/runtime/base/bit_utils_test.cc b/runtime/base/bit_utils_test.cc
index 9f22fb4..c96c6dc 100644
--- a/runtime/base/bit_utils_test.cc
+++ b/runtime/base/bit_utils_test.cc
@@ -122,6 +122,32 @@
 static_assert(33u == MinimumBitsToStore<uint64_t>(UINT64_C(0x1FFFFFFFF)), "TestMinBits2Store64#10");
 static_assert(64u == MinimumBitsToStore<uint64_t>(~UINT64_C(0)), "TestMinBits2Store64#11");
 
+static_assert(0 == TruncToPowerOfTwo<uint32_t>(0u), "TestTruncToPowerOfTwo32#1");
+static_assert(1 == TruncToPowerOfTwo<uint32_t>(1u), "TestTruncToPowerOfTwo32#2");
+static_assert(2 == TruncToPowerOfTwo<uint32_t>(2u), "TestTruncToPowerOfTwo32#3");
+static_assert(2 == TruncToPowerOfTwo<uint32_t>(3u), "TestTruncToPowerOfTwo32#4");
+static_assert(4 == TruncToPowerOfTwo<uint32_t>(7u), "TestTruncToPowerOfTwo32#5");
+static_assert(0x20000u == TruncToPowerOfTwo<uint32_t>(0x3aaaau),
+              "TestTruncToPowerOfTwo32#6");
+static_assert(0x40000000u == TruncToPowerOfTwo<uint32_t>(0x40000001u),
+              "TestTruncToPowerOfTwo32#7");
+static_assert(0x80000000u == TruncToPowerOfTwo<uint32_t>(0x80000000u),
+              "TestTruncToPowerOfTwo32#8");
+
+static_assert(0 == TruncToPowerOfTwo<uint64_t>(UINT64_C(0)), "TestTruncToPowerOfTwo64#1");
+static_assert(1 == TruncToPowerOfTwo<uint64_t>(UINT64_C(1)), "TestTruncToPowerOfTwo64#2");
+static_assert(2 == TruncToPowerOfTwo<uint64_t>(UINT64_C(2)), "TestTruncToPowerOfTwo64#3");
+static_assert(2 == TruncToPowerOfTwo<uint64_t>(UINT64_C(3)), "TestTruncToPowerOfTwo64#4");
+static_assert(4 == TruncToPowerOfTwo<uint64_t>(UINT64_C(7)), "TestTruncToPowerOfTwo64#5");
+static_assert(UINT64_C(0x20000) == TruncToPowerOfTwo<uint64_t>(UINT64_C(0x3aaaa)),
+              "TestTruncToPowerOfTwo64#6");
+static_assert(
+    UINT64_C(0x4000000000000000) == TruncToPowerOfTwo<uint64_t>(UINT64_C(0x4000000000000001)),
+    "TestTruncToPowerOfTwo64#7");
+static_assert(
+    UINT64_C(0x8000000000000000) == TruncToPowerOfTwo<uint64_t>(UINT64_C(0x8000000000000000)),
+    "TestTruncToPowerOfTwo64#8");
+
 static_assert(0 == RoundUpToPowerOfTwo<uint32_t>(0u), "TestRoundUpPowerOfTwo32#1");
 static_assert(1 == RoundUpToPowerOfTwo<uint32_t>(1u), "TestRoundUpPowerOfTwo32#2");
 static_assert(2 == RoundUpToPowerOfTwo<uint32_t>(2u), "TestRoundUpPowerOfTwo32#3");
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 77ce39c..4161754 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -7763,7 +7763,8 @@
     // We have a valid method from the DexCache but we need to perform ICCE and IAE checks.
     DCHECK(resolved->GetDeclaringClassUnchecked() != nullptr) << resolved->GetDexMethodIndex();
     klass = LookupResolvedType(dex_file, method_id.class_idx_, dex_cache.Get(), class_loader.Get());
-    DCHECK(klass != nullptr);
+    CHECK(klass != nullptr) << resolved->PrettyMethod() << " " << resolved << " "
+                            << resolved->GetAccessFlags();
   } else {
     // The method was not in the DexCache, resolve the declaring class.
     klass = ResolveType(dex_file, method_id.class_idx_, dex_cache, class_loader);
diff --git a/runtime/class_loader_context.cc b/runtime/class_loader_context.cc
index 07afedf..56573f5 100644
--- a/runtime/class_loader_context.cc
+++ b/runtime/class_loader_context.cc
@@ -632,6 +632,10 @@
   }
 }
 
+static bool IsAbsoluteLocation(const std::string& location) {
+  return !location.empty() && location[0] == '/';
+}
+
 bool ClassLoaderContext::VerifyClassLoaderContextMatch(const std::string& context_spec) const {
   ClassLoaderContext expected_context;
   if (!expected_context.Parse(context_spec, /*parse_checksums*/ true)) {
@@ -673,18 +677,52 @@
     DCHECK_EQ(expected_info.classpath.size(), expected_info.checksums.size());
 
     for (size_t k = 0; k < info.classpath.size(); k++) {
-      if (info.classpath[k] != expected_info.classpath[k]) {
+      // Compute the dex location that must be compared.
+      // We shouldn't do a naive comparison `info.classpath[k] == expected_info.classpath[k]`
+      // because even if they refer to the same file, one could be encoded as a relative location
+      // and the other as an absolute one.
+      bool is_dex_name_absolute = IsAbsoluteLocation(info.classpath[k]);
+      bool is_expected_dex_name_absolute = IsAbsoluteLocation(expected_info.classpath[k]);
+      std::string dex_name;
+      std::string expected_dex_name;
+
+      if (is_dex_name_absolute == is_expected_dex_name_absolute) {
+        // If both locations are absolute or relative then compare them as they are.
+        // This is usually the case for: shared libraries and secondary dex files.
+        dex_name = info.classpath[k];
+        expected_dex_name = expected_info.classpath[k];
+      } else if (is_dex_name_absolute) {
+        // The runtime name is absolute but the compiled name (the expected one) is relative.
+        // This is the case for split apks which depend on base or on other splits.
+        dex_name = info.classpath[k];
+        expected_dex_name = OatFile::ResolveRelativeEncodedDexLocation(
+            info.classpath[k].c_str(), expected_info.classpath[k]);
+      } else {
+        // The runtime name is relative but the compiled name is absolute.
+        // There is no expected use case that would end up here as dex files are always loaded
+        // with their absolute location. However, be tolerant and do the best effort (in case
+        // there are unexpected new use case...).
+        DCHECK(is_expected_dex_name_absolute);
+        dex_name = OatFile::ResolveRelativeEncodedDexLocation(
+            expected_info.classpath[k].c_str(), info.classpath[k]);
+        expected_dex_name = expected_info.classpath[k];
+      }
+
+      // Compare the locations.
+      if (dex_name != expected_dex_name) {
         LOG(WARNING) << "ClassLoaderContext classpath element mismatch for position " << i
             << ". expected=" << expected_info.classpath[k]
             << ", found=" << info.classpath[k]
             << " (" << context_spec << " | " << EncodeContextForOatFile("") << ")";
         return false;
       }
+
+      // Compare the checksums.
       if (info.checksums[k] != expected_info.checksums[k]) {
         LOG(WARNING) << "ClassLoaderContext classpath element checksum mismatch for position " << i
-            << ". expected=" << expected_info.checksums[k]
-            << ", found=" << info.checksums[k]
-            << " (" << context_spec << " | " << EncodeContextForOatFile("") << ")";
+                     << ". expected=" << expected_info.checksums[k]
+                     << ", found=" << info.checksums[k]
+                     << " (" << context_spec << " | " << EncodeContextForOatFile("") << ")";
         return false;
       }
     }
diff --git a/runtime/class_loader_context_test.cc b/runtime/class_loader_context_test.cc
index ddbb73b..1847274 100644
--- a/runtime/class_loader_context_test.cc
+++ b/runtime/class_loader_context_test.cc
@@ -697,7 +697,17 @@
 
   std::unique_ptr<ClassLoaderContext> context = CreateContextForClassLoader(class_loader_d);
 
-  ASSERT_TRUE(context->VerifyClassLoaderContextMatch(context->EncodeContextForOatFile("")));
+  std::string context_with_no_base_dir = context->EncodeContextForOatFile("");
+  ASSERT_TRUE(context->VerifyClassLoaderContextMatch(context_with_no_base_dir));
+
+  std::string dex_location = GetTestDexFileName("ForClassLoaderA");
+  size_t pos = dex_location.rfind('/');
+  ASSERT_NE(std::string::npos, pos);
+  std::string parent = dex_location.substr(0, pos);
+
+  std::string context_with_base_dir = context->EncodeContextForOatFile(parent);
+  ASSERT_NE(context_with_base_dir, context_with_no_base_dir);
+  ASSERT_TRUE(context->VerifyClassLoaderContextMatch(context_with_base_dir));
 }
 
 TEST_F(ClassLoaderContextTest, VerifyClassLoaderContextMatchAfterEncodingMultidex) {
diff --git a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
index 53f0727..5f40711 100644
--- a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
@@ -73,7 +73,11 @@
   // Before deoptimizing to interpreter, we must push the deoptimization context.
   JValue return_value;
   return_value.SetJ(0);  // we never deoptimize from compiled code with an invoke result.
-  self->PushDeoptimizationContext(return_value, false, /* from_code */ true, self->GetException());
+  self->PushDeoptimizationContext(return_value,
+                                  false /* is_reference */,
+                                  self->GetException(),
+                                  true /* from_code */,
+                                  DeoptimizationMethodType::kDefault);
   artDeoptimizeImpl(self, kind, true);
 }
 
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index c6abd28..7b83f20 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -744,7 +744,11 @@
 
     ObjPtr<mirror::Throwable> pending_exception;
     bool from_code = false;
-    self->PopDeoptimizationContext(&result, &pending_exception, /* out */ &from_code);
+    DeoptimizationMethodType method_type;
+    self->PopDeoptimizationContext(/* out */ &result,
+                                   /* out */ &pending_exception,
+                                   /* out */ &from_code,
+                                   /* out */ &method_type);
 
     // Push a transition back into managed code onto the linked list in thread.
     self->PushManagedStackFragment(&fragment);
@@ -771,7 +775,11 @@
     if (pending_exception != nullptr) {
       self->SetException(pending_exception);
     }
-    interpreter::EnterInterpreterFromDeoptimize(self, deopt_frame, from_code, &result);
+    interpreter::EnterInterpreterFromDeoptimize(self,
+                                                deopt_frame,
+                                                &result,
+                                                from_code,
+                                                DeoptimizationMethodType::kDefault);
   } else {
     const char* old_cause = self->StartAssertNoThreadSuspension(
         "Building interpreter shadow frame");
@@ -823,7 +831,11 @@
       // Push the context of the deoptimization stack so we can restore the return value and the
       // exception before executing the deoptimized frames.
       self->PushDeoptimizationContext(
-          result, shorty[0] == 'L', /* from_code */ false, self->GetException());
+          result,
+          shorty[0] == 'L' || shorty[0] == '[',  /* class or array */
+          self->GetException(),
+          false /* from_code */,
+          DeoptimizationMethodType::kDefault);
 
       // Set special exception to cause deoptimization.
       self->SetException(Thread::GetDeoptimizationException());
@@ -1041,7 +1053,8 @@
   CHECK(!self->IsExceptionPending()) << "Enter instrumentation exit stub with pending exception "
                                      << self->GetException()->Dump();
   // Compute address of return PC and sanity check that it currently holds 0.
-  size_t return_pc_offset = GetCalleeSaveReturnPcOffset(kRuntimeISA, CalleeSaveType::kSaveRefsOnly);
+  size_t return_pc_offset = GetCalleeSaveReturnPcOffset(kRuntimeISA,
+                                                        CalleeSaveType::kSaveEverything);
   uintptr_t* return_pc = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(sp) +
                                                       return_pc_offset);
   CHECK_EQ(*return_pc, 0U);
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 6e457a4..4d8c687 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -26,11 +26,13 @@
 #include "class_linker.h"
 #include "debugger.h"
 #include "dex_file-inl.h"
+#include "dex_instruction-inl.h"
 #include "entrypoints/quick/quick_alloc_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "entrypoints/runtime_asm_entrypoints.h"
 #include "gc_root-inl.h"
 #include "interpreter/interpreter.h"
+#include "interpreter/interpreter_common.h"
 #include "jit/jit.h"
 #include "jit/jit_code_cache.h"
 #include "jvalue-inl.h"
@@ -229,39 +231,32 @@
         return true;  // Continue.
       }
       uintptr_t return_pc = GetReturnPc();
-      if (m->IsRuntimeMethod()) {
-        if (return_pc == instrumentation_exit_pc_) {
-          if (kVerboseInstrumentation) {
-            LOG(INFO) << "  Handling quick to interpreter transition. Frame " << GetFrameId();
-          }
-          CHECK_LT(instrumentation_stack_depth_, instrumentation_stack_->size());
-          const InstrumentationStackFrame& frame =
-              instrumentation_stack_->at(instrumentation_stack_depth_);
-          CHECK(frame.interpreter_entry_);
-          // This is an interpreter frame so method enter event must have been reported. However we
-          // need to push a DEX pc into the dex_pcs_ list to match size of instrumentation stack.
-          // Since we won't report method entry here, we can safely push any DEX pc.
-          dex_pcs_.push_back(0);
-          last_return_pc_ = frame.return_pc_;
-          ++instrumentation_stack_depth_;
-          return true;
-        } else {
-          if (kVerboseInstrumentation) {
-            LOG(INFO) << "  Skipping runtime method. Frame " << GetFrameId();
-          }
-          last_return_pc_ = GetReturnPc();
-          return true;  // Ignore unresolved methods since they will be instrumented after resolution.
-        }
-      }
       if (kVerboseInstrumentation) {
         LOG(INFO) << "  Installing exit stub in " << DescribeLocation();
       }
       if (return_pc == instrumentation_exit_pc_) {
+        CHECK_LT(instrumentation_stack_depth_, instrumentation_stack_->size());
+
+        if (m->IsRuntimeMethod()) {
+          const InstrumentationStackFrame& frame =
+              instrumentation_stack_->at(instrumentation_stack_depth_);
+          if (frame.interpreter_entry_) {
+            // This instrumentation frame is for an interpreter bridge and is
+            // pushed when executing the instrumented interpreter bridge. So method
+            // enter event must have been reported. However we need to push a DEX pc
+            // into the dex_pcs_ list to match size of instrumentation stack.
+            uint32_t dex_pc = DexFile::kDexNoIndex;
+            dex_pcs_.push_back(dex_pc);
+            last_return_pc_ = frame.return_pc_;
+            ++instrumentation_stack_depth_;
+            return true;
+          }
+        }
+
         // We've reached a frame which has already been installed with instrumentation exit stub.
         // We should have already installed instrumentation on previous frames.
         reached_existing_instrumentation_frames_ = true;
 
-        CHECK_LT(instrumentation_stack_depth_, instrumentation_stack_->size());
         const InstrumentationStackFrame& frame =
             instrumentation_stack_->at(instrumentation_stack_depth_);
         CHECK_EQ(m, frame.method_) << "Expected " << ArtMethod::PrettyMethod(m)
@@ -273,8 +268,12 @@
       } else {
         CHECK_NE(return_pc, 0U);
         CHECK(!reached_existing_instrumentation_frames_);
-        InstrumentationStackFrame instrumentation_frame(GetThisObject(), m, return_pc, GetFrameId(),
-                                                        false);
+        InstrumentationStackFrame instrumentation_frame(
+            m->IsRuntimeMethod() ? nullptr : GetThisObject(),
+            m,
+            return_pc,
+            GetFrameId(),    // A runtime method still gets a frame id.
+            false);
         if (kVerboseInstrumentation) {
           LOG(INFO) << "Pushing frame " << instrumentation_frame.Dump();
         }
@@ -291,9 +290,12 @@
         instrumentation_stack_->insert(it, instrumentation_frame);
         SetReturnPc(instrumentation_exit_pc_);
       }
-      dex_pcs_.push_back((GetCurrentOatQuickMethodHeader() == nullptr)
-          ? DexFile::kDexNoIndex
-          : GetCurrentOatQuickMethodHeader()->ToDexPc(m, last_return_pc_));
+      uint32_t dex_pc = DexFile::kDexNoIndex;
+      if (last_return_pc_ != 0 &&
+          GetCurrentOatQuickMethodHeader() != nullptr) {
+        dex_pc = GetCurrentOatQuickMethodHeader()->ToDexPc(m, last_return_pc_);
+      }
+      dex_pcs_.push_back(dex_pc);
       last_return_pc_ = return_pc;
       ++instrumentation_stack_depth_;
       return true;  // Continue.
@@ -391,7 +393,8 @@
             CHECK(m == instrumentation_frame.method_) << ArtMethod::PrettyMethod(m);
           }
           SetReturnPc(instrumentation_frame.return_pc_);
-          if (instrumentation_->ShouldNotifyMethodEnterExitEvents()) {
+          if (instrumentation_->ShouldNotifyMethodEnterExitEvents() &&
+              !m->IsRuntimeMethod()) {
             // Create the method exit events. As the methods didn't really exit the result is 0.
             // We only do this if no debugger is attached to prevent from posting events twice.
             instrumentation_->MethodExitEvent(thread_, instrumentation_frame.this_object_, m,
@@ -969,6 +972,7 @@
                                            ObjPtr<mirror::Object> this_object,
                                            ArtMethod* method,
                                            uint32_t dex_pc) const {
+  DCHECK(!method->IsRuntimeMethod());
   if (HasMethodEntryListeners()) {
     Thread* self = Thread::Current();
     StackHandleScope<1> hs(self);
@@ -1199,6 +1203,66 @@
   stack->push_front(instrumentation_frame);
 }
 
+DeoptimizationMethodType Instrumentation::GetDeoptimizationMethodType(ArtMethod* method) {
+  if (method->IsRuntimeMethod()) {
+    // Certain methods have strict requirement on whether the dex instruction
+    // should be re-executed upon deoptimization.
+    if (method == Runtime::Current()->GetCalleeSaveMethod(
+        CalleeSaveType::kSaveEverythingForClinit)) {
+      return DeoptimizationMethodType::kKeepDexPc;
+    }
+    if (method == Runtime::Current()->GetCalleeSaveMethod(
+        CalleeSaveType::kSaveEverythingForSuspendCheck)) {
+      return DeoptimizationMethodType::kKeepDexPc;
+    }
+  }
+  return DeoptimizationMethodType::kDefault;
+}
+
+// Try to get the shorty of a runtime method if it's an invocation stub.
+struct RuntimeMethodShortyVisitor : public StackVisitor {
+  explicit RuntimeMethodShortyVisitor(Thread* thread)
+      : StackVisitor(thread, nullptr, StackVisitor::StackWalkKind::kIncludeInlinedFrames),
+        shorty('V') {}
+
+  bool VisitFrame() REQUIRES_SHARED(Locks::mutator_lock_) {
+    ArtMethod* m = GetMethod();
+    if (m != nullptr && !m->IsRuntimeMethod()) {
+      // The first Java method.
+      if (m->IsNative()) {
+        // Use JNI method's shorty for the jni stub.
+        shorty = m->GetShorty()[0];
+        return false;
+      }
+      if (m->IsProxyMethod()) {
+        // Proxy method just invokes its proxied method via
+        // art_quick_proxy_invoke_handler.
+        shorty = m->GetInterfaceMethodIfProxy(kRuntimePointerSize)->GetShorty()[0];
+        return false;
+      }
+      const DexFile::CodeItem* code_item = m->GetCodeItem();
+      const Instruction* instr = Instruction::At(&code_item->insns_[GetDexPc()]);
+      if (instr->IsInvoke()) {
+        const DexFile* dex_file = m->GetDexFile();
+        if (interpreter::IsStringInit(dex_file, instr->VRegB())) {
+          // Invoking string init constructor is turned into invoking
+          // StringFactory.newStringFromChars() which returns a string.
+          shorty = 'L';
+          return false;
+        }
+        // A regular invoke, use callee's shorty.
+        uint32_t method_idx = instr->VRegB();
+        shorty = dex_file->GetMethodShorty(method_idx)[0];
+      }
+      // Stop stack walking since we've seen a Java frame.
+      return false;
+    }
+    return true;
+  }
+
+  char shorty;
+};
+
 TwoWordReturn Instrumentation::PopInstrumentationStackFrame(Thread* self,
                                                             uintptr_t* return_pc,
                                                             uint64_t* gpr_result,
@@ -1219,7 +1283,36 @@
   ArtMethod* method = instrumentation_frame.method_;
   uint32_t length;
   const PointerSize pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
-  char return_shorty = method->GetInterfaceMethodIfProxy(pointer_size)->GetShorty(&length)[0];
+  char return_shorty;
+
+  // Runtime method does not call into MethodExitEvent() so there should not be
+  // suspension point below.
+  ScopedAssertNoThreadSuspension ants(__FUNCTION__, method->IsRuntimeMethod());
+  if (method->IsRuntimeMethod()) {
+    if (method != Runtime::Current()->GetCalleeSaveMethod(
+        CalleeSaveType::kSaveEverythingForClinit)) {
+      // If the caller is at an invocation point and the runtime method is not
+      // for clinit, we need to pass return results to the caller.
+      // We need the correct shorty to decide whether we need to pass the return
+      // result for deoptimization below.
+      RuntimeMethodShortyVisitor visitor(self);
+      visitor.WalkStack();
+      return_shorty = visitor.shorty;
+    } else {
+      // Some runtime methods such as allocations, unresolved field getters, etc.
+      // have return value. We don't need to set return_value since MethodExitEvent()
+      // below isn't called for runtime methods. Deoptimization doesn't need the
+      // value either since the dex instruction will be re-executed by the
+      // interpreter, except these two cases:
+      // (1) For an invoke, which is handled above to get the correct shorty.
+      // (2) For MONITOR_ENTER/EXIT, which cannot be re-executed since it's not
+      //     idempotent. However there is no return value for it anyway.
+      return_shorty = 'V';
+    }
+  } else {
+    return_shorty = method->GetInterfaceMethodIfProxy(pointer_size)->GetShorty(&length)[0];
+  }
+
   bool is_ref = return_shorty == '[' || return_shorty == 'L';
   StackHandleScope<1> hs(self);
   MutableHandle<mirror::Object> res(hs.NewHandle<mirror::Object>(nullptr));
@@ -1239,7 +1332,7 @@
   //       return_pc.
   uint32_t dex_pc = DexFile::kDexNoIndex;
   mirror::Object* this_object = instrumentation_frame.this_object_;
-  if (!instrumentation_frame.interpreter_entry_) {
+  if (!method->IsRuntimeMethod() && !instrumentation_frame.interpreter_entry_) {
     MethodExitEvent(self, this_object, instrumentation_frame.method_, dex_pc, return_value);
   }
 
@@ -1265,10 +1358,12 @@
                 << " in "
                 << *self;
     }
+    DeoptimizationMethodType deopt_method_type = GetDeoptimizationMethodType(method);
     self->PushDeoptimizationContext(return_value,
-                                    return_shorty == 'L',
+                                    return_shorty == 'L' || return_shorty == '[',
+                                    nullptr /* no pending exception */,
                                     false /* from_code */,
-                                    nullptr /* no pending exception */);
+                                    deopt_method_type);
     return GetTwoWordSuccessValue(*return_pc,
                                   reinterpret_cast<uintptr_t>(GetQuickDeoptimizationEntryPoint()));
   } else {
@@ -1305,7 +1400,9 @@
     // TODO: improve the dex pc information here, requires knowledge of current PC as opposed to
     //       return_pc.
     uint32_t dex_pc = DexFile::kDexNoIndex;
-    MethodUnwindEvent(self, instrumentation_frame.this_object_, method, dex_pc);
+    if (!method->IsRuntimeMethod()) {
+      MethodUnwindEvent(self, instrumentation_frame.this_object_, method, dex_pc);
+    }
   }
   // TODO: bring back CheckStackDepth(self, instrumentation_frame, 2);
   CHECK_EQ(stack->size(), idx);
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index fec027e..5763a41 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -40,6 +40,7 @@
 union JValue;
 class ShadowFrame;
 class Thread;
+enum class DeoptimizationMethodType;
 
 namespace instrumentation {
 
@@ -474,6 +475,9 @@
                                      bool interpreter_entry)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  DeoptimizationMethodType GetDeoptimizationMethodType(ArtMethod* method)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Called when an instrumented method is exited. Removes the pushed instrumentation frame
   // returning the intended link register. Generates method exit events. The gpr_result and
   // fpr_result pointers are pointers to the locations where the integer/pointer and floating point
@@ -711,9 +715,15 @@
 
 // An element in the instrumentation side stack maintained in art::Thread.
 struct InstrumentationStackFrame {
-  InstrumentationStackFrame(mirror::Object* this_object, ArtMethod* method,
-                            uintptr_t return_pc, size_t frame_id, bool interpreter_entry)
-      : this_object_(this_object), method_(method), return_pc_(return_pc), frame_id_(frame_id),
+  InstrumentationStackFrame(mirror::Object* this_object,
+                            ArtMethod* method,
+                            uintptr_t return_pc,
+                            size_t frame_id,
+                            bool interpreter_entry)
+      : this_object_(this_object),
+        method_(method),
+        return_pc_(return_pc),
+        frame_id_(frame_id),
         interpreter_entry_(interpreter_entry) {
   }
 
diff --git a/runtime/instrumentation_test.cc b/runtime/instrumentation_test.cc
index 9b77d12..89baa35 100644
--- a/runtime/instrumentation_test.cc
+++ b/runtime/instrumentation_test.cc
@@ -514,7 +514,23 @@
 
 // Test instrumentation listeners for each event.
 TEST_F(InstrumentationTest, MethodEntryEvent) {
-  TestEvent(instrumentation::Instrumentation::kMethodEntered);
+  ScopedObjectAccess soa(Thread::Current());
+  jobject class_loader = LoadDex("Instrumentation");
+  Runtime* const runtime = Runtime::Current();
+  ClassLinker* class_linker = runtime->GetClassLinker();
+  StackHandleScope<1> hs(soa.Self());
+  Handle<mirror::ClassLoader> loader(hs.NewHandle(soa.Decode<mirror::ClassLoader>(class_loader)));
+  mirror::Class* klass = class_linker->FindClass(soa.Self(), "LInstrumentation;", loader);
+  ASSERT_TRUE(klass != nullptr);
+  ArtMethod* method =
+      klass->FindClassMethod("returnReference", "()Ljava/lang/Object;", kRuntimePointerSize);
+  ASSERT_TRUE(method != nullptr);
+  ASSERT_TRUE(method->IsDirect());
+  ASSERT_TRUE(method->GetDeclaringClass() == klass);
+  TestEvent(instrumentation::Instrumentation::kMethodEntered,
+            /*event_method*/ method,
+            /*event_field*/ nullptr,
+            /*with_object*/ true);
 }
 
 TEST_F(InstrumentationTest, MethodExitObjectEvent) {
diff --git a/runtime/interpreter/interpreter.cc b/runtime/interpreter/interpreter.cc
index 3349833..a1f2123 100644
--- a/runtime/interpreter/interpreter.cc
+++ b/runtime/interpreter/interpreter.cc
@@ -467,29 +467,6 @@
   self->PopShadowFrame();
 }
 
-static bool IsStringInit(const Instruction* instr, ArtMethod* caller)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  if (instr->Opcode() == Instruction::INVOKE_DIRECT ||
-      instr->Opcode() == Instruction::INVOKE_DIRECT_RANGE) {
-    // Instead of calling ResolveMethod() which has suspend point and can trigger
-    // GC, look up the callee method symbolically.
-    uint16_t callee_method_idx = (instr->Opcode() == Instruction::INVOKE_DIRECT_RANGE) ?
-        instr->VRegB_3rc() : instr->VRegB_35c();
-    const DexFile* dex_file = caller->GetDexFile();
-    const DexFile::MethodId& method_id = dex_file->GetMethodId(callee_method_idx);
-    const char* class_name = dex_file->StringByTypeIdx(method_id.class_idx_);
-    const char* method_name = dex_file->GetMethodName(method_id);
-    // Compare method's class name and method name against string init.
-    // It's ok since it's not allowed to create your own java/lang/String.
-    // TODO: verify that assumption.
-    if ((strcmp(class_name, "Ljava/lang/String;") == 0) &&
-        (strcmp(method_name, "<init>") == 0)) {
-      return true;
-    }
-  }
-  return false;
-}
-
 static int16_t GetReceiverRegisterForStringInit(const Instruction* instr) {
   DCHECK(instr->Opcode() == Instruction::INVOKE_DIRECT_RANGE ||
          instr->Opcode() == Instruction::INVOKE_DIRECT);
@@ -499,8 +476,9 @@
 
 void EnterInterpreterFromDeoptimize(Thread* self,
                                     ShadowFrame* shadow_frame,
+                                    JValue* ret_val,
                                     bool from_code,
-                                    JValue* ret_val)
+                                    DeoptimizationMethodType deopt_method_type)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   JValue value;
   // Set value to last known result in case the shadow frame chain is empty.
@@ -525,11 +503,27 @@
       new_dex_pc = MoveToExceptionHandler(
           self, *shadow_frame, instrumentation) ? shadow_frame->GetDexPC() : DexFile::kDexNoIndex;
     } else if (!from_code) {
-      // For the debugger and full deoptimization stack, we must go past the invoke
-      // instruction, as it already executed.
-      // TODO: should be tested more once b/17586779 is fixed.
+      // Deoptimization is not called from code directly.
       const Instruction* instr = Instruction::At(&code_item->insns_[dex_pc]);
-      if (instr->IsInvoke()) {
+      if (deopt_method_type == DeoptimizationMethodType::kKeepDexPc) {
+        DCHECK(first);
+        // Need to re-execute the dex instruction.
+        // (1) An invocation might be split into class initialization and invoke.
+        //     In this case, the invoke should not be skipped.
+        // (2) A suspend check should also execute the dex instruction at the
+        //     corresponding dex pc.
+        DCHECK_EQ(new_dex_pc, dex_pc);
+      } else if (instr->Opcode() == Instruction::MONITOR_ENTER ||
+                 instr->Opcode() == Instruction::MONITOR_EXIT) {
+        DCHECK(deopt_method_type == DeoptimizationMethodType::kDefault);
+        DCHECK(first);
+        // Non-idempotent dex instruction should not be re-executed.
+        // On the other hand, if a MONITOR_ENTER is at the dex_pc of a suspend
+        // check, that MONITOR_ENTER should be executed. That case is handled
+        // above.
+        new_dex_pc = dex_pc + instr->SizeInCodeUnits();
+      } else if (instr->IsInvoke()) {
+        DCHECK(deopt_method_type == DeoptimizationMethodType::kDefault);
         if (IsStringInit(instr, shadow_frame->GetMethod())) {
           uint16_t this_obj_vreg = GetReceiverRegisterForStringInit(instr);
           // Move the StringFactory.newStringFromChars() result into the register representing
@@ -542,30 +536,27 @@
         }
         new_dex_pc = dex_pc + instr->SizeInCodeUnits();
       } else if (instr->Opcode() == Instruction::NEW_INSTANCE) {
-        // It's possible to deoptimize at a NEW_INSTANCE dex instruciton that's for a
-        // java string, which is turned into a call into StringFactory.newEmptyString();
-        // Move the StringFactory.newEmptyString() result into the destination register.
-        DCHECK(value.GetL()->IsString());
-        shadow_frame->SetVRegReference(instr->VRegA_21c(), value.GetL());
-        // new-instance doesn't generate a result value.
-        value.SetJ(0);
-        // Skip the dex instruction since we essentially come back from an invocation.
-        new_dex_pc = dex_pc + instr->SizeInCodeUnits();
-        if (kIsDebugBuild) {
-          ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
-          // This is a suspend point. But it's ok since value has been set into shadow_frame.
-          ObjPtr<mirror::Class> klass = class_linker->ResolveType(
-              dex::TypeIndex(instr->VRegB_21c()), shadow_frame->GetMethod());
-          DCHECK(klass->IsStringClass());
-        }
+        // A NEW_INSTANCE is simply re-executed, including
+        // "new-instance String" which is compiled into a call into
+        // StringFactory.newEmptyString().
+        DCHECK_EQ(new_dex_pc, dex_pc);
       } else {
-        CHECK(false) << "Unexpected instruction opcode " << instr->Opcode()
-                     << " at dex_pc " << dex_pc
-                     << " of method: " << ArtMethod::PrettyMethod(shadow_frame->GetMethod(), false);
+        DCHECK(deopt_method_type == DeoptimizationMethodType::kDefault);
+        DCHECK(first);
+        // By default, we re-execute the dex instruction since if they are not
+        // an invoke, so that we don't have to decode the dex instruction to move
+        // result into the right vreg. All slow paths have been audited to be
+        // idempotent except monitor-enter/exit and invocation stubs.
+        // TODO: move result and advance dex pc. That also requires that we
+        // can tell the return type of a runtime method, possibly by decoding
+        // the dex instruction at the caller.
+        DCHECK_EQ(new_dex_pc, dex_pc);
       }
     } else {
       // Nothing to do, the dex_pc is the one at which the code requested
       // the deoptimization.
+      DCHECK(first);
+      DCHECK_EQ(new_dex_pc, dex_pc);
     }
     if (new_dex_pc != DexFile::kDexNoIndex) {
       shadow_frame->SetDexPC(new_dex_pc);
@@ -574,8 +565,10 @@
     ShadowFrame* old_frame = shadow_frame;
     shadow_frame = shadow_frame->GetLink();
     ShadowFrame::DeleteDeoptimizedFrame(old_frame);
-    // Following deoptimizations of shadow frames must pass the invoke instruction.
+    // Following deoptimizations of shadow frames must be at invocation point
+    // and should advance dex pc past the invoke instruction.
     from_code = false;
+    deopt_method_type = DeoptimizationMethodType::kDefault;
     first = false;
   }
   ret_val->SetJ(value.GetJ());
diff --git a/runtime/interpreter/interpreter.h b/runtime/interpreter/interpreter.h
index 65cfade..df8568e 100644
--- a/runtime/interpreter/interpreter.h
+++ b/runtime/interpreter/interpreter.h
@@ -30,6 +30,7 @@
 union JValue;
 class ShadowFrame;
 class Thread;
+enum class DeoptimizationMethodType;
 
 namespace interpreter {
 
@@ -44,8 +45,11 @@
     REQUIRES_SHARED(Locks::mutator_lock_);
 
 // 'from_code' denotes whether the deoptimization was explicitly triggered by compiled code.
-extern void EnterInterpreterFromDeoptimize(Thread* self, ShadowFrame* shadow_frame, bool from_code,
-                                           JValue* ret_val)
+extern void EnterInterpreterFromDeoptimize(Thread* self,
+                                           ShadowFrame* shadow_frame,
+                                           JValue* ret_val,
+                                           bool from_code,
+                                           DeoptimizationMethodType method_type)
     REQUIRES_SHARED(Locks::mutator_lock_);
 
 extern JValue EnterInterpreterFromEntryPoint(Thread* self, const DexFile::CodeItem* code_item,
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index 82e12f5..3ccab85 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -537,6 +537,34 @@
                                         uint16_t arg_offset,
                                         JValue* result);
 
+static inline bool IsStringInit(const DexFile* dex_file, uint32_t method_idx)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  const DexFile::MethodId& method_id = dex_file->GetMethodId(method_idx);
+  const char* class_name = dex_file->StringByTypeIdx(method_id.class_idx_);
+  const char* method_name = dex_file->GetMethodName(method_id);
+  // Instead of calling ResolveMethod() which has suspend point and can trigger
+  // GC, look up the method symbolically.
+  // Compare method's class name and method name against string init.
+  // It's ok since it's not allowed to create your own java/lang/String.
+  // TODO: verify that assumption.
+  if ((strcmp(class_name, "Ljava/lang/String;") == 0) &&
+      (strcmp(method_name, "<init>") == 0)) {
+    return true;
+  }
+  return false;
+}
+
+static inline bool IsStringInit(const Instruction* instr, ArtMethod* caller)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  if (instr->Opcode() == Instruction::INVOKE_DIRECT ||
+      instr->Opcode() == Instruction::INVOKE_DIRECT_RANGE) {
+    uint16_t callee_method_idx = (instr->Opcode() == Instruction::INVOKE_DIRECT_RANGE) ?
+        instr->VRegB_3rc() : instr->VRegB_35c();
+    return IsStringInit(caller->GetDexFile(), callee_method_idx);
+  }
+  return false;
+}
+
 // Set string value created from StringFactory.newStringFromXXX() into all aliases of
 // StringFactory.newEmptyString().
 void SetStringInitValueToAllAliases(ShadowFrame* shadow_frame,
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 5c63dca..80e6ad3 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -1498,13 +1498,21 @@
       break;
     case LockWord::kThinLocked:
       owner_ = Runtime::Current()->GetThreadList()->FindThreadByThreadId(lock_word.ThinLockOwner());
+      DCHECK(owner_ != nullptr) << "Thin-locked without owner!";
       entry_count_ = 1 + lock_word.ThinLockCount();
       // Thin locks have no waiters.
       break;
     case LockWord::kFatLocked: {
       Monitor* mon = lock_word.FatLockMonitor();
       owner_ = mon->owner_;
-      entry_count_ = 1 + mon->lock_count_;
+      // Here it is okay for the owner to be null since we don't reset the LockWord back to
+      // kUnlocked until we get a GC. In cases where this hasn't happened yet we will have a fat
+      // lock without an owner.
+      if (owner_ != nullptr) {
+        entry_count_ = 1 + mon->lock_count_;
+      } else {
+        DCHECK_EQ(mon->lock_count_, 0) << "Monitor is fat-locked without any owner!";
+      }
       for (Thread* waiter = mon->wait_set_; waiter != nullptr; waiter = waiter->GetWaitNext()) {
         waiters_.push_back(waiter);
       }
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index a8ccf89..a67a6aa 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -417,6 +417,12 @@
       return;
     }
     Thread* self = Thread::Current();
+
+    // Dump all threads first and then the aborting thread. While this is counter the logical flow,
+    // it improves the chance of relevant data surviving in the Android logs.
+
+    DumpAllThreads(os, self);
+
     if (self == nullptr) {
       os << "(Aborting thread was not attached to runtime!)\n";
       DumpKernelStack(os, GetTid(), "  kernel: ", false);
@@ -432,7 +438,6 @@
         }
       }
     }
-    DumpAllThreads(os, self);
   }
 
   // No thread-safety analysis as we do explicitly test for holding the mutator lock.
diff --git a/runtime/runtime_common.cc b/runtime/runtime_common.cc
index 940e461..f8e9442 100644
--- a/runtime/runtime_common.cc
+++ b/runtime/runtime_common.cc
@@ -416,8 +416,19 @@
            << "Cmdline: " << cmd_line << std::endl
            << "Thread: " << tid << " \"" << thread_name << "\"" << std::endl
            << "Registers:\n" << Dumpable<UContext>(thread_context) << std::endl
-           << "Backtrace:\n" << Dumpable<Backtrace>(thread_backtrace) << std::endl;
-    stream << std::flush;
+           << "Backtrace:\n" << Dumpable<Backtrace>(thread_backtrace);
+    if (signal_number == SIGILL) {
+      // Note the view we present is from the d-cache, which should
+      // match the i-cache if all is well.
+      static const size_t kCodeSnippetBytes = 16;
+      stream << "Code:\n\t" << info->si_addr << ":";
+      uintptr_t start = reinterpret_cast<uintptr_t>(info->si_addr);
+      uintptr_t end = std::min(start + kCodeSnippetBytes, RoundUp(start, kPageSize));
+      for (uintptr_t addr = start; addr != end; ++addr) {
+        stream << StringPrintf(" %02x", *(reinterpret_cast<const uint8_t*>(addr)));
+      }
+    }
+    stream << std::endl << std::flush;
   };
 
   if (dump_on_stderr) {
diff --git a/runtime/runtime_common.h b/runtime/runtime_common.h
index 06d6627..1248fe0 100644
--- a/runtime/runtime_common.h
+++ b/runtime/runtime_common.h
@@ -30,6 +30,7 @@
 
 #include <iomanip>
 
+#include "base/bit_utils.h"
 #include "base/dumpable.h"
 #include "native_stack_dump.h"
 #include "utils.h"
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 3f23926..57b3a75 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -166,11 +166,13 @@
                               bool is_reference,
                               bool from_code,
                               ObjPtr<mirror::Throwable> pending_exception,
+                              DeoptimizationMethodType method_type,
                               DeoptimizationContextRecord* link)
       : ret_val_(ret_val),
         is_reference_(is_reference),
         from_code_(from_code),
         pending_exception_(pending_exception.Ptr()),
+        deopt_method_type_(method_type),
         link_(link) {}
 
   JValue GetReturnValue() const { return ret_val_; }
@@ -185,6 +187,9 @@
   mirror::Object** GetPendingExceptionAsGCRoot() {
     return reinterpret_cast<mirror::Object**>(&pending_exception_);
   }
+  DeoptimizationMethodType GetDeoptimizationMethodType() const {
+    return deopt_method_type_;
+  }
 
  private:
   // The value returned by the method at the top of the stack before deoptimization.
@@ -200,6 +205,9 @@
   // exception).
   mirror::Throwable* pending_exception_;
 
+  // Whether the context was created for an (idempotent) runtime method.
+  const DeoptimizationMethodType deopt_method_type_;
+
   // A link to the previous DeoptimizationContextRecord.
   DeoptimizationContextRecord* const link_;
 
@@ -229,26 +237,30 @@
 
 void Thread::PushDeoptimizationContext(const JValue& return_value,
                                        bool is_reference,
+                                       ObjPtr<mirror::Throwable> exception,
                                        bool from_code,
-                                       ObjPtr<mirror::Throwable> exception) {
+                                       DeoptimizationMethodType method_type) {
   DeoptimizationContextRecord* record = new DeoptimizationContextRecord(
       return_value,
       is_reference,
       from_code,
       exception,
+      method_type,
       tlsPtr_.deoptimization_context_stack);
   tlsPtr_.deoptimization_context_stack = record;
 }
 
 void Thread::PopDeoptimizationContext(JValue* result,
                                       ObjPtr<mirror::Throwable>* exception,
-                                      bool* from_code) {
+                                      bool* from_code,
+                                      DeoptimizationMethodType* method_type) {
   AssertHasDeoptimizationContext();
   DeoptimizationContextRecord* record = tlsPtr_.deoptimization_context_stack;
   tlsPtr_.deoptimization_context_stack = record->GetLink();
   result->SetJ(record->GetReturnValue().GetJ());
   *exception = record->GetPendingException();
   *from_code = record->GetFromCode();
+  *method_type = record->GetDeoptimizationMethodType();
   delete record;
 }
 
@@ -3084,10 +3096,16 @@
     NthCallerVisitor visitor(this, 0, false);
     visitor.WalkStack();
     if (Runtime::Current()->IsAsyncDeoptimizeable(visitor.caller_pc)) {
+      // method_type shouldn't matter due to exception handling.
+      const DeoptimizationMethodType method_type = DeoptimizationMethodType::kDefault;
       // Save the exception into the deoptimization context so it can be restored
       // before entering the interpreter.
       PushDeoptimizationContext(
-          JValue(), /*is_reference */ false, /* from_code */ false, exception);
+          JValue(),
+          false /* is_reference */,
+          exception,
+          false /* from_code */,
+          method_type);
       artDeoptimize(this);
       UNREACHABLE();
     } else {
@@ -3647,7 +3665,8 @@
       PopStackedShadowFrame(StackedShadowFrameType::kDeoptimizationShadowFrame);
   ObjPtr<mirror::Throwable> pending_exception;
   bool from_code = false;
-  PopDeoptimizationContext(result, &pending_exception, &from_code);
+  DeoptimizationMethodType method_type;
+  PopDeoptimizationContext(result, &pending_exception, &from_code, &method_type);
   SetTopOfStack(nullptr);
   SetTopOfShadowStack(shadow_frame);
 
@@ -3656,7 +3675,11 @@
   if (pending_exception != nullptr) {
     SetException(pending_exception);
   }
-  interpreter::EnterInterpreterFromDeoptimize(this, shadow_frame, from_code, result);
+  interpreter::EnterInterpreterFromDeoptimize(this,
+                                              shadow_frame,
+                                              result,
+                                              from_code,
+                                              method_type);
 }
 
 void Thread::SetException(ObjPtr<mirror::Throwable> new_exception) {
diff --git a/runtime/thread.h b/runtime/thread.h
index 7540fd2..ad4506e 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -117,6 +117,13 @@
   kDeoptimizationShadowFrame,
 };
 
+// The type of method that triggers deoptimization. It contains info on whether
+// the deoptimized method should advance dex_pc.
+enum class DeoptimizationMethodType {
+  kKeepDexPc,  // dex pc is required to be kept upon deoptimization.
+  kDefault     // dex pc may or may not advance depending on other conditions.
+};
+
 // This should match RosAlloc::kNumThreadLocalSizeBrackets.
 static constexpr size_t kNumRosAllocThreadLocalSizeBracketsInThread = 16;
 
@@ -960,14 +967,18 @@
   // values on stacks.
   // 'from_code' denotes whether the deoptimization was explicitly made from
   // compiled code.
+  // 'method_type' contains info on whether deoptimization should advance
+  // dex_pc.
   void PushDeoptimizationContext(const JValue& return_value,
                                  bool is_reference,
+                                 ObjPtr<mirror::Throwable> exception,
                                  bool from_code,
-                                 ObjPtr<mirror::Throwable> exception)
+                                 DeoptimizationMethodType method_type)
       REQUIRES_SHARED(Locks::mutator_lock_);
   void PopDeoptimizationContext(JValue* result,
                                 ObjPtr<mirror::Throwable>* exception,
-                                bool* from_code)
+                                bool* from_code,
+                                DeoptimizationMethodType* method_type)
       REQUIRES_SHARED(Locks::mutator_lock_);
   void AssertHasDeoptimizationContext()
       REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/test/088-monitor-verification/src/Main.java b/test/088-monitor-verification/src/Main.java
index 13a96c7..f5cbc2a 100644
--- a/test/088-monitor-verification/src/Main.java
+++ b/test/088-monitor-verification/src/Main.java
@@ -39,6 +39,7 @@
         ensureJitCompiled(Main.class, "constantLock");
         ensureJitCompiled(Main.class, "notExcessiveNesting");
         ensureJitCompiled(Main.class, "notNested");
+        ensureJitCompiled(TwoPath.class, "twoPath");
 
         Main m = new Main();
 
diff --git a/test/1930-monitor-info/expected.txt b/test/1930-monitor-info/expected.txt
new file mode 100644
index 0000000..b43f1b2
--- /dev/null
+++ b/test/1930-monitor-info/expected.txt
@@ -0,0 +1,31 @@
+Running with single thread.
+Pre-lock[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testSingleThread], owner: <NULL>, entryCount: 0, waiters: [], notify_waiters: [] }
+Thread[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testSingleThread], owner: main, entryCount: 1, waiters: [], notify_waiters: [] }
+Running with single thread in native.
+Pre-lock[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testSingleThread], owner: <NULL>, entryCount: 0, waiters: [], notify_waiters: [] }
+Thread[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testSingleThread], owner: main, entryCount: 1, waiters: [], notify_waiters: [] }
+Lock twice
+Pre-lock[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwice], owner: <NULL>, entryCount: 0, waiters: [], notify_waiters: [] }
+Pre-lock[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwice], owner: main, entryCount: 1, waiters: [], notify_waiters: [] }
+Thread[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwice], owner: main, entryCount: 2, waiters: [], notify_waiters: [] }
+Lock twice native
+Pre-lock[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwiceNative], owner: <NULL>, entryCount: 0, waiters: [], notify_waiters: [] }
+Pre-lock[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwiceNative], owner: main, entryCount: 1, waiters: [], notify_waiters: [] }
+Thread[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwiceNative], owner: main, entryCount: 2, waiters: [], notify_waiters: [] }
+Lock twice Java then native
+Pre-lock[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwiceJN], owner: <NULL>, entryCount: 0, waiters: [], notify_waiters: [] }
+Pre-lock[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwiceJN], owner: main, entryCount: 1, waiters: [], notify_waiters: [] }
+Thread[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwiceJN], owner: main, entryCount: 2, waiters: [], notify_waiters: [] }
+Lock twice native then Java
+Pre-lock[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwiceNJ], owner: <NULL>, entryCount: 0, waiters: [], notify_waiters: [] }
+Pre-lock[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwiceNJ], owner: main, entryCount: 1, waiters: [], notify_waiters: [] }
+Thread[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockedTwiceNJ], owner: main, entryCount: 2, waiters: [], notify_waiters: [] }
+lock with wait
+Thread[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockWait], owner: main, entryCount: 1, waiters: [Test1930 Thread - testLockWait], notify_waiters: [] }
+Thread[Test1930 Thread - testLockWait]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockWait], owner: Test1930 Thread - testLockWait, entryCount: 1, waiters: [], notify_waiters: [] }
+Thread[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testLockWait], owner: <NULL>, entryCount: 0, waiters: [], notify_waiters: [] }
+Wait for notify.
+Thread[Test1930 Thread - testLockWait]: MonitorUsage{ monitor: NamedLock[Test1930 - testNotifyWait], owner: Test1930 Thread - testLockWait, entryCount: 1, waiters: [], notify_waiters: [] }
+Thread[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testNotifyWait], owner: main, entryCount: 1, waiters: [Test1930 Thread - testLockWait], notify_waiters: [Test1930 Thread - testLockWait] }
+Thread[Test1930 Thread - testLockWait]: MonitorUsage{ monitor: NamedLock[Test1930 - testNotifyWait], owner: Test1930 Thread - testLockWait, entryCount: 1, waiters: [], notify_waiters: [] }
+Thread[main]: MonitorUsage{ monitor: NamedLock[Test1930 - testNotifyWait], owner: <NULL>, entryCount: 0, waiters: [], notify_waiters: [] }
diff --git a/test/1930-monitor-info/info.txt b/test/1930-monitor-info/info.txt
new file mode 100644
index 0000000..8e19edc
--- /dev/null
+++ b/test/1930-monitor-info/info.txt
@@ -0,0 +1,3 @@
+Tests basic functions in the jvmti plugin.
+
+Tests that the GetObjectMonitorUsage function works correctly.
diff --git a/test/1930-monitor-info/monitor.cc b/test/1930-monitor-info/monitor.cc
new file mode 100644
index 0000000..7f97c05
--- /dev/null
+++ b/test/1930-monitor-info/monitor.cc
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <pthread.h>
+
+#include <cstdio>
+#include <iostream>
+#include <vector>
+
+#include "android-base/logging.h"
+#include "jni.h"
+#include "jvmti.h"
+
+#include "scoped_local_ref.h"
+#include "scoped_primitive_array.h"
+
+// Test infrastructure
+#include "jvmti_helper.h"
+#include "test_env.h"
+
+namespace art {
+namespace Test1930MonitorInfo {
+
+extern "C" JNIEXPORT void JNICALL Java_art_Test1930_executeLockedNative(JNIEnv* env,
+                                                                        jclass klass,
+                                                                        jobject run,
+                                                                        jobject l) {
+  ScopedLocalRef<jclass> runnable(env, env->FindClass("java/lang/Runnable"));
+  if (env->ExceptionCheck()) {
+    return;
+  }
+  jmethodID method = env->GetMethodID(runnable.get(), "run", "()V");
+
+  if (env->ExceptionCheck()) {
+    return;
+  }
+  jmethodID printMethod = env->GetStaticMethodID(klass, "printPreLock", "(Ljava/lang/Object;)V");
+  if (env->ExceptionCheck()) {
+    return;
+  }
+
+  env->CallStaticVoidMethod(klass, printMethod, l);
+  if (env->ExceptionCheck()) {
+    return;
+  }
+  if (env->MonitorEnter(l) != 0) {
+    return;
+  }
+  env->CallVoidMethod(run, method);
+  env->MonitorExit(l);
+}
+
+}  // namespace Test1930MonitorInfo
+}  // namespace art
diff --git a/test/1930-monitor-info/run b/test/1930-monitor-info/run
new file mode 100755
index 0000000..e92b873
--- /dev/null
+++ b/test/1930-monitor-info/run
@@ -0,0 +1,17 @@
+#!/bin/bash
+#
+# Copyright 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+./default-run "$@" --jvmti
diff --git a/test/1930-monitor-info/src/Main.java b/test/1930-monitor-info/src/Main.java
new file mode 100644
index 0000000..3328461
--- /dev/null
+++ b/test/1930-monitor-info/src/Main.java
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) throws Exception {
+    art.Test1930.run();
+  }
+}
diff --git a/test/1930-monitor-info/src/art/Monitors.java b/test/1930-monitor-info/src/art/Monitors.java
new file mode 100644
index 0000000..26f7718
--- /dev/null
+++ b/test/1930-monitor-info/src/art/Monitors.java
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package art;
+
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.Function;
+import java.util.stream.Stream;
+
+public class Monitors {
+  public static class NamedLock {
+    public final String name;
+    public NamedLock(String name) {
+      this.name = name;
+    }
+    public String toString() {
+      return String.format("NamedLock[%s]", name);
+    }
+  }
+
+  public static final class MonitorUsage {
+    public final Object monitor;
+    public final Thread owner;
+    public final int entryCount;
+    public final Thread[] waiters;
+    public final Thread[] notifyWaiters;
+
+    public MonitorUsage(
+        Object monitor,
+        Thread owner,
+        int entryCount,
+        Thread[] waiters,
+        Thread[] notifyWaiters) {
+      this.monitor = monitor;
+      this.entryCount = entryCount;
+      this.owner = owner;
+      this.waiters = waiters;
+      this.notifyWaiters = notifyWaiters;
+    }
+
+    private static String toNameList(Thread[] ts) {
+      return Arrays.toString(Arrays.stream(ts).map((Thread t) -> t.getName()).toArray());
+    }
+
+    public String toString() {
+      return String.format(
+          "MonitorUsage{ monitor: %s, owner: %s, entryCount: %d, waiters: %s, notify_waiters: %s }",
+          monitor,
+          (owner != null) ? owner.getName() : "<NULL>",
+          entryCount,
+          toNameList(waiters),
+          toNameList(notifyWaiters));
+    }
+  }
+
+  public static native MonitorUsage getObjectMonitorUsage(Object monitor);
+}
+
diff --git a/test/1930-monitor-info/src/art/Test1930.java b/test/1930-monitor-info/src/art/Test1930.java
new file mode 100644
index 0000000..a7fa1c7
--- /dev/null
+++ b/test/1930-monitor-info/src/art/Test1930.java
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package art;
+
+import java.util.concurrent.Semaphore;
+import java.util.Arrays;
+
+public class Test1930 {
+  public static final int NUM_RETRY = 100;
+  private static void testSingleThread() {
+    Monitors.NamedLock lk = new Monitors.NamedLock("Test1930 - testSingleThread");
+    executeLocked(() -> { printMonitorUsage(lk); }, lk);
+  }
+  private static void testSingleThreadNative() {
+    Monitors.NamedLock lk = new Monitors.NamedLock("Test1930 - testSingleThread");
+    executeLockedNative(() -> { printMonitorUsage(lk); }, lk);
+  }
+
+  private static void testLockedTwice() {
+    final Monitors.NamedLock lk = new Monitors.NamedLock("Test1930 - testLockedTwice");
+    executeLocked(() -> { executeLocked(() -> { printMonitorUsage(lk); }, lk); }, lk);
+  }
+
+  private static void testLockedTwiceNJ() {
+    final Monitors.NamedLock lk = new Monitors.NamedLock("Test1930 - testLockedTwiceNJ");
+    executeLockedNative(() -> { executeLockedNative(() -> { printMonitorUsage(lk); }, lk); }, lk);
+  }
+
+  private static void testLockedTwiceJN() {
+    final Monitors.NamedLock lk = new Monitors.NamedLock("Test1930 - testLockedTwiceJN");
+    executeLockedNative(() -> { executeLockedNative(() -> { printMonitorUsage(lk); }, lk); }, lk);
+  }
+
+  private static void testLockedTwiceNative() {
+    final Monitors.NamedLock lk = new Monitors.NamedLock("Test1930 - testLockedTwiceNative");
+    executeLockedNative(() -> { executeLockedNative(() -> { printMonitorUsage(lk); }, lk); }, lk);
+  }
+
+  public final static class ThreadSignaler {
+    public volatile boolean signal = false;
+  }
+
+  private static void testLockWait() throws Exception {
+    final Monitors.NamedLock lk = new Monitors.NamedLock("Test1930 - testLockWait");
+    final Semaphore sem = new Semaphore(0);
+    final Thread t = new Thread(() -> {
+      sem.release();
+      synchronized (lk) {
+        printMonitorUsage(lk);
+      }
+    }, "Test1930 Thread - testLockWait");
+    synchronized (lk) {
+      t.start();
+      // Wait for the other thread to actually start.
+      sem.acquire();
+      // Wait for the other thread to go to sleep trying to get the mutex. This might take a (short)
+      // time since we try spinning first for better performance.
+      boolean found_wait = false;
+      for (long i = 0; i < NUM_RETRY; i++) {
+        if (Arrays.asList(Monitors.getObjectMonitorUsage(lk).waiters).contains(t)) {
+          found_wait = true;
+          break;
+        } else {
+          Thread.sleep(500);
+          Thread.yield();
+        }
+      }
+      if (!found_wait) {
+        System.out.println("other thread doesn't seem to be waiting.");
+      }
+      printMonitorUsage(lk);
+    }
+    t.join();
+    printMonitorUsage(lk);
+  }
+
+  private static void testNotifyWait() throws Exception {
+    final Monitors.NamedLock lk = new Monitors.NamedLock("Test1930 - testNotifyWait");
+    final Semaphore sem = new Semaphore(0);
+    Thread t = new Thread(() -> {
+      synchronized (lk) {
+        printMonitorUsage(lk);
+        sem.release();
+        try {
+          lk.wait();
+        } catch (Exception e) {
+          throw new Error("Error waiting!", e);
+        }
+        printMonitorUsage(lk);
+      }
+    }, "Test1930 Thread - testLockWait");
+    t.start();
+    sem.acquire();
+    synchronized (lk) {
+      printMonitorUsage(lk);
+      lk.notifyAll();
+    }
+    t.join();
+    printMonitorUsage(lk);
+  }
+
+  public static void run() throws Exception {
+    // Single threaded tests.
+    System.out.println("Running with single thread.");
+    testSingleThread();
+    System.out.println("Running with single thread in native.");
+    testSingleThreadNative();
+    System.out.println("Lock twice");
+    testLockedTwice();
+    System.out.println("Lock twice native");
+    testLockedTwiceNative();
+    System.out.println("Lock twice Java then native");
+    testLockedTwiceJN();
+    System.out.println("Lock twice native then Java");
+    testLockedTwiceNJ();
+
+    // Mutli threaded tests.
+    System.out.println("lock with wait");
+    testLockWait();
+    System.out.println("Wait for notify.");
+    testNotifyWait();
+  }
+
+  public static void printPreLock(Object lock) {
+    System.out.println(String.format("Pre-lock[%s]: %s",
+          Thread.currentThread().getName(), Monitors.getObjectMonitorUsage(lock)));
+  }
+
+  public static void executeLocked(Runnable r, Object lock) {
+    printPreLock(lock);
+    synchronized (lock) {
+      r.run();
+    }
+  }
+
+  public native static void executeLockedNative(Runnable r, Object m);
+  public static void printMonitorUsage(Object m) {
+    System.out.println(String.format("Thread[%s]: %s",
+          Thread.currentThread().getName(), Monitors.getObjectMonitorUsage(m)));
+  }
+}
diff --git a/test/476-checker-ctor-fence-redun-elim/expected.txt b/test/476-checker-ctor-fence-redun-elim/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/476-checker-ctor-fence-redun-elim/expected.txt
diff --git a/test/476-checker-ctor-fence-redun-elim/info.txt b/test/476-checker-ctor-fence-redun-elim/info.txt
new file mode 100644
index 0000000..46d62f7
--- /dev/null
+++ b/test/476-checker-ctor-fence-redun-elim/info.txt
@@ -0,0 +1,2 @@
+Tests to ensure constructor fences (after new-instance, new-array, or final fields) are properly
+merged together by the compiler when they are redundant.
diff --git a/test/476-checker-ctor-fence-redun-elim/src/Main.java b/test/476-checker-ctor-fence-redun-elim/src/Main.java
new file mode 100644
index 0000000..05f2f7c
--- /dev/null
+++ b/test/476-checker-ctor-fence-redun-elim/src/Main.java
@@ -0,0 +1,844 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Array;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.lang.reflect.Modifier;
+
+// Baseline class. This has no final fields, so there are no additional freezes
+// in its constructor.
+//
+// The new-instance itself always has 1 freeze for the happens-before on the object header
+// write (i.e. [obj.class = X] happens-before any access to obj).
+//
+// Total freezes for "new Base()": 1.
+class Base {
+  int w0;
+  int w1;
+  int w2;
+  int w3;
+
+  @Override
+  public String toString() {
+    return getClass().getName() + "(" + baseString() + ")";
+  }
+
+  protected String baseString() {
+    return String.format("w0: %d, w1: %d, w2: %d, w3: %d", w0, w1, w2, w3);
+  }
+}
+
+// This has a final field in its constructor, so there must be a field freeze
+// at the end of <init>.
+//
+// Total freezes for "new OneFinal()": 2.
+class OneFinal extends Base {
+  final int x;
+  OneFinal(int x) {
+    this.x = x;
+  }
+
+  @Override
+  protected String baseString() {
+    return String.format("%s, x: %d", super.baseString(), x);
+  }
+}
+
+class Assert {
+  public static void stringEquals(String expected, Object actual) {
+    stringEquals$noinline$(expected, actual);
+  }
+
+  // Forbid compiler from inlining this to avoid overly clever optimizations.
+  private static void stringEquals$noinline$(String expected, Object actual) {
+    String actualStr = Main.valueToString(actual);
+    if (!expected.equals(actualStr)) {
+      throw new AssertionError("Expected: " + expected + ", actual: " + actualStr);
+    }
+  }
+}
+
+interface Test {
+  public void exercise();
+  public void check();
+}
+
+class TestOneFinal implements Test {
+  // Initialize at least once before actual test.
+  public static Object external;
+
+  /// CHECK-START: void TestOneFinal.exercise() constructor_fence_redundancy_elimination (before)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-NOT:                      ConstructorFence
+  /// CHECK-DAG:                      StaticFieldSet [<<External:l\d+>>,<<NewInstance>>]
+
+  /// CHECK-START: void TestOneFinal.exercise() constructor_fence_redundancy_elimination (after)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-NOT:                      ConstructorFence
+  /// CHECK-DAG:                      StaticFieldSet [<<External:l\d+>>,<<NewInstance>>]
+  @Override
+  public void exercise() {
+      Base b = new OneFinal(1);
+      // 1 store, 2 freezes.
+
+      // Stores to 'b' do not escape b.
+      b.w0 = 1;
+      b.w1 = 2;
+      b.w2 = 3;
+
+      // Publish the result to a global so that it is not LSE-eliminated.
+      external = b;
+  }
+
+  @Override
+  public void check() {
+    Assert.stringEquals("OneFinal(w0: 1, w1: 2, w2: 3, w3: 0, x: 1)", external);
+  }
+}
+
+// This has a final field in its constructor, so there must be a field freeze
+// at the end of <init>. The previous base class's freezes accumulate on top
+// of this one.
+//
+// Total freezes for "new TwoFinal()": 3.
+class TwoFinal extends OneFinal {
+  final int y;
+  TwoFinal(int x, int y) {
+    super(x);
+    this.y = y;
+  }
+
+  @Override
+  protected String baseString() {
+    return String.format("%s, y: %d", super.baseString(), y);
+  }
+}
+
+// This has a final field in its constructor, so there must be a field freeze
+// at the end of <init>. The previous base class's freezes accumulate on top
+// of this one.
+//
+// Total freezes for "new ThreeFinal()": 4.
+class ThreeFinal extends TwoFinal {
+  final int z;
+  ThreeFinal(int x, int y, int z) {
+    super(x, y);
+    this.z = z;
+  }
+
+  @Override
+  protected String baseString() {
+    return String.format("%s, z: %d", super.baseString(), z);
+  }
+}
+
+class TestThreeFinal implements Test {
+  // Initialize at least once before actual test.
+  public static Object external;
+
+  /// CHECK-START: void TestThreeFinal.exercise() constructor_fence_redundancy_elimination (before)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-NOT:                      ConstructorFence
+  /// CHECK-DAG:                      StaticFieldSet [<<External:l\d+>>,<<NewInstance>>]
+
+  /// CHECK-START: void TestThreeFinal.exercise() constructor_fence_redundancy_elimination (after)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-NOT:                      ConstructorFence
+  /// CHECK-DAG:                      StaticFieldSet [<<External:l\d+>>,<<NewInstance>>]
+  @Override
+  public void exercise() {
+    Base b = new ThreeFinal(1, 1, 2);
+    // 3 store, 4 freezes.
+
+    // Stores to 'b' do not escape b.
+    b.w0 = 3;
+
+    // Publish the result to a global so that it is not LSE-eliminated.
+    external = b;
+  }
+
+  @Override
+  public void check() {
+    Assert.stringEquals("ThreeFinal(w0: 3, w1: 0, w2: 0, w3: 0, x: 1, y: 1, z: 2)", external);
+  }
+}
+
+// Ensure "freezes" between multiple new-instances are optimized out.
+class TestMultiAlloc implements Test {
+  public static Object external;
+  public static Object external2;
+
+  /// CHECK-START: void TestMultiAlloc.exercise() constructor_fence_redundancy_elimination (before)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+  /// CHECK-NOT:                      ConstructorFence
+  /// CHECK-DAG:                      StaticFieldSet [<<External:l\d+>>,<<NewInstance>>]
+  /// CHECK-DAG:                      StaticFieldSet [<<External2:l\d+>>,<<NewInstance2>>]
+
+  /// CHECK-START: void TestMultiAlloc.exercise() constructor_fence_redundancy_elimination (after)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>,<<NewInstance>>]
+  /// CHECK-NOT:                      ConstructorFence
+  /// CHECK-DAG:                      StaticFieldSet [<<External:l\d+>>,<<NewInstance>>]
+  /// CHECK-DAG:                      StaticFieldSet [<<External2:l\d+>>,<<NewInstance2>>]
+  @Override
+  public void exercise() {
+    // 1 freeze
+    Base b = new Base();
+    // 1 freeze
+    Base b2 = new Base();
+
+    // Merge 2 freezes above into 1 constructor fence.
+    external = b;
+    external2 = b2;
+  }
+
+  @Override
+  public void check() {
+    Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external);
+    Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external2);
+  }
+}
+
+// Ensure "freezes" between multiple new-instances are optimized out.
+class TestThreeFinalTwice implements Test {
+  // Initialize at least once before actual test.
+  public static Object external;
+  public static Object external2;
+
+  /// CHECK-START: void TestThreeFinalTwice.exercise() constructor_fence_redundancy_elimination (before)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance>>]
+  /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+  /// CHECK-NOT:                      ConstructorFence
+  /// CHECK-DAG:                      StaticFieldSet [<<External:l\d+>>,<<NewInstance>>]
+  /// CHECK-DAG:                      StaticFieldSet [<<External2:l\d+>>,<<NewInstance2>>]
+
+  /// CHECK-START: void TestThreeFinalTwice.exercise() constructor_fence_redundancy_elimination (after)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>,<<NewInstance>>]
+  /// CHECK-NOT:                      ConstructorFence
+  /// CHECK-DAG:                      StaticFieldSet [<<External:l\d+>>,<<NewInstance>>]
+  /// CHECK-DAG:                      StaticFieldSet [<<External2:l\d+>>,<<NewInstance2>>]
+  @Override
+  public void exercise() {
+    Base b = new ThreeFinal(1, 1, 2);
+    // 3 store, 4 freezes.
+
+    // Stores to 'b' do not escape b.
+    b.w0 = 3;
+
+    Base b2 = new ThreeFinal(4, 5, 6);
+    // 3 store, 4 freezes.
+
+    // Stores to 'b2' do not escape b2.
+    b2.w0 = 7;
+
+    // Publish the result to a global so that it is not LSE-eliminated.
+    // Publishing is done at the end to give freezes above a chance to merge.
+    external = b;
+    external2 = b2;
+  }
+
+  @Override
+  public void check() {
+    Assert.stringEquals("ThreeFinal(w0: 3, w1: 0, w2: 0, w3: 0, x: 1, y: 1, z: 2)", external);
+    Assert.stringEquals("ThreeFinal(w0: 7, w1: 0, w2: 0, w3: 0, x: 4, y: 5, z: 6)", external2);
+  }
+}
+
+class TestNonEscaping {
+  // Prevent constant folding.
+  static boolean test;
+
+  static Object external;
+  static Object external2;
+  static Object external3;
+  static Object external4;
+
+  static class Invoke implements Test {
+    /// CHECK-START: void TestNonEscaping$Invoke.exercise() constructor_fence_redundancy_elimination (before)
+    /// CHECK: <<NewInstance:l\d+>>     NewInstance
+    /// CHECK:                          ConstructorFence [<<NewInstance>>]
+    /// CHECK:                          InvokeStaticOrDirect
+    /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+    /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                      ConstructorFence
+
+    /// CHECK-START: void TestNonEscaping$Invoke.exercise() constructor_fence_redundancy_elimination (after)
+    /// CHECK: <<NewInstance:l\d+>>     NewInstance
+    /// CHECK:                          InvokeStaticOrDirect
+    /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+    /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>,<<NewInstance>>]
+    /// CHECK-NOT:                      ConstructorFence
+    @Override
+    public void exercise() {
+      Base b = new Base();
+
+      // b cannot possibly escape into this invoke because it hasn't escaped onto the heap earlier,
+      // and the invoke doesn't take it as a parameter.
+      noEscape$noinline$();
+
+      // Remove the Constructor Fence for b, merging into the fence for b2.
+      Base b2 = new Base();
+
+      // Do not LSE-eliminate b,b2
+      external = b;
+      external2 = b2;
+    }
+
+    @Override
+    public void check() {
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external2);
+    }
+  }
+
+  public static int[] array = new int[1];
+  static Base base = new Base();
+
+  static class Store implements Test {
+    /// CHECK-START: void TestNonEscaping$Store.exercise() constructor_fence_redundancy_elimination (before)
+    /// CHECK: <<NewInstance:l\d+>>     NewInstance
+    /// CHECK:                          ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG:                      ArraySet
+    /// CHECK-DAG:                      StaticFieldSet
+    /// CHECK-DAG:                      InstanceFieldSet
+    /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+    /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                      ConstructorFence
+
+    /// CHECK-START: void TestNonEscaping$Store.exercise() constructor_fence_redundancy_elimination (after)
+    /// CHECK-DAG: <<NewInstance:l\d+>>   NewInstance
+    /// CHECK-DAG: <<NewInstance2:l\d+>>  NewInstance
+    /// CHECK-DAG:                        ConstructorFence [<<NewInstance2>>,<<NewInstance>>]
+    /// CHECK-NOT:                        ConstructorFence
+    @Override
+    public void exercise() {
+      Base b = new Base();
+
+      // Stores of inputs other than the fence target do not publish 'b'.
+      array[0] = b.w0;  // aput
+      external = array; // sput
+      base.w0 = b.w0;   // iput
+
+      // Remove the Constructor Fence for b, merging into the fence for b2.
+      Base b2 = new Base();
+
+      // Do not LSE-eliminate b,b2
+      external3 = b;
+      external4 = b2;
+    }
+
+    @Override
+    public void check() {
+      Assert.stringEquals("[0]", array);
+      Assert.stringEquals("[0]", external);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", base);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external3);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external4);
+    }
+  }
+
+  private static void noEscape$noinline$() {
+  }
+}
+
+class TestDontOptimizeAcrossBlocks implements Test {
+  // Prevent constant folding.
+  static boolean test;
+
+  static Object external;
+  static Object external3;
+
+  /// CHECK-START: void TestDontOptimizeAcrossBlocks.exercise() constructor_fence_redundancy_elimination (before)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK:                          ConstructorFence [<<NewInstance>>]
+  /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+  /// CHECK-NOT:                      ConstructorFence
+  /// CHECK-DAG:                      StaticFieldSet [<<External:l\d+>>,<<NewInstance>>]
+  /// CHECK-DAG:                      StaticFieldSet [<<External2:l\d+>>,<<NewInstance2>>]
+
+  /// CHECK-START: void TestDontOptimizeAcrossBlocks.exercise() constructor_fence_redundancy_elimination (after)
+  /// CHECK: <<NewInstance:l\d+>>     NewInstance
+  /// CHECK:                          ConstructorFence [<<NewInstance>>]
+  /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+  /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+  /// CHECK-NOT:                      ConstructorFence
+  /// CHECK-DAG:                      StaticFieldSet [<<External:l\d+>>,<<NewInstance>>]
+  /// CHECK-DAG:                      StaticFieldSet [<<External2:l\d+>>,<<NewInstance2>>]
+  @Override
+  public void exercise() {
+    Base b = new Base();
+
+    // Do not move constructor fence across this block, even though 'b' is not published yet.
+    if (test) {
+      external = null;
+    }
+
+    Base b2 = new Base();
+    external = b2;
+    external3 = b;
+  }
+
+  @Override
+  public void check() {
+    Assert.stringEquals("false", test);
+    Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external);
+    Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external3);
+  }
+}
+
+class TestDontOptimizeAcrossEscape {
+  // Prevent constant folding.
+  static boolean test;
+
+  static Object external;
+  static Object external2;
+  static Object external3;
+  static Object external4;
+
+  static class Invoke implements Test {
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$Invoke.exercise() constructor_fence_redundancy_elimination (before)
+    /// CHECK: <<NewInstance:l\d+>>     NewInstance
+    /// CHECK:                          ConstructorFence [<<NewInstance>>]
+    /// CHECK:                          InvokeStaticOrDirect
+    /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+    /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                      ConstructorFence
+
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$Invoke.exercise() constructor_fence_redundancy_elimination (after)
+    /// CHECK: <<NewInstance:l\d+>>     NewInstance
+    /// CHECK:                          ConstructorFence [<<NewInstance>>]
+    /// CHECK:                          InvokeStaticOrDirect
+    /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+    /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                      ConstructorFence
+    @Override
+    public void exercise() {
+      Base b = new Base();
+      // Do not optimize across invokes into which the fence target escapes.
+      invoke$noinline$(b);
+
+      Base b2 = new Base();
+
+      // Do not LSE-eliminate b,b2
+      external = b;
+      external2 = b2;
+    }
+
+    private static void invoke$noinline$(Object b) {
+      // Even though 'b' does not escape this method, we conservatively assume all parameters
+      // of an invoke escape.
+    }
+
+    @Override
+    public void check() {
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external2);
+    }
+  }
+
+  public static Object[] array = new Object[3];
+  static Base base = new Base();
+
+  static class InstanceEscaper {
+    public Object holder;
+
+    @Override
+    public String toString() {
+      return getClass().getName() + "(" + baseString() + ")";
+    }
+
+    protected String baseString() {
+      return String.format("holder: %s", Main.valueToString(holder));
+    }
+  }
+
+  static InstanceEscaper instanceEscaper = new InstanceEscaper();
+
+  static class StoreIput implements Test {
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$StoreIput.exercise() constructor_fence_redundancy_elimination (before)
+    /// CHECK: <<NewInstance:l\d+>>     NewInstance
+    /// CHECK:                          ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG:                      InstanceFieldSet
+    /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+    /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                      ConstructorFence
+
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$StoreIput.exercise() constructor_fence_redundancy_elimination (after)
+    /// CHECK-DAG: <<NewInstance:l\d+>>   NewInstance
+    /// CHECK:                            ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG: <<NewInstance2:l\d+>>  NewInstance
+    /// CHECK-DAG:                        ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                        ConstructorFence
+    @Override
+    public void exercise() {
+      Base b = new Base();
+
+      // A store of 'b' into another instance will publish 'b'.
+      instanceEscaper.holder = b;
+
+      // Do not remove any constructor fences above.
+      Base b2 = new Base();
+
+      // Do not LSE-eliminate b,b2
+      external3 = b;
+      external4 = b2;
+    }
+
+    @Override
+    public void check() {
+      Assert.stringEquals(
+          "TestDontOptimizeAcrossEscape$InstanceEscaper(holder: Base(w0: 0, w1: 0, w2: 0, w3: 0))",
+          instanceEscaper);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external3);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external4);
+    }
+  }
+
+  static class StoreAput implements Test {
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$StoreAput.exercise() constructor_fence_redundancy_elimination (before)
+    /// CHECK: <<NewInstance:l\d+>>     NewInstance
+    /// CHECK:                          ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG:                      ArraySet
+    /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+    /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                      ConstructorFence
+
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$StoreAput.exercise() constructor_fence_redundancy_elimination (after)
+    /// CHECK-DAG: <<NewInstance:l\d+>>   NewInstance
+    /// CHECK:                            ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG: <<NewInstance2:l\d+>>  NewInstance
+    /// CHECK-DAG:                        ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                        ConstructorFence
+    @Override
+    public void exercise() {
+      Base b = new Base();
+
+      // A store of 'b' into another array will publish 'b'.
+      array[0] = b;  // aput
+
+      // Do not remove any constructor fences above.
+      Base b2 = new Base();
+
+      // Do not LSE-eliminate b,b2
+      external3 = b;
+      external4 = b2;
+    }
+
+    @Override
+    public void check() {
+      Assert.stringEquals("[Base(w0: 0, w1: 0, w2: 0, w3: 0),<null>,<null>]", array);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external3);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external4);
+    }
+  }
+
+  static class StoreSput implements Test {
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$StoreSput.exercise() constructor_fence_redundancy_elimination (before)
+    /// CHECK: <<NewInstance:l\d+>>     NewInstance
+    /// CHECK:                          ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG:                      StaticFieldSet
+    /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+    /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                      ConstructorFence
+
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$StoreSput.exercise() constructor_fence_redundancy_elimination (after)
+    /// CHECK-DAG: <<NewInstance:l\d+>>   NewInstance
+    /// CHECK:                            ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG: <<NewInstance2:l\d+>>  NewInstance
+    /// CHECK-DAG:                        ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                        ConstructorFence
+    @Override
+    public void exercise() {
+      Base b = new Base();
+
+      // A store of 'b' into a static will publish 'b'.
+      external = b;
+
+      // Do not remove any constructor fences above.
+      Base b2 = new Base();
+
+      // Do not LSE-eliminate b,b2
+      external3 = b;
+      external4 = b2;
+    }
+
+    @Override
+    public void check() {
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external3);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external4);
+    }
+  }
+
+  static class Deopt implements Test {
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$Deopt.exercise() constructor_fence_redundancy_elimination (before)
+    /// CHECK: <<NewInstance:l\d+>>     NewInstance
+    /// CHECK:                          ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG:                      Deoptimize
+    /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+    /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                      ConstructorFence
+
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$Deopt.exercise() constructor_fence_redundancy_elimination (after)
+    /// CHECK-DAG: <<NewInstance:l\d+>>   NewInstance
+    /// CHECK:                            ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG: <<NewInstance2:l\d+>>  NewInstance
+    /// CHECK-DAG:                        ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                        ConstructorFence
+    @Override
+    public void exercise() {
+      Base b = new Base();
+
+      // An array access generates a Deopt to avoid doing bounds check.
+      array[0] = external;  // aput
+      array[1] = external;  // aput
+      array[2] = external;  // aput
+
+      // Do not remove any constructor fences above.
+      Base b2 = new Base();
+
+      // Do not LSE-eliminate b,b2
+      external3 = b;
+      external4 = b2;
+    }
+
+    @Override
+    public void check() {
+      Assert.stringEquals("[Base(w0: 0, w1: 0, w2: 0, w3: 0),"
+              + "Base(w0: 0, w1: 0, w2: 0, w3: 0),"
+              + "Base(w0: 0, w1: 0, w2: 0, w3: 0)]",
+          array);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external3);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external4);
+    }
+  }
+
+  static class Select implements Test {
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$Select.exercise() constructor_fence_redundancy_elimination (before)
+    /// CHECK: <<NewInstance:l\d+>>     NewInstance
+    /// CHECK:                          ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG:                      Select
+    /// CHECK: <<NewInstance2:l\d+>>    NewInstance
+    /// CHECK-DAG:                      ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                      ConstructorFence
+
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$Select.exercise() constructor_fence_redundancy_elimination (after)
+    /// CHECK-DAG: <<NewInstance:l\d+>>   NewInstance
+    /// CHECK:                            ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG: <<NewInstance2:l\d+>>  NewInstance
+    /// CHECK-DAG:                        ConstructorFence [<<NewInstance2>>]
+    /// CHECK-NOT:                        ConstructorFence
+    @Override
+    public void exercise() {
+      Base b = new Base();
+
+      boolean localTest = test;
+      Object localExternal = external3;
+
+      // Selecting 'b' creates an alias, which we conservatively assume escapes immediately.
+      external = localTest ? b : localExternal;
+
+      // Do not remove any constructor fences above.
+      Base b2 = new Base();
+
+      // Do not LSE-eliminate b,b2
+      external3 = b;
+      external4 = b2;
+    }
+
+    @Override
+    public void check() {
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external3);
+      Assert.stringEquals("Base(w0: 0, w1: 0, w2: 0, w3: 0)", external4);
+    }
+  }
+
+  static class MakeBoundTypeTest implements Test {
+    public static Object makeBoundType;
+    public static Object makeBoundTypeSub;
+
+    @Override
+    public void exercise() {
+      // Note: MakeBoundType is special and we have to call the constructor directly
+      // to prevent inlining it.
+      try {
+        makeBoundType = exerciseNewInstance(MakeBoundType.class, 123);
+        makeBoundTypeSub = exerciseNewInstance(MakeBoundTypeSub.class, 123);
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    @Override
+    public void check() {
+      Assert.stringEquals(
+          "TestDontOptimizeAcrossEscape$MakeBoundTypeTest$MakeBoundType(abcdefgh: 123, x: 2)",
+          makeBoundType);
+      Assert.stringEquals(
+          "TestDontOptimizeAcrossEscape$MakeBoundTypeTest$MakeBoundTypeSub(abcdefgh: 123, x: 1)",
+          makeBoundTypeSub);
+    }
+
+    // Make a new instance of 'klass'.
+    private static <T> T exerciseNewInstance(Class<T> klass, int params) throws Exception {
+      return klass.cast(klass.getDeclaredConstructor(int.class).newInstance(params));
+    }
+
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$MakeBoundTypeTest$MakeBoundType.<init>(int) constructor_fence_redundancy_elimination (before)
+    /// CHECK-DAG: <<This:l\d+>>         ParameterValue
+    /// CHECK-DAG: <<NewInstance:l\d+>>  NewInstance
+    /// CHECK:                           ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG:                       BoundType
+    /// CHECK-DAG:                       ConstructorFence [<<This>>]
+    /// CHECK-NOT:                       ConstructorFence
+
+    /// CHECK-START: void TestDontOptimizeAcrossEscape$MakeBoundTypeTest$MakeBoundType.<init>(int) constructor_fence_redundancy_elimination (after)
+    /// CHECK-DAG: <<This:l\d+>>         ParameterValue
+    /// CHECK-DAG: <<NewInstance:l\d+>>  NewInstance
+    /// CHECK:                           ConstructorFence [<<NewInstance>>]
+    /// CHECK-DAG:                       BoundType
+    /// CHECK-DAG:                       ConstructorFence [<<This>>]
+    /// CHECK-NOT:                       ConstructorFence
+    static class MakeBoundType {
+      final int abcdefgh;
+      int x;
+
+      MakeBoundType(int param) {
+        abcdefgh = param;
+
+        Base b = new Base();
+        // constructor-fence(b)
+
+        if (this instanceof MakeBoundTypeSub) {
+          // Create a "BoundType(this)" which prevents
+          // a merged constructor-fence(this, b)
+          x = 1;
+        } else {
+          x = 2;
+        }
+
+        // publish(b).
+        external = b;
+
+        // constructor-fence(this)
+      }
+
+      @Override
+      public String toString() {
+        return getClass().getName() + "(" + baseString() + ")";
+      }
+
+      protected String baseString() {
+        return String.format("abcdefgh: %d, x: %d", abcdefgh, x);
+      }
+    }
+
+    static class MakeBoundTypeSub extends MakeBoundType {
+      MakeBoundTypeSub(int xyz) {
+        super(xyz);
+      }
+    }
+  }
+}
+
+public class Main {
+  public static void main(String[] args) throws Exception {
+    // Ensure that all of this code does not get optimized out into a no-op
+    // by actually running the code with reflection, then validating
+    // the result by asserting it against a string.
+    Class<? extends Test>[] testClasses = new Class[] {
+      TestOneFinal.class,
+      TestThreeFinal.class,
+      TestMultiAlloc.class,
+      TestThreeFinalTwice.class,
+      TestNonEscaping.Invoke.class,
+      TestNonEscaping.Store.class,
+      TestDontOptimizeAcrossBlocks.class,
+      TestDontOptimizeAcrossEscape.Invoke.class,
+      TestDontOptimizeAcrossEscape.StoreIput.class,
+      TestDontOptimizeAcrossEscape.StoreAput.class,
+      TestDontOptimizeAcrossEscape.StoreSput.class,
+      TestDontOptimizeAcrossEscape.Deopt.class,
+      TestDontOptimizeAcrossEscape.Select.class,
+      TestDontOptimizeAcrossEscape.MakeBoundTypeTest.class,
+    };
+
+    for (Class<? extends Test> klass : testClasses) {
+      exerciseTestClass(klass);
+    }
+  }
+
+  /**
+   * Invoke Test#exercise(), then Test#check().
+   * @throws AssertionError if test fails.
+   */
+  private static void exerciseTestClass(Class<? extends Test> klass) throws Exception {
+    Test instance = klass.cast(klass.getDeclaredConstructor().newInstance());
+
+    // Use reflection as a best-effort to avoid compiler optimizations (e.g. inlining).
+    instance.getClass().getDeclaredMethod("exercise").invoke(instance);
+    instance.getClass().getDeclaredMethod("check").invoke(instance);
+  }
+
+  // Print an object, with special handling for array and null.
+  public static String valueToString(Object val) {
+    if (val == null) {
+      return "<null>";
+    }
+    if (val.getClass().isArray()) {
+      String fmt = "[";
+      int length = Array.getLength(val);
+      for (int i = 0; i < length; ++i) {
+        Object arrayElement = Array.get(val, i);
+        fmt += valueToString(arrayElement);
+
+        if (i != length - 1) {
+          fmt += ",";
+        }
+      }
+      fmt += "]";
+
+      return fmt;
+    }
+
+    return val.toString();
+  }
+}
diff --git a/test/530-checker-lse/src/Main.java b/test/530-checker-lse/src/Main.java
index 6632503..7ae873a 100644
--- a/test/530-checker-lse/src/Main.java
+++ b/test/530-checker-lse/src/Main.java
@@ -881,10 +881,10 @@
   /// CHECK: ArrayGet
   private static int testAllocationEliminationOfArray2() {
     // Cannot eliminate array allocation since array is accessed with non-constant
-    // index.
-    int[] array = new int[4];
-    array[2] = 4;
-    array[3] = 7;
+    // index (only 3 elements to prevent vectorization of the reduction).
+    int[] array = new int[3];
+    array[1] = 4;
+    array[2] = 7;
     int sum = 0;
     for (int e : array) {
       sum += e;
diff --git a/test/597-deopt-busy-loop/expected.txt b/test/597-deopt-busy-loop/expected.txt
new file mode 100644
index 0000000..f993efc
--- /dev/null
+++ b/test/597-deopt-busy-loop/expected.txt
@@ -0,0 +1,2 @@
+JNI_OnLoad called
+Finishing
diff --git a/test/597-deopt-busy-loop/info.txt b/test/597-deopt-busy-loop/info.txt
new file mode 100644
index 0000000..2c50dbb
--- /dev/null
+++ b/test/597-deopt-busy-loop/info.txt
@@ -0,0 +1 @@
+Test deoptimizing when returning from suspend-check runtime method.
diff --git a/test/597-deopt-busy-loop/run b/test/597-deopt-busy-loop/run
new file mode 100644
index 0000000..bc04498
--- /dev/null
+++ b/test/597-deopt-busy-loop/run
@@ -0,0 +1,18 @@
+#!/bin/bash
+#
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# We want to run in debuggable mode and compiled.
+exec ${RUN} --jit -Xcompiler-option --debuggable "${@}"
diff --git a/test/597-deopt-busy-loop/src/Main.java b/test/597-deopt-busy-loop/src/Main.java
new file mode 100644
index 0000000..46b6bbf
--- /dev/null
+++ b/test/597-deopt-busy-loop/src/Main.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main implements Runnable {
+    static final int numberOfThreads = 2;
+    volatile static boolean sExitFlag = false;
+    volatile static boolean sEntered = false;
+    int threadIndex;
+
+    private static native void deoptimizeAll();
+    private static native void assertIsInterpreted();
+    private static native void assertIsManaged();
+    private static native void ensureJitCompiled(Class<?> cls, String methodName);
+
+    Main(int index) {
+        threadIndex = index;
+    }
+
+    public static void main(String[] args) throws Exception {
+        System.loadLibrary(args[0]);
+
+        final Thread[] threads = new Thread[numberOfThreads];
+        for (int t = 0; t < threads.length; t++) {
+            threads[t] = new Thread(new Main(t));
+            threads[t].start();
+        }
+        for (Thread t : threads) {
+            t.join();
+        }
+        System.out.println("Finishing");
+    }
+
+    public void $noinline$busyLoop() {
+        assertIsManaged();
+        sEntered = true;
+        for (;;) {
+            if (sExitFlag) {
+                break;
+            }
+        }
+        assertIsInterpreted();
+    }
+
+    public void run() {
+        if (threadIndex == 0) {
+            while (!sEntered) {
+              Thread.yield();
+            }
+            deoptimizeAll();
+            sExitFlag = true;
+        } else {
+            ensureJitCompiled(Main.class, "$noinline$busyLoop");
+            $noinline$busyLoop();
+        }
+    }
+}
diff --git a/test/597-deopt-invoke-stub/expected.txt b/test/597-deopt-invoke-stub/expected.txt
new file mode 100644
index 0000000..f993efc
--- /dev/null
+++ b/test/597-deopt-invoke-stub/expected.txt
@@ -0,0 +1,2 @@
+JNI_OnLoad called
+Finishing
diff --git a/test/597-deopt-invoke-stub/info.txt b/test/597-deopt-invoke-stub/info.txt
new file mode 100644
index 0000000..31960a9
--- /dev/null
+++ b/test/597-deopt-invoke-stub/info.txt
@@ -0,0 +1 @@
+Test deoptimizing when returning from a quick-to-interpreter bridge.
diff --git a/test/597-deopt-invoke-stub/run b/test/597-deopt-invoke-stub/run
new file mode 100644
index 0000000..bc04498
--- /dev/null
+++ b/test/597-deopt-invoke-stub/run
@@ -0,0 +1,18 @@
+#!/bin/bash
+#
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# We want to run in debuggable mode and compiled.
+exec ${RUN} --jit -Xcompiler-option --debuggable "${@}"
diff --git a/test/597-deopt-invoke-stub/src/Main.java b/test/597-deopt-invoke-stub/src/Main.java
new file mode 100644
index 0000000..0751783
--- /dev/null
+++ b/test/597-deopt-invoke-stub/src/Main.java
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main implements Runnable {
+    static final int numberOfThreads = 2;
+    volatile static boolean sExitFlag = false;
+    volatile static boolean sEntered = false;
+    int threadIndex;
+
+    private static native void deoptimizeAll();
+    private static native void assertIsInterpreted();
+    private static native void assertIsManaged();
+    private static native void ensureJitCompiled(Class<?> cls, String methodName);
+
+    Main(int index) {
+        threadIndex = index;
+    }
+
+    public static void main(String[] args) throws Exception {
+        System.loadLibrary(args[0]);
+
+        final Thread[] threads = new Thread[numberOfThreads];
+        for (int t = 0; t < threads.length; t++) {
+            threads[t] = new Thread(new Main(t));
+            threads[t].start();
+        }
+        for (Thread t : threads) {
+            t.join();
+        }
+        System.out.println("Finishing");
+    }
+
+    private static int $noinline$bar() {
+        // Should be entered via interpreter bridge.
+        assertIsInterpreted();
+        sEntered = true;
+        while (!sExitFlag) {}
+        assertIsInterpreted();
+        return 0x1234;
+    }
+
+    public void $noinline$foo() {
+        assertIsManaged();
+        if ($noinline$bar() != 0x1234) {
+            System.out.println("Bad return value");
+        }
+        assertIsInterpreted();
+    }
+
+    public void run() {
+        if (threadIndex == 0) {
+            while (!sEntered) {
+              Thread.yield();
+            }
+            deoptimizeAll();
+            sExitFlag = true;
+        } else {
+            ensureJitCompiled(Main.class, "$noinline$foo");
+            $noinline$foo();
+        }
+    }
+}
diff --git a/test/623-checker-loop-regressions/src/Main.java b/test/623-checker-loop-regressions/src/Main.java
index 056ed91..9229d81 100644
--- a/test/623-checker-loop-regressions/src/Main.java
+++ b/test/623-checker-loop-regressions/src/Main.java
@@ -473,6 +473,18 @@
     return y;
   }
 
+  // b/65478356: sum up 2-dim array.
+  static int sum(int[][] a) {
+    int sum = 0;
+    for (int y = 0; y < a.length; y++) {
+      int[] aa = a[y];
+      for (int x = 0; x < aa.length; x++) {
+        sum += aa[x];
+      }
+    }
+    return sum;
+  }
+
   public static void main(String[] args) {
     expectEquals(10, earlyExitFirst(-1));
     for (int i = 0; i <= 10; i++) {
@@ -613,6 +625,14 @@
     }
     expectEquals(2, verify);
 
+    int[][] x = new int[128][128];
+    for (int i = 0; i < 128; i++) {
+      for (int j = 0; j < 128; j++) {
+        x[i][j] = -i - j;
+      }
+    }
+    expectEquals(-2080768, sum(x));
+
     System.out.println("passed");
   }
 
diff --git a/test/661-checker-simd-reduc/src/Main.java b/test/661-checker-simd-reduc/src/Main.java
index 741b5fa..8208a9e 100644
--- a/test/661-checker-simd-reduc/src/Main.java
+++ b/test/661-checker-simd-reduc/src/Main.java
@@ -51,6 +51,26 @@
     return sum;
   }
 
+  /// CHECK-START: int Main.reductionInt(int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Get>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.reductionInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static int reductionInt(int[] x) {
     int sum = 0;
     for (int i = 0; i < x.length; i++) {
@@ -59,6 +79,28 @@
     return sum;
   }
 
+  /// CHECK-START: long Main.reductionLong(long[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Long0:j\d+>>  LongConstant 0                loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<Long0>>,{{j\d+}}]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get:j\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Get>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.reductionLong(long[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Long0:j\d+>>  LongConstant 0                loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Long0>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:j\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static long reductionLong(long[] x) {
     long sum = 0;
     for (int i = 0; i < x.length; i++) {
@@ -67,6 +109,90 @@
     return sum;
   }
 
+  private static byte reductionByteM1(byte[] x) {
+    byte sum = -1;
+    for (int i = 0; i < x.length; i++) {
+      sum += x[i];
+    }
+    return sum;
+  }
+
+  private static short reductionShortM1(short[] x) {
+    short sum = -1;
+    for (int i = 0; i < x.length; i++) {
+      sum += x[i];
+    }
+    return sum;
+  }
+
+  private static char reductionCharM1(char[] x) {
+    char sum = 0xffff;
+    for (int i = 0; i < x.length; i++) {
+      sum += x[i];
+    }
+    return sum;
+  }
+
+  /// CHECK-START: int Main.reductionIntM1(int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<ConsM1:i\d+>> IntConstant -1                loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<ConsM1>>,{{i\d+}}]     loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Get>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.reductionIntM1(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<ConsM1:i\d+>> IntConstant -1                loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsM1>>]    loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
+  private static int reductionIntM1(int[] x) {
+    int sum = -1;
+    for (int i = 0; i < x.length; i++) {
+      sum += x[i];
+    }
+    return sum;
+  }
+
+  /// CHECK-START: long Main.reductionLongM1(long[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<LongM1:j\d+>> LongConstant -1               loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<LongM1>>,{{j\d+}}]     loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get:j\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Get>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.reductionLongM1(long[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<LongM1:j\d+>> LongConstant -1               loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<LongM1>>]    loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:j\d+>>   VecExtractScalar [<<Red>>]    loop:none
+  private static long reductionLongM1(long[] x) {
+    long sum = -1L;
+    for (int i = 0; i < x.length; i++) {
+      sum += x[i];
+    }
+    return sum;
+  }
+
   private static byte reductionMinusByte(byte[] x) {
     byte sum = 0;
     for (int i = 0; i < x.length; i++) {
@@ -91,6 +217,26 @@
     return sum;
   }
 
+  /// CHECK-START: int Main.reductionMinusInt(int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Sub [<<Phi2>>,<<Get>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.reductionMinusInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecSub [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static int reductionMinusInt(int[] x) {
     int sum = 0;
     for (int i = 0; i < x.length; i++) {
@@ -99,6 +245,28 @@
     return sum;
   }
 
+  /// CHECK-START: long Main.reductionMinusLong(long[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Long0:j\d+>>  LongConstant 0                loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<Long0>>,{{j\d+}}]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get:j\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Sub [<<Phi2>>,<<Get>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.reductionMinusLong(long[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Long0:j\d+>>  LongConstant 0                loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Long0>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecSub [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:j\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static long reductionMinusLong(long[] x) {
     long sum = 0;
     for (int i = 0; i < x.length; i++) {
@@ -131,6 +299,28 @@
     return min;
   }
 
+  /// CHECK-START: int Main.reductionMinInt(int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<ConsM:i\d+>>  IntConstant 2147483647        loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<ConsM>>,{{i\d+}}]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 InvokeStaticOrDirect [<<Phi2>>,<<Get>>] intrinsic:MathMinIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.reductionMinInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<ConsM:i\d+>>  IntConstant 2147483647        loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsM>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecMin [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static int reductionMinInt(int[] x) {
     int min = Integer.MAX_VALUE;
     for (int i = 0; i < x.length; i++) {
@@ -171,6 +361,28 @@
     return max;
   }
 
+  /// CHECK-START: int Main.reductionMaxInt(int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<ConsM:i\d+>>  IntConstant -2147483648       loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<ConsM>>,{{i\d+}}]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 InvokeStaticOrDirect [<<Phi2>>,<<Get>>] intrinsic:MathMaxIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.reductionMaxInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<ConsM:i\d+>>  IntConstant -2147483648       loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsM>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 VecMax [<<Phi2>>,<<Load>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static int reductionMaxInt(int[] x) {
     int max = Integer.MIN_VALUE;
     for (int i = 0; i < x.length; i++) {
@@ -253,6 +465,11 @@
     expectEquals(38070, reductionChar(xc));
     expectEquals(365750, reductionInt(xi));
     expectEquals(365750L, reductionLong(xl));
+    expectEquals(-75, reductionByteM1(xb));
+    expectEquals(-27467, reductionShortM1(xs));
+    expectEquals(38069, reductionCharM1(xc));
+    expectEquals(365749, reductionIntM1(xi));
+    expectEquals(365749L, reductionLongM1(xl));
     expectEquals(74, reductionMinusByte(xb));
     expectEquals(27466, reductionMinusShort(xs));
     expectEquals(27466, reductionMinusChar(xc));
diff --git a/test/665-checker-simd-zero/expected.txt b/test/665-checker-simd-zero/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/665-checker-simd-zero/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/665-checker-simd-zero/info.txt b/test/665-checker-simd-zero/info.txt
new file mode 100644
index 0000000..55eca88
--- /dev/null
+++ b/test/665-checker-simd-zero/info.txt
@@ -0,0 +1 @@
+Functional tests on zero-out SIMD vectorization.
diff --git a/test/665-checker-simd-zero/src/Main.java b/test/665-checker-simd-zero/src/Main.java
new file mode 100644
index 0000000..66eea64
--- /dev/null
+++ b/test/665-checker-simd-zero/src/Main.java
@@ -0,0 +1,236 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for zero vectorization.
+ */
+public class Main {
+
+  /// CHECK-START: void Main.zeroz(boolean[]) loop_optimization (before)
+  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.zeroz(boolean[]) loop_optimization (after)
+  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
+  private static void zeroz(boolean[] x) {
+    for (int i = 0; i < x.length; i++) {
+      x[i] = false;
+    }
+  }
+
+  /// CHECK-START: void Main.zerob(byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.zerob(byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
+  private static void zerob(byte[] x) {
+    for (int i = 0; i < x.length; i++) {
+      x[i] = 0;
+    }
+  }
+
+  /// CHECK-START: void Main.zeroc(char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.zeroc(char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
+  private static void zeroc(char[] x) {
+    for (int i = 0; i < x.length; i++) {
+      x[i] = 0;
+    }
+  }
+
+  /// CHECK-START: void Main.zeros(short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.zeros(short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
+  private static void zeros(short[] x) {
+    for (int i = 0; i < x.length; i++) {
+      x[i] = 0;
+    }
+  }
+
+  /// CHECK-START: void Main.zeroi(int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.zeroi(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
+  private static void zeroi(int[] x) {
+    for (int i = 0; i < x.length; i++) {
+      x[i] = 0;
+    }
+  }
+
+  /// CHECK-START: void Main.zerol(long[]) loop_optimization (before)
+  /// CHECK-DAG: <<Zero:j\d+>> LongConstant 0                       loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.zerol(long[]) loop_optimization (after)
+  /// CHECK-DAG: <<Zero:j\d+>> LongConstant 0                       loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
+  private static void zerol(long[] x) {
+    for (int i = 0; i < x.length; i++) {
+      x[i] = 0;
+    }
+  }
+
+  /// CHECK-START: void Main.zerof(float[]) loop_optimization (before)
+  /// CHECK-DAG: <<Zero:f\d+>> FloatConstant 0                      loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.zerof(float[]) loop_optimization (after)
+  /// CHECK-DAG: <<Zero:f\d+>> FloatConstant 0                      loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
+  private static void zerof(float[] x) {
+    for (int i = 0; i < x.length; i++) {
+      x[i] = 0;
+    }
+  }
+
+  /// CHECK-START: void Main.zerod(double[]) loop_optimization (before)
+  /// CHECK-DAG: <<Zero:d\d+>> DoubleConstant 0                     loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.zerod(double[]) loop_optimization (after)
+  /// CHECK-DAG: <<Zero:d\d+>> DoubleConstant 0                     loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
+  private static void zerod(double[] x) {
+    for (int i = 0; i < x.length; i++) {
+      x[i] = 0;
+    }
+  }
+
+  public static void main(String[] args) {
+    int total = 1111;
+
+    boolean[] xz = new boolean[total];
+    byte[]    xb = new byte[total];
+    char[]    xc = new char[total];
+    short[]   xs = new short[total];
+    int[]     xi = new int[total];
+    long[]    xl = new long[total];
+    float[]   xf = new float[total];
+    double[]  xd = new double[total];
+
+    for (int i = 0; i < total; i++) {
+      xz[i] = true;
+      xb[i] = 1;
+      xc[i] = 1;
+      xs[i] = 1;
+      xi[i] = 1;
+      xl[i] = 1;
+      xf[i] = 1;
+      xd[i] = 1;
+    }
+
+    for (int i = 0; i < total; i++) {
+      expectEquals(true, xz[i]);
+      expectEquals(1, xb[i]);
+      expectEquals(1, xc[i]);
+      expectEquals(1, xs[i]);
+      expectEquals(1, xi[i]);
+      expectEquals(1, xl[i]);
+      expectEquals(1, xf[i]);
+      expectEquals(1, xd[i]);
+    }
+
+    zeroz(xz);
+    zerob(xb);
+    zeroc(xc);
+    zeros(xs);
+    zeroi(xi);
+    zerol(xl);
+    zerof(xf);
+    zerod(xd);
+
+    for (int i = 0; i < total; i++) {
+      expectEquals(false, xz[i]);
+      expectEquals(0, xb[i]);
+      expectEquals(0, xc[i]);
+      expectEquals(0, xs[i]);
+      expectEquals(0, xi[i]);
+      expectEquals(0, xl[i]);
+      expectEquals(0, xf[i]);
+      expectEquals(0, xd[i]);
+    }
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(boolean expected, boolean result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectEquals(float expected, float result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectEquals(double expected, double result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/Android.bp b/test/Android.bp
index 2a88af1..2f23056 100644
--- a/test/Android.bp
+++ b/test/Android.bp
@@ -250,6 +250,7 @@
         "ti-agent/common_helper.cc",
         "ti-agent/frame_pop_helper.cc",
         "ti-agent/locals_helper.cc",
+        "ti-agent/monitors_helper.cc",
         "ti-agent/redefinition_helper.cc",
         "ti-agent/suspension_helper.cc",
         "ti-agent/stack_trace_helper.cc",
@@ -299,7 +300,8 @@
         "1922-owned-monitors-info/owned_monitors.cc",
         "1924-frame-pop-toggle/frame_pop_toggle.cc",
         "1926-missed-frame-pop/frame_pop_missed.cc",
-        "1927-exception-event/exception_event.cc"
+        "1927-exception-event/exception_event.cc",
+        "1930-monitor-info/monitor.cc",
     ],
     shared_libs: [
         "libbase",
@@ -349,6 +351,7 @@
     ],
     shared_libs: [
         "libbase",
+        "slicer",
     ],
     header_libs: ["libopenjdkjvmti_headers"],
 }
diff --git a/test/etc/run-test-jar b/test/etc/run-test-jar
index 90e2600..c16c487 100755
--- a/test/etc/run-test-jar
+++ b/test/etc/run-test-jar
@@ -422,10 +422,7 @@
   if [[ "$JVMTI_REDEFINE_STRESS" = "y" ]]; then
     # We really cannot do this on RI so don't both passing it in that case.
     if [[ "$USE_JVM" = "n" ]]; then
-      file_1=$(mktemp --tmpdir=${DEX_LOCATION})
-      file_2=$(mktemp --tmpdir=${DEX_LOCATION})
-      # TODO Remove need for DEXTER_BINARY!
-      agent_args="${agent_args},redefine,${DEXTER_BINARY},${file_1},${file_2}"
+      agent_args="${agent_args},redefine"
     fi
   fi
   if [[ "$JVMTI_FIELD_STRESS" = "y" ]]; then
diff --git a/test/knownfailures.json b/test/knownfailures.json
index 315476a..84758c9 100644
--- a/test/knownfailures.json
+++ b/test/knownfailures.json
@@ -216,6 +216,18 @@
                         "suppressed when tracing."]
     },
     {
+        "tests": "597-deopt-busy-loop",
+        "variant": "interp-ac | interpreter | trace | stream",
+        "description": ["This test expects JIT compilation, which is",
+                        "suppressed when tracing."]
+    },
+    {
+        "tests": "597-deopt-invoke-stub",
+        "variant": "interp-ac | interpreter | optimizing | trace | stream",
+        "description": ["This test expects JIT compilation and no AOT for",
+                        "testing deoptimizing at quick-to-interpreter bridge."]
+    },
+    {
         "tests": "137-cfi",
         "description": ["CFI unwinding expects managed frames, and the test",
                         "does not iterate enough to even compile. JIT also",
diff --git a/test/ti-agent/jvmti_helper.cc b/test/ti-agent/jvmti_helper.cc
index 7280102..c290e9b 100644
--- a/test/ti-agent/jvmti_helper.cc
+++ b/test/ti-agent/jvmti_helper.cc
@@ -50,7 +50,7 @@
     .can_get_synthetic_attribute                     = 1,
     .can_get_owned_monitor_info                      = 0,
     .can_get_current_contended_monitor               = 0,
-    .can_get_monitor_info                            = 0,
+    .can_get_monitor_info                            = 1,
     .can_pop_frame                                   = 0,
     .can_redefine_classes                            = 1,
     .can_signal_thread                               = 0,
diff --git a/test/ti-agent/monitors_helper.cc b/test/ti-agent/monitors_helper.cc
new file mode 100644
index 0000000..7c28ede
--- /dev/null
+++ b/test/ti-agent/monitors_helper.cc
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "jni.h"
+#include "jvmti.h"
+#include <vector>
+#include "jvmti_helper.h"
+#include "jni_helper.h"
+#include "test_env.h"
+#include "scoped_local_ref.h"
+namespace art {
+namespace common_monitors {
+
+extern "C" JNIEXPORT jobject JNICALL Java_art_Monitors_getObjectMonitorUsage(
+    JNIEnv* env, jclass, jobject obj) {
+  ScopedLocalRef<jclass> klass(env, env->FindClass("art/Monitors$MonitorUsage"));
+  if (env->ExceptionCheck()) {
+    return nullptr;
+  }
+  jmethodID constructor = env->GetMethodID(
+      klass.get(),
+      "<init>",
+      "(Ljava/lang/Object;Ljava/lang/Thread;I[Ljava/lang/Thread;[Ljava/lang/Thread;)V");
+  if (env->ExceptionCheck()) {
+    return nullptr;
+  }
+  jvmtiMonitorUsage usage;
+  if (JvmtiErrorToException(env, jvmti_env, jvmti_env->GetObjectMonitorUsage(obj, &usage))) {
+    return nullptr;
+  }
+  jobjectArray wait = CreateObjectArray(env, usage.waiter_count, "java/lang/Thread",
+                                        [&](jint i) { return usage.waiters[i]; });
+  if (env->ExceptionCheck()) {
+    jvmti_env->Deallocate(reinterpret_cast<unsigned char*>(usage.waiters));
+    jvmti_env->Deallocate(reinterpret_cast<unsigned char*>(usage.notify_waiters));
+    return nullptr;
+  }
+  jobjectArray notify_wait = CreateObjectArray(env, usage.notify_waiter_count, "java/lang/Thread",
+                                               [&](jint i) { return usage.notify_waiters[i]; });
+  if (env->ExceptionCheck()) {
+    jvmti_env->Deallocate(reinterpret_cast<unsigned char*>(usage.waiters));
+    jvmti_env->Deallocate(reinterpret_cast<unsigned char*>(usage.notify_waiters));
+    return nullptr;
+  }
+  return env->NewObject(klass.get(), constructor,
+                        obj, usage.owner, usage.entry_count, wait, notify_wait);
+}
+
+}  // namespace common_monitors
+}  // namespace art
diff --git a/test/ti-stress/stress.cc b/test/ti-stress/stress.cc
index 5d7c2f3..6e29e36 100644
--- a/test/ti-stress/stress.cc
+++ b/test/ti-stress/stress.cc
@@ -28,15 +28,31 @@
 #include "jvmti.h"
 #include "utils.h"
 
+#pragma clang diagnostic push
+// slicer defines its own CHECK. b/65422458
+#pragma push_macro("CHECK")
+#undef CHECK
+
+// Slicer's headers have code that triggers these warnings. b/65298177
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wsign-compare"
+#include "code_ir.h"
+#include "control_flow_graph.h"
+#include "dex_ir.h"
+#include "dex_ir_builder.h"
+#include "instrumentation.h"
+#include "reader.h"
+#include "writer.h"
+
+#pragma pop_macro("CHECK")
+#pragma clang diagnostic pop
+
 namespace art {
 
 // Should we do a 'full_rewrite' with this test?
 static constexpr bool kDoFullRewrite = true;
 
 struct StressData {
-  std::string dexter_cmd;
-  std::string out_temp_dex;
-  std::string in_temp_dex;
   bool vm_class_loader_initialized;
   bool trace_stress;
   bool redefine_stress;
@@ -44,51 +60,60 @@
   bool step_stress;
 };
 
-static void WriteToFile(const std::string& fname, jint data_len, const unsigned char* data) {
-  std::ofstream file(fname, std::ios::binary | std::ios::out | std::ios::trunc);
-  file.write(reinterpret_cast<const char*>(data), data_len);
-  file.flush();
-}
-
-static bool ReadIntoBuffer(const std::string& fname, /*out*/std::vector<unsigned char>* data) {
-  std::ifstream file(fname, std::ios::binary | std::ios::in);
-  file.seekg(0, std::ios::end);
-  size_t len = file.tellg();
-  data->resize(len);
-  file.seekg(0);
-  file.read(reinterpret_cast<char*>(data->data()), len);
-  return len != 0;
-}
-
-// TODO rewrite later.
-static bool DoExtractClassFromData(StressData* data,
-                                   const std::string& class_name,
+static bool DoExtractClassFromData(jvmtiEnv* env,
+                                   const std::string& descriptor,
                                    jint in_len,
                                    const unsigned char* in_data,
-                                   /*out*/std::vector<unsigned char>* dex) {
-  // Write the dex file into a temporary file.
-  WriteToFile(data->in_temp_dex, in_len, in_data);
-  // Clear out file so even if something suppresses the exit value we will still detect dexter
-  // failure.
-  WriteToFile(data->out_temp_dex, 0, nullptr);
-  // Have dexter do the extraction.
-  std::vector<std::string> args;
-  args.push_back(data->dexter_cmd);
-  if (kDoFullRewrite) {
-    args.push_back("-x");
-    args.push_back("full_rewrite");
-  }
-  args.push_back("-e");
-  args.push_back(class_name);
-  args.push_back("-o");
-  args.push_back(data->out_temp_dex);
-  args.push_back(data->in_temp_dex);
-  std::string error;
-  if (ExecAndReturnCode(args, &error) != 0) {
-    LOG(ERROR) << "unable to execute dexter: " << error;
+                                   /*out*/jint* out_len,
+                                   /*out*/unsigned char** out_data) {
+  dex::Reader reader(in_data, in_len);
+  dex::u4 class_idx = reader.FindClassIndex(descriptor.c_str());
+  if (class_idx != dex::kNoIndex) {
+    reader.CreateClassIr(class_idx);
+  } else {
+    LOG(ERROR) << "ERROR: Can't find class " << descriptor;
     return false;
   }
-  return ReadIntoBuffer(data->out_temp_dex, dex);
+  auto dex_ir = reader.GetIr();
+
+  if (kDoFullRewrite) {
+    for (auto& ir_method : dex_ir->encoded_methods) {
+      if (ir_method->code != nullptr) {
+        lir::CodeIr code_ir(ir_method.get(), dex_ir);
+        lir::ControlFlowGraph cfg_compact(&code_ir, false);
+        lir::ControlFlowGraph cfg_verbose(&code_ir, true);
+        code_ir.Assemble();
+      }
+    }
+  }
+  dex::Writer writer(dex_ir);
+
+  struct Allocator : public dex::Writer::Allocator {
+    explicit Allocator(jvmtiEnv* jvmti_env) : jvmti_env_(jvmti_env) {}
+    virtual void* Allocate(size_t size) {
+      unsigned char* out = nullptr;
+      if (JVMTI_ERROR_NONE != jvmti_env_->Allocate(size, &out)) {
+        return nullptr;
+      } else {
+        return out;
+      }
+    }
+    virtual void Free(void* ptr) {
+      jvmti_env_->Deallocate(reinterpret_cast<unsigned char*>(ptr));
+    }
+   private:
+    jvmtiEnv* jvmti_env_;
+  };
+  Allocator alloc(env);
+  size_t res_len;
+  unsigned char* res = writer.CreateImage(&alloc, &res_len);
+  if (res != nullptr) {
+    *out_data = res;
+    *out_len = res_len;
+    return true;
+  } else {
+    return false;
+  }
 }
 
 class ScopedThreadInfo {
@@ -615,10 +640,10 @@
                                          jint* new_class_data_len,
                                          unsigned char** new_class_data) {
   std::vector<unsigned char> out;
-  std::string name_str(name);
-  // Make the jvmti semi-descriptor into the java style descriptor (though with $ for inner
-  // classes).
-  std::replace(name_str.begin(), name_str.end(), '/', '.');
+  // Make the jvmti semi-descriptor into the full descriptor.
+  std::string name_str("L");
+  name_str += name;
+  name_str += ";";
   StressData* data = nullptr;
   CHECK_EQ(jvmti->GetEnvironmentLocalStorage(reinterpret_cast<void**>(&data)),
            JVMTI_ERROR_NONE);
@@ -626,15 +651,11 @@
     LOG(WARNING) << "Ignoring load of class " << name << " because VMClassLoader is not yet "
                  << "initialized. Transforming this class could cause spurious test failures.";
     return;
-  } else if (DoExtractClassFromData(data, name_str, class_data_len, class_data, /*out*/ &out)) {
+  } else if (DoExtractClassFromData(jvmti, name_str, class_data_len, class_data,
+                                    /*out*/ new_class_data_len, /*out*/ new_class_data)) {
     LOG(INFO) << "Extracted class: " << name;
-    unsigned char* new_data;
-    CHECK_EQ(JVMTI_ERROR_NONE, jvmti->Allocate(out.size(), &new_data));
-    memcpy(new_data, out.data(), out.size());
-    *new_class_data_len = static_cast<jint>(out.size());
-    *new_class_data = new_data;
   } else {
-    std::cerr << "Unable to extract class " << name_str << std::endl;
+    std::cerr << "Unable to extract class " << name << std::endl;
     *new_class_data_len = 0;
     *new_class_data = nullptr;
   }
@@ -653,7 +674,7 @@
 }
 
 // Options are
-// jvmti-stress,[redefine,${DEXTER_BINARY},${TEMP_FILE_1},${TEMP_FILE_2},][trace,][field]
+// jvmti-stress,[redefine,][trace,][field]
 static void ReadOptions(StressData* data, char* options) {
   std::string ops(options);
   CHECK_EQ(GetOption(ops), "jvmti-stress") << "Options should start with jvmti-stress";
@@ -668,12 +689,6 @@
       data->field_stress = true;
     } else if (cur == "redefine") {
       data->redefine_stress = true;
-      ops = AdvanceOption(ops);
-      data->dexter_cmd = GetOption(ops);
-      ops = AdvanceOption(ops);
-      data->in_temp_dex = GetOption(ops);
-      ops = AdvanceOption(ops);
-      data->out_temp_dex = GetOption(ops);
     } else {
       LOG(FATAL) << "Unknown option: " << GetOption(ops);
     }
diff --git a/tools/libcore_gcstress_debug_failures.txt b/tools/libcore_gcstress_debug_failures.txt
index 5806b61..d27b8fc 100644
--- a/tools/libcore_gcstress_debug_failures.txt
+++ b/tools/libcore_gcstress_debug_failures.txt
@@ -11,9 +11,11 @@
   names: ["jsr166.CompletableFutureTest#testCompleteOnTimeout_completed",
           "libcore.icu.TransliteratorTest#testAll",
           "libcore.icu.RelativeDateTimeFormatterTest#test_bug25821045",
+          "libcore.icu.RelativeDateTimeFormatterTest#test_bug25883157",
           "libcore.java.lang.ref.ReferenceQueueTest#testRemoveWithDelayedResultAndTimeout",
           "libcore.java.lang.ref.ReferenceQueueTest#testRemoveWithDelayedResultAndNoTimeout",
           "libcore.java.util.TimeZoneTest#testSetDefaultDeadlock",
+          "libcore.javax.crypto.CipherBasicsTest#testBasicEncryption",
           "org.apache.harmony.tests.java.util.TimerTest#testThrowingTaskKillsTimerThread"]
 }
 ]