Revert "Revert "Implement on-stack replacement for arm/arm64/x86/x86_64.""

This reverts commit bd89a5c556324062b7d841843b039392e84cfaf4.

Change-Id: I08d190431520baa7fcec8fbdb444519f25ac8d44
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 631b784..b3a2979 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -429,6 +429,56 @@
 END art_quick_invoke_stub_internal
 
     /*
+     * On stack replacement stub.
+     * On entry:
+     *   r0 = stack to copy
+     *   r1 = size of stack
+     *   r2 = pc to call
+     *   r3 = JValue* result
+     *   [sp] = shorty
+     *   [sp + 4] = thread
+     */
+ENTRY art_quick_osr_stub
+    SPILL_ALL_CALLEE_SAVE_GPRS             @ Spill regs (9)
+    mov    r11, sp                         @ Save the stack pointer
+    mov    r10, r1                         @ Save size of stack
+    ldr    r9, [r11, #40]                  @ Move managed thread pointer into r9
+    mov    r8, r2                          @ Save the pc to call
+    sub    r7, sp, #12                     @ Reserve space for stack pointer, JValue result, and ArtMethod* slot
+    and    r7, #0xFFFFFFF0                 @ Align stack pointer
+    mov    sp, r7                          @ Update stack pointer
+    str    r11, [sp, #4]                   @ Save old stack pointer
+    str    r3, [sp, #8]                    @ Save JValue result
+    mov    ip, #0
+    str    ip, [sp]                        @ Store null for ArtMethod* at bottom of frame
+    sub    sp, sp, r1                      @ Reserve space for callee stack
+    mov    r2, r1
+    mov    r1, r0
+    mov    r0, sp
+    bl     memcpy                          @ memcpy (dest r0, src r1, bytes r2)
+    bl     .Losr_entry                     @ Call the method
+    ldr    r11, [sp, #4]                   @ Restore saved stack pointer
+    ldr    r10, [sp, #8]                   @ Restire JValue result
+    mov    sp, r11                         @ Restore stack pointer.
+    ldr    r4, [sp, #36]                   @ load shorty
+    ldr    r4, [r4, #0]                    @ load return type
+    cmp    r4, #68                         @ Test if result type char == 'D'.
+    beq    .Losr_fp_result
+    cmp    r4, #70                         @ Test if result type char == 'F'.
+    beq    .Losr_fp_result
+    strd r0, [r10]                         @ Store r0/r1 into result pointer
+    b    .Losr_exit
+.Losr_fp_result:
+    vstr d0, [r10]                         @ Store s0-s1/d0 into result pointer
+.Losr_exit:
+    pop    {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+.Losr_entry:
+    sub r10, r10, #4
+    str lr, [sp, r10]                     @ Store link register per the compiler ABI
+    bx r8
+END art_quick_osr_stub
+
+    /*
      * On entry r0 is uint32_t* gprs_ and r1 is uint32_t* fprs_
      */
 ARM_ENTRY art_quick_do_long_jump
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 9ccabad..e848008 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -915,6 +915,105 @@
 
 
 
+/*  extern"C" void art_quick_osr_stub(void** stack,                x0
+ *                                    size_t stack_size_in_bytes,  x1
+ *                                    const uin8_t* native_pc,     x2
+ *                                    JValue *result,              x3
+ *                                    char   *shorty,              x4
+ *                                    Thread *self)                x5
+ */
+ENTRY art_quick_osr_stub
+SAVE_SIZE=15*8   // x3, x4, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, SP, LR, FP saved.
+    mov x9, sp                             // Save stack pointer.
+    .cfi_register sp,x9
+
+    sub x10, sp, # SAVE_SIZE
+    and x10, x10, # ~0xf                   // Enforce 16 byte stack alignment.
+    mov sp, x10                            // Set new SP.
+
+    str x28, [sp, #112]
+    stp x26, x27, [sp, #96]
+    stp x24, x25, [sp, #80]
+    stp x22, x23, [sp, #64]
+    stp x20, x21, [sp, #48]
+    stp x9, x19, [sp, #32]                // Save old stack pointer and x19.
+    stp x3, x4, [sp, #16]                 // Save result and shorty addresses.
+    stp xFP, xLR, [sp]                    // Store LR & FP.
+    mov xSELF, x5                         // Move thread pointer into SELF register.
+
+    sub sp, sp, #16
+    str xzr, [sp]                         // Store null for ArtMethod* slot
+    // Branch to stub.
+    bl .Losr_entry
+    add sp, sp, #16
+
+    // Restore return value address and shorty address.
+    ldp x3,x4, [sp, #16]
+    ldr x28, [sp, #112]
+    ldp x26, x27, [sp, #96]
+    ldp x24, x25, [sp, #80]
+    ldp x22, x23, [sp, #64]
+    ldp x20, x21, [sp, #48]
+
+    // Store result (w0/x0/s0/d0) appropriately, depending on resultType.
+    ldrb w10, [x4]
+
+    // Check the return type and store the correct register into the jvalue in memory.
+
+    // Don't set anything for a void type.
+    cmp w10, #'V'
+    beq .Losr_exit
+
+    // Is it a double?
+    cmp w10, #'D'
+    bne .Lno_double
+    str d0, [x3]
+    b .Losr_exit
+
+.Lno_double:  // Is it a float?
+    cmp w10, #'F'
+    bne .Lno_float
+    str s0, [x3]
+    b .Losr_exit
+
+.Lno_float:  // Just store x0. Doesn't matter if it is 64 or 32 bits.
+    str x0, [x3]
+
+.Losr_exit:  // Finish up.
+    ldp x2, x19, [sp, #32]   // Restore stack pointer and x19.
+    ldp xFP, xLR, [sp]    // Restore old frame pointer and link register.
+    mov sp, x2
+    ret
+
+.Losr_entry:
+    // Update stack pointer for the callee
+    sub sp, sp, x1
+
+    // Update link register slot expected by the callee.
+    sub w1, w1, #8
+    str lr, [sp, x1]
+
+    // Copy arguments into stack frame.
+    // Use simple copy routine for now.
+    // 4 bytes per slot.
+    // X0 - source address
+    // W1 - args length
+    // SP - destination address.
+    // W10 - temporary
+.Losr_loop_entry:
+    cmp w1, #0
+    beq .Losr_loop_exit
+    sub w1, w1, #4
+    ldr w10, [x0, x1]
+    str w10, [sp, x1]
+    b .Losr_loop_entry
+
+.Losr_loop_exit:
+    // Branch to the OSR entry point.
+    br x2
+
+END art_quick_osr_stub
+
     /*
      * On entry x0 is uintptr_t* gprs_ and x1 is uint64_t* fprs_
      */
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index da30331..fbee5d7 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1712,5 +1712,65 @@
     ret
 END_FUNCTION art_quick_read_barrier_for_root_slow
 
+  /*
+     * On stack replacement stub.
+     * On entry:
+     *   [sp] = return address
+     *   [sp + 4] = stack to copy
+     *   [sp + 8] = size of stack
+     *   [sp + 12] = pc to call
+     *   [sp + 16] = JValue* result
+     *   [sp + 20] = shorty
+     *   [sp + 24] = thread
+     */
+DEFINE_FUNCTION art_quick_osr_stub
+    // Save native callee saves.
+    PUSH ebp
+    PUSH ebx
+    PUSH esi
+    PUSH edi
+    mov 4+16(%esp), %esi           // ESI = argument array
+    mov 8+16(%esp), %ecx           // ECX = size of args
+    mov 12+16(%esp), %ebx          // EBX = pc to call
+    mov %esp, %ebp                 // Save stack pointer
+    andl LITERAL(0xFFFFFFF0), %esp // Align stack
+    PUSH ebp                       // Save old stack pointer
+    subl LITERAL(12), %esp         // Align stack
+    movl LITERAL(0), (%esp)        // Store null for ArtMethod* slot
+    call .Losr_entry
+
+    // Restore stack pointer.
+    addl LITERAL(12), %esp
+    POP ebp
+    mov %ebp, %esp
+
+    // Restore callee saves.
+    POP edi
+    POP esi
+    POP ebx
+    POP ebp
+    mov 16(%esp), %ecx            // Get JValue result
+    mov %eax, (%ecx)              // Store the result assuming it is a long, int or Object*
+    mov %edx, 4(%ecx)             // Store the other half of the result
+    mov 20(%esp), %edx            // Get the shorty
+    cmpb LITERAL(68), (%edx)      // Test if result type char == 'D'
+    je .Losr_return_double_quick
+    cmpb LITERAL(70), (%edx)      // Test if result type char == 'F'
+    je .Losr_return_float_quick
+    ret
+.Losr_return_double_quick:
+    movsd %xmm0, (%ecx)           // Store the floating point result
+    ret
+.Losr_return_float_quick:
+    movss %xmm0, (%ecx)           // Store the floating point result
+    ret
+.Losr_entry:
+    subl LITERAL(4), %ecx         // Given stack size contains pushed frame pointer, substract it.
+    subl %ecx, %esp
+    mov %esp, %edi                // EDI = beginning of stack
+    rep movsb                     // while (ecx--) { *edi++ = *esi++ }
+    jmp *%ebx
+END_FUNCTION art_quick_osr_stub
+
     // TODO: implement these!
 UNIMPLEMENTED art_quick_memcmp16
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 883da96..d6e0f1c 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1744,3 +1744,62 @@
     RESTORE_FP_CALLEE_SAVE_FRAME
     ret
 END_FUNCTION art_quick_read_barrier_for_root_slow
+
+    /*
+     * On stack replacement stub.
+     * On entry:
+     *   [sp] = return address
+     *   rdi = stack to copy
+     *   rsi = size of stack
+     *   rdx = pc to call
+     *   rcx = JValue* result
+     *   r8 = shorty
+     *   r9 = thread
+     */
+DEFINE_FUNCTION art_quick_osr_stub
+    // Save the non-volatiles.
+    PUSH rbp                      // Save rbp.
+    PUSH rcx                      // Save rcx/result*.
+    PUSH r8                       // Save r8/shorty*.
+
+    // Save callee saves.
+    PUSH rbx
+    PUSH r12
+    PUSH r13
+    PUSH r14
+    PUSH r15
+
+    pushq LITERAL(0)              // Push null for ArtMethod*.
+    movl %esi, %ecx               // rcx := size of stack
+    movq %rdi, %rsi               // rsi := stack to copy
+    call .Losr_entry
+
+    // Restore stack and callee-saves.
+    addq LITERAL(8), %rsp
+    POP r15
+    POP r14
+    POP r13
+    POP r12
+    POP rbx
+    POP r8
+    POP rcx
+    POP rbp
+    cmpb LITERAL(68), (%r8)        // Test if result type char == 'D'.
+    je .Losr_return_double_quick
+    cmpb LITERAL(70), (%r8)        // Test if result type char == 'F'.
+    je .Losr_return_float_quick
+    movq %rax, (%rcx)              // Store the result assuming its a long, int or Object*
+    ret
+.Losr_return_double_quick:
+    movsd %xmm0, (%rcx)            // Store the double floating point result.
+    ret
+.Losr_return_float_quick:
+    movss %xmm0, (%rcx)            // Store the floating point result.
+    ret
+.Losr_entry:
+    subl LITERAL(8), %ecx         // Given stack size contains pushed frame pointer, substract it.
+    subq %rcx, %rsp
+    movq %rsp, %rdi               // rdi := beginning of stack
+    rep movsb                     // while (rcx--) { *rdi++ = *rsi++ }
+    jmp *%rdx
+END_FUNCTION art_quick_osr_stub
diff --git a/runtime/art_method.cc b/runtime/art_method.cc
index 6f36016..cd38e16 100644
--- a/runtime/art_method.cc
+++ b/runtime/art_method.cc
@@ -292,22 +292,7 @@
         // Unusual case where we were running generated code and an
         // exception was thrown to force the activations to be removed from the
         // stack. Continue execution in the interpreter.
-        self->ClearException();
-        ShadowFrame* shadow_frame =
-            self->PopStackedShadowFrame(StackedShadowFrameType::kDeoptimizationShadowFrame);
-        mirror::Throwable* pending_exception = nullptr;
-        bool from_code = false;
-        self->PopDeoptimizationContext(result, &pending_exception, &from_code);
-        CHECK(!from_code);
-        self->SetTopOfStack(nullptr);
-        self->SetTopOfShadowStack(shadow_frame);
-
-        // Restore the exception that was pending before deoptimization then interpret the
-        // deoptimized frames.
-        if (pending_exception != nullptr) {
-          self->SetException(pending_exception);
-        }
-        interpreter::EnterInterpreterFromDeoptimize(self, shadow_frame, from_code, result);
+        self->DeoptimizeWithDeoptimizationException(result);
       }
       if (kLogInvocationStartAndReturn) {
         LOG(INFO) << StringPrintf("Returned '%s' quick code=%p", PrettyMethod(this).c_str(),
diff --git a/runtime/entrypoints/entrypoint_utils.cc b/runtime/entrypoints/entrypoint_utils.cc
index b5a55bf..3dfad76 100644
--- a/runtime/entrypoints/entrypoint_utils.cc
+++ b/runtime/entrypoints/entrypoint_utils.cc
@@ -273,15 +273,15 @@
     if (outer_method != nullptr) {
       const OatQuickMethodHeader* current_code = outer_method->GetOatQuickMethodHeader(caller_pc);
       if (current_code->IsOptimized()) {
-          uintptr_t native_pc_offset = current_code->NativeQuickPcOffset(caller_pc);
-          CodeInfo code_info = current_code->GetOptimizedCodeInfo();
-          StackMapEncoding encoding = code_info.ExtractEncoding();
-          StackMap stack_map = code_info.GetStackMapForNativePcOffset(native_pc_offset, encoding);
-          DCHECK(stack_map.IsValid());
-          if (stack_map.HasInlineInfo(encoding)) {
-            InlineInfo inline_info = code_info.GetInlineInfoOf(stack_map, encoding);
-            caller = GetResolvedMethod(outer_method, inline_info, inline_info.GetDepth() - 1);
-          }
+        uintptr_t native_pc_offset = current_code->NativeQuickPcOffset(caller_pc);
+        CodeInfo code_info = current_code->GetOptimizedCodeInfo();
+        StackMapEncoding encoding = code_info.ExtractEncoding();
+        StackMap stack_map = code_info.GetStackMapForNativePcOffset(native_pc_offset, encoding);
+        DCHECK(stack_map.IsValid());
+        if (stack_map.HasInlineInfo(encoding)) {
+          InlineInfo inline_info = code_info.GetInlineInfoOf(stack_map, encoding);
+          caller = GetResolvedMethod(outer_method, inline_info, inline_info.GetDepth() - 1);
+        }
       }
     }
     if (kIsDebugBuild && do_caller_check) {
diff --git a/runtime/interpreter/interpreter_goto_table_impl.cc b/runtime/interpreter/interpreter_goto_table_impl.cc
index 940d344..ca8598e 100644
--- a/runtime/interpreter/interpreter_goto_table_impl.cc
+++ b/runtime/interpreter/interpreter_goto_table_impl.cc
@@ -21,6 +21,7 @@
 #include "base/stl_util.h"  // MakeUnique
 #include "experimental_flags.h"
 #include "interpreter_common.h"
+#include "jit/jit.h"
 #include "safe_math.h"
 
 #include <memory>  // std::unique_ptr
@@ -63,10 +64,15 @@
   currentHandlersTable = handlersTable[ \
       Runtime::Current()->GetInstrumentation()->GetInterpreterHandlerTable()]
 
-#define BRANCH_INSTRUMENTATION(offset) \
-  do { \
+#define BRANCH_INSTRUMENTATION(offset)                                                            \
+  do {                                                                                            \
+    ArtMethod* method = shadow_frame.GetMethod();                                                 \
     instrumentation::Instrumentation* instrumentation = Runtime::Current()->GetInstrumentation(); \
-    instrumentation->Branch(self, shadow_frame.GetMethod(), dex_pc, offset); \
+    instrumentation->Branch(self, method, dex_pc, offset);                                        \
+    JValue result;                                                                                \
+    if (jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, &result)) {             \
+      return result;                                                                              \
+    }                                                                                             \
   } while (false)
 
 #define UNREACHABLE_CODE_CHECK()                \
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index f606978..25dbab2 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -17,6 +17,7 @@
 #include "base/stl_util.h"  // MakeUnique
 #include "experimental_flags.h"
 #include "interpreter_common.h"
+#include "jit/jit.h"
 #include "safe_math.h"
 
 #include <memory>  // std::unique_ptr
@@ -69,9 +70,14 @@
     }                                                                                           \
   } while (false)
 
-#define BRANCH_INSTRUMENTATION(offset) \
-  do { \
-    instrumentation->Branch(self, shadow_frame.GetMethod(), dex_pc, offset); \
+#define BRANCH_INSTRUMENTATION(offset)                                                         \
+  do {                                                                                         \
+    ArtMethod* method = shadow_frame.GetMethod();                                              \
+    instrumentation->Branch(self, method, dex_pc, offset);                                     \
+    JValue result;                                                                             \
+    if (jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, &result)) {          \
+      return result;                                                                           \
+    }                                                                                          \
   } while (false)
 
 static bool IsExperimentalInstructionEnabled(const Instruction *inst) {
diff --git a/runtime/jit/jit.cc b/runtime/jit/jit.cc
index fa5c41d..3e152e1 100644
--- a/runtime/jit/jit.cc
+++ b/runtime/jit/jit.cc
@@ -25,10 +25,12 @@
 #include "jit_code_cache.h"
 #include "jit_instrumentation.h"
 #include "oat_file_manager.h"
+#include "oat_quick_method_header.h"
 #include "offline_profiling_info.h"
 #include "profile_saver.h"
 #include "runtime.h"
 #include "runtime_options.h"
+#include "stack_map.h"
 #include "utils.h"
 
 namespace art {
@@ -43,6 +45,8 @@
       options.GetOrDefault(RuntimeArgumentMap::JITCodeCacheMaxCapacity);
   jit_options->compile_threshold_ =
       options.GetOrDefault(RuntimeArgumentMap::JITCompileThreshold);
+  // TODO(ngeoffray): Make this a proper option.
+  jit_options->osr_threshold_ = jit_options->compile_threshold_ * 2;
   jit_options->warmup_threshold_ =
       options.GetOrDefault(RuntimeArgumentMap::JITWarmupThreshold);
   jit_options->dump_info_on_shutdown_ =
@@ -121,7 +125,7 @@
     *error_msg = "JIT couldn't find jit_unload entry point";
     return false;
   }
-  jit_compile_method_ = reinterpret_cast<bool (*)(void*, ArtMethod*, Thread*)>(
+  jit_compile_method_ = reinterpret_cast<bool (*)(void*, ArtMethod*, Thread*, bool)>(
       dlsym(jit_library_handle_, "jit_compile_method"));
   if (jit_compile_method_ == nullptr) {
     dlclose(jit_library_handle_);
@@ -156,7 +160,7 @@
   return true;
 }
 
-bool Jit::CompileMethod(ArtMethod* method, Thread* self) {
+bool Jit::CompileMethod(ArtMethod* method, Thread* self, bool osr) {
   DCHECK(!method->IsRuntimeMethod());
   // Don't compile the method if it has breakpoints.
   if (Dbg::IsDebuggerActive() && Dbg::MethodHasAnyBreakpoints(method)) {
@@ -171,10 +175,11 @@
     return false;
   }
 
-  if (!code_cache_->NotifyCompilationOf(method, self)) {
+  if (!code_cache_->NotifyCompilationOf(method, self, osr)) {
+    VLOG(jit) << "JIT not compiling " << PrettyMethod(method) << " due to code cache";
     return false;
   }
-  bool success = jit_compile_method_(jit_compiler_handle_, method, self);
+  bool success = jit_compile_method_(jit_compiler_handle_, method, self, osr);
   code_cache_->DoneCompiling(method, self);
   return success;
 }
@@ -224,9 +229,11 @@
   }
 }
 
-void Jit::CreateInstrumentationCache(size_t compile_threshold, size_t warmup_threshold) {
+void Jit::CreateInstrumentationCache(size_t compile_threshold,
+                                     size_t warmup_threshold,
+                                     size_t osr_threshold) {
   instrumentation_cache_.reset(
-      new jit::JitInstrumentationCache(compile_threshold, warmup_threshold));
+      new jit::JitInstrumentationCache(compile_threshold, warmup_threshold, osr_threshold));
 }
 
 void Jit::NewTypeLoadedIfUsingJit(mirror::Class* type) {
@@ -255,5 +262,120 @@
   }
 }
 
+extern "C" void art_quick_osr_stub(void** stack,
+                                   uint32_t stack_size_in_bytes,
+                                   const uint8_t* native_pc,
+                                   JValue* result,
+                                   const char* shorty,
+                                   Thread* self);
+
+bool Jit::MaybeDoOnStackReplacement(Thread* thread,
+                                    ArtMethod* method,
+                                    uint32_t dex_pc,
+                                    int32_t dex_pc_offset,
+                                    JValue* result) {
+  Jit* jit = Runtime::Current()->GetJit();
+  if (jit == nullptr) {
+    return false;
+  }
+
+  if (kRuntimeISA == kMips || kRuntimeISA == kMips64) {
+    VLOG(jit) << "OSR not supported on this platform";
+    return false;
+  }
+
+  // Cheap check if the method has been compiled already. That's an indicator that we should
+  // osr into it.
+  if (!jit->GetCodeCache()->ContainsPc(method->GetEntryPointFromQuickCompiledCode())) {
+    return false;
+  }
+
+  const OatQuickMethodHeader* osr_method = jit->GetCodeCache()->LookupOsrMethodHeader(method);
+  if (osr_method == nullptr) {
+    // No osr method yet, just return to the interpreter.
+    return false;
+  }
+
+  const size_t number_of_vregs = method->GetCodeItem()->registers_size_;
+  CodeInfo code_info = osr_method->GetOptimizedCodeInfo();
+  StackMapEncoding encoding = code_info.ExtractEncoding();
+
+  // Find stack map starting at the target dex_pc.
+  StackMap stack_map = code_info.GetOsrStackMapForDexPc(dex_pc + dex_pc_offset, encoding);
+  if (!stack_map.IsValid()) {
+    // There is no OSR stack map for this dex pc offset. Just return to the interpreter in the
+    // hope that the next branch has one.
+    return false;
+  }
+
+  // We found a stack map, now fill the frame with dex register values from the interpreter's
+  // shadow frame.
+  DexRegisterMap vreg_map =
+      code_info.GetDexRegisterMapOf(stack_map, encoding, number_of_vregs);
+
+  ShadowFrame* shadow_frame = thread->PopShadowFrame();
+
+  size_t frame_size = osr_method->GetFrameSizeInBytes();
+  void** memory = reinterpret_cast<void**>(malloc(frame_size));
+  memset(memory, 0, frame_size);
+
+  // Art ABI: ArtMethod is at the bottom of the stack.
+  memory[0] = method;
+
+  if (!vreg_map.IsValid()) {
+    // If we don't have a dex register map, then there are no live dex registers at
+    // this dex pc.
+  } else {
+    for (uint16_t vreg = 0; vreg < number_of_vregs; ++vreg) {
+      DexRegisterLocation::Kind location =
+          vreg_map.GetLocationKind(vreg, number_of_vregs, code_info, encoding);
+      if (location == DexRegisterLocation::Kind::kNone) {
+        // Dex register is dead or unitialized.
+        continue;
+      }
+
+      if (location == DexRegisterLocation::Kind::kConstant) {
+        // We skip constants because the compiled code knows how to handle them.
+        continue;
+      }
+
+      DCHECK(location == DexRegisterLocation::Kind::kInStack);
+
+      int32_t vreg_value = shadow_frame->GetVReg(vreg);
+      int32_t slot_offset = vreg_map.GetStackOffsetInBytes(vreg,
+                                                           number_of_vregs,
+                                                           code_info,
+                                                           encoding);
+      DCHECK_LT(slot_offset, static_cast<int32_t>(frame_size));
+      DCHECK_GT(slot_offset, 0);
+      (reinterpret_cast<int32_t*>(memory))[slot_offset / sizeof(int32_t)] = vreg_value;
+    }
+  }
+
+  const uint8_t* native_pc = stack_map.GetNativePcOffset(encoding) + osr_method->GetEntryPoint();
+  VLOG(jit) << "Jumping to "
+            << PrettyMethod(method)
+            << "@"
+            << std::hex << reinterpret_cast<uintptr_t>(native_pc);
+  {
+    ManagedStack fragment;
+    thread->PushManagedStackFragment(&fragment);
+    (*art_quick_osr_stub)(memory,
+                          frame_size,
+                          native_pc,
+                          result,
+                          method->GetInterfaceMethodIfProxy(sizeof(void*))->GetShorty(),
+                          thread);
+    if (UNLIKELY(thread->GetException() == Thread::GetDeoptimizationException())) {
+      thread->DeoptimizeWithDeoptimizationException(result);
+    }
+    thread->PopManagedStackFragment(fragment);
+  }
+  free(memory);
+  thread->PushShadowFrame(shadow_frame);
+  VLOG(jit) << "Done running OSR code for " << PrettyMethod(method);
+  return true;
+}
+
 }  // namespace jit
 }  // namespace art
diff --git a/runtime/jit/jit.h b/runtime/jit/jit.h
index a80f51f..042da92 100644
--- a/runtime/jit/jit.h
+++ b/runtime/jit/jit.h
@@ -49,9 +49,11 @@
 
   virtual ~Jit();
   static Jit* Create(JitOptions* options, std::string* error_msg);
-  bool CompileMethod(ArtMethod* method, Thread* self)
+  bool CompileMethod(ArtMethod* method, Thread* self, bool osr)
       SHARED_REQUIRES(Locks::mutator_lock_);
-  void CreateInstrumentationCache(size_t compile_threshold, size_t warmup_threshold);
+  void CreateInstrumentationCache(size_t compile_threshold,
+                                  size_t warmup_threshold,
+                                  size_t osr_threshold);
   void CreateThreadPool();
   CompilerCallbacks* GetCompilerCallbacks() {
     return compiler_callbacks_;
@@ -88,6 +90,17 @@
 
   bool JitAtFirstUse();
 
+  // If an OSR compiled version is available for `method`,
+  // and `dex_pc + dex_pc_offset` is an entry point of that compiled
+  // version, this method will jump to the compiled code, let it run,
+  // and return true afterwards. Return false otherwise.
+  static bool MaybeDoOnStackReplacement(Thread* thread,
+                                        ArtMethod* method,
+                                        uint32_t dex_pc,
+                                        int32_t dex_pc_offset,
+                                        JValue* result)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
  private:
   Jit();
   bool LoadCompiler(std::string* error_msg);
@@ -97,7 +110,7 @@
   void* jit_compiler_handle_;
   void* (*jit_load_)(CompilerCallbacks**, bool*);
   void (*jit_unload_)(void*);
-  bool (*jit_compile_method_)(void*, ArtMethod*, Thread*);
+  bool (*jit_compile_method_)(void*, ArtMethod*, Thread*, bool);
   void (*jit_types_loaded_)(void*, mirror::Class**, size_t count);
 
   // Performance monitoring.
@@ -123,6 +136,9 @@
   size_t GetWarmupThreshold() const {
     return warmup_threshold_;
   }
+  size_t GetOsrThreshold() const {
+    return osr_threshold_;
+  }
   size_t GetCodeCacheInitialCapacity() const {
     return code_cache_initial_capacity_;
   }
@@ -155,6 +171,7 @@
   size_t code_cache_max_capacity_;
   size_t compile_threshold_;
   size_t warmup_threshold_;
+  size_t osr_threshold_;
   bool dump_info_on_shutdown_;
   bool save_profiling_info_;
 
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index f325949..464c441 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -184,7 +184,8 @@
                                   size_t core_spill_mask,
                                   size_t fp_spill_mask,
                                   const uint8_t* code,
-                                  size_t code_size) {
+                                  size_t code_size,
+                                  bool osr) {
   uint8_t* result = CommitCodeInternal(self,
                                        method,
                                        mapping_table,
@@ -194,7 +195,8 @@
                                        core_spill_mask,
                                        fp_spill_mask,
                                        code,
-                                       code_size);
+                                       code_size,
+                                       osr);
   if (result == nullptr) {
     // Retry.
     GarbageCollectCache(self);
@@ -207,7 +209,8 @@
                                 core_spill_mask,
                                 fp_spill_mask,
                                 code,
-                                code_size);
+                                code_size,
+                                osr);
   }
   return result;
 }
@@ -287,7 +290,8 @@
                                           size_t core_spill_mask,
                                           size_t fp_spill_mask,
                                           const uint8_t* code,
-                                          size_t code_size) {
+                                          size_t code_size,
+                                          bool osr) {
   size_t alignment = GetInstructionSetAlignment(kRuntimeISA);
   // Ensure the header ends up at expected instruction alignment.
   size_t header_size = RoundUp(sizeof(OatQuickMethodHeader), alignment);
@@ -329,8 +333,12 @@
   {
     MutexLock mu(self, lock_);
     method_code_map_.Put(code_ptr, method);
-    Runtime::Current()->GetInstrumentation()->UpdateMethodsCode(
-        method, method_header->GetEntryPoint());
+    if (osr) {
+      osr_code_map_.Put(method, code_ptr);
+    } else {
+      Runtime::Current()->GetInstrumentation()->UpdateMethodsCode(
+          method, method_header->GetEntryPoint());
+    }
     if (collection_in_progress_) {
       // We need to update the live bitmap if there is a GC to ensure it sees this new
       // code.
@@ -338,7 +346,7 @@
     }
     last_update_time_ns_.StoreRelease(NanoTime());
     VLOG(jit)
-        << "JIT added "
+        << "JIT added (osr = " << std::boolalpha << osr << std::noboolalpha << ") "
         << PrettyMethod(method) << "@" << method
         << " ccache_size=" << PrettySize(CodeCacheSizeLocked()) << ": "
         << " dcache_size=" << PrettySize(DataCacheSizeLocked()) << ": "
@@ -569,6 +577,10 @@
         info->GetMethod()->SetProfilingInfo(nullptr);
       }
     }
+
+    // Empty osr method map, as osr compile code will be deleted (except the ones
+    // on thread stacks).
+    osr_code_map_.clear();
   }
 
   // Run a checkpoint on all threads to mark the JIT compiled code they are running.
@@ -662,6 +674,15 @@
   return method_header;
 }
 
+OatQuickMethodHeader* JitCodeCache::LookupOsrMethodHeader(ArtMethod* method) {
+  MutexLock mu(Thread::Current(), lock_);
+  auto it = osr_code_map_.find(method);
+  if (it == osr_code_map_.end()) {
+    return nullptr;
+  }
+  return OatQuickMethodHeader::FromCodePointer(it->second);
+}
+
 ProfilingInfo* JitCodeCache::AddProfilingInfo(Thread* self,
                                               ArtMethod* method,
                                               const std::vector<uint32_t>& entries,
@@ -733,12 +754,15 @@
   return last_update_time_ns_.LoadAcquire();
 }
 
-bool JitCodeCache::NotifyCompilationOf(ArtMethod* method, Thread* self) {
-  if (ContainsPc(method->GetEntryPointFromQuickCompiledCode())) {
+bool JitCodeCache::NotifyCompilationOf(ArtMethod* method, Thread* self, bool osr) {
+  if (!osr && ContainsPc(method->GetEntryPointFromQuickCompiledCode())) {
     return false;
   }
 
   MutexLock mu(self, lock_);
+  if (osr && (osr_code_map_.find(method) != osr_code_map_.end())) {
+    return false;
+  }
   ProfilingInfo* info = method->GetProfilingInfo(sizeof(void*));
   if (info == nullptr || info->IsMethodBeingCompiled()) {
     return false;
diff --git a/runtime/jit/jit_code_cache.h b/runtime/jit/jit_code_cache.h
index 69fc553..048f8d0 100644
--- a/runtime/jit/jit_code_cache.h
+++ b/runtime/jit/jit_code_cache.h
@@ -71,7 +71,7 @@
   // Number of compilations done throughout the lifetime of the JIT.
   size_t NumberOfCompilations() REQUIRES(!lock_);
 
-  bool NotifyCompilationOf(ArtMethod* method, Thread* self)
+  bool NotifyCompilationOf(ArtMethod* method, Thread* self, bool osr)
       SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!lock_);
 
@@ -89,7 +89,8 @@
                       size_t core_spill_mask,
                       size_t fp_spill_mask,
                       const uint8_t* code,
-                      size_t code_size)
+                      size_t code_size,
+                      bool osr)
       SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!lock_);
 
@@ -131,6 +132,10 @@
       REQUIRES(!lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  OatQuickMethodHeader* LookupOsrMethodHeader(ArtMethod* method)
+      REQUIRES(!lock_)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
   // Remove all methods in our cache that were allocated by 'alloc'.
   void RemoveMethodsIn(Thread* self, const LinearAlloc& alloc)
       REQUIRES(!lock_)
@@ -187,7 +192,8 @@
                               size_t core_spill_mask,
                               size_t fp_spill_mask,
                               const uint8_t* code,
-                              size_t code_size)
+                              size_t code_size,
+                              bool osr)
       REQUIRES(!lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
@@ -237,8 +243,10 @@
   void* data_mspace_ GUARDED_BY(lock_);
   // Bitmap for collecting code and data.
   std::unique_ptr<CodeCacheBitmap> live_bitmap_;
-  // This map holds compiled code associated to the ArtMethod.
+  // Holds compiled code associated to the ArtMethod.
   SafeMap<const void*, ArtMethod*> method_code_map_ GUARDED_BY(lock_);
+  // Holds osr compiled code associated to the ArtMethod.
+  SafeMap<ArtMethod*, const void*> osr_code_map_ GUARDED_BY(lock_);
   // ProfilingInfo objects we have allocated.
   std::vector<ProfilingInfo*> profiling_infos_ GUARDED_BY(lock_);
 
diff --git a/runtime/jit/jit_instrumentation.cc b/runtime/jit/jit_instrumentation.cc
index d597b36..a4e40ad 100644
--- a/runtime/jit/jit_instrumentation.cc
+++ b/runtime/jit/jit_instrumentation.cc
@@ -29,7 +29,8 @@
  public:
   enum TaskKind {
     kAllocateProfile,
-    kCompile
+    kCompile,
+    kCompileOsr
   };
 
   JitCompileTask(ArtMethod* method, TaskKind kind) : method_(method), kind_(kind) {
@@ -48,9 +49,14 @@
     ScopedObjectAccess soa(self);
     if (kind_ == kCompile) {
       VLOG(jit) << "JitCompileTask compiling method " << PrettyMethod(method_);
-      if (!Runtime::Current()->GetJit()->CompileMethod(method_, self)) {
+      if (!Runtime::Current()->GetJit()->CompileMethod(method_, self, /* osr */ false)) {
         VLOG(jit) << "Failed to compile method " << PrettyMethod(method_);
       }
+    } else if (kind_ == kCompileOsr) {
+      VLOG(jit) << "JitCompileTask compiling method osr " << PrettyMethod(method_);
+      if (!Runtime::Current()->GetJit()->CompileMethod(method_, self, /* osr */ true)) {
+        VLOG(jit) << "Failed to compile method osr " << PrettyMethod(method_);
+      }
     } else {
       DCHECK(kind_ == kAllocateProfile);
       if (ProfilingInfo::Create(self, method_, /* retry_allocation */ true)) {
@@ -72,9 +78,11 @@
 };
 
 JitInstrumentationCache::JitInstrumentationCache(size_t hot_method_threshold,
-                                                 size_t warm_method_threshold)
+                                                 size_t warm_method_threshold,
+                                                 size_t osr_method_threshold)
     : hot_method_threshold_(hot_method_threshold),
       warm_method_threshold_(warm_method_threshold),
+      osr_method_threshold_(osr_method_threshold),
       listener_(this) {
 }
 
@@ -151,6 +159,11 @@
     DCHECK(thread_pool_ != nullptr);
     thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kCompile));
   }
+
+  if (sample_count == osr_method_threshold_) {
+    DCHECK(thread_pool_ != nullptr);
+    thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kCompileOsr));
+  }
 }
 
 JitInstrumentationListener::JitInstrumentationListener(JitInstrumentationCache* cache)
diff --git a/runtime/jit/jit_instrumentation.h b/runtime/jit/jit_instrumentation.h
index 06559ad..d1c5c44 100644
--- a/runtime/jit/jit_instrumentation.h
+++ b/runtime/jit/jit_instrumentation.h
@@ -96,7 +96,9 @@
 // Keeps track of which methods are hot.
 class JitInstrumentationCache {
  public:
-  JitInstrumentationCache(size_t hot_method_threshold, size_t warm_method_threshold);
+  JitInstrumentationCache(size_t hot_method_threshold,
+                          size_t warm_method_threshold,
+                          size_t osr_method_threshold);
   void AddSamples(Thread* self, ArtMethod* method, size_t samples)
       SHARED_REQUIRES(Locks::mutator_lock_);
   void CreateThreadPool();
@@ -112,6 +114,7 @@
  private:
   size_t hot_method_threshold_;
   size_t warm_method_threshold_;
+  size_t osr_method_threshold_;
   JitInstrumentationListener listener_;
   std::unique_ptr<ThreadPool> thread_pool_;
 
diff --git a/runtime/oat_quick_method_header.h b/runtime/oat_quick_method_header.h
index 5643739..2b7eca2 100644
--- a/runtime/oat_quick_method_header.h
+++ b/runtime/oat_quick_method_header.h
@@ -108,7 +108,7 @@
   }
 
   template <bool kCheckFrameSize = true>
-  uint32_t GetFrameSizeInBytes() {
+  uint32_t GetFrameSizeInBytes() const {
     uint32_t result = frame_info_.FrameSizeInBytes();
     if (kCheckFrameSize) {
       DCHECK_LE(static_cast<size_t>(kStackAlignment), result);
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index b1b7473..1b59c6f 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -1887,7 +1887,8 @@
   if (jit_.get() != nullptr) {
     compiler_callbacks_ = jit_->GetCompilerCallbacks();
     jit_->CreateInstrumentationCache(jit_options_->GetCompileThreshold(),
-                                     jit_options_->GetWarmupThreshold());
+                                     jit_options_->GetWarmupThreshold(),
+                                     jit_options_->GetOsrThreshold());
     jit_->CreateThreadPool();
 
     // Notify native debugger about the classes already loaded before the creation of the jit.
diff --git a/runtime/stack.cc b/runtime/stack.cc
index 5faff93..1e82860 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -40,7 +40,7 @@
 
 namespace art {
 
-static constexpr bool kDebugStackWalk = false;
+static constexpr bool kDebugStackWalk = true;
 
 mirror::Object* ShadowFrame::GetThisObject() const {
   ArtMethod* m = GetMethod();
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index 84185ce..97eb805 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -1195,6 +1195,35 @@
     return StackMap();
   }
 
+  StackMap GetOsrStackMapForDexPc(uint32_t dex_pc, const StackMapEncoding& encoding) const {
+    size_t e = GetNumberOfStackMaps();
+    if (e == 0) {
+      // There cannot be OSR stack map if there is no stack map.
+      return StackMap();
+    }
+    // Walk over all stack maps. If two consecutive stack maps are identical, then we
+    // have found a stack map suitable for OSR.
+    for (size_t i = 0; i < e - 1; ++i) {
+      StackMap stack_map = GetStackMapAt(i, encoding);
+      if (stack_map.GetDexPc(encoding) == dex_pc) {
+        StackMap other = GetStackMapAt(i + 1, encoding);
+        if (other.GetDexPc(encoding) == dex_pc &&
+            other.GetNativePcOffset(encoding) == stack_map.GetNativePcOffset(encoding)) {
+          DCHECK_EQ(other.GetDexRegisterMapOffset(encoding),
+                    stack_map.GetDexRegisterMapOffset(encoding));
+          DCHECK(!stack_map.HasInlineInfo(encoding));
+          if (i < e - 2) {
+            // Make sure there are not three identical stack maps following each other.
+            DCHECK_NE(stack_map.GetNativePcOffset(encoding),
+                      GetStackMapAt(i + 2, encoding).GetNativePcOffset(encoding));
+          }
+          return stack_map;
+        }
+      }
+    }
+    return StackMap();
+  }
+
   StackMap GetStackMapForNativePcOffset(uint32_t native_pc_offset,
                                         const StackMapEncoding& encoding) const {
     // TODO: Safepoint stack maps are sorted by native_pc_offset but catch stack
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 2abcd67..c0fb0cd 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -3012,4 +3012,25 @@
   return count;
 }
 
+
+void Thread::DeoptimizeWithDeoptimizationException(JValue* result) {
+  DCHECK_EQ(GetException(), Thread::GetDeoptimizationException());
+  ClearException();
+  ShadowFrame* shadow_frame =
+      PopStackedShadowFrame(StackedShadowFrameType::kDeoptimizationShadowFrame);
+  mirror::Throwable* pending_exception = nullptr;
+  bool from_code = false;
+  PopDeoptimizationContext(result, &pending_exception, &from_code);
+  CHECK(!from_code) << "Deoptimizing from code should be done with single frame deoptimization";
+  SetTopOfStack(nullptr);
+  SetTopOfShadowStack(shadow_frame);
+
+  // Restore the exception that was pending before deoptimization then interpret the
+  // deoptimized frames.
+  if (pending_exception != nullptr) {
+    SetException(pending_exception);
+  }
+  interpreter::EnterInterpreterFromDeoptimize(this, shadow_frame, from_code, result);
+}
+
 }  // namespace art
diff --git a/runtime/thread.h b/runtime/thread.h
index d7887ca..0660cd7 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -552,6 +552,9 @@
         OFFSETOF_MEMBER(tls_32bit_sized_values, is_gc_marking));
   }
 
+  // Deoptimize the Java stack.
+  void DeoptimizeWithDeoptimizationException(JValue* result) SHARED_REQUIRES(Locks::mutator_lock_);
+
  private:
   template<size_t pointer_size>
   static ThreadOffset<pointer_size> ThreadOffsetFromTlsPtr(size_t tls_ptr_offset) {