am c26b4512: Merge "Revert "Revert "Fix deoptimization with pending exception"""

* commit 'c26b4512a01d46756683a4f5e186a0b7f397f251':
  Revert "Revert "Fix deoptimization with pending exception""
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index be9af98..1599025 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -167,7 +167,8 @@
   qpoints->pThrowNullPointer = art_quick_throw_null_pointer_exception;
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
 
-  qpoints->pDeoptimize = art_quick_deoptimize;
+  // Deoptimization from compiled code.
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index d6396c1..e45d828 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1141,6 +1141,17 @@
 END art_quick_deoptimize
 
     /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+    .extern artDeoptimizeFromCompiledCode
+ENTRY art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME r0, r1
+    mov    r0, r9                         @ Set up args.
+    blx    artDeoptimizeFromCompiledCode  @ artDeoptimizeFromCompiledCode(Thread*)
+END art_quick_deoptimize_from_compiled_code
+
+    /*
      * Signed 64-bit integer multiply.
      *
      * Consider WXxYZ (r1r0 x r3r2) with a long multiply:
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index 0f06727..e9c816f 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -150,8 +150,8 @@
   qpoints->pThrowNullPointer = art_quick_throw_null_pointer_exception;
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
 
-  // Deoptimize
-  qpoints->pDeoptimize = art_quick_deoptimize;
+  // Deoptimization from compiled code.
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index bfef0fa..169bc38 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1739,6 +1739,18 @@
     brk 0
 END art_quick_deoptimize
 
+    /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+    .extern artDeoptimizeFromCompiledCode
+ENTRY art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
+    mov    x0, xSELF                      // Pass thread.
+    bl     artDeoptimizeFromCompiledCode  // artDeoptimizeFromCompiledCode(Thread*)
+    brk 0
+END art_quick_deoptimize_from_compiled_code
+
 
     /*
      * String's indexOf.
diff --git a/runtime/arch/mips/entrypoints_init_mips.cc b/runtime/arch/mips/entrypoints_init_mips.cc
index 4e4b91f..6721e54 100644
--- a/runtime/arch/mips/entrypoints_init_mips.cc
+++ b/runtime/arch/mips/entrypoints_init_mips.cc
@@ -267,8 +267,8 @@
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
   static_assert(!IsDirectEntrypoint(kQuickThrowStackOverflow), "Non-direct C stub marked direct.");
 
-  // Deoptimize
-  qpoints->pDeoptimize = art_quick_deoptimize;
+  // Deoptimization from compiled code.
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
   static_assert(!IsDirectEntrypoint(kQuickDeoptimize), "Non-direct C stub marked direct.");
 
   // Atomic 64-bit load/store
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index cb49cf5..ba58c3f 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -1542,6 +1542,18 @@
 END art_quick_deoptimize
 
     /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+    .extern artDeoptimizeFromCompiledCode
+ENTRY art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
+    jal      artDeoptimizeFromCompiledCode  # artDeoptimizeFromCompiledCode(Thread*)
+                                            # Returns caller method's frame size.
+    move     $a0, rSELF                     # pass Thread::current
+END art_quick_deoptimize_from_compiled_code
+
+    /*
      * Long integer shift.  This is different from the generic 32/64-bit
      * binary operations because vAA/vBB are 64-bit but vCC (the shift
      * distance) is 32-bit.  Also, Dalvik requires us to ignore all but the low
diff --git a/runtime/arch/mips64/entrypoints_init_mips64.cc b/runtime/arch/mips64/entrypoints_init_mips64.cc
index ec02d5a..9f1f0e0 100644
--- a/runtime/arch/mips64/entrypoints_init_mips64.cc
+++ b/runtime/arch/mips64/entrypoints_init_mips64.cc
@@ -176,8 +176,8 @@
   qpoints->pThrowNullPointer = art_quick_throw_null_pointer_exception;
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
 
-  // Deoptimize
-  qpoints->pDeoptimize = art_quick_deoptimize;
+  // Deoptimization from compiled code.
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
 
   // TODO - use lld/scd instructions for Mips64
   // Atomic 64-bit load/store
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index 4bc049c..1b50b2e 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -1603,5 +1603,17 @@
     move     $a0, rSELF        # pass Thread::current
 END art_quick_deoptimize
 
+    /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+    .extern artDeoptimizeFromCompiledCode
+ENTRY art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
+    jal      artDeoptimizeFromCompiledCode    # artDeoptimizeFromCompiledCode(Thread*, SP)
+                                              # Returns caller method's frame size.
+    move     $a0, rSELF                       # pass Thread::current
+END art_quick_deoptimize_from_compiled_code
+
 UNIMPLEMENTED art_quick_indexof
 UNIMPLEMENTED art_quick_string_compareto
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index e2632c1..10fc281 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -140,7 +140,7 @@
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
 
   // Deoptimize
-  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_slow_path;
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 9b2d59d..029a296 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1684,9 +1684,6 @@
      */
 DEFINE_FUNCTION art_quick_deoptimize
     PUSH ebx                      // Entry point for a jump. Fake that we were called.
-.globl SYMBOL(art_quick_deoptimize_from_compiled_slow_path)  // Entry point for real calls
-                                                             // from compiled slow paths.
-SYMBOL(art_quick_deoptimize_from_compiled_slow_path):
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME ebx, ebx
     subl LITERAL(12), %esp        // Align stack.
     CFI_ADJUST_CFA_OFFSET(12)
@@ -1697,6 +1694,20 @@
 END_FUNCTION art_quick_deoptimize
 
     /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+DEFINE_FUNCTION art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME ebx, ebx
+    subl LITERAL(12), %esp                      // Align stack.
+    CFI_ADJUST_CFA_OFFSET(12)
+    pushl %fs:THREAD_SELF_OFFSET                // Pass Thread::Current().
+    CFI_ADJUST_CFA_OFFSET(4)
+    call SYMBOL(artDeoptimizeFromCompiledCode)  // artDeoptimizeFromCompiledCode(Thread*)
+    UNREACHABLE
+END_FUNCTION art_quick_deoptimize_from_compiled_code
+
+    /*
      * String's compareTo.
      *
      * On entry:
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index ef1bb5f..5cc72e3 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -144,7 +144,7 @@
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
 
   // Deoptimize
-  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_slow_path;
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 88270d9..1498a4b 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1728,9 +1728,6 @@
 DEFINE_FUNCTION art_quick_deoptimize
     pushq %rsi                     // Entry point for a jump. Fake that we were called.
                                    // Use hidden arg.
-.globl SYMBOL(art_quick_deoptimize_from_compiled_slow_path)  // Entry point for real calls
-                                                             // from compiled slow paths.
-SYMBOL(art_quick_deoptimize_from_compiled_slow_path):
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
                                    // Stack should be aligned now.
     movq %gs:THREAD_SELF_OFFSET, %rdi         // Pass Thread.
@@ -1739,6 +1736,18 @@
 END_FUNCTION art_quick_deoptimize
 
     /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+DEFINE_FUNCTION art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
+                                                // Stack should be aligned now.
+    movq %gs:THREAD_SELF_OFFSET, %rdi           // Pass Thread.
+    call SYMBOL(artDeoptimizeFromCompiledCode)  // artDeoptimizeFromCompiledCode(Thread*)
+    UNREACHABLE
+END_FUNCTION art_quick_deoptimize_from_compiled_code
+
+    /*
      * String's compareTo.
      *
      * On entry:
diff --git a/runtime/art_method.cc b/runtime/art_method.cc
index 56f7b35..e46402d 100644
--- a/runtime/art_method.cc
+++ b/runtime/art_method.cc
@@ -427,9 +427,16 @@
         self->ClearException();
         ShadowFrame* shadow_frame =
             self->PopStackedShadowFrame(StackedShadowFrameType::kDeoptimizationShadowFrame);
-        result->SetJ(self->PopDeoptimizationReturnValue().GetJ());
+        mirror::Throwable* pending_exception = nullptr;
+        self->PopDeoptimizationContext(result, &pending_exception);
         self->SetTopOfStack(nullptr);
         self->SetTopOfShadowStack(shadow_frame);
+
+        // Restore the exception that was pending before deoptimization then interpret the
+        // deoptimized frames.
+        if (pending_exception != nullptr) {
+          self->SetException(pending_exception);
+        }
         interpreter::EnterInterpreterFromDeoptimize(self, shadow_frame, result);
       }
       if (kLogInvocationStartAndReturn) {
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 084c88e..5c1922e 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -89,7 +89,7 @@
             art::Thread::ThinLockIdOffset<__SIZEOF_POINTER__>().Int32Value())
 
 // Offset of field Thread::tlsPtr_.card_table.
-#define THREAD_CARD_TABLE_OFFSET 136
+#define THREAD_CARD_TABLE_OFFSET 128
 ADD_TEST_EQ(THREAD_CARD_TABLE_OFFSET,
             art::Thread::CardTableOffset<__SIZEOF_POINTER__>().Int32Value())
 
diff --git a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
index a4feac1..d749664 100644
--- a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
@@ -28,17 +28,30 @@
 
 namespace art {
 
-extern "C" NO_RETURN void artDeoptimize(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_) {
-  ScopedQuickEntrypointChecks sqec(self);
-
+NO_RETURN static void artDeoptimizeImpl(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_) {
   if (VLOG_IS_ON(deopt)) {
     LOG(INFO) << "Deopting:";
     self->Dump(LOG(INFO));
   }
 
-  self->PushAndClearDeoptimizationReturnValue();
+  self->AssertHasDeoptimizationContext();
   self->SetException(Thread::GetDeoptimizationException());
   self->QuickDeliverException();
 }
 
+extern "C" NO_RETURN void artDeoptimize(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_) {
+  ScopedQuickEntrypointChecks sqec(self);
+  artDeoptimizeImpl(self);
+}
+
+extern "C" NO_RETURN void artDeoptimizeFromCompiledCode(Thread* self)
+    SHARED_REQUIRES(Locks::mutator_lock_) {
+  ScopedQuickEntrypointChecks sqec(self);
+  // Before deoptimizing to interpreter, we must push the deoptimization context.
+  JValue return_value;
+  return_value.SetJ(0);  // we never deoptimize from compiled code with an invoke result.
+  self->PushDeoptimizationContext(return_value, false, self->GetException());
+  artDeoptimizeImpl(self);
+}
+
 }  // namespace art
diff --git a/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc b/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
index ad5ee84..8e660a2 100644
--- a/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
@@ -51,6 +51,9 @@
                                                               uint64_t gpr_result,
                                                               uint64_t fpr_result)
     SHARED_REQUIRES(Locks::mutator_lock_) {
+  // Instrumentation exit stub must not be entered with a pending exception.
+  CHECK(!self->IsExceptionPending()) << "Enter instrumentation exit stub with pending exception "
+                                     << self->GetException()->Dump();
   // Compute address of return PC and sanity check that it currently holds 0.
   size_t return_pc_offset = GetCalleeSaveReturnPcOffset(kRuntimeISA, Runtime::kRefsOnly);
   uintptr_t* return_pc = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(sp) +
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index aa35ec1..0c7caf3 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -688,8 +688,12 @@
     // Request a stack deoptimization if needed
     ArtMethod* caller = QuickArgumentVisitor::GetCallingMethod(sp);
     if (UNLIKELY(Dbg::IsForcedInterpreterNeededForUpcall(self, caller))) {
+      // Push the context of the deoptimization stack so we can restore the return value and the
+      // exception before executing the deoptimized frames.
+      self->PushDeoptimizationContext(result, shorty[0] == 'L', self->GetException());
+
+      // Set special exception to cause deoptimization.
       self->SetException(Thread::GetDeoptimizationException());
-      self->SetDeoptimizationReturnValue(result, shorty[0] == 'L');
     }
 
     // No need to restore the args since the method has already been run by the interpreter.
diff --git a/runtime/entrypoints/runtime_asm_entrypoints.h b/runtime/entrypoints/runtime_asm_entrypoints.h
index 8209dc8..2842c5a 100644
--- a/runtime/entrypoints/runtime_asm_entrypoints.h
+++ b/runtime/entrypoints/runtime_asm_entrypoints.h
@@ -70,7 +70,8 @@
   return reinterpret_cast<const void*>(art_quick_instrumentation_entry);
 }
 
-extern "C" void art_quick_deoptimize_from_compiled_slow_path();
+// Stub to deoptimize from compiled code.
+extern "C" void art_quick_deoptimize_from_compiled_code();
 
 // The return_pc of instrumentation exit stub.
 extern "C" void art_quick_instrumentation_exit();
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index f7a3cd5..7db8888 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -72,15 +72,12 @@
     EXPECT_OFFSET_DIFFP(Thread, tls32_, throwing_OutOfMemoryError, no_thread_suspension, 4);
     EXPECT_OFFSET_DIFFP(Thread, tls32_, no_thread_suspension, thread_exit_check_count, 4);
     EXPECT_OFFSET_DIFFP(Thread, tls32_, thread_exit_check_count, handling_signal_, 4);
-    EXPECT_OFFSET_DIFFP(Thread, tls32_, handling_signal_,
-                        deoptimization_return_value_is_reference, 4);
 
     // TODO: Better connection. Take alignment into account.
     EXPECT_OFFSET_DIFF_GT3(Thread, tls32_.thread_exit_check_count, tls64_.trace_clock_base, 4,
                            thread_tls32_to_tls64);
 
-    EXPECT_OFFSET_DIFFP(Thread, tls64_, trace_clock_base, deoptimization_return_value, 8);
-    EXPECT_OFFSET_DIFFP(Thread, tls64_, deoptimization_return_value, stats, 8);
+    EXPECT_OFFSET_DIFFP(Thread, tls64_, trace_clock_base, stats, 8);
 
     // TODO: Better connection. Take alignment into account.
     EXPECT_OFFSET_DIFF_GT3(Thread, tls64_.stats, tlsPtr_.card_table, 8, thread_tls64_to_tlsptr);
@@ -108,8 +105,8 @@
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, single_step_control, stacked_shadow_frame_record,
                         sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, stacked_shadow_frame_record,
-                        deoptimization_return_value_stack, sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, deoptimization_return_value_stack, name, sizeof(void*));
+                        deoptimization_context_stack, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, deoptimization_context_stack, name, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, name, pthread_self, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, pthread_self, last_no_thread_suspension_cause,
                         sizeof(void*));
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index e28d578..63c02ed 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -1016,7 +1016,8 @@
                                 PrettyMethod(method).c_str(),
                                 return_value.GetJ()) << *self;
     }
-    self->SetDeoptimizationReturnValue(return_value, return_shorty == 'L');
+    self->PushDeoptimizationContext(return_value, return_shorty == 'L',
+                                    nullptr /* no pending exception */);
     return GetTwoWordSuccessValue(*return_pc,
                                   reinterpret_cast<uintptr_t>(GetQuickDeoptimizationEntryPoint()));
   } else {
diff --git a/runtime/oat.h b/runtime/oat.h
index 29dd76c..1520a9b 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '6', '8', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '6', '9', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/thread.cc b/runtime/thread.cc
index a33e150..63534b1 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -162,27 +162,41 @@
   ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints);
 }
 
-class DeoptimizationReturnValueRecord {
+class DeoptimizationContextRecord {
  public:
-  DeoptimizationReturnValueRecord(const JValue& ret_val,
-                                  bool is_reference,
-                                  DeoptimizationReturnValueRecord* link)
-      : ret_val_(ret_val), is_reference_(is_reference), link_(link) {}
+  DeoptimizationContextRecord(const JValue& ret_val, bool is_reference,
+                              mirror::Throwable* pending_exception,
+                              DeoptimizationContextRecord* link)
+      : ret_val_(ret_val), is_reference_(is_reference), pending_exception_(pending_exception),
+        link_(link) {}
 
   JValue GetReturnValue() const { return ret_val_; }
   bool IsReference() const { return is_reference_; }
-  DeoptimizationReturnValueRecord* GetLink() const { return link_; }
-  mirror::Object** GetGCRoot() {
+  mirror::Throwable* GetPendingException() const { return pending_exception_; }
+  DeoptimizationContextRecord* GetLink() const { return link_; }
+  mirror::Object** GetReturnValueAsGCRoot() {
     DCHECK(is_reference_);
     return ret_val_.GetGCRoot();
   }
+  mirror::Object** GetPendingExceptionAsGCRoot() {
+    return reinterpret_cast<mirror::Object**>(&pending_exception_);
+  }
 
  private:
+  // The value returned by the method at the top of the stack before deoptimization.
   JValue ret_val_;
-  const bool is_reference_;
-  DeoptimizationReturnValueRecord* const link_;
 
-  DISALLOW_COPY_AND_ASSIGN(DeoptimizationReturnValueRecord);
+  // Indicates whether the returned value is a reference. If so, the GC will visit it.
+  const bool is_reference_;
+
+  // The exception that was pending before deoptimization (or null if there was no pending
+  // exception).
+  mirror::Throwable* pending_exception_;
+
+  // A link to the previous DeoptimizationContextRecord.
+  DeoptimizationContextRecord* const link_;
+
+  DISALLOW_COPY_AND_ASSIGN(DeoptimizationContextRecord);
 };
 
 class StackedShadowFrameRecord {
@@ -206,22 +220,28 @@
   DISALLOW_COPY_AND_ASSIGN(StackedShadowFrameRecord);
 };
 
-void Thread::PushAndClearDeoptimizationReturnValue() {
-  DeoptimizationReturnValueRecord* record = new DeoptimizationReturnValueRecord(
-      tls64_.deoptimization_return_value,
-      tls32_.deoptimization_return_value_is_reference,
-      tlsPtr_.deoptimization_return_value_stack);
-  tlsPtr_.deoptimization_return_value_stack = record;
-  ClearDeoptimizationReturnValue();
+void Thread::PushDeoptimizationContext(const JValue& return_value, bool is_reference,
+                                       mirror::Throwable* exception) {
+  DeoptimizationContextRecord* record = new DeoptimizationContextRecord(
+      return_value,
+      is_reference,
+      exception,
+      tlsPtr_.deoptimization_context_stack);
+  tlsPtr_.deoptimization_context_stack = record;
 }
 
-JValue Thread::PopDeoptimizationReturnValue() {
-  DeoptimizationReturnValueRecord* record = tlsPtr_.deoptimization_return_value_stack;
-  DCHECK(record != nullptr);
-  tlsPtr_.deoptimization_return_value_stack = record->GetLink();
-  JValue ret_val(record->GetReturnValue());
+void Thread::PopDeoptimizationContext(JValue* result, mirror::Throwable** exception) {
+  AssertHasDeoptimizationContext();
+  DeoptimizationContextRecord* record = tlsPtr_.deoptimization_context_stack;
+  tlsPtr_.deoptimization_context_stack = record->GetLink();
+  result->SetJ(record->GetReturnValue().GetJ());
+  *exception = record->GetPendingException();
   delete record;
-  return ret_val;
+}
+
+void Thread::AssertHasDeoptimizationContext() {
+  CHECK(tlsPtr_.deoptimization_context_stack != nullptr)
+      << "No deoptimization context for thread " << *this;
 }
 
 void Thread::PushStackedShadowFrame(ShadowFrame* sf, StackedShadowFrameType type) {
@@ -1575,6 +1595,9 @@
   CHECK(tlsPtr_.flip_function == nullptr);
   CHECK_EQ(tls32_.suspended_at_suspend_check, false);
 
+  // Make sure we processed all deoptimization requests.
+  CHECK(tlsPtr_.deoptimization_context_stack == nullptr) << "Missed deoptimization";
+
   // We may be deleting a still born thread.
   SetStateUnsafe(kTerminated);
 
@@ -2593,7 +2616,7 @@
   visitor->VisitRootIfNonNull(&tlsPtr_.opeer, RootInfo(kRootThreadObject, thread_id));
   if (tlsPtr_.exception != nullptr && tlsPtr_.exception != GetDeoptimizationException()) {
     visitor->VisitRoot(reinterpret_cast<mirror::Object**>(&tlsPtr_.exception),
-                   RootInfo(kRootNativeStack, thread_id));
+                       RootInfo(kRootNativeStack, thread_id));
   }
   visitor->VisitRootIfNonNull(&tlsPtr_.monitor_enter_object, RootInfo(kRootNativeStack, thread_id));
   tlsPtr_.jni_env->locals.VisitRoots(visitor, RootInfo(kRootJNILocal, thread_id));
@@ -2602,6 +2625,7 @@
   if (tlsPtr_.debug_invoke_req != nullptr) {
     tlsPtr_.debug_invoke_req->VisitRoots(visitor, RootInfo(kRootDebugger, thread_id));
   }
+  // Visit roots for deoptimization.
   if (tlsPtr_.stacked_shadow_frame_record != nullptr) {
     RootCallbackVisitor visitor_to_callback(visitor, thread_id);
     ReferenceMapVisitor<RootCallbackVisitor> mapper(this, nullptr, visitor_to_callback);
@@ -2615,14 +2639,16 @@
       }
     }
   }
-  if (tlsPtr_.deoptimization_return_value_stack != nullptr) {
-    for (DeoptimizationReturnValueRecord* record = tlsPtr_.deoptimization_return_value_stack;
+  if (tlsPtr_.deoptimization_context_stack != nullptr) {
+    for (DeoptimizationContextRecord* record = tlsPtr_.deoptimization_context_stack;
          record != nullptr;
          record = record->GetLink()) {
       if (record->IsReference()) {
-        visitor->VisitRootIfNonNull(record->GetGCRoot(),
-            RootInfo(kRootThreadObject, thread_id));
+        visitor->VisitRootIfNonNull(record->GetReturnValueAsGCRoot(),
+                                    RootInfo(kRootThreadObject, thread_id));
       }
+      visitor->VisitRootIfNonNull(record->GetPendingExceptionAsGCRoot(),
+                                  RootInfo(kRootThreadObject, thread_id));
     }
   }
   for (auto* verifier = tlsPtr_.method_verifier; verifier != nullptr; verifier = verifier->link_) {
diff --git a/runtime/thread.h b/runtime/thread.h
index 9bb57bf..2d450f5 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -77,7 +77,7 @@
 class Closure;
 class Context;
 struct DebugInvokeReq;
-class DeoptimizationReturnValueRecord;
+class DeoptimizationContextRecord;
 class DexFile;
 class JavaVMExt;
 struct JNIEnvExt;
@@ -830,19 +830,13 @@
   // and execute Java code, so there might be nested deoptimizations happening.
   // We need to save the ongoing deoptimization shadow frames and return
   // values on stacks.
-  void SetDeoptimizationReturnValue(const JValue& ret_val, bool is_reference) {
-    tls64_.deoptimization_return_value.SetJ(ret_val.GetJ());
-    tls32_.deoptimization_return_value_is_reference = is_reference;
-  }
-  bool IsDeoptimizationReturnValueReference() {
-    return tls32_.deoptimization_return_value_is_reference;
-  }
-  void ClearDeoptimizationReturnValue() {
-    tls64_.deoptimization_return_value.SetJ(0);
-    tls32_.deoptimization_return_value_is_reference = false;
-  }
-  void PushAndClearDeoptimizationReturnValue();
-  JValue PopDeoptimizationReturnValue();
+  void PushDeoptimizationContext(const JValue& return_value, bool is_reference,
+                                 mirror::Throwable* exception)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+  void PopDeoptimizationContext(JValue* result, mirror::Throwable** exception)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+  void AssertHasDeoptimizationContext()
+      SHARED_REQUIRES(Locks::mutator_lock_);
   void PushStackedShadowFrame(ShadowFrame* sf, StackedShadowFrameType type);
   ShadowFrame* PopStackedShadowFrame(StackedShadowFrameType type);
 
@@ -1102,9 +1096,8 @@
       suspend_count(0), debug_suspend_count(0), thin_lock_thread_id(0), tid(0),
       daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
       thread_exit_check_count(0), handling_signal_(false),
-      deoptimization_return_value_is_reference(false), suspended_at_suspend_check(false),
-      ready_for_debug_invoke(false), debug_method_entry_(false), is_gc_marking(false),
-      weak_ref_access_enabled(true) {
+      suspended_at_suspend_check(false), ready_for_debug_invoke(false),
+      debug_method_entry_(false), is_gc_marking(false), weak_ref_access_enabled(true) {
     }
 
     union StateAndFlags state_and_flags;
@@ -1144,10 +1137,6 @@
     // True if signal is being handled by this thread.
     bool32_t handling_signal_;
 
-    // True if the return value for interpreter after deoptimization is a reference.
-    // For gc purpose.
-    bool32_t deoptimization_return_value_is_reference;
-
     // True if the thread is suspended in FullSuspendCheck(). This is
     // used to distinguish runnable threads that are suspended due to
     // a normal suspend check from other threads.
@@ -1178,15 +1167,12 @@
   } tls32_;
 
   struct PACKED(8) tls_64bit_sized_values {
-    tls_64bit_sized_values() : trace_clock_base(0), deoptimization_return_value() {
+    tls_64bit_sized_values() : trace_clock_base(0) {
     }
 
     // The clock base used for tracing.
     uint64_t trace_clock_base;
 
-    // Return value used by deoptimization.
-    JValue deoptimization_return_value;
-
     RuntimeStats stats;
   } tls64_;
 
@@ -1197,7 +1183,7 @@
       stack_trace_sample(nullptr), wait_next(nullptr), monitor_enter_object(nullptr),
       top_handle_scope(nullptr), class_loader_override(nullptr), long_jump_context(nullptr),
       instrumentation_stack(nullptr), debug_invoke_req(nullptr), single_step_control(nullptr),
-      stacked_shadow_frame_record(nullptr), deoptimization_return_value_stack(nullptr),
+      stacked_shadow_frame_record(nullptr), deoptimization_context_stack(nullptr),
       name(nullptr), pthread_self(0),
       last_no_thread_suspension_cause(nullptr), thread_local_start(nullptr),
       thread_local_pos(nullptr), thread_local_end(nullptr), thread_local_objects(0),
@@ -1282,7 +1268,7 @@
     StackedShadowFrameRecord* stacked_shadow_frame_record;
 
     // Deoptimization return value record stack.
-    DeoptimizationReturnValueRecord* deoptimization_return_value_stack;
+    DeoptimizationContextRecord* deoptimization_context_stack;
 
     // A cached copy of the java.lang.Thread's name.
     std::string* name;