Change pResolveString entrypoint to kSaveEverything.

Test: Run ART test suite including gcstress on host and Nexus 9.
Test: Run ART test suite including gcstress with baker CC on host and Nexus 9.
Bug: 20323084
Change-Id: I63c21a7d3be8ff7a5765b5003c85b5317635efe6
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index 3b77880..4a9de7f 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -214,7 +214,7 @@
         DCHECK(patch.GetType() == LinkerPatch::Type::kStringRelative ||
                patch.GetType() == LinkerPatch::Type::kTypeRelative) << patch.GetType();
       } else {
-        // With the read barrier (non-baker) enabled, it could be kDexCacheArray in the
+        // With the read barrier (non-Baker) enabled, it could be kDexCacheArray in the
         // HLoadString::LoadKind::kDexCachePcRelative case of VisitLoadString().
         DCHECK(patch.GetType() == LinkerPatch::Type::kStringRelative ||
                patch.GetType() == LinkerPatch::Type::kTypeRelative ||
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 9870876..80b4907 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -429,34 +429,50 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(locations->Out().reg()));
+    HLoadString* load = instruction_->AsLoadString();
+    const uint32_t string_index = load->GetStringIndex();
+    Register out = locations->Out().AsRegister<Register>();
+    Register temp = locations->GetTemp(0).AsRegister<Register>();
+    constexpr bool call_saves_everything_except_r0 = (!kUseReadBarrier || kUseBakerReadBarrier);
 
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
-    HLoadString* load = instruction_->AsLoadString();
-    const uint32_t string_index = load->GetStringIndex();
+    // In the unlucky case that the `temp` is R0, we preserve the address in `out` across
+    // the kSaveEverything call (or use `out` for the address after non-kSaveEverything call).
+    bool temp_is_r0 = (temp == calling_convention.GetRegisterAt(0));
+    Register entry_address = temp_is_r0 ? out : temp;
+    DCHECK_NE(entry_address, calling_convention.GetRegisterAt(0));
+    if (call_saves_everything_except_r0 && temp_is_r0) {
+      __ mov(entry_address, ShifterOperand(temp));
+    }
+
     __ LoadImmediate(calling_convention.GetRegisterAt(0), string_index);
     arm_codegen->InvokeRuntime(kQuickResolveString, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
+
+    // Store the resolved String to the .bss entry.
+    if (call_saves_everything_except_r0) {
+      // The string entry address was preserved in `entry_address` thanks to kSaveEverything.
+      __ str(R0, Address(entry_address));
+    } else {
+      // For non-Baker read barrier, we need to re-calculate the address of the string entry.
+      CodeGeneratorARM::PcRelativePatchInfo* labels =
+          arm_codegen->NewPcRelativeStringPatch(load->GetDexFile(), string_index);
+      __ BindTrackedLabel(&labels->movw_label);
+      __ movw(entry_address, /* placeholder */ 0u);
+      __ BindTrackedLabel(&labels->movt_label);
+      __ movt(entry_address, /* placeholder */ 0u);
+      __ BindTrackedLabel(&labels->add_pc_label);
+      __ add(entry_address, entry_address, ShifterOperand(PC));
+      __ str(R0, Address(entry_address));
+    }
+
     arm_codegen->Move32(locations->Out(), Location::RegisterLocation(R0));
-
     RestoreLiveRegisters(codegen, locations);
 
-    // Store the resolved String to the BSS entry.
-    // TODO: Change art_quick_resolve_string to kSaveEverything and use a temporary for the
-    // .bss entry address in the fast path, so that we can avoid another calculation here.
-    CodeGeneratorARM::PcRelativePatchInfo* labels =
-        arm_codegen->NewPcRelativeStringPatch(load->GetDexFile(), string_index);
-    __ BindTrackedLabel(&labels->movw_label);
-    __ movw(IP, /* placeholder */ 0u);
-    __ BindTrackedLabel(&labels->movt_label);
-    __ movt(IP, /* placeholder */ 0u);
-    __ BindTrackedLabel(&labels->add_pc_label);
-    __ add(IP, IP, ShifterOperand(PC));
-    __ str(locations->Out().AsRegister<Register>(), Address(IP));
-
     __ b(GetExitLabel());
   }
 
@@ -5704,10 +5720,25 @@
 
   HLoadString::LoadKind load_kind = load->GetLoadKind();
   if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod) {
-    locations->SetInAt(0, Location::RequiresRegister());
     locations->SetOut(Location::RegisterLocation(R0));
   } else {
     locations->SetOut(Location::RequiresRegister());
+    if (load_kind == HLoadString::LoadKind::kBssEntry) {
+      if (!kUseReadBarrier || kUseBakerReadBarrier) {
+        // Rely on the pResolveString and/or marking to save everything, including temps.
+        // Note that IP may theoretically be clobbered by saving/restoring the live register
+        // (only one thanks to the custom calling convention), so we request a different temp.
+        locations->AddTemp(Location::RequiresRegister());
+        RegisterSet caller_saves = RegisterSet::Empty();
+        InvokeRuntimeCallingConvention calling_convention;
+        caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+        // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK()
+        // that the the kPrimNot result register is the same as the first argument register.
+        locations->SetCustomSlowPathCallerSaves(caller_saves);
+      } else {
+        // For non-Baker read barrier we have a temp-clobbering call.
+      }
+    }
   }
 }
 
@@ -5743,15 +5774,16 @@
     }
     case HLoadString::LoadKind::kBssEntry: {
       DCHECK(!codegen_->GetCompilerOptions().IsBootImage());
+      Register temp = locations->GetTemp(0).AsRegister<Register>();
       CodeGeneratorARM::PcRelativePatchInfo* labels =
           codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
       __ BindTrackedLabel(&labels->movw_label);
-      __ movw(out, /* placeholder */ 0u);
+      __ movw(temp, /* placeholder */ 0u);
       __ BindTrackedLabel(&labels->movt_label);
-      __ movt(out, /* placeholder */ 0u);
+      __ movt(temp, /* placeholder */ 0u);
       __ BindTrackedLabel(&labels->add_pc_label);
-      __ add(out, out, ShifterOperand(PC));
-      GenerateGcRootFieldLoad(load, out_loc, out, 0);
+      __ add(temp, temp, ShifterOperand(PC));
+      GenerateGcRootFieldLoad(load, out_loc, temp, 0);
       SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM(load);
       codegen_->AddSlowPath(slow_path);
       __ CompareAndBranchIfZero(out, slow_path->GetEntryLabel());
@@ -5765,6 +5797,7 @@
   // TODO: Consider re-adding the compiler code to do string dex cache lookup again.
   DCHECK(load_kind == HLoadString::LoadKind::kDexCacheViaMethod);
   InvokeRuntimeCallingConvention calling_convention;
+  DCHECK_EQ(calling_convention.GetRegisterAt(0), out);
   __ LoadImmediate(calling_convention.GetRegisterAt(0), load->GetStringIndex());
   codegen_->InvokeRuntime(kQuickResolveString, load, load->GetDexPc());
   CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 969d653..8197787 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -331,13 +331,20 @@
 
 class LoadStringSlowPathARM64 : public SlowPathCodeARM64 {
  public:
-  explicit LoadStringSlowPathARM64(HLoadString* instruction) : SlowPathCodeARM64(instruction) {}
+  LoadStringSlowPathARM64(HLoadString* instruction, Register temp, vixl::aarch64::Label* adrp_label)
+      : SlowPathCodeARM64(instruction),
+        temp_(temp),
+        adrp_label_(adrp_label) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(locations->Out().reg()));
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
 
+    // temp_ is a scratch register. Make sure it's not used for saving/restoring registers.
+    UseScratchRegisterScope temps(arm64_codegen->GetVIXLAssembler());
+    temps.Exclude(temp_);
+
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
 
@@ -352,21 +359,21 @@
     RestoreLiveRegisters(codegen, locations);
 
     // Store the resolved String to the BSS entry.
-    UseScratchRegisterScope temps(arm64_codegen->GetVIXLAssembler());
-    Register temp = temps.AcquireX();
     const DexFile& dex_file = instruction_->AsLoadString()->GetDexFile();
-    // TODO: Change art_quick_resolve_string to kSaveEverything and use a temporary
-    // for the ADRP in the fast path, so that we can avoid the ADRP here.
-    vixl::aarch64::Label* adrp_label =
-        arm64_codegen->NewPcRelativeStringPatch(dex_file, string_index);
-    arm64_codegen->EmitAdrpPlaceholder(adrp_label, temp);
+    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      // The string entry page address was preserved in temp_ thanks to kSaveEverything.
+    } else {
+      // For non-Baker read barrier, we need to re-calculate the address of the string entry page.
+      adrp_label_ = arm64_codegen->NewPcRelativeStringPatch(dex_file, string_index);
+      arm64_codegen->EmitAdrpPlaceholder(adrp_label_, temp_);
+    }
     vixl::aarch64::Label* strp_label =
-        arm64_codegen->NewPcRelativeStringPatch(dex_file, string_index, adrp_label);
+        arm64_codegen->NewPcRelativeStringPatch(dex_file, string_index, adrp_label_);
     {
       SingleEmissionCheckScope guard(arm64_codegen->GetVIXLAssembler());
       __ Bind(strp_label);
       __ str(RegisterFrom(locations->Out(), Primitive::kPrimNot),
-             MemOperand(temp, /* offset placeholder */ 0));
+             MemOperand(temp_, /* offset placeholder */ 0));
     }
 
     __ B(GetExitLabel());
@@ -375,6 +382,9 @@
   const char* GetDescription() const OVERRIDE { return "LoadStringSlowPathARM64"; }
 
  private:
+  const Register temp_;
+  vixl::aarch64::Label* adrp_label_;
+
   DISALLOW_COPY_AND_ASSIGN(LoadStringSlowPathARM64);
 };
 
@@ -4238,11 +4248,24 @@
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
   if (load->GetLoadKind() == HLoadString::LoadKind::kDexCacheViaMethod) {
-    locations->SetInAt(0, Location::RequiresRegister());
     InvokeRuntimeCallingConvention calling_convention;
     locations->SetOut(calling_convention.GetReturnLocation(load->GetType()));
   } else {
     locations->SetOut(Location::RequiresRegister());
+    if (load->GetLoadKind() == HLoadString::LoadKind::kBssEntry) {
+      if (!kUseReadBarrier || kUseBakerReadBarrier) {
+        // Rely on the pResolveString and/or marking to save everything, including temps.
+        RegisterSet caller_saves = RegisterSet::Empty();
+        InvokeRuntimeCallingConvention calling_convention;
+        caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0).GetCode()));
+        DCHECK_EQ(calling_convention.GetRegisterAt(0).GetCode(),
+                  RegisterFrom(calling_convention.GetReturnLocation(Primitive::kPrimNot),
+                               Primitive::kPrimNot).GetCode());
+        locations->SetCustomSlowPathCallerSaves(caller_saves);
+      } else {
+        // For non-Baker read barrier we have a temp-clobbering call.
+      }
+    }
   }
 }
 
@@ -4277,18 +4300,21 @@
       const DexFile& dex_file = load->GetDexFile();
       uint32_t string_index = load->GetStringIndex();
       DCHECK(!codegen_->GetCompilerOptions().IsBootImage());
+      UseScratchRegisterScope temps(codegen_->GetVIXLAssembler());
+      Register temp = temps.AcquireX();
       vixl::aarch64::Label* adrp_label = codegen_->NewPcRelativeStringPatch(dex_file, string_index);
-      codegen_->EmitAdrpPlaceholder(adrp_label, out.X());
+      codegen_->EmitAdrpPlaceholder(adrp_label, temp);
       // Add LDR with its PC-relative String patch.
       vixl::aarch64::Label* ldr_label =
           codegen_->NewPcRelativeStringPatch(dex_file, string_index, adrp_label);
       // /* GcRoot<mirror::Class> */ out = *(base_address + offset)  /* PC-relative */
       GenerateGcRootFieldLoad(load,
                               load->GetLocations()->Out(),
-                              out.X(),
+                              temp,
                               /* placeholder */ 0u,
                               ldr_label);
-      SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load);
+      SlowPathCodeARM64* slow_path =
+          new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load, temp, adrp_label);
       codegen_->AddSlowPath(slow_path);
       __ Cbz(out.X(), slow_path->GetEntryLabel());
       __ Bind(slow_path->GetExitLabel());
@@ -4300,6 +4326,7 @@
 
   // TODO: Re-add the compiler code to do string dex cache lookup again.
   InvokeRuntimeCallingConvention calling_convention;
+  DCHECK_EQ(calling_convention.GetRegisterAt(0).GetCode(), out.GetCode());
   __ Mov(calling_convention.GetRegisterAt(0).W(), load->GetStringIndex());
   codegen_->InvokeRuntime(kQuickResolveString, load, load->GetDexPc());
   CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 0b23599..ab60671 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -6059,8 +6059,7 @@
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
   HLoadString::LoadKind load_kind = load->GetLoadKind();
-  if (load_kind == HLoadString::LoadKind::kDexCacheViaMethod ||
-      load_kind == HLoadString::LoadKind::kBootImageLinkTimePcRelative ||
+  if (load_kind == HLoadString::LoadKind::kBootImageLinkTimePcRelative ||
       load_kind == HLoadString::LoadKind::kBssEntry) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
@@ -6068,6 +6067,17 @@
     locations->SetOut(Location::RegisterLocation(EAX));
   } else {
     locations->SetOut(Location::RequiresRegister());
+    if (load_kind == HLoadString::LoadKind::kBssEntry) {
+      if (!kUseReadBarrier || kUseBakerReadBarrier) {
+        // Rely on the pResolveString and/or marking to save everything.
+        RegisterSet caller_saves = RegisterSet::Empty();
+        InvokeRuntimeCallingConvention calling_convention;
+        caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+        locations->SetCustomSlowPathCallerSaves(caller_saves);
+      } else {
+        // For non-Baker read barrier we have a temp-clobbering call.
+      }
+    }
   }
 }
 
@@ -6114,6 +6124,7 @@
 
   // TODO: Re-add the compiler code to do string dex cache lookup again.
   InvokeRuntimeCallingConvention calling_convention;
+  DCHECK_EQ(calling_convention.GetRegisterAt(0), out);
   __ movl(calling_convention.GetRegisterAt(0), Immediate(load->GetStringIndex()));
   codegen_->InvokeRuntime(kQuickResolveString, load, load->GetDexPc());
   CheckEntrypointTypes<kQuickResolveString, void*, uint32_t>();
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 28638d7..6518c32 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -299,9 +299,9 @@
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
 
-    InvokeRuntimeCallingConvention calling_convention;
     const uint32_t string_index = instruction_->AsLoadString()->GetStringIndex();
-    __ movl(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(string_index));
+    // Custom calling convention: RAX serves as both input and output.
+    __ movl(CpuRegister(RAX), Immediate(string_index));
     x86_64_codegen->InvokeRuntime(kQuickResolveString,
                                   instruction_,
                                   instruction_->GetDexPc(),
@@ -5450,10 +5450,20 @@
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(load, call_kind);
   if (load->GetLoadKind() == HLoadString::LoadKind::kDexCacheViaMethod) {
-    locations->SetInAt(0, Location::RequiresRegister());
     locations->SetOut(Location::RegisterLocation(RAX));
   } else {
     locations->SetOut(Location::RequiresRegister());
+    if (load->GetLoadKind() == HLoadString::LoadKind::kBssEntry) {
+      if (!kUseReadBarrier || kUseBakerReadBarrier) {
+        // Rely on the pResolveString and/or marking to save everything.
+        // Custom calling convention: RAX serves as both input and output.
+        RegisterSet caller_saves = RegisterSet::Empty();
+        caller_saves.Add(Location::RegisterLocation(RAX));
+        locations->SetCustomSlowPathCallerSaves(caller_saves);
+      } else {
+        // For non-Baker read barrier we have a temp-clobbering call.
+      }
+    }
   }
 }
 
@@ -5493,9 +5503,8 @@
   }
 
   // TODO: Re-add the compiler code to do string dex cache lookup again.
-  InvokeRuntimeCallingConvention calling_convention;
-  __ movl(CpuRegister(calling_convention.GetRegisterAt(0)),
-          Immediate(load->GetStringIndex()));
+  // Custom calling convention: RAX serves as both input and output.
+  __ movl(CpuRegister(RAX), Immediate(load->GetStringIndex()));
   codegen_->InvokeRuntime(kQuickResolveString,
                           load,
                           load->GetDexPc());
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index cdb4c25..bf70c55 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -239,6 +239,30 @@
     .cfi_adjust_cfa_offset -56
 .endm
 
+.macro RESTORE_SAVE_EVERYTHING_FRAME_KEEP_R0
+    add  sp, #8                         @ rewind sp
+    .cfi_adjust_cfa_offset -8
+    vpop {d0-d15}
+    .cfi_adjust_cfa_offset -128
+    add  sp, #4                         @ skip r0
+    .cfi_adjust_cfa_offset -4
+    .cfi_restore r0                     @ debugger can no longer restore caller's r0
+    pop {r1-r12, lr}                    @ 13 words of callee saves
+    .cfi_restore r1
+    .cfi_restore r2
+    .cfi_restore r3
+    .cfi_restore r5
+    .cfi_restore r6
+    .cfi_restore r7
+    .cfi_restore r8
+    .cfi_restore r9
+    .cfi_restore r10
+    .cfi_restore r11
+    .cfi_restore r12
+    .cfi_restore lr
+    .cfi_adjust_cfa_offset -52
+.endm
+
 .macro RETURN_IF_RESULT_IS_ZERO
     cbnz   r0, 1f              @ result non-zero branch over
     bx     lr                  @ return
@@ -252,17 +276,23 @@
 .endm
 
     /*
-     * Macro that set calls through to artDeliverPendingExceptionFromCode, where the pending
-     * exception is Thread::Current()->exception_
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_ when the runtime method frame is ready.
      */
-.macro DELIVER_PENDING_EXCEPTION
-    .fnend
-    .fnstart
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r0       @ save callee saves for throw
+.macro DELIVER_PENDING_EXCEPTION_FRAME_READY
     mov    r0, r9                              @ pass Thread::Current
     bl     artDeliverPendingExceptionFromCode  @ artDeliverPendingExceptionFromCode(Thread*)
 .endm
 
+    /*
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_.
+     */
+.macro DELIVER_PENDING_EXCEPTION
+    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r0       @ save callee saves for throw
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+.endm
+
 .macro NO_ARG_RUNTIME_EXCEPTION c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
@@ -1078,41 +1108,71 @@
      */
 
 ENTRY art_quick_resolve_string
-    ldr    r1, [sp]                                              @ load referrer
-    ldr    r1, [r1, #ART_METHOD_DECLARING_CLASS_OFFSET]          @ load declaring class
-    ldr    r1, [r1, #DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET]   @ load string dex cache
-    ubfx   r2, r0, #0, #STRING_DEX_CACHE_HASH_BITS
-    add    r1, r1, r2, LSL #STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT
-    ldrd   r2, r3, [r1]                                    @ load index into r3 and pointer into r2
-    cmp    r0, r3
+    push   {r10-r12, lr}
+    .cfi_adjust_cfa_offset 16
+    .cfi_rel_offset r10, 0
+    .cfi_rel_offset r11, 4
+    .cfi_rel_offset ip, 8
+    .cfi_rel_offset lr, 12
+    ldr    r10, [sp, #16]                                        @ load referrer
+    ldr    r10, [r10, #ART_METHOD_DECLARING_CLASS_OFFSET]        @ load declaring class
+    ldr    r10, [r10, #DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET] @ load string dex cache
+    ubfx   r11, r0, #0, #STRING_DEX_CACHE_HASH_BITS
+    add    r10, r10, r11, LSL #STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT
+    ldrd   r10, r11, [r10]                               @ load index into r11 and pointer into r10
+    cmp    r0, r11
     bne    .Lart_quick_resolve_string_slow_path
 #ifdef USE_READ_BARRIER
-    ldr    r3, [rSELF, #THREAD_IS_GC_MARKING_OFFSET]
-    cbnz   r3, .Lart_quick_resolve_string_marking
+    ldr    r0, [rSELF, #THREAD_IS_GC_MARKING_OFFSET]
+    cbnz   r0, .Lart_quick_resolve_string_marking
+.Lart_quick_resolve_string_no_rb:
 #endif
-    mov    r0, r2
-    bx     lr
-// Slow path case, the index did not match
-.Lart_quick_resolve_string_slow_path:
-    SETUP_SAVE_REFS_ONLY_FRAME r2                    @ save callee saves in case of GC
-    mov    r1, r9                                    @ pass Thread::Current
-    mov    r3, sp
-    bl     artResolveStringFromCode                  @ (uint32_t type_idx, Method* method, Thread*)
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+    mov    r0, r10
+    pop    {r10-r12, pc}
+
+#ifdef USE_READ_BARRIER
 // GC is marking case, need to check the mark bit.
 .Lart_quick_resolve_string_marking:
-    ldr    r3, [r2, MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    tst    r3, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
-    mov    r0, r2
-    bne    .Lart_quick_resolve_string_no_rb
-    push   {r1, r2, r3, lr}                          @ Save x1, LR
-    .cfi_adjust_cfa_offset 16
-    bl     artReadBarrierMark                        @ Get the marked string back.
-    pop    {r1, r2, r3, lr}                          @ Restore registers.
+    ldr    r0, [r10, MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    lsrs   r0, #(LOCK_WORD_MARK_BIT_SHIFT + 1)
+    bcs    .Lart_quick_resolve_string_no_rb
+    mov    r0, r10
+    .cfi_remember_state
+    pop    {r10-r12, lr}
     .cfi_adjust_cfa_offset -16
-.Lart_quick_resolve_string_no_rb:
+    .cfi_restore r10
+    .cfi_restore r11
+    .cfi_restore r12
+    .cfi_restore lr
+    // Note: art_quick_read_barrier_mark_reg00 clobbers IP but the .Lslow_rb_* does not.
+    b      .Lslow_rb_art_quick_read_barrier_mark_reg00  @ Get the marked string back.
+    .cfi_restore_state
+#endif
+
+// Slow path case, the index did not match
+.Lart_quick_resolve_string_slow_path:
+    push {r0-r9}                  @ 10 words of callee saves and args; {r10-r12, lr} already saved.
+    .cfi_adjust_cfa_offset 40
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset r1, 4
+    .cfi_rel_offset r2, 8
+    .cfi_rel_offset r3, 12
+    .cfi_rel_offset r4, 16
+    .cfi_rel_offset r5, 20
+    .cfi_rel_offset r6, 24
+    .cfi_rel_offset r7, 28
+    .cfi_rel_offset r8, 32
+    .cfi_rel_offset r9, 36
+    SETUP_SAVE_EVERYTHING_FRAME_CORE_REGS_SAVED r1   @ save callee saves in case of GC
+    mov    r1, r9                                    @ pass Thread::Current
+    bl     artResolveStringFromCode                  @ (uint32_t type_idx, Thread*)
+    cbz    r0, 1f                                    @ If result is null, deliver the OOME.
+    .cfi_remember_state
+    RESTORE_SAVE_EVERYTHING_FRAME_KEEP_R0
     bx     lr
+    .cfi_restore_state
+1:
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
 END art_quick_resolve_string
 
 // Generate the allocation entrypoints for each allocator.
@@ -1920,6 +1980,8 @@
      * getting its argument and returning its result through register
      * `reg`, saving and restoring all caller-save registers.
      *
+     * IP is clobbered; `reg` must not be IP.
+     *
      * If `reg` is different from `r0`, the generated function follows a
      * non-standard runtime calling convention:
      * - register `reg` is used to pass the (sole) argument of this
@@ -1936,36 +1998,71 @@
     SMART_CBZ \reg, .Lret_rb_\name
     // Check lock word for mark bit, if marked return. Use IP for scratch since it is blocked.
     ldr ip, [\reg, MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    ands ip, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+    tst ip, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
     beq .Lslow_rb_\name
     // Already marked, return right away.
+.Lret_rb_\name:
     bx lr
 
 .Lslow_rb_\name:
-    push  {r0-r5, r9, lr}               @ save return address and core caller-save registers
-                                        @ also save callee save r5 for 16 byte alignment
+    // Save IP: the kSaveEverything entrypoint art_quick_resolve_string makes a tail call here.
+    push  {r0-r4, r9, ip, lr}           @ save return address, core caller-save registers and ip
     .cfi_adjust_cfa_offset 32
     .cfi_rel_offset r0, 0
     .cfi_rel_offset r1, 4
     .cfi_rel_offset r2, 8
     .cfi_rel_offset r3, 12
     .cfi_rel_offset r4, 16
-    .cfi_rel_offset r5, 20
-    .cfi_rel_offset r9, 24
+    .cfi_rel_offset r9, 20
+    .cfi_rel_offset ip, 24
     .cfi_rel_offset lr, 28
-    vpush {s0-s15}                      @ save floating-point caller-save registers
-    .cfi_adjust_cfa_offset 64
 
     .ifnc \reg, r0
       mov   r0, \reg                    @ pass arg1 - obj from `reg`
     .endif
+
+    vpush {s0-s15}                      @ save floating-point caller-save registers
+    .cfi_adjust_cfa_offset 64
     bl    artReadBarrierMark            @ r0 <- artReadBarrierMark(obj)
-    mov ip, r0                          @ Save result in IP
     vpop {s0-s15}                       @ restore floating-point registers
     .cfi_adjust_cfa_offset -64
-    pop   {r0-r5, r9, lr}               @ restore caller-save registers
-    mov \reg, ip                        @ copy result to reg
-.Lret_rb_\name:
+
+    .ifc \reg, r0                       @ Save result to the stack slot or destination register.
+      str r0, [sp, #0]
+    .else
+      .ifc \reg, r1
+        str r0, [sp, #4]
+      .else
+        .ifc \reg, r2
+          str r0, [sp, #8]
+        .else
+          .ifc \reg, r3
+            str r0, [sp, #12]
+          .else
+            .ifc \reg, r4
+              str r0, [sp, #16]
+            .else
+              .ifc \reg, r9
+                str r0, [sp, #20]
+              .else
+                mov \reg, r0
+              .endif
+            .endif
+          .endif
+        .endif
+      .endif
+    .endif
+
+    pop   {r0-r4, r9, ip, lr}           @ restore caller-save registers
+    .cfi_adjust_cfa_offset -32
+    .cfi_restore r0
+    .cfi_restore r1
+    .cfi_restore r2
+    .cfi_restore r3
+    .cfi_restore r4
+    .cfi_restore r9
+    .cfi_restore ip
+    .cfi_restore lr
     bx lr
 END \name
 .endm
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 04a3cc6..483cee3 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -337,7 +337,7 @@
     SETUP_SAVE_EVERYTHING_FRAME_DECREMENTED_SP_SKIP_X29_LR
 .endm
 
-.macro RESTORE_SAVE_EVERYTHING_FRAME
+.macro RESTORE_SAVE_EVERYTHING_FRAME_KEEP_X0
     // Restore FP registers.
     // For better performance, load d0 and d31 separately, so that all LDPs are 16-byte aligned.
     ldr d0,       [sp, #8]
@@ -359,7 +359,6 @@
     ldr d31,      [sp, #256]
 
     // Restore core registers.
-    RESTORE_REG            x0, 264
     RESTORE_TWO_REGS  x1,  x2, 272
     RESTORE_TWO_REGS  x3,  x4, 288
     RESTORE_TWO_REGS  x5,  x6, 304
@@ -379,6 +378,11 @@
     DECREASE_FRAME 512
 .endm
 
+.macro RESTORE_SAVE_EVERYTHING_FRAME
+    RESTORE_REG            x0, 264
+    RESTORE_SAVE_EVERYTHING_FRAME_KEEP_X0
+.endm
+
 .macro RETURN_IF_RESULT_IS_ZERO
     cbnz x0, 1f                // result non-zero branch over
     ret                        // return
@@ -392,11 +396,10 @@
 .endm
 
     /*
-     * Macro that set calls through to artDeliverPendingExceptionFromCode, where the pending
-     * exception is Thread::Current()->exception_
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_ when the runtime method frame is ready.
      */
-.macro DELIVER_PENDING_EXCEPTION
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME
+.macro DELIVER_PENDING_EXCEPTION_FRAME_READY
     mov x0, xSELF
 
     // Point of no return.
@@ -404,6 +407,15 @@
     brk 0  // Unreached
 .endm
 
+    /*
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_.
+     */
+.macro DELIVER_PENDING_EXCEPTION
+    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+.endm
+
 .macro RETURN_OR_DELIVER_PENDING_EXCEPTION_REG reg
     ldr \reg, [xSELF, # THREAD_EXCEPTION_OFFSET]   // Get exception field.
     cbnz \reg, 1f
@@ -1638,40 +1650,54 @@
      */
 
 ENTRY art_quick_resolve_string
-    ldr   x1, [sp]                                               // load referrer
-    ldr   w2, [x1, #ART_METHOD_DECLARING_CLASS_OFFSET]           // load declaring class
-    ldr   x1, [x2, #DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET]    // load string dex cache
-    ubfx  x2, x0, #0, #STRING_DEX_CACHE_HASH_BITS                // get masked string index into x2
-    ldr   x2, [x1, x2, lsl #STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT]  // load dex cache pair into x2
-    cmp   x0, x2, lsr #32                                         // compare against upper 32 bits
+    SAVE_TWO_REGS_INCREASE_FRAME x29, xLR, 2 * __SIZEOF_POINTER__
+    ldr   x29, [sp, #(2 * __SIZEOF_POINTER__)]                   // load referrer
+    ldr   w29, [x29, #ART_METHOD_DECLARING_CLASS_OFFSET]         // load declaring class
+    ldr   x29, [x29, #DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET]  // load string dex cache
+    ubfx  lr, x0, #0, #STRING_DEX_CACHE_HASH_BITS                // get masked string index into LR
+    ldr   x29, [x29, lr, lsl #STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT]  // load dex cache pair into x29
+    cmp   x0, x29, lsr #32                                       // compare against upper 32 bits
     bne   .Lart_quick_resolve_string_slow_path
-    ubfx  x0, x2, #0, #32                                        // extract lower 32 bits into x0
+    ubfx  x0, x29, #0, #32                                       // extract lower 32 bits into x0
 #ifdef USE_READ_BARRIER
     // Most common case: GC is not marking.
-    ldr    w3, [xSELF, #THREAD_IS_GC_MARKING_OFFSET]
-    cbnz   x3, .Lart_quick_resolve_string_marking
+    ldr    w29, [xSELF, #THREAD_IS_GC_MARKING_OFFSET]
+    cbnz   x29, .Lart_quick_resolve_string_marking
+.Lart_quick_resolve_string_no_rb:
 #endif
+    .cfi_remember_state
+    RESTORE_TWO_REGS_DECREASE_FRAME x29, xLR, 2 * __SIZEOF_POINTER__
     ret
+    .cfi_restore_state
+    .cfi_def_cfa_offset 16                          // workaround for clang bug: 31975598
+
+#ifdef USE_READ_BARRIER
+// GC is marking case, need to check the mark bit.
+.Lart_quick_resolve_string_marking:
+    ldr   x29, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tbnz  x29, #LOCK_WORD_MARK_BIT_SHIFT, .Lart_quick_resolve_string_no_rb
+    .cfi_remember_state
+    RESTORE_TWO_REGS_DECREASE_FRAME x29, xLR, 2 * __SIZEOF_POINTER__
+    // Note: art_quick_read_barrier_mark_reg00 clobbers IP0 but the .Lslow_rb_* does not.
+    b     .Lslow_rb_art_quick_read_barrier_mark_reg00  // Get the marked string back.
+    .cfi_restore_state
+    .cfi_def_cfa_offset 16                          // workaround for clang bug: 31975598
+#endif
 
 // Slow path case, the index did not match.
 .Lart_quick_resolve_string_slow_path:
-    SETUP_SAVE_REFS_ONLY_FRAME                      // save callee saves in case of GC
+    INCREASE_FRAME (FRAME_SIZE_SAVE_EVERYTHING - 2 * __SIZEOF_POINTER__)
+    SETUP_SAVE_EVERYTHING_FRAME_DECREMENTED_SP_SKIP_X29_LR  // save callee saves in case of GC
     mov   x1, xSELF                                 // pass Thread::Current
     bl    artResolveStringFromCode                  // (int32_t string_idx, Thread* self)
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-
-// GC is marking case, need to check the mark bit.
-.Lart_quick_resolve_string_marking:
-    ldr   x3, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    tbnz  x3, #LOCK_WORD_MARK_BIT_SHIFT, .Lart_quick_resolve_string_no_rb
-    // Save LR so that we can return, also x1 for alignment purposes.
-    SAVE_TWO_REGS_INCREASE_FRAME x1, xLR, 16        // Save x1, LR.
-    bl     artReadBarrierMark                       // Get the marked string back.
-    RESTORE_TWO_REGS_DECREASE_FRAME x1, xLR, 16     // Restore registers.
-.Lart_quick_resolve_string_no_rb:
-    ret
-
+    cbz   w0, 1f                                    // If result is null, deliver the OOME.
+    .cfi_remember_state
+    RESTORE_SAVE_EVERYTHING_FRAME_KEEP_X0
+    ret                        // return
+    .cfi_restore_state
+    .cfi_def_cfa_offset FRAME_SIZE_SAVE_EVERYTHING  // workaround for clang bug: 31975598
+1:
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
 END art_quick_resolve_string
 
 // Generate the allocation entrypoints for each allocator.
@@ -2513,9 +2539,10 @@
      */
     // Use wIP0 as temp and check the mark bit of the reference. wIP0 is not used by the compiler.
     ldr   wIP0, [\xreg, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    tbz   wIP0, #LOCK_WORD_MARK_BIT_SHIFT, .Lslow_path_rb_\name
+    tbz   wIP0, #LOCK_WORD_MARK_BIT_SHIFT, .Lslow_rb_\name
+.Lret_rb_\name:
     ret
-.Lslow_path_rb_\name:
+.Lslow_rb_\name:
     // Save all potentially live caller-save core registers.
     SAVE_TWO_REGS_INCREASE_FRAME x0, x1, 368
     SAVE_TWO_REGS  x2,  x3, 16
@@ -2580,7 +2607,6 @@
     // Restore return address and remove padding.
     RESTORE_REG xLR, 360
     DECREASE_FRAME 368
-.Lret_rb_\name:
     ret
 END \name
 .endm
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 7bb59ef..f4f9a68 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -224,12 +224,11 @@
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveEverything)
-     * when EDI is already saved.
+     * when EDI and ESI are already saved.
      */
-MACRO2(SETUP_SAVE_EVERYTHING_FRAME_EDI_SAVED, got_reg, temp_reg)
+MACRO2(SETUP_SAVE_EVERYTHING_FRAME_EDI_ESI_SAVED, got_reg, temp_reg)
     // Save core registers from highest to lowest to agree with core spills bitmap.
-    // EDI, or at least a placeholder for it, is already on the stack.
-    PUSH esi
+    // EDI and ESI, or at least placeholders for them, are already on the stack.
     PUSH ebp
     PUSH ebx
     PUSH edx
@@ -268,13 +267,25 @@
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveEverything)
+     * when EDI is already saved.
+     */
+MACRO2(SETUP_SAVE_EVERYTHING_FRAME_EDI_SAVED, got_reg, temp_reg)
+    // Save core registers from highest to lowest to agree with core spills bitmap.
+    // EDI, or at least a placeholder for it, is already on the stack.
+    PUSH esi
+    SETUP_SAVE_EVERYTHING_FRAME_EDI_ESI_SAVED RAW_VAR(got_reg), RAW_VAR(temp_reg)
+END_MACRO
+
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveEverything)
      */
 MACRO2(SETUP_SAVE_EVERYTHING_FRAME, got_reg, temp_reg)
     PUSH edi
     SETUP_SAVE_EVERYTHING_FRAME_EDI_SAVED RAW_VAR(got_reg), RAW_VAR(temp_reg)
 END_MACRO
 
-MACRO0(RESTORE_SAVE_EVERYTHING_FRAME)
+MACRO0(RESTORE_SAVE_EVERYTHING_FRAME_FRPS)
     // Restore FPRs. Method and padding is still on the stack.
     movsd 16(%esp), %xmm0
     movsd 24(%esp), %xmm1
@@ -284,13 +295,10 @@
     movsd 56(%esp), %xmm5
     movsd 64(%esp), %xmm6
     movsd 72(%esp), %xmm7
+END_MACRO
 
-    // Remove save everything callee save method, stack alignment padding and FPRs.
-    addl MACRO_LITERAL(16 + 8 * 8), %esp
-    CFI_ADJUST_CFA_OFFSET(-(16 + 8 * 8))
-
-    // Restore core registers.
-    POP eax
+MACRO0(RESTORE_SAVE_EVERYTHING_FRAME_GPRS_EXCEPT_EAX)
+    // Restore core registers (except eax).
     POP ecx
     POP edx
     POP ebx
@@ -299,12 +307,32 @@
     POP edi
 END_MACRO
 
+MACRO0(RESTORE_SAVE_EVERYTHING_FRAME)
+    RESTORE_SAVE_EVERYTHING_FRAME_FRPS
+
+    // Remove save everything callee save method, stack alignment padding and FPRs.
+    addl MACRO_LITERAL(16 + 8 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(-(16 + 8 * 8))
+
+    POP eax
+    RESTORE_SAVE_EVERYTHING_FRAME_GPRS_EXCEPT_EAX
+END_MACRO
+
+MACRO0(RESTORE_SAVE_EVERYTHING_FRAME_KEEP_EAX)
+    RESTORE_SAVE_EVERYTHING_FRAME_FRPS
+
+    // Remove save everything callee save method, stack alignment padding and FPRs, skip EAX.
+    addl MACRO_LITERAL(16 + 8 * 8 + 4), %esp
+    CFI_ADJUST_CFA_OFFSET(-(16 + 8 * 8 + 4))
+
+    RESTORE_SAVE_EVERYTHING_FRAME_GPRS_EXCEPT_EAX
+END_MACRO
+
     /*
-     * Macro that set calls through to artDeliverPendingExceptionFromCode, where the pending
-     * exception is Thread::Current()->exception_.
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_ when the runtime method frame is ready.
      */
-MACRO0(DELIVER_PENDING_EXCEPTION)
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME ebx, ebx // save callee saves for throw
+MACRO0(DELIVER_PENDING_EXCEPTION_FRAME_READY)
     // Outgoing argument set up
     subl MACRO_LITERAL(12), %esp               // alignment padding
     CFI_ADJUST_CFA_OFFSET(12)
@@ -314,6 +342,15 @@
     UNREACHABLE
 END_MACRO
 
+    /*
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_.
+     */
+MACRO0(DELIVER_PENDING_EXCEPTION)
+    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME ebx, ebx // save callee saves for throw
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+END_MACRO
+
 MACRO2(NO_ARG_RUNTIME_EXCEPTION, c_name, cxx_name)
     DEFINE_FUNCTION VAR(c_name)
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME ebx, ebx // save all registers as basis for long jump context
@@ -1114,26 +1151,42 @@
 END_FUNCTION art_quick_alloc_object_region_tlab
 
 DEFINE_FUNCTION art_quick_resolve_string
-    movl 4(%esp), %ecx                                           // get referrer
-    movl ART_METHOD_DECLARING_CLASS_OFFSET(%ecx), %ecx           // get declaring class
-    movl DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET(%ecx), %ecx    // get string dex cache
-    movl LITERAL(STRING_DEX_CACHE_SIZE_MINUS_ONE), %edx
-    andl %eax, %edx
-    movlps (%ecx, %edx, STRING_DEX_CACHE_ELEMENT_SIZE), %xmm0    // load string idx and ptr to xmm0
-    movd %xmm0, %ecx                                             // extract pointer
+    PUSH edi
+    PUSH esi
+    // Save xmm0 at an aligned address on the stack.
+    subl MACRO_LITERAL(12), %esp
+    CFI_ADJUST_CFA_OFFSET(12)
+    movsd %xmm0, 0(%esp)
+    movl 24(%esp), %edi                                          // get referrer
+    movl ART_METHOD_DECLARING_CLASS_OFFSET(%edi), %edi           // get declaring class
+    movl DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET(%edi), %edi    // get string dex cache
+    movl LITERAL(STRING_DEX_CACHE_SIZE_MINUS_ONE), %esi
+    andl %eax, %esi
+    movlps (%edi, %esi, STRING_DEX_CACHE_ELEMENT_SIZE), %xmm0    // load string idx and ptr to xmm0
+    movd %xmm0, %edi                                             // extract pointer
     pshufd LITERAL(0x55), %xmm0, %xmm0                           // shuffle index into lowest bits
-    movd %xmm0, %edx                                             // extract index
-    cmp %edx, %eax
+    movd %xmm0, %esi                                             // extract index
+    // Restore xmm0 and remove it together with padding from the stack.
+    movsd 0(%esp), %xmm0
+    addl MACRO_LITERAL(12), %esp
+    CFI_ADJUST_CFA_OFFSET(-12)
+    cmp %esi, %eax
     jne .Lart_quick_resolve_string_slow_path
-    movl %ecx, %eax
+    movl %edi, %eax
+    CFI_REMEMBER_STATE
+    POP esi
+    POP edi
 #ifdef USE_READ_BARRIER
     cmpl LITERAL(0), %fs:THREAD_IS_GC_MARKING_OFFSET
-    jne .Lart_quick_resolve_string_marking
+    jne .Lnot_null_art_quick_read_barrier_mark_reg00
 #endif
     ret
+    CFI_RESTORE_STATE
+    CFI_DEF_CFA(esp, 24)                          // workaround for clang bug: 31975598
+
 .Lart_quick_resolve_string_slow_path:
     // Outgoing argument set up
-    SETUP_SAVE_REFS_ONLY_FRAME  ebx, ebx
+    SETUP_SAVE_EVERYTHING_FRAME_EDI_ESI_SAVED ebx, ebx
     subl LITERAL(8), %esp                                        // push padding
     CFI_ADJUST_CFA_OFFSET(8)
     pushl %fs:THREAD_SELF_OFFSET                                 // pass Thread::Current()
@@ -1142,21 +1195,15 @@
     call SYMBOL(artResolveStringFromCode)
     addl LITERAL(16), %esp                                       // pop arguments
     CFI_ADJUST_CFA_OFFSET(-16)
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-.Lart_quick_resolve_string_marking:
-    SETUP_SAVE_REFS_ONLY_FRAME  ebx, ebx
-    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax)
-    jnz .Lart_quick_resolve_string_no_rb
-    subl LITERAL(12), %esp                                   // alignment padding
-    CFI_ADJUST_CFA_OFFSET(12)
-    PUSH eax                                                 // Pass the string as the first param.
-    call SYMBOL(artReadBarrierMark)
-    addl LITERAL(16), %esp
-    CFI_ADJUST_CFA_OFFSET(-16)
-.Lart_quick_resolve_string_no_rb:
-    RESTORE_SAVE_REFS_ONLY_FRAME
+    testl %eax, %eax                                        // If result is null, deliver the OOME.
+    jz 1f
+    CFI_REMEMBER_STATE
+    RESTORE_SAVE_EVERYTHING_FRAME_KEEP_EAX
     ret
+    CFI_RESTORE_STATE
+    CFI_DEF_CFA(esp, FRAME_SIZE_SAVE_EVERYTHING)  // workaround for clang bug: 31975598
+1:
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
 END_FUNCTION art_quick_resolve_string
 
 ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
@@ -2102,6 +2149,7 @@
     // Null check so that we can load the lock word.
     test REG_VAR(reg), REG_VAR(reg)
     jz .Lret_rb_\name
+.Lnot_null_\name:
     // Check the mark bit, if it is 1 return.
     testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(reg))
     jz .Lslow_rb_\name
diff --git a/runtime/arch/x86_64/asm_support_x86_64.S b/runtime/arch/x86_64/asm_support_x86_64.S
index af4a6c4..28018c5 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.S
+++ b/runtime/arch/x86_64/asm_support_x86_64.S
@@ -76,6 +76,8 @@
     #define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg
     #define CFI_RESTORE(reg) .cfi_restore reg
     #define CFI_REL_OFFSET(reg,size) .cfi_rel_offset reg,size
+    #define CFI_RESTORE_STATE .cfi_restore_state
+    #define CFI_REMEMBER_STATE .cfi_remember_state
 #else
     // Mac OS' doesn't like cfi_* directives.
     #define CFI_STARTPROC
@@ -85,6 +87,8 @@
     #define CFI_DEF_CFA_REGISTER(reg)
     #define CFI_RESTORE(reg)
     #define CFI_REL_OFFSET(reg,size)
+    #define CFI_RESTORE_STATE
+    #define CFI_REMEMBER_STATE
 #endif
 
     // Symbols.
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 54e52e5..92273bf 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -263,16 +263,15 @@
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveEverything)
-     * when R15 is already saved.
+     * when R14 and R15 are already saved.
      */
-MACRO0(SETUP_SAVE_EVERYTHING_FRAME_R15_SAVED)
+MACRO0(SETUP_SAVE_EVERYTHING_FRAME_R14_R15_SAVED)
 #if defined(__APPLE__)
     int3
     int3
 #else
     // Save core registers from highest to lowest to agree with core spills bitmap.
-    // R15, or at least a placeholder for it, is already on the stack.
-    PUSH r14
+    // R14 and R15, or at least placeholders for them, are already on the stack.
     PUSH r13
     PUSH r12
     PUSH r11
@@ -326,13 +325,23 @@
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveEverything)
+     * when R15 is already saved.
+     */
+MACRO0(SETUP_SAVE_EVERYTHING_FRAME_R15_SAVED)
+    PUSH r14
+    SETUP_SAVE_EVERYTHING_FRAME_R14_R15_SAVED
+END_MACRO
+
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveEverything)
      */
 MACRO0(SETUP_SAVE_EVERYTHING_FRAME)
     PUSH r15
     SETUP_SAVE_EVERYTHING_FRAME_R15_SAVED
 END_MACRO
 
-MACRO0(RESTORE_SAVE_EVERYTHING_FRAME)
+MACRO0(RESTORE_SAVE_EVERYTHING_FRAME_FRPS)
     // Restore FPRs. Method and padding is still on the stack.
     movq 16(%rsp), %xmm0
     movq 24(%rsp), %xmm1
@@ -350,12 +359,10 @@
     movq 120(%rsp), %xmm13
     movq 128(%rsp), %xmm14
     movq 136(%rsp), %xmm15
+END_MACRO
 
-    // Remove save everything callee save method, stack alignment padding and FPRs.
-    addq MACRO_LITERAL(16 + 16 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(-(16 + 16 * 8))
-    // Restore callee and GPR args, mixed together to agree with core spills bitmap.
-    POP rax
+MACRO0(RESTORE_SAVE_EVERYTHING_FRAME_GPRS_EXCEPT_RAX)
+    // Restore callee and GPR args (except RAX), mixed together to agree with core spills bitmap.
     POP rcx
     POP rdx
     POP rbx
@@ -372,19 +379,47 @@
     POP r15
 END_MACRO
 
+MACRO0(RESTORE_SAVE_EVERYTHING_FRAME)
+    RESTORE_SAVE_EVERYTHING_FRAME_FRPS
+
+    // Remove save everything callee save method, stack alignment padding and FPRs.
+    addq MACRO_LITERAL(16 + 16 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-(16 + 16 * 8))
+
+    POP rax
+    RESTORE_SAVE_EVERYTHING_FRAME_GPRS_EXCEPT_RAX
+END_MACRO
+
+MACRO0(RESTORE_SAVE_EVERYTHING_FRAME_KEEP_RAX)
+    RESTORE_SAVE_EVERYTHING_FRAME_FRPS
+
+    // Remove save everything callee save method, stack alignment padding and FPRs, skip RAX.
+    addq MACRO_LITERAL(16 + 16 * 8 + 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-(16 + 16 * 8 + 8))
+
+    RESTORE_SAVE_EVERYTHING_FRAME_GPRS_EXCEPT_RAX
+END_MACRO
 
     /*
-     * Macro that set calls through to artDeliverPendingExceptionFromCode, where the pending
-     * exception is Thread::Current()->exception_.
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_ when the runtime method frame is ready.
      */
-MACRO0(DELIVER_PENDING_EXCEPTION)
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME        // save callee saves for throw
+MACRO0(DELIVER_PENDING_EXCEPTION_FRAME_READY)
     // (Thread*) setup
     movq %gs:THREAD_SELF_OFFSET, %rdi
     call SYMBOL(artDeliverPendingExceptionFromCode)  // artDeliverPendingExceptionFromCode(Thread*)
     UNREACHABLE
 END_MACRO
 
+    /*
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_.
+     */
+MACRO0(DELIVER_PENDING_EXCEPTION)
+    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME        // save callee saves for throw
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+END_MACRO
+
 MACRO2(NO_ARG_RUNTIME_EXCEPTION, c_name, cxx_name)
     DEFINE_FUNCTION VAR(c_name)
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME  // save all registers as basis for long jump context
@@ -1295,45 +1330,48 @@
 END_FUNCTION art_quick_alloc_object_initialized_region_tlab
 
 DEFINE_FUNCTION art_quick_resolve_string
-    movq 8(%rsp), %rcx                                         // get referrer
-    movl ART_METHOD_DECLARING_CLASS_OFFSET(%rcx), %ecx         // get declaring class
-    movq DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET(%ecx), %rcx  // get string dex cache
-    movq LITERAL(STRING_DEX_CACHE_SIZE_MINUS_ONE), %rdx
-    andq %rdi, %rdx
-    movq (%rcx, %rdx, STRING_DEX_CACHE_ELEMENT_SIZE), %rdx
-    movl %edx, %eax
-    shrq LITERAL(32), %rdx
-    cmp %rdx, %rdi
+    // Custom calling convention: RAX serves as both input and output.
+    PUSH r15
+    PUSH r14
+    movq 24(%rsp), %r15                                         // get referrer
+    movl ART_METHOD_DECLARING_CLASS_OFFSET(%r15), %r15d         // get declaring class
+    movq DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET(%r15d), %r15  // get string dex cache
+    movl LITERAL(STRING_DEX_CACHE_SIZE_MINUS_ONE), %r14d
+    andl %eax, %r14d
+    movq (%r15, %r14, STRING_DEX_CACHE_ELEMENT_SIZE), %r14
+    movl %r14d, %r15d
+    shrq LITERAL(32), %r14
+    cmpl %r14d, %eax
     jne .Lart_quick_resolve_string_slow_path
+    movl %r15d, %eax
+    CFI_REMEMBER_STATE
+    POP r14
+    POP r15
 #ifdef USE_READ_BARRIER
     cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
-    jne .Lart_quick_resolve_string_marking
+    jne .Lnot_null_art_quick_read_barrier_mark_reg00
 #endif
     ret
-// Slow path, the index did not match
+    CFI_RESTORE_STATE
+    CFI_DEF_CFA(rsp, 24)                        // workaround for clang bug: 31975598
+
+// Slow path, the index did not match.
 .Lart_quick_resolve_string_slow_path:
-    SETUP_SAVE_REFS_ONLY_FRAME
-    movq %rcx, %rax
+    SETUP_SAVE_EVERYTHING_FRAME_R14_R15_SAVED
     // Outgoing argument set up
+    movl %eax, %edi                             // pass string index
     movq %gs:THREAD_SELF_OFFSET, %rsi           // pass Thread::Current()
     call SYMBOL(artResolveStringFromCode)       // artResolveStringFromCode(arg0, referrer, Thread*)
-    RESTORE_SAVE_REFS_ONLY_FRAME                // restore frame up to return address
-    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-// GC is marking case, need to check the mark bit.
-.Lart_quick_resolve_string_marking:
-    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%rax)
-    jnz .Lart_quick_resolve_string_no_rb
-    // Save LR so that we can return, also x1 for alignment purposes
-    PUSH rdi
-    PUSH rsi
-    subq LITERAL(8), %rsp                         // 16 byte alignment
-    movq %rax, %rdi
-    call SYMBOL(artReadBarrierMark)
-    addq LITERAL(8), %rsp
-    POP  rsi
-    POP  rdi
-.Lart_quick_resolve_string_no_rb:
+
+    testl %eax, %eax                            // If result is null, deliver the OOME.
+    jz 1f
+    CFI_REMEMBER_STATE
+    RESTORE_SAVE_EVERYTHING_FRAME_KEEP_RAX      // restore frame up to return address
     ret
+    CFI_RESTORE_STATE
+    CFI_DEF_CFA(rsp, FRAME_SIZE_SAVE_EVERYTHING)  // workaround for clang bug: 31975598
+1:
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
 END_FUNCTION art_quick_resolve_string
 
 ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
@@ -2228,6 +2266,7 @@
     // Null check so that we can load the lock word.
     testq REG_VAR(reg), REG_VAR(reg)
     jz .Lret_rb_\name
+.Lnot_null_\name:
     // Check the mark bit, if it is 1 return.
     testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(reg))
     jz .Lslow_rb_\name
diff --git a/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc b/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
index 4311d19..2a3ffab 100644
--- a/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
@@ -60,7 +60,11 @@
 extern "C" mirror::String* artResolveStringFromCode(int32_t string_idx, Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   ScopedQuickEntrypointChecks sqec(self);
-  auto* caller = GetCalleeSaveMethodCaller(self, Runtime::kSaveRefsOnly);
+  auto* caller = GetCalleeSaveMethodCaller(
+      self,
+      // TODO: Change art_quick_resolve_string on MIPS and MIPS64 to kSaveEverything.
+      (kRuntimeISA == kMips || kRuntimeISA == kMips64) ? Runtime::kSaveRefsOnly
+                                                       : Runtime::kSaveEverything);
   mirror::String* result = ResolveStringFromCode(caller, string_idx);
   if (LIKELY(result != nullptr)) {
     // For AOT code, we need a write barrier for the dex cache that holds the GC roots in the .bss.