Baseline JIT: update inline caches in compiled code.

In trying to remove profiling from interpreter, to speed up
interpreter performance.

Bug: 119800099
Test: test.py --baseline
Change-Id: Ica1fa41a889b31262d9f5691b30a31fbcec01b34
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 10397e8..0162311 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -18,7 +18,7 @@
 
 #include "arch/arm64/asm_support_arm64.h"
 #include "arch/arm64/instruction_set_features_arm64.h"
-#include "art_method.h"
+#include "art_method-inl.h"
 #include "base/bit_utils.h"
 #include "base/bit_utils_iterator.h"
 #include "class_table.h"
@@ -4041,6 +4041,26 @@
   HandleInvoke(invoke);
 }
 
+void CodeGeneratorARM64::MaybeGenerateInlineCacheCheck(HInstruction* instruction,
+                                                       Register klass) {
+  DCHECK_EQ(klass.GetCode(), 0u);
+  if (GetCompilerOptions().IsBaseline() && !Runtime::Current()->IsAotCompiler()) {
+    DCHECK(!instruction->GetEnvironment()->IsFromInlinedInvoke());
+    ScopedObjectAccess soa(Thread::Current());
+    ProfilingInfo* info = GetGraph()->GetArtMethod()->GetProfilingInfo(kRuntimePointerSize);
+    InlineCache* cache = info->GetInlineCache(instruction->GetDexPc());
+    uint64_t address = reinterpret_cast64<uint64_t>(cache);
+    vixl::aarch64::Label done;
+    __ Mov(x8, address);
+    __ Ldr(x9, MemOperand(x8, InlineCache::ClassesOffset().Int32Value()));
+    // Fast path for a monomorphic cache.
+    __ Cmp(klass, x9);
+    __ B(eq, &done);
+    InvokeRuntime(kQuickUpdateInlineCache, instruction, instruction->GetDexPc());
+    __ Bind(&done);
+  }
+}
+
 void InstructionCodeGeneratorARM64::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
   LocationSummary* locations = invoke->GetLocations();
@@ -4049,13 +4069,6 @@
   Offset class_offset = mirror::Object::ClassOffset();
   Offset entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize);
 
-  // The register ip1 is required to be used for the hidden argument in
-  // art_quick_imt_conflict_trampoline, so prevent VIXL from using it.
-  MacroAssembler* masm = GetVIXLAssembler();
-  UseScratchRegisterScope scratch_scope(masm);
-  scratch_scope.Exclude(ip1);
-  __ Mov(ip1, invoke->GetDexMethodIndex());
-
   // Ensure that between load and MaybeRecordImplicitNullCheck there are no pools emitted.
   if (receiver.IsStackSlot()) {
     __ Ldr(temp.W(), StackOperandFrom(receiver));
@@ -4080,6 +4093,17 @@
   // intact/accessible until the end of the marking phase (the
   // concurrent copying collector may not in the future).
   GetAssembler()->MaybeUnpoisonHeapReference(temp.W());
+
+  // If we're compiling baseline, update the inline cache.
+  codegen_->MaybeGenerateInlineCacheCheck(invoke, temp);
+
+  // The register ip1 is required to be used for the hidden argument in
+  // art_quick_imt_conflict_trampoline, so prevent VIXL from using it.
+  MacroAssembler* masm = GetVIXLAssembler();
+  UseScratchRegisterScope scratch_scope(masm);
+  scratch_scope.Exclude(ip1);
+  __ Mov(ip1, invoke->GetDexMethodIndex());
+
   __ Ldr(temp,
       MemOperand(temp, mirror::Class::ImtPtrOffset(kArm64PointerSize).Uint32Value()));
   uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
@@ -4260,6 +4284,10 @@
   // intact/accessible until the end of the marking phase (the
   // concurrent copying collector may not in the future).
   GetAssembler()->MaybeUnpoisonHeapReference(temp.W());
+
+  // If we're compiling baseline, update the inline cache.
+  MaybeGenerateInlineCacheCheck(invoke, temp);
+
   // temp = temp->GetMethodAt(method_offset);
   __ Ldr(temp, MemOperand(temp, method_offset));
   // lr = temp->GetEntryPoint();
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index a669094..253e915 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -786,6 +786,8 @@
     CodeGenerator::MaybeRecordImplicitNullCheck(instr);
   }
 
+  void MaybeGenerateInlineCacheCheck(HInstruction* instruction, vixl::aarch64::Register klass);
+
  private:
   // Encoding of thunk type and data for link-time generated thunks for Baker read barriers.
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 1c69dd6..68e2dfa 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -18,7 +18,7 @@
 
 #include "arch/arm/asm_support_arm.h"
 #include "arch/arm/instruction_set_features_arm.h"
-#include "art_method.h"
+#include "art_method-inl.h"
 #include "base/bit_utils.h"
 #include "base/bit_utils_iterator.h"
 #include "class_table.h"
@@ -34,6 +34,7 @@
 #include "linker/linker_patch.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
+#include "scoped_thread_state_change-inl.h"
 #include "thread.h"
 #include "utils/arm/assembler_arm_vixl.h"
 #include "utils/arm/managed_register_arm.h"
@@ -3297,6 +3298,28 @@
   invoke->GetLocations()->AddTemp(LocationFrom(r12));
 }
 
+void CodeGeneratorARMVIXL::MaybeGenerateInlineCacheCheck(HInstruction* instruction,
+                                                         vixl32::Register klass) {
+  DCHECK_EQ(r0.GetCode(), klass.GetCode());
+  if (GetCompilerOptions().IsBaseline() && !Runtime::Current()->IsAotCompiler()) {
+    DCHECK(!instruction->GetEnvironment()->IsFromInlinedInvoke());
+    ScopedObjectAccess soa(Thread::Current());
+    ProfilingInfo* info = GetGraph()->GetArtMethod()->GetProfilingInfo(kRuntimePointerSize);
+    InlineCache* cache = info->GetInlineCache(instruction->GetDexPc());
+    uint32_t address = reinterpret_cast32<uint32_t>(cache);
+    vixl32::Label done;
+    UseScratchRegisterScope temps(GetVIXLAssembler());
+    temps.Exclude(ip);
+    __ Mov(r4, address);
+    __ Ldr(ip, MemOperand(r4, InlineCache::ClassesOffset().Int32Value()));
+    // Fast path for a monomorphic cache.
+    __ Cmp(klass, ip);
+    __ B(eq, &done, /* is_far_target= */ false);
+    InvokeRuntime(kQuickUpdateInlineCache, instruction, instruction->GetDexPc());
+    __ Bind(&done);
+  }
+}
+
 void InstructionCodeGeneratorARMVIXL::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
   LocationSummary* locations = invoke->GetLocations();
@@ -3324,10 +3347,15 @@
   // intact/accessible until the end of the marking phase (the
   // concurrent copying collector may not in the future).
   GetAssembler()->MaybeUnpoisonHeapReference(temp);
+
+  // If we're compiling baseline, update the inline cache.
+  codegen_->MaybeGenerateInlineCacheCheck(invoke, temp);
+
   GetAssembler()->LoadFromOffset(kLoadWord,
                                  temp,
                                  temp,
                                  mirror::Class::ImtPtrOffset(kArmPointerSize).Uint32Value());
+
   uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement(
       invoke->GetImtIndex(), kArmPointerSize));
   // temp = temp->GetImtEntryAt(method_offset);
@@ -8906,6 +8934,9 @@
   // concurrent copying collector may not in the future).
   GetAssembler()->MaybeUnpoisonHeapReference(temp);
 
+  // If we're compiling baseline, update the inline cache.
+  MaybeGenerateInlineCacheCheck(invoke, temp);
+
   // temp = temp->GetMethodAt(method_offset);
   uint32_t entry_point = ArtMethod::EntryPointFromQuickCompiledCodeOffset(
       kArmPointerSize).Int32Value();
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index fae615d..3d4c231 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -756,6 +756,8 @@
     CodeGenerator::MaybeRecordImplicitNullCheck(instr);
   }
 
+  void MaybeGenerateInlineCacheCheck(HInstruction* instruction, vixl32::Register klass);
+
  private:
   // Encoding of thunk type and data for link-time generated thunks for Baker read barriers.
 
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 797fe32..ac36ce3 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -16,7 +16,7 @@
 
 #include "code_generator_x86.h"
 
-#include "art_method.h"
+#include "art_method-inl.h"
 #include "class_table.h"
 #include "code_generator_utils.h"
 #include "compiled_method.h"
@@ -27,10 +27,12 @@
 #include "heap_poisoning.h"
 #include "intrinsics.h"
 #include "intrinsics_x86.h"
+#include "jit/profiling_info.h"
 #include "linker/linker_patch.h"
 #include "lock_word.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
+#include "scoped_thread_state_change-inl.h"
 #include "thread.h"
 #include "utils/assembler.h"
 #include "utils/stack_checks.h"
@@ -2260,6 +2262,10 @@
   }
 
   HandleInvoke(invoke);
+  if (codegen_->GetCompilerOptions().IsBaseline() && !Runtime::Current()->IsAotCompiler()) {
+    // Add one temporary for inline cache update.
+    invoke->GetLocations()->AddTemp(Location::RegisterLocation(EBP));
+  }
 }
 
 void LocationsBuilderX86::HandleInvoke(HInvoke* invoke) {
@@ -2283,6 +2289,34 @@
   HandleInvoke(invoke);
   // Add the hidden argument.
   invoke->GetLocations()->AddTemp(Location::FpuRegisterLocation(XMM7));
+
+  if (codegen_->GetCompilerOptions().IsBaseline() && !Runtime::Current()->IsAotCompiler()) {
+    // Add one temporary for inline cache update.
+    invoke->GetLocations()->AddTemp(Location::RegisterLocation(EBP));
+  }
+}
+
+void CodeGeneratorX86::MaybeGenerateInlineCacheCheck(HInstruction* instruction, Register klass) {
+  DCHECK_EQ(EAX, klass);
+  if (GetCompilerOptions().IsBaseline() && !Runtime::Current()->IsAotCompiler()) {
+    DCHECK(!instruction->GetEnvironment()->IsFromInlinedInvoke());
+    ScopedObjectAccess soa(Thread::Current());
+    ProfilingInfo* info = GetGraph()->GetArtMethod()->GetProfilingInfo(kRuntimePointerSize);
+    InlineCache* cache = info->GetInlineCache(instruction->GetDexPc());
+    uint32_t address = reinterpret_cast32<uint32_t>(cache);
+    if (kIsDebugBuild) {
+      uint32_t temp_index = instruction->GetLocations()->GetTempCount() - 1u;
+      CHECK_EQ(EBP, instruction->GetLocations()->GetTemp(temp_index).AsRegister<Register>());
+    }
+    Register temp = EBP;
+    NearLabel done;
+    __ movl(temp, Immediate(address));
+    // Fast path for a monomorphic cache.
+    __ cmpl(klass, Address(temp, InlineCache::ClassesOffset().Int32Value()));
+    __ j(kEqual, &done);
+    GenerateInvokeRuntime(GetThreadOffset<kX86PointerSize>(kQuickUpdateInlineCache).Int32Value());
+    __ Bind(&done);
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitInvokeInterface(HInvokeInterface* invoke) {
@@ -2316,6 +2350,9 @@
   // intact/accessible until the end of the marking phase (the
   // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
+
+  codegen_->MaybeGenerateInlineCacheCheck(invoke, temp);
+
   // temp = temp->GetAddressOfIMT()
   __ movl(temp,
       Address(temp, mirror::Class::ImtPtrOffset(kX86PointerSize).Uint32Value()));
@@ -4939,6 +4976,9 @@
   // intact/accessible until the end of the marking phase (the
   // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
+
+  MaybeGenerateInlineCacheCheck(invoke, temp);
+
   // temp = temp->GetMethodAt(method_offset);
   __ movl(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 6bf6b0b..e305b50 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -624,6 +624,8 @@
   void GenerateImplicitNullCheck(HNullCheck* instruction) override;
   void GenerateExplicitNullCheck(HNullCheck* instruction) override;
 
+  void MaybeGenerateInlineCacheCheck(HInstruction* instruction, Register klass);
+
   // When we don't know the proper offset for the value, we use kDummy32BitOffset.
   // The correct value will be inserted when processing Assembler fixups.
   static constexpr int32_t kDummy32BitOffset = 256;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index cd8c609..100a86b 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -16,7 +16,7 @@
 
 #include "code_generator_x86_64.h"
 
-#include "art_method.h"
+#include "art_method-inl.h"
 #include "class_table.h"
 #include "code_generator_utils.h"
 #include "compiled_method.h"
@@ -26,11 +26,13 @@
 #include "heap_poisoning.h"
 #include "intrinsics.h"
 #include "intrinsics_x86_64.h"
+#include "jit/profiling_info.h"
 #include "linker/linker_patch.h"
 #include "lock_word.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/object_reference.h"
+#include "scoped_thread_state_change-inl.h"
 #include "thread.h"
 #include "utils/assembler.h"
 #include "utils/stack_checks.h"
@@ -1068,6 +1070,9 @@
   // intact/accessible until the end of the marking phase (the
   // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
+
+  MaybeGenerateInlineCacheCheck(invoke->GetDexPc(), temp);
+
   // temp = temp->GetMethodAt(method_offset);
   __ movq(temp, Address(temp, method_offset));
   // call temp->GetEntryPoint();
@@ -2520,6 +2525,24 @@
   invoke->GetLocations()->AddTemp(Location::RegisterLocation(RAX));
 }
 
+void CodeGeneratorX86_64::MaybeGenerateInlineCacheCheck(uint32_t dex_pc, CpuRegister klass) {
+  DCHECK_EQ(RDI, klass.AsRegister());
+  if (GetCompilerOptions().IsBaseline() && !Runtime::Current()->IsAotCompiler()) {
+    ScopedObjectAccess soa(Thread::Current());
+    ProfilingInfo* info = GetGraph()->GetArtMethod()->GetProfilingInfo(kRuntimePointerSize);
+    InlineCache* cache = info->GetInlineCache(dex_pc);
+    uint64_t address = reinterpret_cast64<uint64_t>(cache);
+    NearLabel done;
+    __ movq(CpuRegister(TMP), Immediate(address));
+    // Fast path for a monomorphic cache.
+    __ cmpl(Address(CpuRegister(TMP), InlineCache::ClassesOffset().Int32Value()), klass);
+    __ j(kEqual, &done);
+    GenerateInvokeRuntime(
+        GetThreadOffset<kX86_64PointerSize>(kQuickUpdateInlineCache).Int32Value());
+    __ Bind(&done);
+  }
+}
+
 void InstructionCodeGeneratorX86_64::VisitInvokeInterface(HInvokeInterface* invoke) {
   // TODO: b/18116999, our IMTs can miss an IncompatibleClassChangeError.
   LocationSummary* locations = invoke->GetLocations();
@@ -2528,11 +2551,6 @@
   Location receiver = locations->InAt(0);
   size_t class_offset = mirror::Object::ClassOffset().SizeValue();
 
-  // Set the hidden argument. This is safe to do this here, as RAX
-  // won't be modified thereafter, before the `call` instruction.
-  DCHECK_EQ(RAX, hidden_reg.AsRegister());
-  codegen_->Load64BitValue(hidden_reg, invoke->GetDexMethodIndex());
-
   if (receiver.IsStackSlot()) {
     __ movl(temp, Address(CpuRegister(RSP), receiver.GetStackIndex()));
     // /* HeapReference<Class> */ temp = temp->klass_
@@ -2550,6 +2568,15 @@
   // intact/accessible until the end of the marking phase (the
   // concurrent copying collector may not in the future).
   __ MaybeUnpoisonHeapReference(temp);
+
+  codegen_->MaybeGenerateInlineCacheCheck(invoke->GetDexPc(), temp);
+
+  // Set the hidden argument. This is safe to do this here, as RAX
+  // won't be modified thereafter, before the `call` instruction.
+  // We also di it after MaybeGenerateInlineCache that may use RAX.
+  DCHECK_EQ(RAX, hidden_reg.AsRegister());
+  codegen_->Load64BitValue(hidden_reg, invoke->GetDexMethodIndex());
+
   // temp = temp->GetAddressOfIMT()
   __ movq(temp,
       Address(temp, mirror::Class::ImtPtrOffset(kX86_64PointerSize).Uint32Value()));
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index ef8f5ac..20db423 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -600,6 +600,8 @@
   void GenerateNop() override;
   void GenerateImplicitNullCheck(HNullCheck* instruction) override;
   void GenerateExplicitNullCheck(HNullCheck* instruction) override;
+  void MaybeGenerateInlineCacheCheck(uint32_t dex_pc, CpuRegister cls);
+
 
   // When we don't know the proper offset for the value, we use kDummy32BitOffset.
   // We will fix this up in the linker later to have the right value.
diff --git a/dex2oat/linker/oat_writer_test.cc b/dex2oat/linker/oat_writer_test.cc
index 11600a8..319e359 100644
--- a/dex2oat/linker/oat_writer_test.cc
+++ b/dex2oat/linker/oat_writer_test.cc
@@ -471,7 +471,7 @@
   EXPECT_EQ(56U, sizeof(OatHeader));
   EXPECT_EQ(4U, sizeof(OatMethodOffsets));
   EXPECT_EQ(8U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(167 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
+  EXPECT_EQ(168 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
             sizeof(QuickEntryPoints));
 }
 
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 7ac9147..6b9393a 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -2742,3 +2742,67 @@
     blx r1                                        // Call the wrapped method.
     pop {r4, pc}
 END ExecuteSwitchImplAsm
+
+// r0 contains the class, r4 contains the inline cache. We can use ip as temporary.
+ENTRY art_quick_update_inline_cache
+#if (INLINE_CACHE_SIZE != 5)
+#error "INLINE_CACHE_SIZE not as expected."
+#endif
+.Lentry1:
+    ldr ip, [r4, #INLINE_CACHE_CLASSES_OFFSET]
+    cmp ip, r0
+    beq .Ldone
+    cmp ip, #0
+    bne .Lentry2
+    ldrex ip, [r4, #INLINE_CACHE_CLASSES_OFFSET]
+    cmp ip, #0
+    bne .Lentry1
+    strex  ip, r0, [r4, #INLINE_CACHE_CLASSES_OFFSET]
+    cmp ip, #0
+    bne .Ldone
+    b .Lentry1
+.Lentry2:
+    ldr ip, [r4, #INLINE_CACHE_CLASSES_OFFSET+4]
+    cmp ip, r0
+    beq .Ldone
+    cmp ip, #0
+    bne .Lentry3
+    ldrex ip, [r4, #INLINE_CACHE_CLASSES_OFFSET+4]
+    cmp ip, #0
+    bne .Lentry2
+    strex  ip, r0, [r4, #INLINE_CACHE_CLASSES_OFFSET+4]
+    cmp ip, #0
+    bne .Ldone
+    b .Lentry2
+.Lentry3:
+    ldr ip, [r4, #INLINE_CACHE_CLASSES_OFFSET+8]
+    cmp ip, r0
+    beq .Ldone
+    cmp ip, #0
+    bne .Lentry4
+    ldrex ip, [r4, #INLINE_CACHE_CLASSES_OFFSET+8]
+    cmp ip, #0
+    bne .Lentry3
+    strex  ip, r0, [r4, #INLINE_CACHE_CLASSES_OFFSET+8]
+    cmp ip, #0
+    bne .Ldone
+    b .Lentry3
+.Lentry4:
+    ldr ip, [r4, #INLINE_CACHE_CLASSES_OFFSET+12]
+    cmp ip, r0
+    beq .Ldone
+    cmp ip, #0
+    bne .Lentry5
+    ldrex ip, [r4, #INLINE_CACHE_CLASSES_OFFSET+12]
+    cmp ip, #0
+    bne .Lentry4
+    strex  ip, r0, [r4, #INLINE_CACHE_CLASSES_OFFSET+12]
+    cmp ip, #0
+    bne .Ldone
+    b .Lentry4
+.Lentry5:
+    // Unconditionally store, the inline cache is megamorphic.
+    str  r0, [r4, #INLINE_CACHE_CLASSES_OFFSET+16]
+.Ldone:
+    blx lr
+END art_quick_update_inline_cache
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 00a9aa1..5665e18 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2874,3 +2874,59 @@
     RESTORE_TWO_REGS_DECREASE_FRAME x19, xLR, 16
     ret
 END ExecuteSwitchImplAsm
+
+// x0 contains the class, x8 contains the inline cache. x9-x15 can be used.
+ENTRY art_quick_update_inline_cache
+#if (INLINE_CACHE_SIZE != 5)
+#error "INLINE_CACHE_SIZE not as expected."
+#endif
+.Lentry1:
+    ldr w9, [x8, #INLINE_CACHE_CLASSES_OFFSET]
+    cmp w9, w0
+    beq .Ldone
+    cbnz w9, .Lentry2
+    add x10, x8, #INLINE_CACHE_CLASSES_OFFSET
+    ldxr w9, [x10]
+    cbnz w9, .Lentry1
+    stxr  w9, w0, [x10]
+    cbz   w9, .Ldone
+    b .Lentry1
+.Lentry2:
+    ldr w9, [x8, #INLINE_CACHE_CLASSES_OFFSET+4]
+    cmp w9, w0
+    beq .Ldone
+    cbnz w9, .Lentry3
+    add x10, x8, #INLINE_CACHE_CLASSES_OFFSET+4
+    ldxr w9, [x10]
+    cbnz w9, .Lentry2
+    stxr  w9, w0, [x10]
+    cbz   w9, .Ldone
+    b .Lentry2
+.Lentry3:
+    ldr w9, [x8, #INLINE_CACHE_CLASSES_OFFSET+8]
+    cmp w9, w0
+    beq .Ldone
+    cbnz w9, .Lentry4
+    add x10, x8, #INLINE_CACHE_CLASSES_OFFSET+8
+    ldxr w9, [x10]
+    cbnz w9, .Lentry3
+    stxr  w9, w0, [x10]
+    cbz   w9, .Ldone
+    b .Lentry3
+.Lentry4:
+    ldr w9, [x8, #INLINE_CACHE_CLASSES_OFFSET+12]
+    cmp w9, w0
+    beq .Ldone
+    cbnz w9, .Lentry5
+    add x10, x8, #INLINE_CACHE_CLASSES_OFFSET+12
+    ldxr w9, [x10]
+    cbnz w9, .Lentry4
+    stxr  w9, w0, [x10]
+    cbz   w9, .Ldone
+    b .Lentry4
+.Lentry5:
+    // Unconditionally store, the inline cache is megamorphic.
+    str  w0, [x8, #INLINE_CACHE_CLASSES_OFFSET+16]
+.Ldone:
+    ret
+END art_quick_update_inline_cache
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 1819f57..61d0aad 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -2519,5 +2519,58 @@
     ret
 END_FUNCTION ExecuteSwitchImplAsm
 
+// On entry: eax is the class, ebp is the inline cache.
+DEFINE_FUNCTION art_quick_update_inline_cache
+#if (INLINE_CACHE_SIZE != 5)
+#error "INLINE_CACHE_SIZE not as expected."
+#endif
+    PUSH ecx
+    movl %eax, %ecx // eax will be used for cmpxchg
+.Lentry1:
+    movl INLINE_CACHE_CLASSES_OFFSET(%ebp), %eax
+    cmpl %ecx, %eax
+    je .Ldone
+    cmpl LITERAL(0), %eax
+    jne .Lentry2
+    lock cmpxchg %ecx, INLINE_CACHE_CLASSES_OFFSET(%ebp)
+    jz .Ldone
+    jmp .Lentry1
+.Lentry2:
+    movl (INLINE_CACHE_CLASSES_OFFSET+4)(%ebp), %eax
+    cmpl %ecx, %eax
+    je .Ldone
+    cmpl LITERAL(0), %eax
+    jne .Lentry3
+    lock cmpxchg %ecx, (INLINE_CACHE_CLASSES_OFFSET+4)(%ebp)
+    jz .Ldone
+    jmp .Lentry2
+.Lentry3:
+    movl (INLINE_CACHE_CLASSES_OFFSET+8)(%ebp), %eax
+    cmpl %ecx, %eax
+    je .Ldone
+    cmpl LITERAL(0), %eax
+    jne .Lentry4
+    lock cmpxchg %ecx, (INLINE_CACHE_CLASSES_OFFSET+8)(%ebp)
+    jz .Ldone
+    jmp .Lentry3
+.Lentry4:
+    movl (INLINE_CACHE_CLASSES_OFFSET+12)(%ebp), %eax
+    cmpl %ecx, %eax
+    je .Ldone
+    cmpl LITERAL(0), %eax
+    jne .Lentry5
+    lock cmpxchg %ecx, (INLINE_CACHE_CLASSES_OFFSET+12)(%ebp)
+    jz .Ldone
+    jmp .Lentry4
+.Lentry5:
+    // Unconditionally store, the cache is megamorphic.
+    movl %ecx, (INLINE_CACHE_CLASSES_OFFSET+16)(%ebp)
+.Ldone:
+    // Restore registers
+    movl %ecx, %eax
+    POP ecx
+    ret
+END_FUNCTION art_quick_update_inline_cache
+
     // TODO: implement these!
 UNIMPLEMENTED art_quick_memcmp16
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index e1e9dcc..e37ed42 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -2467,3 +2467,51 @@
     POP rbx                  // Restore RBX
     ret
 END_FUNCTION ExecuteSwitchImplAsm
+
+// On entry: edi is the class, r11 is the inline cache. r10 and rax are available.
+DEFINE_FUNCTION art_quick_update_inline_cache
+#if (INLINE_CACHE_SIZE != 5)
+#error "INLINE_CACHE_SIZE not as expected."
+#endif
+.Lentry1:
+    movl INLINE_CACHE_CLASSES_OFFSET(%r11), %eax
+    cmpl %edi, %eax
+    je .Ldone
+    cmpl LITERAL(0), %eax
+    jne .Lentry2
+    lock cmpxchg %edi, INLINE_CACHE_CLASSES_OFFSET(%r11)
+    jz .Ldone
+    jmp .Lentry1
+.Lentry2:
+    movl (INLINE_CACHE_CLASSES_OFFSET+4)(%r11), %eax
+    cmpl %edi, %eax
+    je .Ldone
+    cmpl LITERAL(0), %eax
+    jne .Lentry3
+    lock cmpxchg %edi, (INLINE_CACHE_CLASSES_OFFSET+4)(%r11)
+    jz .Ldone
+    jmp .Lentry2
+.Lentry3:
+    movl (INLINE_CACHE_CLASSES_OFFSET+8)(%r11), %eax
+    cmpl %edi, %eax
+    je .Ldone
+    cmpl LITERAL(0), %eax
+    jne .Lentry4
+    lock cmpxchg %edi, (INLINE_CACHE_CLASSES_OFFSET+8)(%r11)
+    jz .Ldone
+    jmp .Lentry3
+.Lentry4:
+    movl (INLINE_CACHE_CLASSES_OFFSET+12)(%r11), %eax
+    cmpl %edi, %eax
+    je .Ldone
+    cmpl LITERAL(0), %eax
+    jne .Lentry5
+    lock cmpxchg %edi, (INLINE_CACHE_CLASSES_OFFSET+12)(%r11)
+    jz .Ldone
+    jmp .Lentry4
+.Lentry5:
+    // Unconditionally store, the cache is megamorphic.
+    movl %edi, (INLINE_CACHE_CLASSES_OFFSET+16)(%r11)
+.Ldone:
+    ret
+END_FUNCTION art_quick_update_inline_cache
diff --git a/runtime/entrypoints/quick/quick_default_externs.h b/runtime/entrypoints/quick/quick_default_externs.h
index aa32113..42f962e 100644
--- a/runtime/entrypoints/quick/quick_default_externs.h
+++ b/runtime/entrypoints/quick/quick_default_externs.h
@@ -130,4 +130,7 @@
 extern "C" void art_quick_throw_stack_overflow(void*);
 extern "C" void art_quick_throw_string_bounds(int32_t index, int32_t limit);
 
+// Inline cache.
+extern "C" void art_quick_update_inline_cache();
+
 #endif  // ART_RUNTIME_ENTRYPOINTS_QUICK_QUICK_DEFAULT_EXTERNS_H_
diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
index f5bb5a3..d41f9a0 100644
--- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
@@ -124,6 +124,9 @@
 
   // StringBuilder append
   qpoints->pStringBuilderAppend = art_quick_string_builder_append;
+
+  // InlineCache update
+  qpoints->pUpdateInlineCache = art_quick_update_inline_cache;
 }
 
 }  // namespace art
diff --git a/runtime/entrypoints/quick/quick_entrypoints_list.h b/runtime/entrypoints/quick/quick_entrypoints_list.h
index 21e248c..efab7c2 100644
--- a/runtime/entrypoints/quick/quick_entrypoints_list.h
+++ b/runtime/entrypoints/quick/quick_entrypoints_list.h
@@ -171,6 +171,8 @@
 \
   V(StringBuilderAppend, void*, uint32_t) \
 \
+  V(UpdateInlineCache, void, void) \
+\
   V(ReadBarrierJni, void, mirror::CompressedReference<mirror::Object>*, Thread*) \
   V(ReadBarrierMarkReg00, mirror::Object*, mirror::Object*) \
   V(ReadBarrierMarkReg01, mirror::Object*, mirror::Object*) \
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index 5235f65..210d851 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -336,9 +336,12 @@
                          sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pNewStringFromStringBuilder, pStringBuilderAppend,
                          sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pStringBuilderAppend, pReadBarrierJni,
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pStringBuilderAppend, pUpdateInlineCache,
                          sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pReadBarrierJni, pReadBarrierMarkReg00, sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pUpdateInlineCache, pReadBarrierJni,
+                         sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pReadBarrierJni, pReadBarrierMarkReg00,
+                         sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pReadBarrierMarkReg00, pReadBarrierMarkReg01,
                          sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pReadBarrierMarkReg01, pReadBarrierMarkReg02,
diff --git a/runtime/jit/profiling_info.h b/runtime/jit/profiling_info.h
index f6139bb..d4dc498 100644
--- a/runtime/jit/profiling_info.h
+++ b/runtime/jit/profiling_info.h
@@ -21,6 +21,7 @@
 
 #include "base/macros.h"
 #include "gc_root.h"
+#include "offsets.h"
 
 namespace art {
 
@@ -39,8 +40,13 @@
 // Once the classes_ array is full, we consider the INVOKE to be megamorphic.
 class InlineCache {
  public:
+  // This is hard coded in the assembly stub art_quick_update_inline_cache.
   static constexpr uint8_t kIndividualCacheSize = 5;
 
+  static constexpr MemberOffset ClassesOffset() {
+    return MemberOffset(OFFSETOF_MEMBER(InlineCache, classes_));
+  }
+
  private:
   uint32_t dex_pc_;
   GcRoot<mirror::Class> classes_[kIndividualCacheSize];
@@ -99,15 +105,6 @@
     return saved_entry_point_;
   }
 
-  void ClearGcRootsInInlineCaches() {
-    for (size_t i = 0; i < number_of_inline_caches_; ++i) {
-      InlineCache* cache = &cache_[i];
-      memset(&cache->classes_[0],
-             0,
-             InlineCache::kIndividualCacheSize * sizeof(GcRoot<mirror::Class>));
-    }
-  }
-
   // Increments the number of times this method is currently being inlined.
   // Returns whether it was successful, that is it could increment without
   // overflowing.
diff --git a/runtime/oat.h b/runtime/oat.h
index a3f8722..3b20ea1 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: Revert^4 Boot image extension.
-  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '7', '6', '\0' } };
+  // Last oat version changed reason: pUpdateInlineCache entrypoint.
+  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '7', '7', '\0' } };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
diff --git a/tools/cpp-define-generator/asm_defines.def b/tools/cpp-define-generator/asm_defines.def
index 7a77e8e..9aad8a4 100644
--- a/tools/cpp-define-generator/asm_defines.def
+++ b/tools/cpp-define-generator/asm_defines.def
@@ -26,6 +26,7 @@
 #include "mirror_dex_cache.def"
 #include "mirror_object.def"
 #include "mirror_string.def"
+#include "profiling_info.def"
 #include "rosalloc.def"
 #include "runtime.def"
 #include "shadow_frame.def"
diff --git a/tools/cpp-define-generator/profiling_info.def b/tools/cpp-define-generator/profiling_info.def
new file mode 100644
index 0000000..6d77b9d
--- /dev/null
+++ b/tools/cpp-define-generator/profiling_info.def
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if ASM_DEFINE_INCLUDE_DEPENDENCIES
+#include "jit/profiling_info.h"
+#endif
+
+ASM_DEFINE(INLINE_CACHE_SIZE, art::InlineCache::kIndividualCacheSize);
+ASM_DEFINE(INLINE_CACHE_CLASSES_OFFSET, art::InlineCache::ClassesOffset().Int32Value());