Check for forwarding address in READ_BARRIER_MARK_REG

When the object is in the from-space, the mark bit is not set.
In this case, we can also check the lock word for being a forwarding
address. The forwarding address case happens around 25% of the time.
This CL adds the case for forwarding address lock words to
READ_BARRIER_MARK_REG.

Reduces total read barriers reaching runtime on ritzperf:
Slow paths: 20758783 -> 15457783

Deleted the mark bit check in MarkFromReadBarrier since most of the
callers check the bit now.

Perf:
ReadBarrier::Mark: 2.59% -> 2.12%
art_quick_read_barrier_mark_reg01: 0.79% -> 0.78%
art_quick_read_barrier_mark_reg00: 0.54% -> 0.50%
art_quick_read_barrier_mark_reg02: 0.31% -> 0.25%

Only X86_64 for now, will do other archs after.

Bug: 30162165

Test: test-art-host

Change-Id: Ie7289d684d0e37a887943d77710092e380457860
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 2856766..e3f39cd 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -2301,8 +2301,15 @@
     jz .Lslow_rb_\name
     ret
 .Lslow_rb_\name:
-    // Save all potentially live caller-save core registers.
     PUSH rax
+    movl MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(reg)), %eax
+    addl LITERAL(LOCK_WORD_STATE_FORWARDING_ADDRESS_OVERFLOW), %eax
+    // Jump if overflow, the only case where it overflows should be the forwarding address one.
+    // Taken ~25% of the time.
+    jnae .Lret_overflow\name
+
+    // Save all potentially live caller-save core registers.
+    movq 0(%rsp), %rax
     PUSH rcx
     PUSH rdx
     PUSH rsi
@@ -2367,6 +2374,12 @@
     POP_REG_NE rax, RAW_VAR(reg)
 .Lret_rb_\name:
     ret
+.Lret_overflow\name:
+    // The overflow cleared the top bits.
+    sall LITERAL(LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT), %eax
+    movq %rax, REG_VAR(reg)
+    POP_REG_NE rax, RAW_VAR(reg)
+    ret
     END_FUNCTION VAR(name)
 END_MACRO
 
diff --git a/runtime/gc/collector/concurrent_copying-inl.h b/runtime/gc/collector/concurrent_copying-inl.h
index 76f500c..b23897b 100644
--- a/runtime/gc/collector/concurrent_copying-inl.h
+++ b/runtime/gc/collector/concurrent_copying-inl.h
@@ -155,7 +155,7 @@
   mirror::Object* ret;
   // TODO: Delete GetMarkBit check when all of the callers properly check the bit. Remaining caller
   // is array allocations.
-  if (from_ref == nullptr || from_ref->GetMarkBit()) {
+  if (from_ref == nullptr) {
     return from_ref;
   }
   // TODO: Consider removing this check when we are done investigating slow paths. b/30162165
diff --git a/runtime/generated/asm_support_gen.h b/runtime/generated/asm_support_gen.h
index 6c189b0..f938c9f 100644
--- a/runtime/generated/asm_support_gen.h
+++ b/runtime/generated/asm_support_gen.h
@@ -98,6 +98,10 @@
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), (static_cast<uint32_t>(art::LockWord::kReadBarrierStateMaskShiftedToggled)))
 #define LOCK_WORD_THIN_LOCK_COUNT_ONE 65536
 DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_THIN_LOCK_COUNT_ONE), (static_cast<int32_t>(art::LockWord::kThinLockCountOne)))
+#define LOCK_WORD_STATE_FORWARDING_ADDRESS_OVERFLOW 0x40000000
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_FORWARDING_ADDRESS_OVERFLOW), (static_cast<uint32_t>(art::LockWord::kStateForwardingAddressOverflow)))
+#define LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT 0x3
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT), (static_cast<uint32_t>(art::LockWord::kForwardingAddressShift)))
 #define LOCK_WORD_GC_STATE_MASK_SHIFTED 0x30000000
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_GC_STATE_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kGCStateMaskShifted)))
 #define LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED 0xcfffffff
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index 538b6eb..e9d06b3 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -61,7 +61,7 @@
  */
 class LockWord {
  public:
-  enum SizeShiftsAndMasks {  // private marker to avoid generate-operator-out.py from processing.
+  enum SizeShiftsAndMasks : uint32_t {  // private marker to avoid generate-operator-out.py from processing.
     // Number of bits to encode the state, currently just fat or thin/unlocked or hash code.
     kStateSize = 2,
     kReadBarrierStateSize = 1,
@@ -91,6 +91,8 @@
     kStateFat = 1,
     kStateHash = 2,
     kStateForwardingAddress = 3,
+    kStateForwardingAddressShifted = kStateForwardingAddress << kStateShift,
+    kStateForwardingAddressOverflow = (1 + kStateMask - kStateForwardingAddress) << kStateShift,
 
     // Read barrier bit.
     kReadBarrierStateShift = kThinLockCountSize + kThinLockCountShift,
@@ -140,7 +142,7 @@
 
   static LockWord FromForwardingAddress(size_t target) {
     DCHECK_ALIGNED(target, (1 << kStateSize));
-    return LockWord((target >> kForwardingAddressShift) | (kStateForwardingAddress << kStateShift));
+    return LockWord((target >> kForwardingAddressShift) | kStateForwardingAddressShifted);
   }
 
   static LockWord FromHashCode(uint32_t hash_code, uint32_t gc_state) {
@@ -256,6 +258,11 @@
   LockWord();
 
   explicit LockWord(uint32_t val) : value_(val) {
+    // Make sure adding the overflow causes an overflow.
+    constexpr uint64_t overflow = static_cast<uint64_t>(kStateForwardingAddressShifted) +
+        static_cast<uint64_t>(kStateForwardingAddressOverflow);
+    constexpr bool is_larger = overflow > static_cast<uint64_t>(0xFFFFFFFF);
+    static_assert(is_larger, "should have overflowed");
     CheckReadBarrierState();
   }
 
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index eb74fcf..f583167 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -771,7 +771,7 @@
         return false;
       }
       // Can't deflate if our lock count is too high.
-      if (monitor->lock_count_ > LockWord::kThinLockMaxCount) {
+      if (static_cast<uint32_t>(monitor->lock_count_) > LockWord::kThinLockMaxCount) {
         return false;
       }
       // Deflate to a thin lock.