ARM/ARM64: Improve lock/unlock entrypoints.
Do the same in fewer instructions.
Test: Pixel 2 XL boots.
Test: testrunner.py --target --optimizing
Change-Id: I8003481116fd3dc6a1559b84fdc776b92dba0c68
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index ac5b2b8..14d0cc7 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1151,45 +1151,36 @@
*/
.extern artLockObjectFromCode
ENTRY art_quick_lock_object
- cbz w0, .Lslow_lock
- add x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET // exclusive load/store has no immediate anymore
+ ldr w1, [xSELF, #THREAD_ID_OFFSET]
+ cbz w0, art_quick_lock_object_no_inline
+ // Exclusive load/store has no immediate anymore.
+ add x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET
.Lretry_lock:
- ldr w2, [xSELF, #THREAD_ID_OFFSET] // TODO: Can the thread ID really change during the loop?
- ldaxr w1, [x4] // acquire needed only in most common case
- and w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits
- cbnz w3, .Lnot_unlocked // already thin locked
- // unlocked case - x1: original lock word that's zero except for the read barrier bits.
- orr x2, x1, x2 // x2 holds thread id with count of 0 with preserved read barrier bits
- stxr w3, w2, [x4]
- cbnz w3, .Llock_stxr_fail // store failed, retry
+ ldaxr w2, [x4] // Acquire needed only in most common case.
+ eor w3, w2, w1 // Prepare the value to store if unlocked
+ // (thread id, count of 0 and preserved read barrier bits),
+ // or prepare to compare thread id for recursive lock check
+ // (lock_word.ThreadId() ^ self->ThreadId()).
+ tst w2, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // Test the non-gc bits.
+ b.ne .Lnot_unlocked // Check if unlocked.
+ // unlocked case - store w3: original lock word plus thread id, preserved read barrier bits.
+ stxr w2, w3, [x4]
+ cbnz w2, .Lretry_lock // If the store failed, retry.
ret
-.Lnot_unlocked: // x1: original lock word
- lsr w3, w1, LOCK_WORD_STATE_SHIFT
- cbnz w3, .Lslow_lock // if either of the top two bits are set, go slow path
- eor w2, w1, w2 // lock_word.ThreadId() ^ self->ThreadId()
- uxth w2, w2 // zero top 16 bits
- cbnz w2, .Lslow_lock // lock word and self thread id's match -> recursive lock
- // else contention, go to slow path
- and w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits.
- add w2, w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE // increment count in lock word placing in w2 to check overflow
- lsr w3, w2, #LOCK_WORD_GC_STATE_SHIFT // if the first gc state bit is set, we overflowed.
- cbnz w3, .Lslow_lock // if we overflow the count go slow path
- add w2, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE // increment count for real
- stxr w3, w2, [x4]
- cbnz w3, .Llock_stxr_fail // store failed, retry
+.Lnot_unlocked: // w2: original lock word, w1: thread id, w3: w2 ^ w1
+ // Check lock word state and thread id together,
+ tst w3, #(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED)
+ b.ne art_quick_lock_object_no_inline
+ add w3, w2, #LOCK_WORD_THIN_LOCK_COUNT_ONE // Increment the recursive lock count.
+ tst w3, #LOCK_WORD_THIN_LOCK_COUNT_MASK_SHIFTED // Test the new thin lock count.
+ b.eq art_quick_lock_object_no_inline // Zero as the new count indicates overflow, go slow path.
+ stxr w2, w3, [x4]
+ cbnz w2, .Lretry_lock // If the store failed, retry.
ret
-.Llock_stxr_fail:
- b .Lretry_lock // retry
-.Lslow_lock:
- SETUP_SAVE_REFS_ONLY_FRAME // save callee saves in case we block
- mov x1, xSELF // pass Thread::Current
- bl artLockObjectFromCode // (Object* obj, Thread*)
- RESTORE_SAVE_REFS_ONLY_FRAME
- REFRESH_MARKING_REGISTER
- RETURN_IF_W0_IS_ZERO_OR_DELIVER
END art_quick_lock_object
ENTRY art_quick_lock_object_no_inline
+ // This is also the slow path for art_quick_lock_object.
SETUP_SAVE_REFS_ONLY_FRAME // save callee saves in case we block
mov x1, xSELF // pass Thread::Current
bl artLockObjectFromCode // (Object* obj, Thread*)
@@ -1206,54 +1197,46 @@
*/
.extern artUnlockObjectFromCode
ENTRY art_quick_unlock_object
- cbz x0, .Lslow_unlock
- add x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET // exclusive load/store has no immediate anymore
+ ldr w1, [xSELF, #THREAD_ID_OFFSET]
+ cbz x0, art_quick_unlock_object_no_inline
+ // Exclusive load/store has no immediate anymore.
+ add x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET
.Lretry_unlock:
#ifndef USE_READ_BARRIER
- ldr w1, [x4]
+ ldr w2, [x4]
#else
- ldxr w1, [x4] // Need to use atomic instructions for read barrier
+ ldxr w2, [x4] // Need to use atomic instructions for read barrier.
#endif
- lsr w2, w1, LOCK_WORD_STATE_SHIFT
- cbnz w2, .Lslow_unlock // if either of the top two bits are set, go slow path
- ldr w2, [xSELF, #THREAD_ID_OFFSET]
- and w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits
- eor w3, w3, w2 // lock_word.ThreadId() ^ self->ThreadId()
- uxth w3, w3 // zero top 16 bits
- cbnz w3, .Lslow_unlock // do lock word and self thread id's match?
- and w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits
- cmp w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
- bpl .Lrecursive_thin_unlock
- // transition to unlocked
- and w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED // w3: zero except for the preserved read barrier bits
+ eor w3, w2, w1 // Prepare the value to store if simply locked
+ // (mostly 0s, and preserved read barrier bits),
+ // or prepare to compare thread id for recursive lock check
+ // (lock_word.ThreadId() ^ self->ThreadId()).
+ tst w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // Test the non-gc bits.
+ b.ne .Lnot_simply_locked // Locked recursively or by other thread?
+ // Transition to unlocked.
#ifndef USE_READ_BARRIER
stlr w3, [x4]
#else
- stlxr w2, w3, [x4] // Need to use atomic instructions for read barrier
- cbnz w2, .Lunlock_stxr_fail // store failed, retry
+ stlxr w2, w3, [x4] // Need to use atomic instructions for read barrier.
+ cbnz w2, .Lretry_unlock // If the store failed, retry.
#endif
ret
-.Lrecursive_thin_unlock: // w1: original lock word
- sub w1, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE // decrement count
+.Lnot_simply_locked:
+ // Check lock word state and thread id together,
+ tst w3, #(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED)
+ b.ne art_quick_unlock_object_no_inline
+ sub w3, w2, #LOCK_WORD_THIN_LOCK_COUNT_ONE // decrement count
#ifndef USE_READ_BARRIER
- str w1, [x4]
+ str w3, [x4]
#else
- stxr w2, w1, [x4] // Need to use atomic instructions for read barrier
- cbnz w2, .Lunlock_stxr_fail // store failed, retry
+ stxr w2, w3, [x4] // Need to use atomic instructions for read barrier.
+ cbnz w2, .Lretry_unlock // If the store failed, retry.
#endif
ret
-.Lunlock_stxr_fail:
- b .Lretry_unlock // retry
-.Lslow_unlock:
- SETUP_SAVE_REFS_ONLY_FRAME // save callee saves in case exception allocation triggers GC
- mov x1, xSELF // pass Thread::Current
- bl artUnlockObjectFromCode // (Object* obj, Thread*)
- RESTORE_SAVE_REFS_ONLY_FRAME
- REFRESH_MARKING_REGISTER
- RETURN_IF_W0_IS_ZERO_OR_DELIVER
END art_quick_unlock_object
ENTRY art_quick_unlock_object_no_inline
+ // This is also the slow path for art_quick_unlock_object.
SETUP_SAVE_REFS_ONLY_FRAME // save callee saves in case exception allocation triggers GC
mov x1, xSELF // pass Thread::Current
bl artUnlockObjectFromCode // (Object* obj, Thread*)