Improve aarch64 MonitorEntry/Exit assembly code We make two kinds of changes: 1) We remove some redundant moves, which appeared to have been copied from some architecture with a 2 address instruction format. 2) We avoid the use of dmb barrier instructions, and instead use acquire/release instructions for the actual lock loads/updates. (2) is a clear win on A53/A57, where there seems to be very little additional cost associated with acquire/release when used with "exclusive" memory operations, as they are here. On the cores used in 2016 Pixel phones, the story is more mixed. But the addition of acquire/release to a pair of exclusive load/store operations still seems to cost enough less than 2 dmb's, so that even if 10% of lock acquisitions are nested and unnecessarily enforce ordering, we come out slightly ahead. ARM's advice for the future is also to move in this direction. Test: AOSP boots. AOSP art test failures seem attributable to other issues. Change-Id: I2399baeab3df93196471e65612c00d95ad4e2b62

commit: 67eda383545518e79a6aa5185ae8cd841e0a4eec [log] [tgz]
author: Hans Boehm <hboehm@google.com> Tue Jan 17 15:03:38 2017 -0800
committer: Hans Boehm <hboehm@google.com> Tue Feb 21 14:53:48 2017 -0800
tree: 96e2a118a1057cb6a63452ee796ac27f11cc8897
parent: ea36aaf1ebd5342e24ea414d0b797b25eb8d7936 [diff] [blame]
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index bfbe481..7cb50b7 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S

@@ -1183,15 +1183,13 @@
     add    x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET  // exclusive load/store has no immediate anymore
 .Lretry_lock:
     ldr    w2, [xSELF, #THREAD_ID_OFFSET] // TODO: Can the thread ID really change during the loop?
-    ldxr   w1, [x4]
-    mov    x3, x1
-    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
+    ldaxr  w1, [x4]                   // acquire needed only in most common case
+    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     cbnz   w3, .Lnot_unlocked         // already thin locked
     // unlocked case - x1: original lock word that's zero except for the read barrier bits.
     orr    x2, x1, x2                 // x2 holds thread id with count of 0 with preserved read barrier bits
     stxr   w3, w2, [x4]
     cbnz   w3, .Llock_stxr_fail       // store failed, retry
-    dmb    ishld                      // full (LoadLoad|LoadStore) memory barrier
     ret
 .Lnot_unlocked:  // x1: original lock word
     lsr    w3, w1, LOCK_WORD_STATE_SHIFT
@@ -1200,8 +1198,7 @@
     uxth   w2, w2                     // zero top 16 bits
     cbnz   w2, .Lslow_lock            // lock word and self thread id's match -> recursive lock
                                       // else contention, go to slow path
-    mov    x3, x1                     // copy the lock word to check count overflow.
-    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits.
+    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits.
     add    w2, w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count in lock word placing in w2 to check overflow
     lsr    w3, w2, #LOCK_WORD_GC_STATE_SHIFT     // if the first gc state bit is set, we overflowed.
     cbnz   w3, .Lslow_lock            // if we overflow the count go slow path
@@ -1246,23 +1243,19 @@
     lsr    w2, w1, LOCK_WORD_STATE_SHIFT
     cbnz   w2, .Lslow_unlock          // if either of the top two bits are set, go slow path
     ldr    w2, [xSELF, #THREAD_ID_OFFSET]
-    mov    x3, x1                     // copy lock word to check thread id equality
-    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
+    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     eor    w3, w3, w2                 // lock_word.ThreadId() ^ self->ThreadId()
     uxth   w3, w3                     // zero top 16 bits
     cbnz   w3, .Lslow_unlock          // do lock word and self thread id's match?
-    mov    x3, x1                     // copy lock word to detect transition to unlocked
-    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
+    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     cmp    w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
     bpl    .Lrecursive_thin_unlock
     // transition to unlocked
-    mov    x3, x1
-    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED  // w3: zero except for the preserved read barrier bits
-    dmb    ish                        // full (LoadStore|StoreStore) memory barrier
+    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED  // w3: zero except for the preserved read barrier bits
 #ifndef USE_READ_BARRIER
-    str    w3, [x4]
+    stlr   w3, [x4]
 #else
-    stxr   w2, w3, [x4]               // Need to use atomic instructions for read barrier
+    stlxr  w2, w3, [x4]               // Need to use atomic instructions for read barrier
     cbnz   w2, .Lunlock_stxr_fail     // store failed, retry
 #endif
     ret
@@ -1276,7 +1269,7 @@
 #endif
     ret
 .Lunlock_stxr_fail:
-    b      .Lretry_unlock               // retry
+    b      .Lretry_unlock             // retry
 .Lslow_unlock:
     SETUP_SAVE_REFS_ONLY_FRAME        // save callee saves in case exception allocation triggers GC
     mov    x1, xSELF                  // pass Thread::Current
commit	67eda383545518e79a6aa5185ae8cd841e0a4eec	[log] [tgz]
author	Hans Boehm <hboehm@google.com>	Tue Jan 17 15:03:38 2017 -0800
committer	Hans Boehm <hboehm@google.com>	Tue Feb 21 14:53:48 2017 -0800
tree	96e2a118a1057cb6a63452ee796ac27f11cc8897
parent	ea36aaf1ebd5342e24ea414d0b797b25eb8d7936 [diff] [blame]