Improve aarch64 MonitorEntry/Exit assembly code

We make two kinds of changes:

1) We remove some redundant moves, which appeared to have been copied
from some architecture with a 2 address instruction format.

2) We avoid the use of dmb barrier instructions, and instead use
acquire/release instructions for the actual lock loads/updates.

(2) is a clear win on A53/A57, where there seems to be very little
additional cost associated with acquire/release when
used with "exclusive" memory operations, as they are here.
On the cores used in 2016 Pixel phones, the story is more mixed.
But the addition of acquire/release to a pair of exclusive load/store
operations still seems to cost enough less than 2 dmb's, so that
even if 10% of lock acquisitions are nested and unnecessarily
enforce ordering, we come out slightly ahead. ARM's advice for
the future is also to move in this direction.

Test: AOSP boots. AOSP art test failures seem attributable to other
issues.

Change-Id: I2399baeab3df93196471e65612c00d95ad4e2b62
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index bfbe481..7cb50b7 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1183,15 +1183,13 @@
     add    x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET  // exclusive load/store has no immediate anymore
 .Lretry_lock:
     ldr    w2, [xSELF, #THREAD_ID_OFFSET] // TODO: Can the thread ID really change during the loop?
-    ldxr   w1, [x4]
-    mov    x3, x1
-    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
+    ldaxr  w1, [x4]                   // acquire needed only in most common case
+    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     cbnz   w3, .Lnot_unlocked         // already thin locked
     // unlocked case - x1: original lock word that's zero except for the read barrier bits.
     orr    x2, x1, x2                 // x2 holds thread id with count of 0 with preserved read barrier bits
     stxr   w3, w2, [x4]
     cbnz   w3, .Llock_stxr_fail       // store failed, retry
-    dmb    ishld                      // full (LoadLoad|LoadStore) memory barrier
     ret
 .Lnot_unlocked:  // x1: original lock word
     lsr    w3, w1, LOCK_WORD_STATE_SHIFT
@@ -1200,8 +1198,7 @@
     uxth   w2, w2                     // zero top 16 bits
     cbnz   w2, .Lslow_lock            // lock word and self thread id's match -> recursive lock
                                       // else contention, go to slow path
-    mov    x3, x1                     // copy the lock word to check count overflow.
-    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits.
+    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits.
     add    w2, w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count in lock word placing in w2 to check overflow
     lsr    w3, w2, #LOCK_WORD_GC_STATE_SHIFT     // if the first gc state bit is set, we overflowed.
     cbnz   w3, .Lslow_lock            // if we overflow the count go slow path
@@ -1246,23 +1243,19 @@
     lsr    w2, w1, LOCK_WORD_STATE_SHIFT
     cbnz   w2, .Lslow_unlock          // if either of the top two bits are set, go slow path
     ldr    w2, [xSELF, #THREAD_ID_OFFSET]
-    mov    x3, x1                     // copy lock word to check thread id equality
-    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
+    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     eor    w3, w3, w2                 // lock_word.ThreadId() ^ self->ThreadId()
     uxth   w3, w3                     // zero top 16 bits
     cbnz   w3, .Lslow_unlock          // do lock word and self thread id's match?
-    mov    x3, x1                     // copy lock word to detect transition to unlocked
-    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
+    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     cmp    w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
     bpl    .Lrecursive_thin_unlock
     // transition to unlocked
-    mov    x3, x1
-    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED  // w3: zero except for the preserved read barrier bits
-    dmb    ish                        // full (LoadStore|StoreStore) memory barrier
+    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED  // w3: zero except for the preserved read barrier bits
 #ifndef USE_READ_BARRIER
-    str    w3, [x4]
+    stlr   w3, [x4]
 #else
-    stxr   w2, w3, [x4]               // Need to use atomic instructions for read barrier
+    stlxr  w2, w3, [x4]               // Need to use atomic instructions for read barrier
     cbnz   w2, .Lunlock_stxr_fail     // store failed, retry
 #endif
     ret
@@ -1276,7 +1269,7 @@
 #endif
     ret
 .Lunlock_stxr_fail:
-    b      .Lretry_unlock               // retry
+    b      .Lretry_unlock             // retry
 .Lslow_unlock:
     SETUP_SAVE_REFS_ONLY_FRAME        // save callee saves in case exception allocation triggers GC
     mov    x1, xSELF                  // pass Thread::Current