Inflate contended lock word by suspending owner.

Bug 6961405.
Don't inflate monitors for Notify and NotifyAll.
Tidy lock word, handle recursive lock case alongside unlocked case and move
assembly out of line (except for ARM quick). Also handle null in out-of-line
assembly as the test is quick and the enter/exit code is already a safepoint.
To gain ownership of a monitor on behalf of another thread, monitor contenders
must not hold the monitor_lock_, so they wait on a condition variable.
Reduce size of per mutex contention log.
Be consistent in calling thin lock thread ids just thread ids.
Fix potential thread death races caused by the use of FindThreadByThreadId,
make it invariant that returned threads are either self or suspended now.

Code size reduction on ARM boot.oat 0.2%.
Old nexus 7 speedup 0.25%, new nexus 7 speedup 1.4%, nexus 10 speedup 2.24%,
nexus 4 speedup 2.09% on DeltaBlue.

Change-Id: Id52558b914f160d9c8578fdd7fc8199a9598576a
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 5b2dd6c..cb61698 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -318,22 +318,67 @@
 END art_quick_handle_fill_data
 
     /*
-     * Entry from managed code that calls artLockObjectFromCode, may block for GC.
+     * Entry from managed code that calls artLockObjectFromCode, may block for GC. r0 holds the
+     * possibly null object to lock.
      */
     .extern artLockObjectFromCode
 ENTRY art_quick_lock_object
+    cbz    r0, slow_lock
+retry_lock:
+    ldrex  r1, [r0, #LOCK_WORD_OFFSET]
+    ldrt   r2, [r9, #THREAD_ID_OFFSET]
+    cmp    r1, #0
+    bmi    slow_lock                  @ lock word contains a monitor
+    bne    already_thin
+    @ unlocked case - r2 holds thread id with count of 0
+    strex  r3, r2, [r0, #LOCK_WORD_OFFSET]
+    cbnz   r3, strex_fail             @ store failed, retry
+    bx lr
+strex_fail:
+    b retry_lock                      @ unlikely forward branch, need to reload and recheck r1/r2
+already_thin:
+    eor    r2, r1, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
+    uxth   r2, r2                     @ zero top 16 bits
+    cbnz   r2, slow_lock              @ lock word and self thread id's match -> recursive lock
+                                      @ else contention, go to slow path
+    adds   r2, r1, #65536             @ increment count in lock word placing in r2 for storing
+    bmi    slow_lock                  @ if we overflow the count go slow
+    str    r2, [r0, #LOCK_WORD_OFFSET] @ no need for strex as we hold the lock
+    bx lr
+slow_lock:
     SETUP_REF_ONLY_CALLEE_SAVE_FRAME  @ save callee saves in case we block
     mov    r1, r9                     @ pass Thread::Current
     mov    r2, sp                     @ pass SP
     bl     artLockObjectFromCode      @ (Object* obj, Thread*, SP)
-    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME
+    RETURN_IF_RESULT_IS_ZERO
+    DELIVER_PENDING_EXCEPTION
 END art_quick_lock_object
 
     /*
      * Entry from managed code that calls artUnlockObjectFromCode and delivers exception on failure.
+     * r0 holds the possibly null object to lock.
      */
     .extern artUnlockObjectFromCode
 ENTRY art_quick_unlock_object
+    cbz    r0, slow_unlock
+    ldr    r1, [r0, #LOCK_WORD_OFFSET]
+    ldr    r2, [r9, #THREAD_ID_OFFSET]
+    cmp    r1, #0
+    bmi    slow_unlock                @ lock word contains a monitor
+    eor    r3, r1, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
+    uxth   r3, r3                     @ zero top 16 bits
+    cbnz   r3, slow_unlock            @ do lock word and self thread id's match?
+    cmp    r1, #65536
+    bpl    recursive_thin_unlock
+    @ transition to unlocked, r3 holds 0
+    str    r3, [r0, #LOCK_WORD_OFFSET]
+    bx     lr
+recursive_thin_unlock:
+    sub    r1, r1, #65536
+    str    r1, [r0, #LOCK_WORD_OFFSET]
+    bx     lr
+slow_unlock:
     SETUP_REF_ONLY_CALLEE_SAVE_FRAME  @ save callee saves in case exception allocation triggers GC
     mov    r1, r9                     @ pass Thread::Current
     mov    r2, sp                     @ pass SP