Performance improvements by removing a DMB and inlining.

Correct the version of CAS used by Mutex::Lock to be acquire and not release.
Don't do a memory barrier in thread transitions when there is already a
barrier associated with the mutator lock.
Force inlining of the hot thread and shared lock code, heavily used by down
calls and JNI.
Force inlining of mirror routines that are used by runtime support and hot.

Performance was measured and improved using perf and maps.

Change-Id: I012580e337143236d8b6d06c1e270183ae51083c
diff --git a/src/thread.h b/src/thread.h
index 5e424c1..58de45d 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -169,13 +169,15 @@
   // Transition from non-runnable to runnable state acquiring share on mutator_lock_.
   ThreadState TransitionFromSuspendedToRunnable()
       LOCKS_EXCLUDED(Locks::thread_suspend_count_lock_)
-      SHARED_LOCK_FUNCTION(Locks::mutator_lock_);
+      SHARED_LOCK_FUNCTION(Locks::mutator_lock_)
+      __attribute__ ((always_inline));
 
   // Transition from runnable into a state where mutator privileges are denied. Releases share of
   // mutator lock.
   void TransitionFromRunnableToSuspended(ThreadState new_state)
       LOCKS_EXCLUDED(Locks::thread_suspend_count_lock_)
-      UNLOCK_FUNCTION(Locks::mutator_lock_);
+      UNLOCK_FUNCTION(Locks::mutator_lock_)
+      __attribute__ ((always_inline));
 
   // Wait for a debugger suspension on the thread associated with the given peer. Returns the
   // thread on success, else NULL. If the thread should be suspended then request_suspension should
@@ -215,13 +217,7 @@
 #endif
 
 
-#ifndef NDEBUG
   void AssertThreadSuspensionIsAllowable(bool check_locks = true) const;
-#else
-  void AssertThreadSuspensionIsAllowable(bool check_locks = true) const {
-    UNUSED(check_locks);  // Keep GCC happy about unused parameters.
-  }
-#endif
 
   bool IsDaemon() const {
     return daemon_;