optimize contended normal mutex case; add int compare-and-swap atomic
diff --git a/arch/i386/atomic.h b/arch/i386/atomic.h
index e74e453..bf3c336 100644
--- a/arch/i386/atomic.h
+++ b/arch/i386/atomic.h
@@ -49,6 +49,13 @@
 	return t;
 }
 
+static inline int a_cas(volatile int *p, int t, int s)
+{
+	__asm__( "lock ; cmpxchg %3, %1"
+		: "=a"(t), "=m"(*p) : "a"(t), "r"(s) : "memory" );
+	return t;
+}
+
 static inline void *a_swap_p(void *volatile *x, void *v)
 {
 	__asm__( "xchg %0, %1" : "=r"(v), "=m"(*(void **)x) : "0"(v) : "memory" );
diff --git a/arch/x86_64/atomic.h b/arch/x86_64/atomic.h
index 7a665c1..04f6c28 100644
--- a/arch/x86_64/atomic.h
+++ b/arch/x86_64/atomic.h
@@ -48,6 +48,13 @@
 	return t;
 }
 
+static inline int a_cas(volatile int *p, int t, int s)
+{
+	__asm__( "lock ; cmpxchgl %3, %1"
+		: "=a"(t), "=m"(*p) : "a"(t), "r"(s) : "memory" );
+	return t;
+}
+
 static inline void *a_swap_p(void *volatile *x, void *v)
 {
 	__asm__( "xchg %0, %1" : "=r"(v), "=m"(*(void **)x) : "0"(v) : "memory" );
diff --git a/src/thread/pthread_mutex_trylock.c b/src/thread/pthread_mutex_trylock.c
index af42147..6fc604f 100644
--- a/src/thread/pthread_mutex_trylock.c
+++ b/src/thread/pthread_mutex_trylock.c
@@ -5,7 +5,7 @@
 	int tid;
 
 	if (m->_m_type == PTHREAD_MUTEX_NORMAL)
-		return -a_xchg(&m->_m_lock, 1) & EBUSY;
+		return (m->_m_lock || a_swap(&m->_m_lock, 1)) ? EBUSY : 0;
 
 	tid = pthread_self()->tid;