[ARM] Emit clrex in the expanded cmpxchg fail block.

ARM counterpart to r248291:

In the comparison failure block of a cmpxchg expansion, the initial
ldrex/ldxr will not be followed by a matching strex/stxr.
On ARM/AArch64, this unnecessarily ties up the execution monitor,
which might have a negative performance impact on some uarchs.

Instead, release the monitor in the failure block.
The clrex instruction was designed for this: use it.

Also see ARMARM v8-A B2.10.2:
"Exclusive access instructions and Shareable memory locations".

Differential Revision: http://reviews.llvm.org/D13033

llvm-svn: 248294
diff --git a/llvm/test/CodeGen/ARM/cmpxchg-weak.ll b/llvm/test/CodeGen/ARM/cmpxchg-weak.ll
index 126e330..1eac9c4 100644
--- a/llvm/test/CodeGen/ARM/cmpxchg-weak.ll
+++ b/llvm/test/CodeGen/ARM/cmpxchg-weak.ll
@@ -5,16 +5,24 @@
 
   %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
   %oldval = extractvalue { i32, i1 } %pair, 0
-; CHECK:     dmb ish
-; CHECK:     ldrex   [[LOADED:r[0-9]+]], [r0]
-; CHECK:     cmp     [[LOADED]], r1
-; CHECK:     strexeq [[SUCCESS:r[0-9]+]], r2, [r0]
-; CHECK:     cmpeq   [[SUCCESS]], #0
-; CHECK:     bne     [[DONE:LBB[0-9]+_[0-9]+]]
-; CHECK:     dmb     ish
-; CHECK: [[DONE]]:
-; CHECK:     str     r3, [r0]
-; CHECK:     bx      lr
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT:     dmb ish
+; CHECK-NEXT:     ldrex   [[LOADED:r[0-9]+]], [r0]
+; CHECK-NEXT:     cmp     [[LOADED]], r1
+; CHECK-NEXT:     bne     [[LDFAILBB:LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#1:
+; CHECK-NEXT:     strex   [[SUCCESS:r[0-9]+]], r2, [r0]
+; CHECK-NEXT:     cmp     [[SUCCESS]], #0
+; CHECK-NEXT:     bne     [[FAILBB:LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#2:
+; CHECK-NEXT:     dmb     ish
+; CHECK-NEXT:     str     r3, [r0]
+; CHECK-NEXT:     bx      lr
+; CHECK-NEXT: [[LDFAILBB]]:
+; CHECK-NEXT:     clrex
+; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT:     str     r3, [r0]
+; CHECK-NEXT:     bx      lr
 
   store i32 %oldval, i32* %addr
   ret void
@@ -27,17 +35,23 @@
   %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
   %success = extractvalue { i32, i1 } %pair, 1
 
-; CHECK:      dmb     ish
-; CHECK:      mov     r0, #0
-; CHECK:      ldrex   [[LOADED:r[0-9]+]], [r1]
-; CHECK:      cmp     [[LOADED]], r2
-; CHECK:      strexeq [[STATUS:r[0-9]+]], r3, [r1]
-; CHECK:      cmpeq   [[STATUS]], #0
-; CHECK:      bne     [[DONE:LBB[0-9]+_[0-9]+]]
-; CHECK:      dmb     ish
-; CHECK:      mov     r0, #1
-; CHECK: [[DONE]]:
-; CHECK:      bx      lr
+; CHECK-NEXT: BB#0:
+; CHECK-NEXT:     dmb ish
+; CHECK-NEXT:     ldrex   [[LOADED:r[0-9]+]], [r1]
+; CHECK-NEXT:     cmp     [[LOADED]], r2
+; CHECK-NEXT:     bne     [[LDFAILBB:LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#1:
+; CHECK-NEXT:     strex   [[SUCCESS:r[0-9]+]], r3, [r1]
+; CHECK-NEXT:     mov     r0, #0
+; CHECK-NEXT:     cmp     [[SUCCESS]], #0
+; CHECK-NEXT:     bxne    lr
+; CHECK-NEXT:     dmb     ish
+; CHECK-NEXT:     mov     r0, #1
+; CHECK-NEXT:     bx      lr
+; CHECK-NEXT: [[LDFAILBB]]:
+; CHECK-NEXT:     clrex
+; CHECK-NEXT:     mov     r0, #0
+; CHECK-NEXT:     bx      lr
 
   ret i1 %success
 }