Add a peephole to fuse cmpxchg w/ later cmp+branch.

The cmpxchg instruction already sets ZF for comparing the return value
vs the expected value. So there is no need to compare eq again.

Lots of pexes-in-the-wild have this pattern. Some compare against
a constant, some compare against a variable.

BUG=https://code.google.com/p/nativeclient/issues/detail?id=3882
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/413903002
diff --git a/crosstest/test_sync_atomic.cpp b/crosstest/test_sync_atomic.cpp
index 05d0336..9cfb963 100644
--- a/crosstest/test_sync_atomic.cpp
+++ b/crosstest/test_sync_atomic.cpp
@@ -54,9 +54,21 @@
 FOR_ALL_RMWOP_TYPES(X)
 #undef X
 
-#define X(type)                                                          \
-  type test_val_cmp_swap(volatile type *ptr, type oldval, type newval) { \
-    return __sync_val_compare_and_swap(ptr, oldval, newval);             \
+#define X(type)                                                                \
+  type test_val_cmp_swap(volatile type *ptr, type oldval, type newval) {       \
+    return __sync_val_compare_and_swap(ptr, oldval, newval);                   \
+  }                                                                            \
+  type test_val_cmp_swap_loop(volatile type *ptr, type oldval, type newval) {  \
+    type prev;                                                                 \
+    type succeeded_first_try = 1;                                              \
+    while (1) {                                                                \
+      prev = __sync_val_compare_and_swap(ptr, oldval, newval);                 \
+      if (prev == oldval)                                                      \
+        break;                                                                 \
+      succeeded_first_try = 0;                                                 \
+      oldval = prev;                                                           \
+    }                                                                          \
+    return succeeded_first_try;                                                \
   }
 
 ATOMIC_TYPE_TABLE
diff --git a/crosstest/test_sync_atomic.h b/crosstest/test_sync_atomic.h
index a88cd73..ce24a07 100644
--- a/crosstest/test_sync_atomic.h
+++ b/crosstest/test_sync_atomic.h
@@ -22,8 +22,9 @@
 FOR_ALL_RMWOP_TYPES(X)
 #undef X
 
-#define X(type)   \
-  type test_val_cmp_swap(volatile type *ptr, type oldval, type newval);
+#define X(type)                                                                \
+  type test_val_cmp_swap(volatile type *ptr, type oldval, type newval);        \
+  type test_val_cmp_swap_loop(volatile type *ptr, type oldval, type newval);
 
 ATOMIC_TYPE_TABLE
 #undef X
diff --git a/crosstest/test_sync_atomic_main.cpp b/crosstest/test_sync_atomic_main.cpp
index 0cae7cd..6a00f31 100644
--- a/crosstest/test_sync_atomic_main.cpp
+++ b/crosstest/test_sync_atomic_main.cpp
@@ -127,33 +127,43 @@
 template <typename Type>
 void testValCompareAndSwap(volatile Type *AtomicLoc, size_t &TotalTests,
                            size_t &Passes, size_t &Failures) {
-  for (size_t i = 0; i < NumValues; ++i) {
-    Type Value1 = static_cast<Type>(Values[i]);
-    for (size_t j = 0; j < NumValues; ++j) {
-      Type Value2 = static_cast<Type>(Values[j]);
-      for (size_t f = 0; f < 2; ++f) {
-        bool flip = f;
-        ++TotalTests;
-        *AtomicLoc = Value1;
-        Type ResultSz1 = Subzero_::test_val_cmp_swap(
-            AtomicLoc, flip ? Value2 : Value1, Value2);
-        Type ResultSz2 = *AtomicLoc;
-        *AtomicLoc = Value1;
-        Type ResultLlc1 = test_val_cmp_swap(
-            AtomicLoc, flip ? Value2 : Value1, Value2);
-        Type ResultLlc2 = *AtomicLoc;
-        if (ResultSz1 == ResultLlc1 && ResultSz2 == ResultLlc2) {
-          ++Passes;
-        } else {
-          ++Failures;
-          std::cout << "test_val_cmp_swap" << (CHAR_BIT * sizeof(Type)) << "("
-                    << static_cast<uint64_t>(Value1) << ", "
-                    << static_cast<uint64_t>(Value2)
-                    << "): sz1=" << static_cast<uint64_t>(ResultSz1)
-                    << " llc1=" << static_cast<uint64_t>(ResultLlc1)
-                    << " sz2=" << static_cast<uint64_t>(ResultSz2)
-                    << " llc2=" << static_cast<uint64_t>(ResultLlc2)
-                    << "\n";
+  typedef Type (*FuncType)(volatile Type *, Type, Type);
+  static struct {
+    const char *Name;
+    FuncType FuncLlc;
+    FuncType FuncSz;
+  } Funcs[] = {{"val_cmp_swap", test_val_cmp_swap, Subzero_::test_val_cmp_swap},
+               {"val_cmp_swap_loop", test_val_cmp_swap_loop,
+                Subzero_::test_val_cmp_swap_loop}};
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    for (size_t i = 0; i < NumValues; ++i) {
+      Type Value1 = static_cast<Type>(Values[i]);
+      for (size_t j = 0; j < NumValues; ++j) {
+        Type Value2 = static_cast<Type>(Values[j]);
+        for (size_t f = 0; f < 2; ++f) {
+          bool flip = f;
+          ++TotalTests;
+          *AtomicLoc = Value1;
+          Type ResultSz1 =
+              Funcs[f].FuncSz(AtomicLoc, flip ? Value2 : Value1, Value2);
+          Type ResultSz2 = *AtomicLoc;
+          *AtomicLoc = Value1;
+          Type ResultLlc1 =
+              Funcs[f].FuncLlc(AtomicLoc, flip ? Value2 : Value1, Value2);
+          Type ResultLlc2 = *AtomicLoc;
+          if (ResultSz1 == ResultLlc1 && ResultSz2 == ResultLlc2) {
+            ++Passes;
+          } else {
+            ++Failures;
+            std::cout << "test_" << Funcs[f].Name << (CHAR_BIT * sizeof(Type))
+                      << "(" << static_cast<uint64_t>(Value1) << ", "
+                      << static_cast<uint64_t>(Value2)
+                      << "): sz1=" << static_cast<uint64_t>(ResultSz1)
+                      << " llc1=" << static_cast<uint64_t>(ResultLlc1)
+                      << " sz2=" << static_cast<uint64_t>(ResultSz2)
+                      << " llc2=" << static_cast<uint64_t>(ResultLlc2) << "\n";
+          }
         }
       }
     }
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 2f2897c..22915b2 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -104,7 +104,8 @@
 // added before any branch instruction, and also if the block ends
 // with a compare instruction followed by a branch instruction that we
 // may want to fuse, it's better to insert the new assignments before
-// the compare instruction.
+// the compare instruction. The tryOptimizedCmpxchgCmpBr() method
+// assumes this ordering of instructions.
 //
 // Note that this transformation takes the Phi dest variables out of
 // SSA form, as there may be assignments to the dest variable in
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 05ae121..2a9b8c4 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -38,12 +38,12 @@
   Inst->updateVars(getNode());
 }
 
-void LoweringContext::skipDeleted(InstList::iterator &I) {
+void LoweringContext::skipDeleted(InstList::iterator &I) const {
   while (I != End && (*I)->isDeleted())
     ++I;
 }
 
-void LoweringContext::advance(InstList::iterator &I) {
+void LoweringContext::advance(InstList::iterator &I) const {
   if (I != End) {
     ++I;
     skipDeleted(I);
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index da52adf..ec86cff 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -41,6 +41,12 @@
       return NULL;
     return *Next;
   }
+  Inst *getNextInst(InstList::iterator &Iter) const {
+    advance(Iter);
+    if (Iter == End)
+      return NULL;
+    return *Iter;
+  }
   CfgNode *getNode() const { return Node; }
   bool atEnd() const { return Cur == End; }
   InstList::iterator getCur() const { return Cur; }
@@ -68,8 +74,8 @@
   // End is a copy of Insts.end(), used if Next needs to be advanced.
   InstList::iterator End;
 
-  void skipDeleted(InstList::iterator &I);
-  void advance(InstList::iterator &I);
+  void skipDeleted(InstList::iterator &I) const;
+  void advance(InstList::iterator &I) const;
   LoweringContext(const LoweringContext &) LLVM_DELETED_FUNCTION;
   LoweringContext &operator=(const LoweringContext &) LLVM_DELETED_FUNCTION;
 };
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 7c60d08..2db795b 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -2654,12 +2654,9 @@
     Operand *PtrToMem = Instr->getArg(0);
     Operand *Expected = Instr->getArg(1);
     Operand *Desired = Instr->getArg(2);
+    if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
+      return;
     lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
-    // TODO(jvoung): If we peek ahead a few instructions and see how
-    // DestPrev is used (typically via another compare and branch),
-    // we may be able to optimize. If the result truly is used by a
-    // compare + branch, and the comparison is for equality, then we can
-    // optimize out the later compare, and fuse with the later branch.
     return;
   }
   case Intrinsics::AtomicFence:
@@ -2975,6 +2972,79 @@
   _mov(DestPrev, T_eax);
 }
 
+bool TargetX8632::tryOptimizedCmpxchgCmpBr(Variable *Dest, Operand *PtrToMem,
+                                           Operand *Expected,
+                                           Operand *Desired) {
+  if (Ctx->getOptLevel() == Opt_m1)
+    return false;
+  // Peek ahead a few instructions and see how Dest is used.
+  // It's very common to have:
+  //
+  // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
+  // [%y_phi = ...] // list of phi stores
+  // %p = icmp eq i32 %x, %expected
+  // br i1 %p, label %l1, label %l2
+  //
+  // which we can optimize into:
+  //
+  // %x = <cmpxchg code>
+  // [%y_phi = ...] // list of phi stores
+  // br eq, %l1, %l2
+  InstList::iterator I = Context.getCur();
+  // I is currently the InstIntrinsicCall. Peek past that.
+  // This assumes that the atomic cmpxchg has not been lowered yet,
+  // so that the instructions seen in the scan from "Cur" is simple.
+  assert(llvm::isa<InstIntrinsicCall>(*I));
+  Inst *NextInst = Context.getNextInst(I);
+  if (!NextInst)
+    return false;
+  // There might be phi assignments right before the compare+branch, since this
+  // could be a backward branch for a loop. This placement of assignments is
+  // determined by placePhiStores().
+  std::vector<InstAssign *> PhiAssigns;
+  while (InstAssign *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
+    if (PhiAssign->getDest() == Dest)
+      return false;
+    PhiAssigns.push_back(PhiAssign);
+    NextInst = Context.getNextInst(I);
+    if (!NextInst)
+      return false;
+  }
+  if (InstIcmp *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
+    if (!(NextCmp->getCondition() == InstIcmp::Eq &&
+          ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
+           (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
+      return false;
+    }
+    NextInst = Context.getNextInst(I);
+    if (!NextInst)
+      return false;
+    if (InstBr *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
+      if (!NextBr->isUnconditional() &&
+          NextCmp->getDest() == NextBr->getCondition() &&
+          NextBr->isLastUse(NextCmp->getDest())) {
+        lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
+        for (size_t i = 0; i < PhiAssigns.size(); ++i) {
+          // Lower the phi assignments now, before the branch (same placement
+          // as before).
+          InstAssign *PhiAssign = PhiAssigns[i];
+          lowerAssign(PhiAssign);
+          PhiAssign->setDeleted();
+          Context.advanceNext();
+        }
+        _br(InstX8632::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
+        // Skip over the old compare and branch, by deleting them.
+        NextCmp->setDeleted();
+        NextBr->setDeleted();
+        Context.advanceNext();
+        Context.advanceNext();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
                                  Operand *Ptr, Operand *Val) {
   bool NeedsCmpxchg = false;
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 6f09a90..daca0cd 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -97,8 +97,12 @@
   virtual void doAddressOptLoad();
   virtual void doAddressOptStore();
 
+  // Naive lowering of cmpxchg.
   void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
                           Operand *Desired);
+  // Attempt a more optimized lowering of cmpxchg. Returns true if optimized.
+  bool tryOptimizedCmpxchgCmpBr(Variable *DestPrev, Operand *Ptr,
+                                Operand *Expected, Operand *Desired);
   void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
                       Operand *Val);
   void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-cmpxchg-optimization.ll b/tests_lit/llvm2ice_tests/nacl-atomic-cmpxchg-optimization.ll
new file mode 100644
index 0000000..5972df9
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-cmpxchg-optimization.ll
@@ -0,0 +1,150 @@
+; This tests the optimization of atomic cmpxchg w/ following cmp + branches.
+
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=O2
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s --check-prefix=OM1
+; RUN: %llvm2ice -O2 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice -Om1 --verbose none %s \
+; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+
+declare i32 @llvm.nacl.atomic.cmpxchg.i32(i32*, i32, i32, i32, i32)
+
+
+; Test that a cmpxchg followed by icmp eq and branch can be optimized to
+; reuse the flags set by the cmpxchg instruction itself.
+; This is only expected to work w/ O2, based on lightweight liveness.
+; (Or if we had other means to detect the only use).
+declare void @use_value(i32);
+
+define i32 @test_atomic_cmpxchg_loop(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  br label %loop
+
+loop:
+  %expected_loop = phi i32 [ %expected, %entry ], [ %old, %loop ]
+  %succeeded_first_try = phi i32 [ 1, %entry ], [ 2, %loop ]
+  %ptr = inttoptr i32 %iptr to i32*
+  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected_loop,
+                                                i32 %desired, i32 6, i32 6)
+  %success = icmp eq i32 %expected_loop, %old
+  br i1 %success, label %done, label %loop
+
+done:
+  call void @use_value(i32 %old)
+  ret i32 %succeeded_first_try
+}
+; O2-LABEL: .Ltest_atomic_cmpxchg_loop{{.*}}loop
+; O2: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
+; O2-NOT: cmp
+; Make sure the phi assignment for succeeded_first_try is still there.
+; O2: mov {{.*}}, 2
+; O2-NOT: cmp
+; O2: je
+; O2-LABEL: .Ltest_atomic_cmpxchg_loop{{.*}}done
+; Make sure the call isn't accidentally deleted.
+; O2: call
+;
+; Check that the unopt version does have a cmp
+; OM1-LABEL: .Ltest_atomic_cmpxchg_loop{{.*}}loop
+; OM1: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
+; OM1: cmp
+; OM1: je
+; OM1-LABEL: .Ltest_atomic_cmpxchg_loop{{.*}}done
+; OM1: call
+
+; Still works if the compare operands are flipped.
+define i32 @test_atomic_cmpxchg_loop2(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  br label %loop
+
+loop:
+  %expected_loop = phi i32 [ %expected, %entry ], [ %old, %loop ]
+  %ptr = inttoptr i32 %iptr to i32*
+  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected_loop,
+                                                i32 %desired, i32 6, i32 6)
+  %success = icmp eq i32 %old, %expected_loop
+  br i1 %success, label %done, label %loop
+
+done:
+  ret i32 %old
+}
+; O2-LABEL: .Ltest_atomic_cmpxchg_loop2{{.*}}loop
+; O2: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
+; O2-NOT: cmp
+; O2: je
+
+
+; Still works if the compare operands are constants.
+define i32 @test_atomic_cmpxchg_loop_const(i32 %iptr, i32 %desired) {
+entry:
+  br label %loop
+
+loop:
+  %succeeded_first_try = phi i32 [ 1, %entry ], [ 0, %loop ]
+  %ptr = inttoptr i32 %iptr to i32*
+  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 0,
+                                                i32 %desired, i32 6, i32 6)
+  %success = icmp eq i32 %old, 0
+  br i1 %success, label %done, label %loop
+
+done:
+  ret i32 %succeeded_first_try
+}
+; O2-LABEL: .Ltest_atomic_cmpxchg_loop_const{{.*}}loop
+; O2: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
+; O2-NOT: cmp
+; O2: je
+
+; This is a case where the flags cannot be reused (compare is for some
+; other condition).
+define i32 @test_atomic_cmpxchg_no_opt(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  br label %loop
+
+loop:
+  %expected_loop = phi i32 [ %expected, %entry ], [ %old, %loop ]
+  %ptr = inttoptr i32 %iptr to i32*
+  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected_loop,
+                                                i32 %desired, i32 6, i32 6)
+  %success = icmp sgt i32 %old, %expected
+  br i1 %success, label %done, label %loop
+
+done:
+  ret i32 %old
+}
+; O2-LABEL: .Ltest_atomic_cmpxchg_no_opt{{.*}}loop
+; O2: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
+; O2: mov {{.*}}
+; O2: cmp
+; O2: jg
+
+; Another case where the flags cannot be reused (the comparison result
+; is used somewhere else).
+define i32 @test_atomic_cmpxchg_no_opt2(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+  br label %loop
+
+loop:
+  %expected_loop = phi i32 [ %expected, %entry ], [ %old, %loop ]
+  %ptr = inttoptr i32 %iptr to i32*
+  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected_loop,
+                                                i32 %desired, i32 6, i32 6)
+  %success = icmp eq i32 %old, %expected
+  br i1 %success, label %done, label %loop
+
+done:
+  %r = zext i1 %success to i32
+  ret i32 %r
+}
+; O2-LABEL: .Ltest_atomic_cmpxchg_no_opt2{{.*}}loop
+; O2: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
+; O2: mov {{.*}}
+; O2: cmp
+; O2: je
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ
diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
index a852e87..2c325df 100644
--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -2,7 +2,7 @@
 ; size allowed.
 
 ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
-; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=CHECKO2REM
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=CHECKO2
 ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
 ; RUN: %llvm2ice -O2 --verbose none %s \
 ; RUN:               | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
@@ -815,23 +815,6 @@
 ; CHECK-DAG: mov ebx
 ; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
 
-define i32 @test_atomic_cmpxchg_32_loop(i32 %iptr, i32 %expected, i32 %desired) {
-entry:
-  br label %loop
-
-loop:
-  %cmp = phi i32 [ %expected, %entry ], [ %old, %loop ]
-  %ptr = inttoptr i32 %iptr to i32*
-  %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %cmp,
-                                                i32 %desired, i32 6, i32 6)
-  %success = icmp eq i32 %cmp, %old
-  br i1 %success, label %done, label %loop
-
-done:
-  ret i32 %old
-}
-; CHECK-LABEL: test_atomic_cmpxchg_32_loop
-
 ;;;; Fence and is-lock-free.
 
 define void @test_atomic_fence() {
@@ -879,9 +862,9 @@
 ; CHECK-LABEL: test_atomic_is_lock_free_ignored
 ; CHECK: mov {{.*}}, 0
 ; This can get optimized out, because it's side-effect-free.
-; CHECKO2REM-LABEL: test_atomic_is_lock_free_ignored
-; CHECKO2REM-NOT: mov {{.*}}, 1
-; CHECKO2REM: mov {{.*}}, 0
+; CHECKO2-LABEL: test_atomic_is_lock_free_ignored
+; CHECKO2-NOT: mov {{.*}}, 1
+; CHECKO2: mov {{.*}}, 0
 
 ; TODO(jvoung): at some point we can take advantage of the
 ; fact that nacl.atomic.is.lock.free will resolve to a constant