Fix bug when atomic load is fused with an arith op (and not in the entry BB)

Normally, the FakeUse for preserving the atomic load ends
up on the load's Dest. However, for fused load+add, the load
is deleted, and its Dest is no longer defined. This trips
up the liveness analysis when it happens on a non-entry
block. So the FakeUse should be for the add's dest instead,
in that case.

We have no access to the add, so introduce a
getLastInserted() helper. A couple of ways to do that:
- modify insert() to track explicitly
- rewind from Next one step

Either that, or we disable the fusing for atomic loads.

BUG=  https://code.google.com/p/nativeclient/issues/detail?id=3882
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/417353003
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 2a9b8c4..0034de5 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -26,11 +26,12 @@
 
 void LoweringContext::init(CfgNode *N) {
   Node = N;
-  Cur = getNode()->getInsts().begin();
+  Begin = getNode()->getInsts().begin();
+  Cur = Begin;
   End = getNode()->getInsts().end();
   skipDeleted(Cur);
   Next = Cur;
-  advance(Next);
+  advanceForward(Next);
 }
 
 void LoweringContext::insert(Inst *Inst) {
@@ -43,13 +44,26 @@
     ++I;
 }
 
-void LoweringContext::advance(InstList::iterator &I) const {
+void LoweringContext::advanceForward(InstList::iterator &I) const {
   if (I != End) {
     ++I;
     skipDeleted(I);
   }
 }
 
+void LoweringContext::advanceBackward(InstList::iterator &I) const {
+  assert(I != Begin);
+  do {
+    --I;
+  } while (I != Begin && (*I)->isDeleted());
+}
+
+Inst *LoweringContext::getLastInserted() const {
+  InstList::iterator Cursor = Next;
+  advanceBackward(Cursor);
+  return *Cursor;
+}
+
 TargetLowering *TargetLowering::createLowering(TargetArch Target, Cfg *Func) {
   // These statements can be #ifdef'd to specialize the code generator
   // to a subset of the available targets.  TODO: use CRTP.
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index ec86cff..91512a9 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -42,7 +42,7 @@
     return *Next;
   }
   Inst *getNextInst(InstList::iterator &Iter) const {
-    advance(Iter);
+    advanceForward(Iter);
     if (Iter == End)
       return NULL;
     return *Iter;
@@ -52,8 +52,9 @@
   InstList::iterator getCur() const { return Cur; }
   InstList::iterator getEnd() const { return End; }
   void insert(Inst *Inst);
+  Inst *getLastInserted() const;
   void advanceCur() { Cur = Next; }
-  void advanceNext() { advance(Next); }
+  void advanceNext() { advanceForward(Next); }
   void setInsertPoint(const InstList::iterator &Position) { Next = Position; }
 
 private:
@@ -71,11 +72,14 @@
   // insertion point", to avoid confusion when previously-deleted
   // instructions come between the two points.
   InstList::iterator Next;
+  // Begin is a copy of Insts.begin(), used if iterators are moved backward.
+  InstList::iterator Begin;
   // End is a copy of Insts.end(), used if Next needs to be advanced.
   InstList::iterator End;
 
   void skipDeleted(InstList::iterator &I) const;
-  void advance(InstList::iterator &I) const;
+  void advanceForward(InstList::iterator &I) const;
+  void advanceBackward(InstList::iterator &I) const;
   LoweringContext(const LoweringContext &) LLVM_DELETED_FUNCTION;
   LoweringContext &operator=(const LoweringContext &) LLVM_DELETED_FUNCTION;
 };
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 2db795b..00db25a 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -2724,15 +2724,18 @@
       // Then cast the bits back out of the XMM register to the i64 Dest.
       InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
       lowerCast(Cast);
-      // Make sure that the atomic load isn't elided.
+      // Make sure that the atomic load isn't elided when unused.
       Context.insert(InstFakeUse::create(Func, Dest->getLo()));
       Context.insert(InstFakeUse::create(Func, Dest->getHi()));
       return;
     }
     InstLoad *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
     lowerLoad(Load);
-    // Make sure the atomic load isn't elided.
-    Context.insert(InstFakeUse::create(Func, Dest));
+    // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
+    // Since lowerLoad may fuse the load w/ an arithmetic instruction,
+    // insert the FakeUse on the last-inserted instruction's dest.
+    Context.insert(InstFakeUse::create(Func,
+                                       Context.getLastInserted()->getDest()));
     return;
   }
   case Intrinsics::AtomicRMW:
diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
index 2c325df..82f975f 100644
--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -88,6 +88,9 @@
 
 define i32 @test_atomic_load_32_with_arith(i32 %iptr) {
 entry:
+  br label %next
+
+next:
   %ptr = inttoptr i32 %iptr to i32*
   %r = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
   %r2 = add i32 %r, 32
@@ -96,6 +99,11 @@
 ; CHECK-LABEL: test_atomic_load_32_with_arith
 ; CHECK: mov {{.*}}, dword
 ; The next instruction may be a separate load or folded into an add.
+;
+; In O2 mode, we know that the load and add are going to be fused.
+; CHECKO2-LABEL: test_atomic_load_32_with_arith
+; CHECKO2: mov {{.*}}, dword
+; CHECKO2: add {{.*}}, dword
 
 define i32 @test_atomic_load_32_ignored(i32 %iptr) {
 entry:
@@ -106,6 +114,9 @@
 ; CHECK-LABEL: test_atomic_load_32_ignored
 ; CHECK: mov {{.*}}, dword
 ; CHECK: mov {{.*}}, dword
+; CHECKO2-LABEL: test_atomic_load_32_ignored
+; CHECKO2: mov {{.*}}, dword
+; CHECKO2: mov {{.*}}, dword
 
 define i64 @test_atomic_load_64_ignored(i32 %iptr) {
 entry: