Subzero: Transform suitable Load/Arith/Store sequences into RMW ops.

Search for sequences of Load/Arith/Store instructions that can be transformed into single non-atomic Read-Modify-Write instructions.  Corresponding operands must match up, and it is limited to the operator/type combinations that have simple lowerings.

For suitable sequences, an RMW pseudo-instruction is added.  Extra variables are attached to the RMW instruction and the original Store instruction, to make it easy to figure out whether to retain the original Store instruction or the new RMW instruction (but never both).

The RMW instructions are similar to their non-RMW counterparts, except that the RMW instruction has no Dest variable - the Src[0] operand doubles as the memory-operand dest.

The x86-32 integrated assembler has some new forms of existing instructions added.

Note: this CL puts the machinery in place to identify, lower, and emit RMW operations only for the "add" instruction operating on i32/i16/i8 operands.  The next CL will fill in the rest of the options.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/1182603004
diff --git a/src/IceAssemblerX8632.cpp b/src/IceAssemblerX8632.cpp
index a45e7cc..48f023c 100644
--- a/src/IceAssemblerX8632.cpp
+++ b/src/IceAssemblerX8632.cpp
@@ -1759,6 +1759,29 @@
   emitComplex(Ty, 0, Operand(reg), imm);
 }
 
+void AssemblerX8632::add(Type Ty, const Address &address, GPRRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  if (Ty == IceType_i16)
+    emitOperandSizeOverride();
+  if (isByteSizedArithType(Ty))
+    emitUint8(0x00);
+  else
+    emitUint8(0x01);
+  emitOperand(reg, address);
+}
+
+void AssemblerX8632::add(Type Ty, const Address &address,
+                         const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  if (isByteSizedArithType(Ty)) {
+    emitComplexI8(0, address, imm);
+    return;
+  }
+  if (Ty == IceType_i16)
+    emitOperandSizeOverride();
+  emitComplex(Ty, 0, address, imm);
+}
+
 void AssemblerX8632::adc(Type Ty, GPRRegister dst, GPRRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   if (Ty == IceType_i16)
diff --git a/src/IceAssemblerX8632.h b/src/IceAssemblerX8632.h
index c67f4ce..346000d 100644
--- a/src/IceAssemblerX8632.h
+++ b/src/IceAssemblerX8632.h
@@ -720,6 +720,8 @@
   void add(Type Ty, GPRRegister dst, GPRRegister src);
   void add(Type Ty, GPRRegister reg, const Address &address);
   void add(Type Ty, GPRRegister reg, const Immediate &imm);
+  void add(Type Ty, const Address &address, GPRRegister reg);
+  void add(Type Ty, const Address &address, const Immediate &imm);
 
   void adc(Type Ty, GPRRegister dst, GPRRegister src);
   void adc(Type Ty, GPRRegister dst, const Address &address);
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 12de452..2cbe571 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -433,8 +433,12 @@
       // register.  This is accomplished by extending the entry
       // block's instruction range from [2,n) to [1,n) which will
       // transform the problematic [2,2) live ranges into [1,2).
-      if (FirstInstNum == Inst::NumberInitial)
+      if (Node == getEntryNode()) {
+        // TODO(stichnot): Make it a strict requirement that the entry
+        // node gets the lowest instruction numbers, so that extending
+        // the live range for in-args is guaranteed to work.
         FirstInstNum = Inst::NumberExtended;
+      }
       Node->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum);
     }
   }
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index 5cd4a6b..d74ae82 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -211,6 +211,7 @@
         clEnumValN(Ice::IceV_AddrOpt, "addropt", "Address mode optimization"),
         clEnumValN(Ice::IceV_Random, "random", "Randomization details"),
         clEnumValN(Ice::IceV_Folding, "fold", "Instruction folding details"),
+        clEnumValN(Ice::IceV_RMW, "rmw", "ReadModifyWrite optimization"),
         clEnumValN(Ice::IceV_All, "all", "Use all verbose options"),
         clEnumValN(Ice::IceV_Most, "most",
                    "Use all verbose options except 'regalloc'"),
diff --git a/src/IceDefs.h b/src/IceDefs.h
index 71a23f6..0a1fdd1 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -173,6 +173,7 @@
   IceV_AddrOpt = 1 << 9,
   IceV_Random = 1 << 10,
   IceV_Folding = 1 << 11,
+  IceV_RMW = 1 << 12,
   IceV_All = ~IceV_None,
   IceV_Most = IceV_All & ~IceV_LinearScan
 };
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index de082bb..30d5066 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -422,9 +422,16 @@
 }
 
 InstStore::InstStore(Cfg *Func, Operand *Data, Operand *Addr)
-    : InstHighLevel(Func, Inst::Store, 2, nullptr) {
+    : InstHighLevel(Func, Inst::Store, 3, nullptr) {
   addSource(Data);
   addSource(Addr);
+  // The 3rd operand is a dummy placeholder for the RMW beacon.
+  addSource(Data);
+}
+
+void InstStore::setRmwBeacon(Variable *Beacon) {
+  Dest = llvm::dyn_cast<Variable>(getData());
+  Srcs[2] = Beacon;
 }
 
 InstSwitch::InstSwitch(Cfg *Func, SizeT NumCases, Operand *Source,
@@ -747,11 +754,18 @@
     return;
   Ostream &Str = Func->getContext()->getStrDump();
   Type Ty = getData()->getType();
+  dumpDest(Func);
+  if (Dest)
+    Str << " = ";
   Str << "store " << Ty << " ";
   getData()->dump(Func);
   Str << ", " << Ty << "* ";
   getAddr()->dump(Func);
   Str << ", align " << typeAlignInBytes(Ty);
+  if (getRmwBeacon()) {
+    Str << ", beacon ";
+    getRmwBeacon()->dump(Func);
+  }
 }
 
 void InstSwitch::dump(const Cfg *Func) const {
diff --git a/src/IceInst.h b/src/IceInst.h
index 948c5db..99a8653 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -691,13 +691,15 @@
 
 public:
   static InstStore *create(Cfg *Func, Operand *Data, Operand *Addr,
-                           uint32_t align = 1) {
+                           uint32_t Align = 1) {
     // TODO(kschimpf) Stop ignoring alignment specification.
-    (void)align;
+    (void)Align;
     return new (Func->allocate<InstStore>()) InstStore(Func, Data, Addr);
   }
   Operand *getAddr() const { return getSrc(1); }
   Operand *getData() const { return getSrc(0); }
+  Variable *getRmwBeacon() const { return llvm::dyn_cast<Variable>(getSrc(2)); }
+  void setRmwBeacon(Variable *Beacon);
   void dump(const Cfg *Func) const override;
   static bool classof(const Inst *Inst) { return Inst->getKind() == Store; }
 
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 1365d54..09d2ef4 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -111,6 +111,14 @@
   }
 }
 
+InstX8632FakeRMW::InstX8632FakeRMW(Cfg *Func, Operand *Data, Operand *Addr,
+                                   InstArithmetic::OpKind Op, Variable *Beacon)
+    : InstX8632(Func, InstX8632::FakeRMW, 3, nullptr), Op(Op) {
+  addSource(Data);
+  addSource(Addr);
+  addSource(Beacon);
+}
+
 InstX8632AdjustStack::InstX8632AdjustStack(Cfg *Func, SizeT Amount,
                                            Variable *Esp)
     : InstX8632(Func, InstX8632::Adjuststack, 1, Esp), Amount(Amount) {
@@ -370,6 +378,19 @@
   Inst::dump(Func);
 }
 
+void InstX8632FakeRMW::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty = getData()->getType();
+  Str << "rmw " << InstArithmetic::getOpName(getOp()) << " " << Ty << " *";
+  getAddr()->dump(Func);
+  Str << ", ";
+  getData()->dump(Func);
+  Str << ", beacon=";
+  getBeacon()->dump(Func);
+}
+
 void InstX8632Label::emit(const Cfg *Func) const {
   if (!ALLOW_DUMP)
     return;
@@ -589,7 +610,9 @@
     return;
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(Inst->getSrcSize() == 2);
-  Variable *Dest = Inst->getDest();
+  Operand *Dest = Inst->getDest();
+  if (Dest == nullptr)
+    Dest = Inst->getSrc(0);
   assert(Dest == Inst->getSrc(0));
   Operand *Src1 = Inst->getSrc(1);
   Str << "\t" << Opcode << InstX8632::getWidthString(Dest->getType()) << "\t";
@@ -911,6 +934,7 @@
 template <> const char *InstX8632Movq::Opcode = "movq";
 // Binary ops
 template <> const char *InstX8632Add::Opcode = "add";
+template <> const char *InstX8632AddRMW::Opcode = "add";
 template <> const char *InstX8632Addps::Opcode = "addps";
 template <> const char *InstX8632Adc::Opcode = "adc";
 template <> const char *InstX8632Addss::Opcode = "addss";
@@ -994,6 +1018,9 @@
     &X8632::AssemblerX8632::add, &X8632::AssemblerX8632::add,
     &X8632::AssemblerX8632::add};
 template <>
+const X8632::AssemblerX8632::GPREmitterAddrOp InstX8632AddRMW::Emitter = {
+    &X8632::AssemblerX8632::add, &X8632::AssemblerX8632::add};
+template <>
 const X8632::AssemblerX8632::GPREmitterRegOp InstX8632Adc::Emitter = {
     &X8632::AssemblerX8632::adc, &X8632::AssemblerX8632::adc,
     &X8632::AssemblerX8632::adc};
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 840b21b..d9a8419 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -176,6 +176,7 @@
     k__Start = Inst::Target,
     Adc,
     Add,
+    AddRMW,
     Addps,
     Addss,
     Adjuststack,
@@ -195,6 +196,7 @@
     Div,
     Divps,
     Divss,
+    FakeRMW,
     Fld,
     Fstp,
     Icmp,
@@ -312,6 +314,40 @@
   }
 };
 
+// InstX8632FakeRMW represents a non-atomic read-modify-write operation on a
+// memory location.  An InstX8632FakeRMW is a "fake" instruction in that it
+// still needs to be lowered to some actual RMW instruction.
+//
+// If A is some memory address, D is some data value to apply, and OP is an
+// arithmetic operator, the instruction operates as: (*A) = (*A) OP D
+class InstX8632FakeRMW : public InstX8632 {
+  InstX8632FakeRMW() = delete;
+  InstX8632FakeRMW(const InstX8632FakeRMW &) = delete;
+  InstX8632FakeRMW &operator=(const InstX8632FakeRMW &) = delete;
+
+public:
+  static InstX8632FakeRMW *create(Cfg *Func, Operand *Data, Operand *Addr,
+                                  Variable *Beacon, InstArithmetic::OpKind Op,
+                                  uint32_t Align = 1) {
+    // TODO(stichnot): Stop ignoring alignment specification.
+    (void)Align;
+    return new (Func->allocate<InstX8632FakeRMW>())
+        InstX8632FakeRMW(Func, Data, Addr, Op, Beacon);
+  }
+  Operand *getAddr() const { return getSrc(1); }
+  Operand *getData() const { return getSrc(0); }
+  InstArithmetic::OpKind getOp() const { return Op; }
+  Variable *getBeacon() const { return llvm::cast<Variable>(getSrc(2)); }
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, FakeRMW); }
+
+private:
+  InstArithmetic::OpKind Op;
+  InstX8632FakeRMW(Cfg *Func, Operand *Data, Operand *Addr,
+                   InstArithmetic::OpKind Op, Variable *Beacon);
+  ~InstX8632FakeRMW() override {}
+};
+
 // InstX8632Label represents an intra-block label that is the target
 // of an intra-block branch.  The offset between the label and the
 // branch must be fit into one byte (considered "near").  These are
@@ -515,6 +551,9 @@
 // Emit a one-operand (GPR) instruction.
 void emitIASOpTyGPR(const Cfg *Func, Type Ty, const Operand *Var,
                     const X8632::AssemblerX8632::GPREmitterOneOp &Emitter);
+void emitIASAsAddrOpTyGPR(
+    const Cfg *Func, Type Ty, const Operand *Op0, const Operand *Op1,
+    const X8632::AssemblerX8632::GPREmitterAddrOp &Emitter);
 
 // Instructions of the form x := op(x).
 template <InstX8632::InstKindX8632 K>
@@ -765,6 +804,50 @@
   static const X8632::AssemblerX8632::GPREmitterRegOp Emitter;
 };
 
+template <InstX8632::InstKindX8632 K>
+class InstX8632BinopRMW : public InstX8632 {
+  InstX8632BinopRMW() = delete;
+  InstX8632BinopRMW(const InstX8632BinopRMW &) = delete;
+  InstX8632BinopRMW &operator=(const InstX8632BinopRMW &) = delete;
+
+public:
+  // Create an ordinary binary-op instruction like add or sub.
+  static InstX8632BinopRMW *create(Cfg *Func, OperandX8632Mem *DestSrc0,
+                                   Operand *Src1) {
+    return new (Func->allocate<InstX8632BinopRMW>())
+        InstX8632BinopRMW(Func, DestSrc0, Src1);
+  }
+  void emit(const Cfg *Func) const override {
+    if (!ALLOW_DUMP)
+      return;
+    const bool ShiftHack = false;
+    emitTwoAddress(Opcode, this, Func, ShiftHack);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    Type Ty = getSrc(0)->getType();
+    assert(getSrcSize() == 2);
+    emitIASAsAddrOpTyGPR(Func, Ty, getSrc(0), getSrc(1), Emitter);
+  }
+  void dump(const Cfg *Func) const override {
+    if (!ALLOW_DUMP)
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    Str << Opcode << "." << getSrc(0)->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstX8632BinopRMW(Cfg *Func, OperandX8632Mem *DestSrc0, Operand *Src1)
+      : InstX8632(Func, K, 2, nullptr) {
+    addSource(DestSrc0);
+    addSource(Src1);
+  }
+  ~InstX8632BinopRMW() override {}
+  static const char *Opcode;
+  static const X8632::AssemblerX8632::GPREmitterAddrOp Emitter;
+};
+
 template <InstX8632::InstKindX8632 K, bool NeedsElementType>
 class InstX8632BinopXmm : public InstX8632 {
   InstX8632BinopXmm() = delete;
@@ -1017,6 +1100,7 @@
 // Movq - copy between XMM registers, or mem64 and XMM registers.
 typedef InstX8632Movlike<InstX8632::Movq> InstX8632Movq;
 typedef InstX8632BinopGPR<InstX8632::Add> InstX8632Add;
+typedef InstX8632BinopRMW<InstX8632::AddRMW> InstX8632AddRMW;
 typedef InstX8632BinopXmm<InstX8632::Addps, true> InstX8632Addps;
 typedef InstX8632BinopGPR<InstX8632::Adc> InstX8632Adc;
 typedef InstX8632BinopXmm<InstX8632::Addss, false> InstX8632Addss;
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 52017ea..5355a33 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -130,10 +130,13 @@
   assert(!Context.atEnd());
   Inst *Inst = Context.getCur();
   Inst->deleteIfDead();
-  if (!Inst->isDeleted()) {
+  if (!Inst->isDeleted() && !llvm::isa<InstFakeDef>(Inst) &&
+      !llvm::isa<InstFakeUse>(Inst)) {
     // Mark the current instruction as deleted before lowering,
     // otherwise the Dest variable will likely get marked as non-SSA.
-    // See Variable::setDefinition().
+    // See Variable::setDefinition().  However, just pass-through
+    // FakeDef and FakeUse instructions that might have been inserted
+    // prior to lowering.
     Inst->setDeleted();
     switch (Inst->getKind()) {
     case Inst::Alloca:
@@ -194,15 +197,8 @@
     case Inst::Unreachable:
       lowerUnreachable(llvm::cast<InstUnreachable>(Inst));
       break;
-    case Inst::BundleLock:
-    case Inst::BundleUnlock:
-    case Inst::FakeDef:
-    case Inst::FakeUse:
-    case Inst::FakeKill:
-    case Inst::Target:
-      // These are all Target instruction types and shouldn't be
-      // encountered at this stage.
-      Func->setError("Can't lower unsupported instruction type");
+    default:
+      lowerOther(Inst);
       break;
     }
 
@@ -213,6 +209,11 @@
   Context.advanceNext();
 }
 
+void TargetLowering::lowerOther(const Inst *Instr) {
+  (void)Instr;
+  Func->setError("Can't lower unsupported instruction type");
+}
+
 // Drives register allocation, allowing all physical registers (except
 // perhaps for the frame pointer) to be allocated.  This set of
 // registers could potentially be parameterized if we want to restrict
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index fa6f5b7..8ce2c2f 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -248,6 +248,7 @@
   virtual void lowerStore(const InstStore *Inst) = 0;
   virtual void lowerSwitch(const InstSwitch *Inst) = 0;
   virtual void lowerUnreachable(const InstUnreachable *Inst) = 0;
+  virtual void lowerOther(const Inst *Instr);
 
   virtual void doAddressOptLoad() {}
   virtual void doAddressOptStore() {}
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index c57374b..ba519e8 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -465,6 +465,12 @@
   Func->getVMetadata()->init(VMK_SingleDefs);
   Func->doAddressOpt();
 
+  // Find read-modify-write opportunities.  Do this after address mode
+  // optimization so that doAddressOpt() doesn't need to be applied to RMW
+  // instructions as well.
+  findRMW();
+  Func->dump("After RMW transform");
+
   // Argument lowering
   Func->doArgLowering();
 
@@ -579,6 +585,146 @@
 
 namespace {
 
+bool canRMW(const InstArithmetic *Arith) {
+  Type Ty = Arith->getDest()->getType();
+  bool isI64 = Ty == IceType_i64;
+  bool isVector = isVectorType(Ty);
+
+  switch (Arith->getOp()) {
+  // Not handled for lack of simple lowering:
+  //   shift on i64 and vectors
+  //   mul, udiv, urem, sdiv, srem, frem
+  default:
+    return false;
+  case InstArithmetic::Add:
+    return !isI64 && !isVector; // TODO(stichnot): implement i64 and vector
+  case InstArithmetic::Sub:
+  case InstArithmetic::And:
+  case InstArithmetic::Or:
+  case InstArithmetic::Xor:
+  case InstArithmetic::Fadd:
+  case InstArithmetic::Fsub:
+  case InstArithmetic::Fmul:
+  case InstArithmetic::Fdiv:
+    return false; // TODO(stichnot): implement
+    return true;
+  case InstArithmetic::Shl:
+  case InstArithmetic::Lshr:
+  case InstArithmetic::Ashr:
+    return false; // TODO(stichnot): implement
+    return !isI64 && !isVector;
+  }
+}
+
+bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
+  if (A == B)
+    return true;
+  if (auto *MemA = llvm::dyn_cast<OperandX8632Mem>(A)) {
+    if (auto *MemB = llvm::dyn_cast<OperandX8632Mem>(B)) {
+      return MemA->getBase() == MemB->getBase() &&
+             MemA->getOffset() == MemB->getOffset() &&
+             MemA->getIndex() == MemB->getIndex() &&
+             MemA->getShift() == MemB->getShift() &&
+             MemA->getSegmentRegister() == MemB->getSegmentRegister();
+    }
+  }
+  return false;
+}
+
+} // end of anonymous namespace
+
+void TargetX8632::findRMW() {
+  Func->dump("Before RMW");
+  OstreamLocker L(Func->getContext());
+  Ostream &Str = Func->getContext()->getStrDump();
+  for (CfgNode *Node : Func->getNodes()) {
+    // Walk through the instructions, considering each sequence of 3
+    // instructions, and look for the particular RMW pattern.  Note that this
+    // search can be "broken" (false negatives) if there are intervening deleted
+    // instructions, or intervening instructions that could be safely moved out
+    // of the way to reveal an RMW pattern.
+    auto E = Node->getInsts().end();
+    auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
+    for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
+      // Make I3 skip over deleted instructions.
+      while (I3 != E && I3->isDeleted())
+        ++I3;
+      if (I1 == E || I2 == E || I3 == E)
+        continue;
+      assert(!I1->isDeleted());
+      assert(!I2->isDeleted());
+      assert(!I3->isDeleted());
+      if (auto *Load = llvm::dyn_cast<InstLoad>(I1)) {
+        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(I2)) {
+          if (auto *Store = llvm::dyn_cast<InstStore>(I3)) {
+            // Look for:
+            //   a = Load addr
+            //   b = <op> a, other
+            //   Store b, addr
+            // Change to:
+            //   a = Load addr
+            //   b = <op> a, other
+            //   x = FakeDef
+            //   RMW <op>, addr, other, x
+            //   b = Store b, addr, x
+            // Note that inferTwoAddress() makes sure setDestNonKillable() gets
+            // called on the updated Store instruction, to avoid liveness
+            // problems later.
+            //
+            // With this transformation, the Store instruction acquires a Dest
+            // variable and is now subject to dead code elimination if there are
+            // no more uses of "b".  Variable "x" is a beacon for determining
+            // whether the Store instruction gets dead-code eliminated.  If the
+            // Store instruction is eliminated, then it must be the case that
+            // the RMW instruction ends x's live range, and therefore the RMW
+            // instruction will be retained and later lowered.  On the other
+            // hand, if the RMW instruction does not end x's live range, then
+            // the Store instruction must still be present, and therefore the
+            // RMW instruction is ignored during lowering because it is
+            // redundant with the Store instruction.
+            //
+            // Note that if "a" has further uses, the RMW transformation may
+            // still trigger, resulting in two loads and one store, which is
+            // worse than the original one load and one store.  However, this is
+            // probably rare, and caching probably keeps it just as fast.
+            if (!isSameMemAddressOperand(Load->getSourceAddress(),
+                                         Store->getAddr()))
+              continue;
+            if (false && Load->getSourceAddress() != Store->getAddr())
+              continue;
+            if (Arith->getSrc(0) != Load->getDest())
+              continue;
+            if (Arith->getDest() != Store->getData())
+              continue;
+            if (!canRMW(Arith))
+              continue;
+            if (Func->isVerbose(IceV_RMW)) {
+              Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
+              Load->dump(Func);
+              Str << "\n  ";
+              Arith->dump(Func);
+              Str << "\n  ";
+              Store->dump(Func);
+              Str << "\n";
+            }
+            Variable *Beacon = Func->makeVariable(IceType_i32);
+            Beacon->setWeight(0);
+            Store->setRmwBeacon(Beacon);
+            InstFakeDef *BeaconDef = InstFakeDef::create(Func, Beacon);
+            Node->getInsts().insert(I3, BeaconDef);
+            InstX8632FakeRMW *RMW = InstX8632FakeRMW::create(
+                Func, Arith->getSrc(1), Store->getAddr(), Beacon,
+                Arith->getOp());
+            Node->getInsts().insert(I3, RMW);
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace {
+
 // Converts a ConstantInteger32 operand into its constant value, or
 // MemoryOrderInvalid if the operand is not a ConstantInteger32.
 uint64_t getConstantMemoryOrder(Operand *Opnd) {
@@ -4394,7 +4540,10 @@
     Constant *OffsetOp = Ctx->getConstantInt32(Offset);
     Addr = OperandX8632Mem::create(Func, Data->getType(), Base, OffsetOp, Index,
                                    Shift, SegmentReg);
-    Context.insert(InstStore::create(Func, Data, Addr));
+    InstStore *NewStore = InstStore::create(Func, Data, Addr);
+    if (Inst->getDest())
+      NewStore->setRmwBeacon(Inst->getRmwBeacon());
+    Context.insert(NewStore);
   }
 }
 
@@ -4497,6 +4646,46 @@
 
 void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) { _ud2(); }
 
+void TargetX8632::lowerRMW(const InstX8632FakeRMW *RMW) {
+  // If the beacon variable's live range does not end in this
+  // instruction, then it must end in the modified Store instruction
+  // that follows.  This means that the original Store instruction is
+  // still there, either because the value being stored is used beyond
+  // the Store instruction, or because dead code elimination did not
+  // happen.  In either case, we cancel RMW lowering (and the caller
+  // deletes the RMW instruction).
+  if (!RMW->isLastUse(RMW->getBeacon()))
+    return;
+  Operand *Src = RMW->getData();
+  Type Ty = Src->getType();
+  OperandX8632Mem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
+  if (Ty == IceType_i64) {
+    // TODO(stichnot): Implement.
+  } else if (isVectorType(Ty)) {
+    // TODO(stichnot): Implement.
+  } else {
+    // i8, i16, i32, f32, f64
+    switch (RMW->getOp()) {
+    default:
+      // TODO(stichnot): Implement other arithmetic operators.
+      break;
+    case InstArithmetic::Add:
+      Src = legalize(Src, Legal_Reg | Legal_Imm);
+      _add_rmw(Addr, Src);
+      return;
+    }
+  }
+  llvm::report_fatal_error("Couldn't lower RMW instruction");
+}
+
+void TargetX8632::lowerOther(const Inst *Instr) {
+  if (const auto *RMW = llvm::dyn_cast<InstX8632FakeRMW>(Instr)) {
+    lowerRMW(RMW);
+  } else {
+    TargetLowering::lowerOther(Instr);
+  }
+}
+
 // Turn an i64 Phi instruction into a pair of i32 Phi instructions, to
 // preserve integrity of liveness analysis.  Undef values are also
 // turned into zeroes, since loOperand() and hiOperand() don't expect
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index b9a9e2b..8394516 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -180,6 +180,8 @@
   void lowerStore(const InstStore *Inst) override;
   void lowerSwitch(const InstSwitch *Inst) override;
   void lowerUnreachable(const InstUnreachable *Inst) override;
+  void lowerOther(const Inst *Instr) override;
+  void lowerRMW(const InstX8632FakeRMW *RMW);
   void prelowerPhis() override;
   void lowerPhiAssignments(CfgNode *Node,
                            const AssignList &Assignments) override;
@@ -265,6 +267,9 @@
   void _add(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Add::create(Func, Dest, Src0));
   }
+  void _add_rmw(OperandX8632Mem *DestSrc0, Operand *Src1) {
+    Context.insert(InstX8632AddRMW::create(Func, DestSrc0, Src1));
+  }
   void _adjust_stack(int32_t Amount) {
     Context.insert(InstX8632AdjustStack::create(
         Func, Amount, getPhysicalRegister(RegX8632::Reg_esp)));
@@ -561,6 +566,7 @@
   }
 
   bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1);
+  void findRMW();
 
   X86InstructionSet InstructionSet;
   bool IsEbpBasedFrame;
diff --git a/tests_lit/llvm2ice_tests/ias-multi-reloc.ll b/tests_lit/llvm2ice_tests/ias-multi-reloc.ll
index c5b0fc2..11a708e 100644
--- a/tests_lit/llvm2ice_tests/ias-multi-reloc.ll
+++ b/tests_lit/llvm2ice_tests/ias-multi-reloc.ll
@@ -26,6 +26,7 @@
 ; CHECK: .long p_global_char
 ; CHECK: .long global_char
 
+; Also exercises the RMW add operation.
 define internal void @add_in_place() {
 entry:
   %p_global_char.bc = bitcast [4 x i8]* @p_global_char to i32*
@@ -37,8 +38,8 @@
   ret void
 }
 ; CHECK-LABEL: add_in_place
-; CHECK: .long global_char
 ; CHECK: .long p_global_char
+; CHECK-NEXT: .long global_char
 
 define internal void @cmp_global_immediate() {
 entry:
diff --git a/tests_lit/llvm2ice_tests/rmw.ll b/tests_lit/llvm2ice_tests/rmw.ll
new file mode 100644
index 0000000..321f612
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/rmw.ll
@@ -0,0 +1,104 @@
+; This tests Read-Modify-Write (RMW) detection and lowering at the O2
+; optimization level.
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+define internal void @rmw_add_i32_var(i32 %addr_arg, i32 %var) {
+entry:
+  %addr = inttoptr i32 %addr_arg to i32*
+  %val = load i32, i32* %addr, align 1
+  %rmw = add i32 %val, %var
+  store i32 %rmw, i32* %addr, align 1
+  ret void
+}
+; Look for something like: add DWORD PTR [eax],ecx
+; CHECK-LABEL: rmw_add_i32_var
+; CHECK: add DWORD PTR [e{{ax|bx|cx|dx|bp|di|si}}],e{{ax|bx|cx|dx|bp|di|si}}
+
+define internal void @rmw_add_i32_imm(i32 %addr_arg) {
+entry:
+  %addr = inttoptr i32 %addr_arg to i32*
+  %val = load i32, i32* %addr, align 1
+  %rmw = add i32 %val, 19
+  store i32 %rmw, i32* %addr, align 1
+  ret void
+}
+; Look for something like: add DWORD PTR [eax],0x13
+; CHECK-LABEL: rmw_add_i32_imm
+; CHECK: add DWORD PTR [e{{ax|bx|cx|dx|bp|di|si}}],0x13
+
+define internal i32 @no_rmw_add_i32_var(i32 %addr_arg, i32 %var) {
+entry:
+  %addr = inttoptr i32 %addr_arg to i32*
+  %val = load i32, i32* %addr, align 1
+  %rmw = add i32 %val, %var
+  store i32 %rmw, i32* %addr, align 1
+  ret i32 %rmw
+}
+; CHECK-LABEL: no_rmw_add_i32_var
+; CHECK: add e{{ax|bx|cx|dx|bp|di|si}},DWORD PTR [e{{ax|bx|cx|dx|bp|di|si}}]
+
+define internal void @rmw_add_i16_var(i32 %addr_arg, i16 %var) {
+entry:
+  %addr = inttoptr i32 %addr_arg to i16*
+  %val = load i16, i16* %addr, align 1
+  %rmw = add i16 %val, %var
+  store i16 %rmw, i16* %addr, align 1
+  ret void
+}
+; Look for something like: add WORD PTR [eax],cx
+; CHECK-LABEL: rmw_add_i16_var
+; CHECK: add WORD PTR [e{{ax|bx|cx|dx|bp|di|si}}],{{ax|bx|cx|dx|bp|di|si}}
+
+define internal void @rmw_add_i16_imm(i32 %addr_arg) {
+entry:
+  %addr = inttoptr i32 %addr_arg to i16*
+  %val = load i16, i16* %addr, align 1
+  %rmw = add i16 %val, 19
+  store i16 %rmw, i16* %addr, align 1
+  ret void
+}
+; Look for something like: add WORD PTR [eax],0x13
+; CHECK-LABEL: rmw_add_i16_imm
+; CHECK: add WORD PTR [e{{ax|bx|cx|dx|bp|di|si}}],0x13
+
+define internal void @rmw_add_i8_var(i32 %addr_arg, i8 %var) {
+entry:
+  %addr = inttoptr i32 %addr_arg to i8*
+  %val = load i8, i8* %addr, align 1
+  %rmw = add i8 %val, %var
+  store i8 %rmw, i8* %addr, align 1
+  ret void
+}
+; Look for something like: add BYTE PTR [eax],cl
+; CHECK-LABEL: rmw_add_i8_var
+; CHECK: add BYTE PTR [e{{ax|bx|cx|dx|bp|di|si}}],{{al|bl|cl|dl}}
+
+define internal void @rmw_add_i8_imm(i32 %addr_arg) {
+entry:
+  %addr = inttoptr i32 %addr_arg to i8*
+  %val = load i8, i8* %addr, align 1
+  %rmw = add i8 %val, 19
+  store i8 %rmw, i8* %addr, align 1
+  ret void
+}
+; Look for something like: add BYTE PTR [eax],0x13
+; CHECK-LABEL: rmw_add_i8_imm
+; CHECK: add BYTE PTR [e{{ax|bx|cx|dx|bp|di|si}}],0x13
+
+define internal void @rmw_add_i32_var_addropt(i32 %addr_arg, i32 %var) {
+entry:
+  %addr_arg_plus_12 = add i32 %addr_arg, 12
+  %var_times_4 = mul i32 %var, 4
+  %addr_base = add i32 %addr_arg_plus_12 , %var_times_4
+  %addr = inttoptr i32 %addr_base to i32*
+  %val = load i32, i32* %addr, align 1
+  %rmw = add i32 %val, %var
+  store i32 %rmw, i32* %addr, align 1
+  ret void
+}
+; Look for something like: add DWORD PTR [eax+ecx*4+12],ecx
+; CHECK-LABEL: rmw_add_i32_var_addropt
+; CHECK: add DWORD PTR [e{{..}}+e{{..}}*4+0xc],e{{ax|bx|cx|dx|bp|di|si}}