Subzero: Fold icmp into br/select lowering.

Originally there was a peephole-style optimization in lowerIcmp() that looks ahead to see if the next instruction is a conditional branch with the right properties, and if so, folds the icmp and br into a single lowering sequence.

However, sometimes extra instructions come between the icmp and br instructions, disabling the folding even though it would still be possible.

One thought is to do the folding inside lowerBr() instead of lowerIcmp(), by looking backward for a suitable icmp instruction.  The problem here is that the icmp lowering code may leave lowered instructions that can't easily be dead-code eliminated, e.g. instructions lacking a dest variable.

Instead, before lowering a basic block, we do a prepass on the block to identify folding candidates.  For the icmp/br example, the prepass would tentatively delete the icmp instruction and then the br lowering would fold in the icmp.

This folding can also be extended to several producers:
  icmp (i32 operands), icmp (i64 operands), fcmp, trunc .. to i1
and several consumers:
  br, select, sext, zext

This CL starts with 2 combinations: icmp32 paired with br & select.  Other combinations will be added in later CLs.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4162
BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/1141213004
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index a9b34fc..fb2ab39 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -516,6 +516,7 @@
   LoweringContext &Context = Target->getContext();
   // Lower the regular instructions.
   Context.init(this);
+  Target->initNodeForLowering(this);
   while (!Context.atEnd()) {
     InstList::iterator Orig = Context.getCur();
     if (llvm::isa<InstRet>(*Orig))
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index d8ce728..203b54e 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -195,6 +195,7 @@
         clEnumValN(Ice::IceV_Frame, "frame", "Stack frame layout details"),
         clEnumValN(Ice::IceV_AddrOpt, "addropt", "Address mode optimization"),
         clEnumValN(Ice::IceV_Random, "random", "Randomization details"),
+        clEnumValN(Ice::IceV_Folding, "fold", "Instruction folding details"),
         clEnumValN(Ice::IceV_All, "all", "Use all verbose options"),
         clEnumValN(Ice::IceV_Most, "most",
                    "Use all verbose options except 'regalloc'"),
diff --git a/src/IceDefs.h b/src/IceDefs.h
index 1ee3d23..71a23f6 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -172,6 +172,7 @@
   IceV_Frame = 1 << 8,
   IceV_AddrOpt = 1 << 9,
   IceV_Random = 1 << 10,
+  IceV_Folding = 1 << 11,
   IceV_All = ~IceV_None,
   IceV_Most = IceV_All & ~IceV_LinearScan
 };
diff --git a/src/IceInst.h b/src/IceInst.h
index e3262c8..28fb046 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -81,6 +81,7 @@
 
   bool isDeleted() const { return Deleted; }
   void setDeleted() { Deleted = true; }
+  void setDead(bool Value = true) { Dead = Value; }
   void deleteIfDead();
 
   bool hasSideEffects() const { return HasSideEffects; }
@@ -178,7 +179,9 @@
   InstNumberT Number;
   // Deleted means irrevocably deleted.
   bool Deleted;
-  // Dead means pending deletion after liveness analysis converges.
+  // Dead means one of two things depending on context: (1) pending
+  // deletion after liveness analysis converges, or (2) marked for
+  // deletion during lowering due to a folded bool operation.
   bool Dead;
   // HasSideEffects means the instruction is something like a function
   // call or a volatile load that can't be removed even if its Dest
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index d0929a0..2fa0397 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -126,82 +126,85 @@
 void TargetLowering::lower() {
   assert(!Context.atEnd());
   Inst *Inst = Context.getCur();
-  // Mark the current instruction as deleted before lowering,
-  // otherwise the Dest variable will likely get marked as non-SSA.
-  // See Variable::setDefinition().
-  Inst->setDeleted();
-  switch (Inst->getKind()) {
-  case Inst::Alloca:
-    lowerAlloca(llvm::dyn_cast<InstAlloca>(Inst));
-    break;
-  case Inst::Arithmetic:
-    lowerArithmetic(llvm::dyn_cast<InstArithmetic>(Inst));
-    break;
-  case Inst::Assign:
-    lowerAssign(llvm::dyn_cast<InstAssign>(Inst));
-    break;
-  case Inst::Br:
-    lowerBr(llvm::dyn_cast<InstBr>(Inst));
-    break;
-  case Inst::Call:
-    lowerCall(llvm::dyn_cast<InstCall>(Inst));
-    break;
-  case Inst::Cast:
-    lowerCast(llvm::dyn_cast<InstCast>(Inst));
-    break;
-  case Inst::ExtractElement:
-    lowerExtractElement(llvm::dyn_cast<InstExtractElement>(Inst));
-    break;
-  case Inst::Fcmp:
-    lowerFcmp(llvm::dyn_cast<InstFcmp>(Inst));
-    break;
-  case Inst::Icmp:
-    lowerIcmp(llvm::dyn_cast<InstIcmp>(Inst));
-    break;
-  case Inst::InsertElement:
-    lowerInsertElement(llvm::dyn_cast<InstInsertElement>(Inst));
-    break;
-  case Inst::IntrinsicCall: {
-    InstIntrinsicCall *Call = llvm::dyn_cast<InstIntrinsicCall>(Inst);
-    if (Call->getIntrinsicInfo().ReturnsTwice)
-      setCallsReturnsTwice(true);
-    lowerIntrinsicCall(Call);
-    break;
-  }
-  case Inst::Load:
-    lowerLoad(llvm::dyn_cast<InstLoad>(Inst));
-    break;
-  case Inst::Phi:
-    lowerPhi(llvm::dyn_cast<InstPhi>(Inst));
-    break;
-  case Inst::Ret:
-    lowerRet(llvm::dyn_cast<InstRet>(Inst));
-    break;
-  case Inst::Select:
-    lowerSelect(llvm::dyn_cast<InstSelect>(Inst));
-    break;
-  case Inst::Store:
-    lowerStore(llvm::dyn_cast<InstStore>(Inst));
-    break;
-  case Inst::Switch:
-    lowerSwitch(llvm::dyn_cast<InstSwitch>(Inst));
-    break;
-  case Inst::Unreachable:
-    lowerUnreachable(llvm::dyn_cast<InstUnreachable>(Inst));
-    break;
-  case Inst::BundleLock:
-  case Inst::BundleUnlock:
-  case Inst::FakeDef:
-  case Inst::FakeUse:
-  case Inst::FakeKill:
-  case Inst::Target:
-    // These are all Target instruction types and shouldn't be
-    // encountered at this stage.
-    Func->setError("Can't lower unsupported instruction type");
-    break;
-  }
+  Inst->deleteIfDead();
+  if (!Inst->isDeleted()) {
+    // Mark the current instruction as deleted before lowering,
+    // otherwise the Dest variable will likely get marked as non-SSA.
+    // See Variable::setDefinition().
+    Inst->setDeleted();
+    switch (Inst->getKind()) {
+    case Inst::Alloca:
+      lowerAlloca(llvm::cast<InstAlloca>(Inst));
+      break;
+    case Inst::Arithmetic:
+      lowerArithmetic(llvm::cast<InstArithmetic>(Inst));
+      break;
+    case Inst::Assign:
+      lowerAssign(llvm::cast<InstAssign>(Inst));
+      break;
+    case Inst::Br:
+      lowerBr(llvm::cast<InstBr>(Inst));
+      break;
+    case Inst::Call:
+      lowerCall(llvm::cast<InstCall>(Inst));
+      break;
+    case Inst::Cast:
+      lowerCast(llvm::cast<InstCast>(Inst));
+      break;
+    case Inst::ExtractElement:
+      lowerExtractElement(llvm::cast<InstExtractElement>(Inst));
+      break;
+    case Inst::Fcmp:
+      lowerFcmp(llvm::cast<InstFcmp>(Inst));
+      break;
+    case Inst::Icmp:
+      lowerIcmp(llvm::cast<InstIcmp>(Inst));
+      break;
+    case Inst::InsertElement:
+      lowerInsertElement(llvm::cast<InstInsertElement>(Inst));
+      break;
+    case Inst::IntrinsicCall: {
+      InstIntrinsicCall *Call = llvm::cast<InstIntrinsicCall>(Inst);
+      if (Call->getIntrinsicInfo().ReturnsTwice)
+        setCallsReturnsTwice(true);
+      lowerIntrinsicCall(Call);
+      break;
+    }
+    case Inst::Load:
+      lowerLoad(llvm::cast<InstLoad>(Inst));
+      break;
+    case Inst::Phi:
+      lowerPhi(llvm::cast<InstPhi>(Inst));
+      break;
+    case Inst::Ret:
+      lowerRet(llvm::cast<InstRet>(Inst));
+      break;
+    case Inst::Select:
+      lowerSelect(llvm::cast<InstSelect>(Inst));
+      break;
+    case Inst::Store:
+      lowerStore(llvm::cast<InstStore>(Inst));
+      break;
+    case Inst::Switch:
+      lowerSwitch(llvm::cast<InstSwitch>(Inst));
+      break;
+    case Inst::Unreachable:
+      lowerUnreachable(llvm::cast<InstUnreachable>(Inst));
+      break;
+    case Inst::BundleLock:
+    case Inst::BundleUnlock:
+    case Inst::FakeDef:
+    case Inst::FakeUse:
+    case Inst::FakeKill:
+    case Inst::Target:
+      // These are all Target instruction types and shouldn't be
+      // encountered at this stage.
+      Func->setError("Can't lower unsupported instruction type");
+      break;
+    }
 
-  postLower();
+    postLower();
+  }
 
   Context.advanceCur();
   Context.advanceNext();
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 5003574..f488930 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -221,6 +221,7 @@
   // Performs target-specific argument lowering.
   virtual void lowerArguments() = 0;
 
+  virtual void initNodeForLowering(CfgNode *) {}
   virtual void addProlog(CfgNode *Node) = 0;
   virtual void addEpilog(CfgNode *Node) = 0;
 
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 5f83bae..f9488ec 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -260,6 +260,152 @@
 
 } // end of anonymous namespace
 
+BoolFoldingEntry::BoolFoldingEntry(Inst *I)
+    : Instr(I), IsComplex(BoolFolding::hasComplexLowering(I)), IsLiveOut(true),
+      NumUses(0) {}
+
+BoolFolding::BoolFoldingProducerKind
+BoolFolding::getProducerKind(const Inst *Instr) {
+  if (llvm::isa<InstIcmp>(Instr)) {
+    if (Instr->getSrc(0)->getType() != IceType_i64)
+      return PK_Icmp32;
+    return PK_None; // TODO(stichnot): actually PK_Icmp64;
+  }
+  return PK_None; // TODO(stichnot): remove this
+
+  if (llvm::isa<InstFcmp>(Instr))
+    return PK_Fcmp;
+  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
+    switch (Cast->getCastKind()) {
+    default:
+      return PK_None;
+    case InstCast::Trunc:
+      return PK_Trunc;
+    }
+  }
+  return PK_None;
+}
+
+BoolFolding::BoolFoldingConsumerKind
+BoolFolding::getConsumerKind(const Inst *Instr) {
+  if (llvm::isa<InstBr>(Instr))
+    return CK_Br;
+  if (llvm::isa<InstSelect>(Instr))
+    return CK_Select;
+  return CK_None; // TODO(stichnot): remove this
+
+  if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
+    switch (Cast->getCastKind()) {
+    default:
+      return CK_None;
+    case InstCast::Sext:
+      return CK_Sext;
+    case InstCast::Zext:
+      return CK_Zext;
+    }
+  }
+  return CK_None;
+}
+
+// Returns true if the producing instruction has a "complex" lowering
+// sequence.  This generally means that its lowering sequence requires
+// more than one conditional branch, namely 64-bit integer compares
+// and some floating-point compares.  When this is true, and there is
+// more than one consumer, we prefer to disable the folding
+// optimization because it minimizes branches.
+bool BoolFolding::hasComplexLowering(const Inst *Instr) {
+  switch (getProducerKind(Instr)) {
+  default:
+    return false;
+  case PK_Icmp64:
+    return true;
+  case PK_Fcmp:
+    return TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
+           CondX86::Br_None;
+  }
+}
+
+void BoolFolding::init(CfgNode *Node) {
+  Producers.clear();
+  for (Inst &Instr : Node->getInsts()) {
+    // Check whether Instr is a valid producer.
+    Variable *Var = Instr.getDest();
+    if (!Instr.isDeleted() // only consider non-deleted instructions
+        && Var             // only instructions with an actual dest var
+        && Var->getType() == IceType_i1          // only bool-type dest vars
+        && getProducerKind(&Instr) != PK_None) { // white-listed instructions
+      Producers[Var->getIndex()] = BoolFoldingEntry(&Instr);
+    }
+    // Check each src variable against the map.
+    for (SizeT I = 0; I < Instr.getSrcSize(); ++I) {
+      Operand *Src = Instr.getSrc(I);
+      SizeT NumVars = Src->getNumVars();
+      for (SizeT J = 0; J < NumVars; ++J) {
+        const Variable *Var = Src->getVar(J);
+        SizeT VarNum = Var->getIndex();
+        if (containsValid(VarNum)) {
+          if (I != 0 // All valid consumers use Var as the first source operand
+              || getConsumerKind(&Instr) == CK_None // must be white-listed
+              || (Producers[VarNum].IsComplex && // complex can't be multi-use
+                  Producers[VarNum].NumUses > 0)) {
+            setInvalid(VarNum);
+            continue;
+          }
+          ++Producers[VarNum].NumUses;
+          if (Instr.isLastUse(Var)) {
+            Producers[VarNum].IsLiveOut = false;
+          }
+        }
+      }
+    }
+  }
+  for (auto &I : Producers) {
+    // Ignore entries previously marked invalid.
+    if (I.second.Instr == nullptr)
+      continue;
+    // Disable the producer if its dest may be live beyond this block.
+    if (I.second.IsLiveOut) {
+      setInvalid(I.first);
+      continue;
+    }
+    // Mark as "dead" rather than outright deleting.  This is so that
+    // other peephole style optimizations during or before lowering
+    // have access to this instruction in undeleted form.  See for
+    // example tryOptimizedCmpxchgCmpBr().
+    I.second.Instr->setDead();
+  }
+}
+
+const Inst *BoolFolding::getProducerFor(const Operand *Opnd) const {
+  auto *Var = llvm::dyn_cast<const Variable>(Opnd);
+  if (Var == nullptr)
+    return nullptr;
+  SizeT VarNum = Var->getIndex();
+  auto Element = Producers.find(VarNum);
+  if (Element == Producers.end())
+    return nullptr;
+  return Element->second.Instr;
+}
+
+void BoolFolding::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP || !Func->isVerbose(IceV_Folding))
+    return;
+  OstreamLocker L(Func->getContext());
+  Ostream &Str = Func->getContext()->getStrDump();
+  for (auto &I : Producers) {
+    if (I.second.Instr == nullptr)
+      continue;
+    Str << "Found foldable producer:\n  ";
+    I.second.Instr->dump(Func);
+    Str << "\n";
+  }
+}
+
+void TargetX8632::initNodeForLowering(CfgNode *Node) {
+  FoldingInfo.init(Node);
+  FoldingInfo.dump(Func);
+}
+
 TargetX8632::TargetX8632(Cfg *Func)
     : TargetLowering(Func),
       InstructionSet(static_cast<X86InstructionSet>(
@@ -1719,12 +1865,35 @@
 void TargetX8632::lowerBr(const InstBr *Inst) {
   if (Inst->isUnconditional()) {
     _br(Inst->getTargetUnconditional());
-  } else {
-    Operand *Src0 = legalize(Inst->getCondition(), Legal_Reg | Legal_Mem);
-    Constant *Zero = Ctx->getConstantZero(IceType_i32);
-    _cmp(Src0, Zero);
-    _br(CondX86::Br_ne, Inst->getTargetTrue(), Inst->getTargetFalse());
+    return;
   }
+  Operand *Cond = Inst->getCondition();
+
+  // Handle folding opportunities.
+  if (const class Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
+    assert(Producer->isDeleted());
+    switch (BoolFolding::getProducerKind(Producer)) {
+    default:
+      break;
+    case BoolFolding::PK_Icmp32: {
+      // TODO(stichnot): Refactor similarities between this block and
+      // the corresponding code in lowerIcmp().
+      auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
+      Operand *Src0 = Producer->getSrc(0);
+      Operand *Src1 = legalize(Producer->getSrc(1));
+      Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
+      _cmp(Src0RM, Src1);
+      _br(getIcmp32Mapping(Cmp->getCondition()), Inst->getTargetTrue(),
+          Inst->getTargetFalse());
+      return;
+    }
+    }
+  }
+
+  Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  _cmp(Src0, Zero);
+  _br(CondX86::Br_ne, Inst->getTargetTrue(), Inst->getTargetFalse());
 }
 
 void TargetX8632::lowerCall(const InstCall *Instr) {
@@ -2678,37 +2847,6 @@
     return;
   }
 
-  // If Src1 is an immediate, or known to be a physical register, we can
-  // allow Src0 to be a memory operand.  Otherwise, Src0 must be copied into
-  // a physical register.  (Actually, either Src0 or Src1 can be chosen for
-  // the physical register, but unfortunately we have to commit to one or
-  // the other before register allocation.)
-  bool IsSrc1ImmOrReg = false;
-  if (llvm::isa<Constant>(Src1)) {
-    IsSrc1ImmOrReg = true;
-  } else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
-    if (Var->hasReg())
-      IsSrc1ImmOrReg = true;
-  }
-
-  // Try to fuse a compare immediately followed by a conditional branch.  This
-  // is possible when the compare dest and the branch source operands are the
-  // same, and are their only uses.  TODO: implement this optimization for i64.
-  if (InstBr *NextBr = llvm::dyn_cast_or_null<InstBr>(Context.getNextInst())) {
-    if (Src0->getType() != IceType_i64 && !NextBr->isUnconditional() &&
-        Dest == NextBr->getSrc(0) && NextBr->isLastUse(Dest)) {
-      NextBr->setDeleted();
-      Operand *Src0RM =
-          legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
-      _cmp(Src0RM, Src1);
-      _br(getIcmp32Mapping(Inst->getCondition()), NextBr->getTargetTrue(),
-          NextBr->getTargetFalse());
-      // Skip over the following branch instruction.
-      Context.advanceNext();
-      return;
-    }
-  }
-
   // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
   if (Src0->getType() == IceType_i64) {
     InstIcmp::ICond Condition = Inst->getCondition();
@@ -2737,8 +2875,7 @@
   }
 
   // cmp b, c
-  Operand *Src0RM =
-      legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
+  Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
   _cmp(Src0RM, Src1);
   _setcc(Dest, getIcmp32Mapping(Inst->getCondition()));
 }
@@ -4001,7 +4138,51 @@
     return;
   }
 
-  // a=d?b:c ==> cmp d,0; a=b; jne L1; FakeUse(a); a=c; L1:
+  // Handle folding opportunities.
+  if (const class Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
+    assert(Producer->isDeleted());
+    switch (BoolFolding::getProducerKind(Producer)) {
+    default:
+      break;
+    case BoolFolding::PK_Icmp32: {
+      // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
+      auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
+      InstX8632Label *Label = InstX8632Label::create(Func, this);
+      Operand *Src0 = Producer->getSrc(0);
+      Operand *Src1 = legalize(Producer->getSrc(1));
+      Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
+      _cmp(Src0RM, Src1);
+      // This is the same code as below (for both i64 and non-i64),
+      // except without the _cmp instruction and with a different
+      // branch condition.  TODO(stichnot): refactor.
+      if (Dest->getType() == IceType_i64) {
+        Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+        Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+        Operand *SrcLoRI = legalize(loOperand(SrcT), Legal_Reg | Legal_Imm);
+        Operand *SrcHiRI = legalize(hiOperand(SrcT), Legal_Reg | Legal_Imm);
+        _mov(DestLo, SrcLoRI);
+        _mov(DestHi, SrcHiRI);
+        _br(getIcmp32Mapping(Cmp->getCondition()), Label);
+        Operand *SrcFLo = loOperand(SrcF);
+        Operand *SrcFHi = hiOperand(SrcF);
+        SrcLoRI = legalize(SrcFLo, Legal_Reg | Legal_Imm);
+        SrcHiRI = legalize(SrcFHi, Legal_Reg | Legal_Imm);
+        _mov_nonkillable(DestLo, SrcLoRI);
+        _mov_nonkillable(DestHi, SrcHiRI);
+      } else {
+        SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
+        _mov(Dest, SrcT);
+        _br(getIcmp32Mapping(Cmp->getCondition()), Label);
+        SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
+        _mov_nonkillable(Dest, SrcF);
+      }
+      Context.insert(Label);
+      return;
+    }
+    }
+  }
+
+  // a=d?b:c ==> cmp d,0; a=b; jne L1; a=c; L1:
   Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
   Constant *Zero = Ctx->getConstantZero(IceType_i32);
   InstX8632Label *Label = InstX8632Label::create(Func, this);
@@ -4534,6 +4715,23 @@
   return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
 }
 
+// For the cmp instruction, if Src1 is an immediate, or known to be a
+// physical register, we can allow Src0 to be a memory operand.
+// Otherwise, Src0 must be copied into a physical register.
+// (Actually, either Src0 or Src1 can be chosen for the physical
+// register, but unfortunately we have to commit to one or the other
+// before register allocation.)
+Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
+  bool IsSrc1ImmOrReg = false;
+  if (llvm::isa<Constant>(Src1)) {
+    IsSrc1ImmOrReg = true;
+  } else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
+    if (Var->hasReg())
+      IsSrc1ImmOrReg = true;
+  }
+  return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
+}
+
 OperandX8632Mem *TargetX8632::FormMemoryOperand(Operand *Operand, Type Ty) {
   OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand);
   // It may be the case that address mode optimization already creates
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index f114dba..79557ae 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -16,14 +16,81 @@
 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632_H
 #define SUBZERO_SRC_ICETARGETLOWERINGX8632_H
 
+#include <unordered_map>
+
 #include "assembler_ia32.h"
 #include "IceDefs.h"
+#include "IceInst.h"
 #include "IceInstX8632.h"
 #include "IceRegistersX8632.h"
 #include "IceTargetLowering.h"
 
 namespace Ice {
 
+class BoolFoldingEntry {
+  BoolFoldingEntry(const BoolFoldingEntry &) = delete;
+
+public:
+  BoolFoldingEntry()
+      : Instr(nullptr), IsComplex(false), IsLiveOut(true), NumUses(0) {}
+  explicit BoolFoldingEntry(Inst *I);
+  BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
+  // Instr is the instruction producing the i1-type variable of interest.
+  Inst *Instr;
+  // IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
+  bool IsComplex;
+  // IsLiveOut is initialized conservatively to true, and is set to false when
+  // we encounter an instruction that ends Var's live range.  We disable the
+  // folding optimization when Var is live beyond this basic block.  Note that
+  // if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
+  // always be true and the folding optimization will never be performed.
+  bool IsLiveOut;
+  // NumUses counts the number of times Var is used as a source operand in the
+  // basic block.  If IsComplex is true and there is more than one use of Var,
+  // then the folding optimization is disabled for Var.
+  uint32_t NumUses;
+};
+
+class BoolFolding {
+public:
+  enum BoolFoldingProducerKind {
+    PK_None,
+    PK_Icmp32,
+    PK_Icmp64,
+    PK_Fcmp,
+    PK_Trunc
+  };
+
+  // Currently the actual enum values are not used (other than CK_None), but we
+  // go
+  // ahead and produce them anyway for symmetry with the
+  // BoolFoldingProducerKind.
+  enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
+
+private:
+  BoolFolding(const BoolFolding &) = delete;
+  BoolFolding &operator=(const BoolFolding &) = delete;
+
+public:
+  BoolFolding() {}
+  static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
+  static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
+  static bool hasComplexLowering(const Inst *Instr);
+  void init(CfgNode *Node);
+  const Inst *getProducerFor(const Operand *Opnd) const;
+  void dump(const Cfg *Func) const;
+
+private:
+  // Returns true if Producers contains a valid entry for the given VarNum.
+  bool containsValid(SizeT VarNum) const {
+    auto Element = Producers.find(VarNum);
+    return Element != Producers.end() && Element->second.Instr != nullptr;
+  }
+  void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
+  // Producers maps Variable::Number to a BoolFoldingEntry.
+  std::unordered_map<SizeT, BoolFoldingEntry> Producers;
+};
+
 class TargetX8632 : public TargetLowering {
   TargetX8632() = delete;
   TargetX8632(const TargetX8632 &) = delete;
@@ -63,6 +130,7 @@
   void emit(const ConstantDouble *C) const final;
 
   void lowerArguments() override;
+  void initNodeForLowering(CfgNode *Node) override;
   void addProlog(CfgNode *Node) override;
   void addEpilog(CfgNode *Node) override;
   // Ensure that a 64-bit Variable has been split into 2 32-bit
@@ -157,6 +225,8 @@
   Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
                     int32_t RegNum = Variable::NoRegister);
   Variable *legalizeToVar(Operand *From, int32_t RegNum = Variable::NoRegister);
+  // Legalize the first source operand for use in the cmp instruction.
+  Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
   // Turn a pointer operand into a memory operand that can be
   // used by a real load/store operation. Legalizes the operand as well.
   // This is a nop if the operand is already a legal memory operand.
@@ -507,6 +577,7 @@
 
 private:
   ~TargetX8632() override {}
+  BoolFolding FoldingInfo;
 };
 
 class TargetDataX8632 : public TargetDataLowering {
diff --git a/tests_lit/llvm2ice_tests/8bit.pnacl.ll b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
index 4f48cf4..4987a1d 100644
--- a/tests_lit/llvm2ice_tests/8bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
@@ -3,6 +3,8 @@
 ; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
 ; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
 
+declare void @useInt(i32 %x)
+
 define internal i32 @add8Bit(i32 %a, i32 %b) {
 entry:
   %a_8 = trunc i32 %a to i8
@@ -278,6 +280,9 @@
   %cmp = icmp slt i8 %a_8, %b_8
   %ret = select i1 %cmp, i8 %a_8, i8 %b_8
   %ret_ext = zext i8 %ret to i32
+  ; Create a "fake" use of %cmp to prevent O2 bool folding.
+  %d1 = zext i1 %cmp to i32
+  call void @useInt(i32 %d1)
   ret i32 %ret_ext
 }
 ; CHECK-LABEL: selectI8Var
diff --git a/tests_lit/llvm2ice_tests/bool-folding.ll b/tests_lit/llvm2ice_tests/bool-folding.ll
new file mode 100644
index 0000000..d979927
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/bool-folding.ll
@@ -0,0 +1,206 @@
+; This tests the optimization where producers and consumers of i1 (bool)
+; variables are combined to implicitly use flags instead of explicitly using
+; stack or register variables.
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
+
+declare void @use_value(i32)
+
+; Basic cmp/branch folding.
+define i32 @fold_cmp_br(i32 %arg1, i32 %arg2) {
+entry:
+  %cmp1 = icmp slt i32 %arg1, %arg2
+  br i1 %cmp1, label %branch1, label %branch2
+branch1:
+  ret i32 1
+branch2:
+  ret i32 2
+}
+
+; CHECK-LABEL: fold_cmp_br
+; CHECK: cmp
+; CHECK: jge
+
+
+; Cmp/branch folding with intervening instructions.
+define i32 @fold_cmp_br_intervening_insts(i32 %arg1, i32 %arg2) {
+entry:
+  %cmp1 = icmp slt i32 %arg1, %arg2
+  call void @use_value(i32 %arg1)
+  br i1 %cmp1, label %branch1, label %branch2
+branch1:
+  ret i32 1
+branch2:
+  ret i32 2
+}
+
+; CHECK-LABEL: fold_cmp_br_intervening_insts
+; CHECK-NOT: cmp
+; CHECK: call
+; CHECK: cmp
+; CHECK: jge
+
+
+; Cmp/branch non-folding because of live-out.
+define i32 @no_fold_cmp_br_liveout(i32 %arg1, i32 %arg2) {
+entry:
+  %cmp1 = icmp slt i32 %arg1, %arg2
+  br label %next
+next:
+  br i1 %cmp1, label %branch1, label %branch2
+branch1:
+  ret i32 1
+branch2:
+  ret i32 2
+}
+
+; CHECK-LABEL: no_fold_cmp_br_liveout
+; CHECK: cmp
+; CHECK: set
+; CHECK: cmp
+; CHECK: je
+
+
+; Cmp/branch non-folding because of extra non-whitelisted uses.
+define i32 @no_fold_cmp_br_non_whitelist(i32 %arg1, i32 %arg2) {
+entry:
+  %cmp1 = icmp slt i32 %arg1, %arg2
+  %result = zext i1 %cmp1 to i32
+  br i1 %cmp1, label %branch1, label %branch2
+branch1:
+  ret i32 %result
+branch2:
+  ret i32 2
+}
+
+; CHECK-LABEL: no_fold_cmp_br_non_whitelist
+; CHECK: cmp
+; CHECK: set
+; CHECK: movzx
+; CHECK: cmp
+; CHECK: je
+
+
+; Basic cmp/select folding.
+define i32 @fold_cmp_select(i32 %arg1, i32 %arg2) {
+entry:
+  %cmp1 = icmp slt i32 %arg1, %arg2
+  %result = select i1 %cmp1, i32 %arg1, i32 %arg2
+  ret i32 %result
+}
+
+; CHECK-LABEL: fold_cmp_select
+; CHECK: cmp
+; CHECK: jl
+; CHECK: mov
+
+
+; 64-bit cmp/select folding.
+define i64 @fold_cmp_select_64(i64 %arg1, i64 %arg2) {
+entry:
+  %arg1_trunc = trunc i64 %arg1 to i32
+  %arg2_trunc = trunc i64 %arg2 to i32
+  %cmp1 = icmp slt i32 %arg1_trunc, %arg2_trunc
+  %result = select i1 %cmp1, i64 %arg1, i64 %arg2
+  ret i64 %result
+}
+
+; CHECK-LABEL: fold_cmp_select_64
+; CHECK: cmp
+; CHECK: jl
+; CHECK: mov
+; CHECK: mov
+
+
+; Cmp/select folding with intervening instructions.
+define i32 @fold_cmp_select_intervening_insts(i32 %arg1, i32 %arg2) {
+entry:
+  %cmp1 = icmp slt i32 %arg1, %arg2
+  call void @use_value(i32 %arg1)
+  %result = select i1 %cmp1, i32 %arg1, i32 %arg2
+  ret i32 %result
+}
+
+; CHECK-LABEL: fold_cmp_select_intervening_insts
+; CHECK-NOT: cmp
+; CHECK: call
+; CHECK: cmp
+; CHECK: jl
+; CHECK: mov
+
+
+; Cmp/multi-select folding.
+define i32 @fold_cmp_select_multi(i32 %arg1, i32 %arg2) {
+entry:
+  %cmp1 = icmp slt i32 %arg1, %arg2
+  %a = select i1 %cmp1, i32 %arg1, i32 %arg2
+  %b = select i1 %cmp1, i32 %arg2, i32 %arg1
+  %c = select i1 %cmp1, i32 123, i32 %arg1
+  %partial = add i32 %a, %b
+  %result = add i32 %partial, %c
+  ret i32 %result
+}
+
+; CHECK-LABEL: fold_cmp_select_multi
+; CHECK: cmp
+; CHECK: jl
+; CHECK: cmp
+; CHECK: jl
+; CHECK: cmp
+; CHECK: jl
+; CHECK: add
+; CHECK: add
+
+
+; Cmp/multi-select non-folding because of live-out.
+define i32 @no_fold_cmp_select_multi_liveout(i32 %arg1, i32 %arg2) {
+entry:
+  %cmp1 = icmp slt i32 %arg1, %arg2
+  %a = select i1 %cmp1, i32 %arg1, i32 %arg2
+  %b = select i1 %cmp1, i32 %arg2, i32 %arg1
+  br label %next
+next:
+  %c = select i1 %cmp1, i32 123, i32 %arg1
+  %partial = add i32 %a, %b
+  %result = add i32 %partial, %c
+  ret i32 %result
+}
+
+; CHECK-LABEL: no_fold_cmp_select_multi_liveout
+; CHECK: set
+; CHECK: cmp
+; CHECK: jne
+; CHECK: cmp
+; CHECK: jne
+; CHECK: cmp
+; CHECK: jne
+; CHECK: add
+; CHECK: add
+
+
+; Cmp/multi-select non-folding because of extra non-whitelisted uses.
+define i32 @no_fold_cmp_select_multi_non_whitelist(i32 %arg1, i32 %arg2) {
+entry:
+  %cmp1 = icmp slt i32 %arg1, %arg2
+  %a = select i1 %cmp1, i32 %arg1, i32 %arg2
+  %b = select i1 %cmp1, i32 %arg2, i32 %arg1
+  %c = select i1 %cmp1, i32 123, i32 %arg1
+  %ext = zext i1 %cmp1 to i32
+  %partial1 = add i32 %a, %b
+  %partial2 = add i32 %partial1, %c
+  %result = add i32 %partial2, %ext
+  ret i32 %result
+}
+
+; CHECK-LABEL: no_fold_cmp_select_multi_non_whitelist
+; CHECK: set
+; CHECK: cmp
+; CHECK: jne
+; CHECK: cmp
+; CHECK: jne
+; CHECK: cmp
+; CHECK: jne
+; CHECK: movzx
+; CHECK: add
+; CHECK: add
+; CHECK: add
diff --git a/tests_lit/llvm2ice_tests/select-opt.ll b/tests_lit/llvm2ice_tests/select-opt.ll
index 8e67a6d..f6d15bf 100644
--- a/tests_lit/llvm2ice_tests/select-opt.ll
+++ b/tests_lit/llvm2ice_tests/select-opt.ll
@@ -14,6 +14,11 @@
   %cmp1 = icmp sgt i32 %a, %b
   %cond2 = select i1 %cmp1, i32 10, i32 20
   tail call void @useInt(i32 %cond2)
+  ; Create "fake" uses of %cmp and %cmp1 to prevent O2 bool folding.
+  %d1 = zext i1 %cmp to i32
+  call void @useInt(i32 %d1)
+  %d2 = zext i1 %cmp1 to i32
+  call void @useInt(i32 %d2)
   ret void
 }