Subzero: Add branch optimization.

1. Unconditional branch to the next basic block is removed.

2. For a conditional branch with a "false" edge to the next basic block, remove the unconditional branch to the fallthrough block.

3. For a conditional branch with a "true" edge to the next basic block, invert the condition and do like #2.

This is enabled only for O2, particularly because inverting the branch condition is a marginally risky operation.

This decreases the instruction count by about 5-6%.

Also, --stats prints a final tally to make it easier to post-process the output.

BUG= none
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/580903005
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index d699ef5..4439323 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -290,6 +290,14 @@
   return Valid;
 }
 
+void Cfg::doBranchOpt() {
+  for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    NodeList::iterator NextNode = I;
+    ++NextNode;
+    (*I)->doBranchOpt(*NextNode);
+  }
+}
+
 // ======================== Dump routines ======================== //
 
 void Cfg::emit() {
diff --git a/src/IceCfg.h b/src/IceCfg.h
index e12bc9d..a2582eb 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -94,6 +94,7 @@
   void livenessLightweight();
   void liveness(LivenessMode Mode);
   bool validateLiveness() const;
+  void doBranchOpt();
 
   // Manage the CurrentNode field, which is used for validating the
   // Variable::DefNode field during dumping/emitting.
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 4de2f57..57aba6b 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -457,6 +457,19 @@
   }
 }
 
+void CfgNode::doBranchOpt(const CfgNode *NextNode) {
+  TargetLowering *Target = Func->getTarget();
+  // Check every instruction for a branch optimization opportunity.
+  // It may be more efficient to iterate in reverse and stop after the
+  // first opportunity, unless there is some target lowering where we
+  // have the possibility of multiple such optimizations per block
+  // (currently not the case for x86 lowering).
+  for (InstList::const_iterator I = Insts.begin(), E = Insts.end(); I != E;
+       ++I) {
+    Target->doBranchOpt(*I, NextNode);
+  }
+}
+
 // ======================== Dump routines ======================== //
 
 void CfgNode::emit(Cfg *Func) const {
diff --git a/src/IceCfgNode.h b/src/IceCfgNode.h
index 954977f..eba98c0 100644
--- a/src/IceCfgNode.h
+++ b/src/IceCfgNode.h
@@ -65,6 +65,7 @@
   void livenessLightweight();
   bool liveness(Liveness *Liveness);
   void livenessPostprocess(LivenessMode Mode, Liveness *Liveness);
+  void doBranchOpt(const CfgNode *NextNode);
   void emit(Cfg *Func) const;
   void dump(Cfg *Func) const;
 
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 47c7aa8..319e18a 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -384,10 +384,14 @@
   llvm_unreachable("Unknown type");
 }
 
-void GlobalContext::dumpStats(const IceString &Name) {
+void GlobalContext::dumpStats(const IceString &Name, bool Final) {
   if (Flags.DumpStats) {
-    StatsFunction.dump(Name, getStrDump());
-    StatsCumulative.dump("_TOTAL_", getStrDump());
+    if (Final) {
+      StatsCumulative.dump(Name, getStrDump());
+    } else {
+      StatsFunction.dump(Name, getStrDump());
+      StatsCumulative.dump("_TOTAL_", getStrDump());
+    }
   }
 }
 
diff --git a/src/IceGlobalContext.h b/src/IceGlobalContext.h
index da4b6a1..f997b65 100644
--- a/src/IceGlobalContext.h
+++ b/src/IceGlobalContext.h
@@ -132,7 +132,7 @@
 
   // Reset stats at the beginning of a function.
   void resetStats() { StatsFunction.reset(); }
-  void dumpStats(const IceString &Name);
+  void dumpStats(const IceString &Name, bool Final = false);
   void statsUpdateEmitted(uint32_t InstCount) {
     StatsFunction.updateEmitted(InstCount);
     StatsCumulative.updateEmitted(InstCount);
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index a61805e..df28696 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -24,11 +24,12 @@
 namespace {
 
 const struct InstX8632BrAttributes_ {
+  InstX8632::BrCond Opposite;
   const char *DisplayString;
   const char *EmitString;
 } InstX8632BrAttributes[] = {
-#define X(tag, dump, emit)                                                     \
-  { dump, emit }                                                               \
+#define X(tag, opp, dump, emit)                                                \
+  { InstX8632::opp, dump, emit }                                               \
   ,
     ICEINSTX8632BR_TABLE
 #undef X
@@ -128,11 +129,52 @@
   return ".L" + Func->getFunctionName() + "$local$__" + buf;
 }
 
-InstX8632Br::InstX8632Br(Cfg *Func, CfgNode *TargetTrue, CfgNode *TargetFalse,
-                         InstX8632Label *Label, InstX8632::BrCond Condition)
+InstX8632Br::InstX8632Br(Cfg *Func, const CfgNode *TargetTrue,
+                         const CfgNode *TargetFalse,
+                         const InstX8632Label *Label,
+                         InstX8632::BrCond Condition)
     : InstX8632(Func, InstX8632::Br, 0, NULL), Condition(Condition),
       TargetTrue(TargetTrue), TargetFalse(TargetFalse), Label(Label) {}
 
+bool InstX8632Br::optimizeBranch(const CfgNode *NextNode) {
+  // If there is no next block, then there can be no fallthrough to
+  // optimize.
+  if (NextNode == NULL)
+    return false;
+  // Intra-block conditional branches can't be optimized.
+  if (Label)
+    return false;
+  // If there is no fallthrough node, such as a non-default case label
+  // for a switch instruction, then there is no opportunity to
+  // optimize.
+  if (getTargetFalse() == NULL)
+    return false;
+
+  // Unconditional branch to the next node can be removed.
+  if (Condition == Br_None && getTargetFalse() == NextNode) {
+    assert(getTargetTrue() == NULL);
+    setDeleted();
+    return true;
+  }
+  // If the fallthrough is to the next node, set fallthrough to NULL
+  // to indicate.
+  if (getTargetFalse() == NextNode) {
+    TargetFalse = NULL;
+    return true;
+  }
+  // If TargetTrue is the next node, and TargetFalse is non-NULL
+  // (which was already tested above), then invert the branch
+  // condition, swap the targets, and set new fallthrough to NULL.
+  if (getTargetTrue() == NextNode) {
+    assert(Condition != Br_None);
+    Condition = InstX8632BrAttributes[Condition].Opposite;
+    TargetTrue = getTargetFalse();
+    TargetFalse = NULL;
+    return true;
+  }
+  return false;
+}
+
 InstX8632Call::InstX8632Call(Cfg *Func, Variable *Dest, Operand *CallTarget)
     : InstX8632(Func, InstX8632::Call, 1, Dest) {
   HasSideEffects = true;
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index 932500c..16c6fb8 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -51,20 +51,20 @@
 //#define X(val, name)
 
 #define ICEINSTX8632BR_TABLE   \
-  /* enum value, dump, emit */ \
-  X(Br_a,        "a",  "ja")   \
-  X(Br_ae,       "ae", "jae")  \
-  X(Br_b,        "b",  "jb")   \
-  X(Br_be,       "be", "jbe")  \
-  X(Br_e,        "e",  "je")   \
-  X(Br_g,        "g",  "jg")   \
-  X(Br_ge,       "ge", "jge")  \
-  X(Br_l,        "l",  "jl")   \
-  X(Br_le,       "le", "jle")  \
-  X(Br_ne,       "ne", "jne")  \
-  X(Br_np,       "np", "jnp")  \
-  X(Br_p,        "p",  "jp")   \
-//#define X(tag, dump, emit)
+  /* enum value, opposite, dump, emit */ \
+  X(Br_a,        Br_be,    "a",  "ja")                \
+  X(Br_ae,       Br_b,     "ae", "jae")               \
+  X(Br_b,        Br_ae,    "b",  "jb")                \
+  X(Br_be,       Br_a,     "be", "jbe")               \
+  X(Br_e,        Br_ne,    "e",  "je")                \
+  X(Br_g,        Br_le,    "g",  "jg")                \
+  X(Br_ge,       Br_l,     "ge", "jge")               \
+  X(Br_l,        Br_ge,    "l",  "jl")                \
+  X(Br_le,       Br_g,     "le", "jle")               \
+  X(Br_ne,       Br_e,     "ne", "jne")               \
+  X(Br_np,       Br_p,     "np", "jnp")               \
+  X(Br_p,        Br_np,    "p",  "jp")                \
+//#define X(tag, opp, dump, emit)
 
 #define ICEINSTX8632CMPPS_TABLE \
   /* enum value, emit */        \
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index f0558db..09615ce 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -225,7 +225,7 @@
   };
 
   enum BrCond {
-#define X(tag, dump, emit) tag,
+#define X(tag, opp, dump, emit) tag,
     ICEINSTX8632BR_TABLE
 #undef X
         Br_None
@@ -309,53 +309,62 @@
   // Create a conditional branch to a node.
   static InstX8632Br *create(Cfg *Func, CfgNode *TargetTrue,
                              CfgNode *TargetFalse, BrCond Condition) {
+    const InstX8632Label *NoLabel = NULL;
     return new (Func->allocate<InstX8632Br>())
-        InstX8632Br(Func, TargetTrue, TargetFalse, NULL, Condition);
+        InstX8632Br(Func, TargetTrue, TargetFalse, NoLabel, Condition);
   }
   // Create an unconditional branch to a node.
   static InstX8632Br *create(Cfg *Func, CfgNode *Target) {
+    const CfgNode *NoCondTarget = NULL;
+    const InstX8632Label *NoLabel = NULL;
     return new (Func->allocate<InstX8632Br>())
-        InstX8632Br(Func, NULL, Target, NULL, Br_None);
+        InstX8632Br(Func, NoCondTarget, Target, NoLabel, Br_None);
   }
   // Create a non-terminator conditional branch to a node, with a
   // fallthrough to the next instruction in the current node.  This is
   // used for switch lowering.
   static InstX8632Br *create(Cfg *Func, CfgNode *Target, BrCond Condition) {
+    const CfgNode *NoUncondTarget = NULL;
+    const InstX8632Label *NoLabel = NULL;
     return new (Func->allocate<InstX8632Br>())
-        InstX8632Br(Func, Target, NULL, NULL, Condition);
+        InstX8632Br(Func, Target, NoUncondTarget, NoLabel, Condition);
   }
   // Create a conditional intra-block branch (or unconditional, if
   // Condition==Br_None) to a label in the current block.
   static InstX8632Br *create(Cfg *Func, InstX8632Label *Label,
                              BrCond Condition) {
+    const CfgNode *NoCondTarget = NULL;
+    const CfgNode *NoUncondTarget = NULL;
     return new (Func->allocate<InstX8632Br>())
-        InstX8632Br(Func, NULL, NULL, Label, Condition);
+        InstX8632Br(Func, NoCondTarget, NoUncondTarget, Label, Condition);
   }
-  CfgNode *getTargetTrue() const { return TargetTrue; }
-  CfgNode *getTargetFalse() const { return TargetFalse; }
+  const CfgNode *getTargetTrue() const { return TargetTrue; }
+  const CfgNode *getTargetFalse() const { return TargetFalse; }
+  bool optimizeBranch(const CfgNode *NextNode);
   virtual uint32_t getEmitInstCount() const {
+    uint32_t Sum = 0;
     if (Label)
-      return 1;
-    if (Condition == Br_None)
-      return 1;
+      ++Sum;
+    if (getTargetTrue())
+      ++Sum;
     if (getTargetFalse())
-      return 2;
-    return 1;
+      ++Sum;
+    return Sum;
   }
   virtual void emit(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Br); }
 
 private:
-  InstX8632Br(Cfg *Func, CfgNode *TargetTrue, CfgNode *TargetFalse,
-              InstX8632Label *Label, BrCond Condition);
+  InstX8632Br(Cfg *Func, const CfgNode *TargetTrue, const CfgNode *TargetFalse,
+              const InstX8632Label *Label, BrCond Condition);
   InstX8632Br(const InstX8632Br &) LLVM_DELETED_FUNCTION;
   InstX8632Br &operator=(const InstX8632Br &) LLVM_DELETED_FUNCTION;
   virtual ~InstX8632Br() {}
   BrCond Condition;
-  CfgNode *TargetTrue;
-  CfgNode *TargetFalse;
-  InstX8632Label *Label; // Intra-block branch target
+  const CfgNode *TargetTrue;
+  const CfgNode *TargetFalse;
+  const InstX8632Label *Label; // Intra-block branch target
 };
 
 // AdjustStack instruction - subtracts esp by the given amount and
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index b15c4f1..7383fff 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -125,6 +125,11 @@
   void doNopInsertion();
   // Lowers a single instruction.
   void lower();
+  // Tries to do branch optimization on a single instruction.  Returns
+  // true if some optimization was done.
+  virtual bool doBranchOpt(Inst * /*I*/, const CfgNode * /*NextNode*/) {
+    return false;
+  }
 
   // Returns a variable pre-colored to the specified physical
   // register.  This is generally used to get very direct access to
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 464a2e8..31c11d4 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -395,6 +395,14 @@
   T_genFrame.printElapsedUs(Context, "genFrame()");
   Func->dump("After stack frame mapping");
 
+  // Branch optimization.  This needs to be done just before code
+  // emission.  In particular, no transformations that insert or
+  // reorder CfgNodes should be done after branch optimization.  We go
+  // ahead and do it before nop insertion to reduce the amount of work
+  // needed for searching for opportunities.
+  Func->doBranchOpt();
+  Func->dump("After branch optimization");
+
   // Nop insertion
   if (shouldDoNopInsertion()) {
     Func->doNopInsertion();
@@ -444,6 +452,13 @@
   }
 }
 
+bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
+  if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) {
+    return Br->optimizeBranch(NextNode);
+  }
+  return false;
+}
+
 IceString TargetX8632::RegNames[] = {
 #define X(val, init, name, name16, name8, scratch, preserved, stackptr,        \
           frameptr, isI8, isInt, isFP)                                         \
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 99235de..a104dac 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -28,6 +28,7 @@
 
   virtual void translateOm1();
   virtual void translateO2();
+  virtual bool doBranchOpt(Inst *I, const CfgNode *NextNode);
 
   virtual Variable *getPhysicalRegister(SizeT RegNum);
   virtual IceString getRegName(SizeT RegNum, Type Ty) const;
diff --git a/src/PNaClTranslator.cpp b/src/PNaClTranslator.cpp
index 609804c..1353b07 100644
--- a/src/PNaClTranslator.cpp
+++ b/src/PNaClTranslator.cpp
@@ -2247,7 +2247,6 @@
            << "\n";
     ErrorStatus = true;
   }
-  return;
 }
 
 } // end of namespace Ice
diff --git a/src/llvm2ice.cpp b/src/llvm2ice.cpp
index 3cb13b3..f147877 100644
--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -166,10 +166,11 @@
   Ice::GlobalContext Ctx(Ls, Os, VMask, TargetArch, OptLevel, TestPrefix,
                          Flags);
 
+  int ErrorStatus = 0;
   if (BuildOnRead) {
     Ice::PNaClTranslator Translator(&Ctx, Flags);
     Translator.translate(IRFilename);
-    return Translator.getErrorStatus();
+    ErrorStatus = Translator.getErrorStatus();
   } else {
     // Parse the input LLVM IR file into a module.
     SMDiagnostic Err;
@@ -189,6 +190,9 @@
 
     Ice::Converter Converter(Mod, &Ctx, Flags);
     Converter.convertToIce();
-    return Converter.getErrorStatus();
+    ErrorStatus = Converter.getErrorStatus();
   }
+  const bool FinalStats = true;
+  Ctx.dumpStats("_FINAL_", FinalStats);
+  return ErrorStatus;
 }