Subzero: Initial O2 lowering

Includes the following:
1. Liveness analysis.
2. Linear-scan register allocation.
3. Address mode optimization.
4. Compare-branch fusing.

All of these depend on liveness analysis.  There are three versions of liveness analysis (in order of increasing cost):
1. Lightweight.  This computes last-uses for variables local to a single basic block.
2. Full.  This computes last-uses for all variables based on global dataflow analysis.
3. Full live ranges.  This computes all last-uses, plus calculates the live range intervals in terms of instruction numbers.  (The live ranges are needed for register allocation.)

For testing the full live range computation, Cfg::validateLiveness() checks every Variable of every Inst and verifies that the current Inst is contained within the Variable's live range.

The cross tests are run with O2 in addition to Om1.

Some of the lit tests (for what good they do) are updated with O2 code sequences.

BUG= none
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/300563003
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 3d720fa..6bd9bf7 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -16,6 +16,7 @@
 #include "IceCfgNode.h"
 #include "IceDefs.h"
 #include "IceInst.h"
+#include "IceLiveness.h"
 #include "IceOperand.h"
 #include "IceTargetLowering.h"
 
@@ -24,7 +25,7 @@
 Cfg::Cfg(GlobalContext *Ctx)
     : Ctx(Ctx), FunctionName(""), ReturnType(IceType_void),
       IsInternalLinkage(false), HasError(false), ErrorMessage(""), Entry(NULL),
-      NextInstNumber(1),
+      NextInstNumber(1), Live(NULL),
       Target(TargetLowering::createLowering(Ctx->getTargetArch(), this)),
       CurrentNode(NULL) {}
 
@@ -62,14 +63,10 @@
 bool Cfg::hasComputedFrame() const { return getTarget()->hasComputedFrame(); }
 
 void Cfg::translate() {
-  Ostream &Str = Ctx->getStrDump();
   if (hasError())
     return;
 
-  if (Ctx->isVerbose()) {
-    Str << "================ Initial CFG ================\n";
-    dump();
-  }
+  dump("Initial CFG");
 
   Timer T_translate;
   // The set of translation passes and their order are determined by
@@ -77,10 +74,7 @@
   getTarget()->translate();
   T_translate.printElapsedUs(getContext(), "translate()");
 
-  if (Ctx->isVerbose()) {
-    Str << "================ Final output ================\n";
-    dump();
-  }
+  dump("Final output");
 }
 
 void Cfg::computePredecessors() {
@@ -89,6 +83,13 @@
   }
 }
 
+void Cfg::renumberInstructions() {
+  NextInstNumber = 1;
+  for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    (*I)->renumberInstructions();
+  }
+}
+
 // placePhiLoads() must be called before placePhiStores().
 void Cfg::placePhiLoads() {
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
@@ -109,6 +110,12 @@
   }
 }
 
+void Cfg::doAddressOpt() {
+  for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    (*I)->doAddressOpt();
+  }
+}
+
 void Cfg::genCode() {
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     (*I)->genCode();
@@ -128,6 +135,150 @@
   }
 }
 
+// This is a lightweight version of live-range-end calculation.  Marks
+// the last use of only those variables whose definition and uses are
+// completely with a single block.  It is a quick single pass and
+// doesn't need to iterate until convergence.
+void Cfg::livenessLightweight() {
+  for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    (*I)->livenessLightweight();
+  }
+}
+
+void Cfg::liveness(LivenessMode Mode) {
+  Live.reset(new Liveness(this, Mode));
+  Live->init();
+  // Initialize with all nodes needing to be processed.
+  llvm::BitVector NeedToProcess(Nodes.size(), true);
+  while (NeedToProcess.any()) {
+    // Iterate in reverse topological order to speed up convergence.
+    for (NodeList::reverse_iterator I = Nodes.rbegin(), E = Nodes.rend();
+         I != E; ++I) {
+      CfgNode *Node = *I;
+      if (NeedToProcess[Node->getIndex()]) {
+        NeedToProcess[Node->getIndex()] = false;
+        bool Changed = Node->liveness(getLiveness());
+        if (Changed) {
+          // If the beginning-of-block liveness changed since the last
+          // iteration, mark all in-edges as needing to be processed.
+          const NodeList &InEdges = Node->getInEdges();
+          for (NodeList::const_iterator I1 = InEdges.begin(),
+                                        E1 = InEdges.end();
+               I1 != E1; ++I1) {
+            CfgNode *Pred = *I1;
+            NeedToProcess[Pred->getIndex()] = true;
+          }
+        }
+      }
+    }
+  }
+  if (Mode == Liveness_Intervals) {
+    // Reset each variable's live range.
+    for (VarList::const_iterator I = Variables.begin(), E = Variables.end();
+         I != E; ++I) {
+      if (Variable *Var = *I)
+        Var->resetLiveRange();
+    }
+  }
+  // Collect timing for just the portion that constructs the live
+  // range intervals based on the end-of-live-range computation, for a
+  // finer breakdown of the cost.
+  Timer T_liveRange;
+  // Make a final pass over instructions to delete dead instructions
+  // and build each Variable's live range.
+  for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    (*I)->livenessPostprocess(Mode, getLiveness());
+  }
+  if (Mode == Liveness_Intervals) {
+    // Special treatment for live in-args.  Their liveness needs to
+    // extend beyond the beginning of the function, otherwise an arg
+    // whose only use is in the first instruction will end up having
+    // the trivial live range [1,1) and will *not* interfere with
+    // other arguments.  So if the first instruction of the method is
+    // "r=arg1+arg2", both args may be assigned the same register.
+    for (SizeT I = 0; I < Args.size(); ++I) {
+      Variable *Arg = Args[I];
+      if (!Live->getLiveRange(Arg).isEmpty()) {
+        // Add live range [-1,0) with weight 0.  TODO: Here and below,
+        // use better symbolic constants along the lines of
+        // Inst::NumberDeleted and Inst::NumberSentinel instead of -1
+        // and 0.
+        Live->addLiveRange(Arg, -1, 0, 0);
+      }
+      // Do the same for i64 args that may have been lowered into i32
+      // Lo and Hi components.
+      Variable *Lo = Arg->getLo();
+      if (Lo && !Live->getLiveRange(Lo).isEmpty())
+        Live->addLiveRange(Lo, -1, 0, 0);
+      Variable *Hi = Arg->getHi();
+      if (Hi && !Live->getLiveRange(Hi).isEmpty())
+        Live->addLiveRange(Hi, -1, 0, 0);
+    }
+    // Copy Liveness::LiveRanges into individual variables.  TODO:
+    // Remove Variable::LiveRange and redirect to
+    // Liveness::LiveRanges.  TODO: make sure Variable weights
+    // are applied properly.
+    SizeT NumVars = Variables.size();
+    for (SizeT i = 0; i < NumVars; ++i) {
+      Variable *Var = Variables[i];
+      Var->setLiveRange(Live->getLiveRange(Var));
+      if (Var->getWeight().isInf())
+        Var->setLiveRangeInfiniteWeight();
+    }
+    T_liveRange.printElapsedUs(getContext(), "live range construction");
+    dump();
+  }
+}
+
+// Traverse every Variable of every Inst and verify that it
+// appears within the Variable's computed live range.
+bool Cfg::validateLiveness() const {
+  bool Valid = true;
+  Ostream &Str = Ctx->getStrDump();
+  for (NodeList::const_iterator I1 = Nodes.begin(), E1 = Nodes.end(); I1 != E1;
+       ++I1) {
+    CfgNode *Node = *I1;
+    InstList &Insts = Node->getInsts();
+    for (InstList::const_iterator I2 = Insts.begin(), E2 = Insts.end();
+         I2 != E2; ++I2) {
+      Inst *Inst = *I2;
+      if (Inst->isDeleted())
+        continue;
+      if (llvm::isa<InstFakeKill>(Inst))
+        continue;
+      InstNumberT InstNumber = Inst->getNumber();
+      Variable *Dest = Inst->getDest();
+      if (Dest) {
+        // TODO: This instruction should actually begin Dest's live
+        // range, so we could probably test that this instruction is
+        // the beginning of some segment of Dest's live range.  But
+        // this wouldn't work with non-SSA temporaries during
+        // lowering.
+        if (!Dest->getLiveRange().containsValue(InstNumber)) {
+          Valid = false;
+          Str << "Liveness error: inst " << Inst->getNumber() << " dest ";
+          Dest->dump(this);
+          Str << " live range " << Dest->getLiveRange() << "\n";
+        }
+      }
+      for (SizeT I = 0; I < Inst->getSrcSize(); ++I) {
+        Operand *Src = Inst->getSrc(I);
+        SizeT NumVars = Src->getNumVars();
+        for (SizeT J = 0; J < NumVars; ++J) {
+          const Variable *Var = Src->getVar(J);
+          if (!Var->getLiveRange().containsValue(InstNumber)) {
+            Valid = false;
+            Str << "Liveness error: inst " << Inst->getNumber() << " var ";
+            Var->dump(this);
+            Str << " live range " << Var->getLiveRange() << "\n";
+          }
+        }
+      }
+    }
+  }
+  return Valid;
+}
+
 // ======================== Dump routines ======================== //
 
 void Cfg::emit() {
@@ -158,8 +309,13 @@
   T_emit.printElapsedUs(Ctx, "emit()");
 }
 
-void Cfg::dump() {
+// Dumps the IR with an optional introductory message.
+void Cfg::dump(const IceString &Message) {
+  if (!Ctx->isVerbose())
+    return;
   Ostream &Str = Ctx->getStrDump();
+  if (!Message.empty())
+    Str << "================ " << Message << " ================\n";
   setCurrentNode(getEntryNode());
   // Print function name+args
   if (getContext()->isVerbose(IceV_Instructions)) {
@@ -176,6 +332,18 @@
     Str << ") {\n";
   }
   setCurrentNode(NULL);
+  if (getContext()->isVerbose(IceV_Liveness)) {
+    // Print summary info about variables
+    for (VarList::const_iterator I = Variables.begin(), E = Variables.end();
+         I != E; ++I) {
+      Variable *Var = *I;
+      Str << "//"
+          << " multiblock=" << Var->isMultiblockLife() << " "
+          << " weight=" << Var->getWeight() << " ";
+      Var->dump(this);
+      Str << " LIVE=" << Var->getLiveRange() << "\n";
+    }
+  }
   // Print each basic block
   for (NodeList::const_iterator I = Nodes.begin(), E = Nodes.end(); I != E;
        ++I) {
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 5f3bfd3..dbf08c6 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -58,7 +58,7 @@
   const NodeList &getNodes() const { return Nodes; }
 
   // Manage instruction numbering.
-  int32_t newInstNumber() { return NextInstNumber++; }
+  InstNumberT newInstNumber() { return NextInstNumber++; }
 
   // Manage Variables.
   Variable *makeVariable(Type Ty, const CfgNode *Node,
@@ -72,6 +72,7 @@
 
   // Miscellaneous accessors.
   TargetLowering *getTarget() const { return Target.get(); }
+  Liveness *getLiveness() const { return Live.get(); }
   bool hasComputedFrame() const;
 
   // Passes over the CFG.
@@ -80,11 +81,16 @@
   // compute the predecessor edges, in the form of
   // CfgNode::InEdges[].
   void computePredecessors();
+  void renumberInstructions();
   void placePhiLoads();
   void placePhiStores();
   void deletePhis();
+  void doAddressOpt();
   void genCode();
   void genFrame();
+  void livenessLightweight();
+  void liveness(LivenessMode Mode);
+  bool validateLiveness() const;
 
   // Manage the CurrentNode field, which is used for validating the
   // Variable::DefNode field during dumping/emitting.
@@ -92,7 +98,7 @@
   const CfgNode *getCurrentNode() const { return CurrentNode; }
 
   void emit();
-  void dump();
+  void dump(const IceString &Message = "");
 
   // Allocate data of type T using the per-Cfg allocator.
   template <typename T> T *allocate() { return Allocator.Allocate<T>(); }
@@ -136,9 +142,10 @@
   IceString ErrorMessage;
   CfgNode *Entry; // entry basic block
   NodeList Nodes; // linearized node list; Entry should be first
-  int32_t NextInstNumber;
+  InstNumberT NextInstNumber;
   VarList Variables;
   VarList Args; // subset of Variables, in argument order
+  llvm::OwningPtr<Liveness> Live;
   llvm::OwningPtr<TargetLowering> Target;
 
   // CurrentNode is maintained during dumping/emitting just for
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index c00b2c7..3fd63e2 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -15,6 +15,7 @@
 #include "IceCfg.h"
 #include "IceCfgNode.h"
 #include "IceInst.h"
+#include "IceLiveness.h"
 #include "IceOperand.h"
 #include "IceTargetLowering.h"
 
@@ -49,6 +50,22 @@
   Inst->updateVars(this);
 }
 
+// Renumbers the non-deleted instructions in the node.  This needs to
+// be done in preparation for live range analysis.  The instruction
+// numbers in a block must be monotonically increasing.  The range of
+// instruction numbers in a block, from lowest to highest, must not
+// overlap with the range of any other block.
+void CfgNode::renumberInstructions() {
+  for (PhiList::const_iterator I = Phis.begin(), E = Phis.end(); I != E; ++I) {
+    (*I)->renumber(Func);
+  }
+  InstList::const_iterator I = Insts.begin(), E = Insts.end();
+  while (I != E) {
+    Inst *Inst = *I++;
+    Inst->renumber(Func);
+  }
+}
+
 // When a node is created, the OutEdges are immediately knows, but the
 // InEdges have to be built up incrementally.  After the CFG has been
 // constructed, the computePredecessors() pass finalizes it by
@@ -107,10 +124,25 @@
   // Every block must end in a terminator instruction.
   assert(InsertionPoint != Insts.begin());
   --InsertionPoint;
-  // Confirm via assert() that InsertionPoint is a terminator
-  // instruction.  Calling getTerminatorEdges() on a non-terminator
-  // instruction will cause an llvm_unreachable().
-  assert(((*InsertionPoint)->getTerminatorEdges(), true));
+  // Confirm that InsertionPoint is a terminator instruction.  Calling
+  // getTerminatorEdges() on a non-terminator instruction will cause
+  // an llvm_unreachable().
+  (void)(*InsertionPoint)->getTerminatorEdges();
+  // If the current insertion point is at a conditional branch
+  // instruction, and the previous instruction is a compare
+  // instruction, then we move the insertion point before the compare
+  // instruction so as not to interfere with compare/branch fusing.
+  if (InstBr *Branch = llvm::dyn_cast<InstBr>(*InsertionPoint)) {
+    if (!Branch->isUnconditional()) {
+      if (InsertionPoint != Insts.begin()) {
+        --InsertionPoint;
+        if (!llvm::isa<InstIcmp>(*InsertionPoint) &&
+            !llvm::isa<InstFcmp>(*InsertionPoint)) {
+          ++InsertionPoint;
+        }
+      }
+    }
+  }
 
   // Consider every out-edge.
   for (NodeList::const_iterator I1 = OutEdges.begin(), E1 = OutEdges.end();
@@ -145,6 +177,19 @@
   }
 }
 
+// Does address mode optimization.  Pass each instruction to the
+// TargetLowering object.  If it returns a new instruction
+// (representing the optimized address mode), then insert the new
+// instruction and delete the old.
+void CfgNode::doAddressOpt() {
+  TargetLowering *Target = Func->getTarget();
+  LoweringContext &Context = Target->getContext();
+  Context.init(this);
+  while (!Context.atEnd()) {
+    Target->doAddressOpt();
+  }
+}
+
 // Drives the target lowering.  Passes the current instruction and the
 // next non-deleted instruction for target lowering.
 void CfgNode::genCode() {
@@ -162,6 +207,189 @@
   }
 }
 
+void CfgNode::livenessLightweight() {
+  SizeT NumVars = Func->getNumVariables();
+  llvm::BitVector Live(NumVars);
+  // Process regular instructions in reverse order.
+  for (InstList::const_reverse_iterator I = Insts.rbegin(), E = Insts.rend();
+       I != E; ++I) {
+    if ((*I)->isDeleted())
+      continue;
+    (*I)->livenessLightweight(Live);
+  }
+  for (PhiList::const_iterator I = Phis.begin(), E = Phis.end(); I != E; ++I) {
+    if ((*I)->isDeleted())
+      continue;
+    (*I)->livenessLightweight(Live);
+  }
+}
+
+// Performs liveness analysis on the block.  Returns true if the
+// incoming liveness changed from before, false if it stayed the same.
+// (If it changes, the node's predecessors need to be processed
+// again.)
+bool CfgNode::liveness(Liveness *Liveness) {
+  SizeT NumVars = Liveness->getNumVarsInNode(this);
+  llvm::BitVector Live(NumVars);
+  // Mark the beginning and ending of each variable's live range
+  // with the sentinel instruction number 0.
+  std::vector<InstNumberT> &LiveBegin = Liveness->getLiveBegin(this);
+  std::vector<InstNumberT> &LiveEnd = Liveness->getLiveEnd(this);
+  LiveBegin.assign(NumVars, Inst::NumberSentinel);
+  LiveEnd.assign(NumVars, Inst::NumberSentinel);
+  // Initialize Live to be the union of all successors' LiveIn.
+  for (NodeList::const_iterator I = OutEdges.begin(), E = OutEdges.end();
+       I != E; ++I) {
+    CfgNode *Succ = *I;
+    Live |= Liveness->getLiveIn(Succ);
+    // Mark corresponding argument of phis in successor as live.
+    for (PhiList::const_iterator I1 = Succ->Phis.begin(), E1 = Succ->Phis.end();
+         I1 != E1; ++I1) {
+      (*I1)->livenessPhiOperand(Live, this, Liveness);
+    }
+  }
+  Liveness->getLiveOut(this) = Live;
+
+  // Process regular instructions in reverse order.
+  for (InstList::const_reverse_iterator I = Insts.rbegin(), E = Insts.rend();
+       I != E; ++I) {
+    if ((*I)->isDeleted())
+      continue;
+    (*I)->liveness((*I)->getNumber(), Live, Liveness, this);
+  }
+  // Process phis in forward order so that we can override the
+  // instruction number to be that of the earliest phi instruction in
+  // the block.
+  InstNumberT FirstPhiNumber = Inst::NumberSentinel;
+  for (PhiList::const_iterator I = Phis.begin(), E = Phis.end(); I != E; ++I) {
+    if ((*I)->isDeleted())
+      continue;
+    if (FirstPhiNumber == Inst::NumberSentinel)
+      FirstPhiNumber = (*I)->getNumber();
+    (*I)->liveness(FirstPhiNumber, Live, Liveness, this);
+  }
+
+  // When using the sparse representation, after traversing the
+  // instructions in the block, the Live bitvector should only contain
+  // set bits for global variables upon block entry.  We validate this
+  // by shrinking the Live vector and then testing it against the
+  // pre-shrunk version.  (The shrinking is required, but the
+  // validation is not.)
+  llvm::BitVector LiveOrig = Live;
+  Live.resize(Liveness->getNumGlobalVars());
+  // Non-global arguments in the entry node are allowed to be live on
+  // entry.
+  bool IsEntry = (Func->getEntryNode() == this);
+  if (!(IsEntry || Live == LiveOrig)) {
+    // This is a fatal liveness consistency error.  Print some
+    // diagnostics and abort.
+    Ostream &Str = Func->getContext()->getStrDump();
+    Func->setCurrentNode(NULL);
+    Str << "LiveOrig-Live =";
+    for (SizeT i = Live.size(); i < LiveOrig.size(); ++i) {
+      if (LiveOrig.test(i)) {
+        Str << " ";
+        Liveness->getVariable(i, this)->dump(Func);
+      }
+    }
+    Str << "\n";
+    llvm_unreachable("Fatal inconsistency in liveness analysis");
+  }
+
+  bool Changed = false;
+  llvm::BitVector &LiveIn = Liveness->getLiveIn(this);
+  // Add in current LiveIn
+  Live |= LiveIn;
+  // Check result, set LiveIn=Live
+  Changed = (Live != LiveIn);
+  if (Changed)
+    LiveIn = Live;
+  return Changed;
+}
+
+// Now that basic liveness is complete, remove dead instructions that
+// were tentatively marked as dead, and compute actual live ranges.
+// It is assumed that within a single basic block, a live range begins
+// at most once and ends at most once.  This is certainly true for
+// pure SSA form.  It is also true once phis are lowered, since each
+// assignment to the phi-based temporary is in a different basic
+// block, and there is a single read that ends the live in the basic
+// block that contained the actual phi instruction.
+void CfgNode::livenessPostprocess(LivenessMode Mode, Liveness *Liveness) {
+  InstNumberT FirstInstNum = Inst::NumberSentinel;
+  InstNumberT LastInstNum = Inst::NumberSentinel;
+  // Process phis in any order.  Process only Dest operands.
+  for (PhiList::const_iterator I = Phis.begin(), E = Phis.end(); I != E; ++I) {
+    InstPhi *Inst = *I;
+    Inst->deleteIfDead();
+    if (Inst->isDeleted())
+      continue;
+    if (FirstInstNum == Inst::NumberSentinel)
+      FirstInstNum = Inst->getNumber();
+    assert(Inst->getNumber() > LastInstNum);
+    LastInstNum = Inst->getNumber();
+  }
+  // Process instructions
+  for (InstList::const_iterator I = Insts.begin(), E = Insts.end(); I != E;
+       ++I) {
+    Inst *Inst = *I;
+    Inst->deleteIfDead();
+    if (Inst->isDeleted())
+      continue;
+    if (FirstInstNum == Inst::NumberSentinel)
+      FirstInstNum = Inst->getNumber();
+    assert(Inst->getNumber() > LastInstNum);
+    LastInstNum = Inst->getNumber();
+    // Create fake live ranges for a Kill instruction, but only if the
+    // linked instruction is still alive.
+    if (Mode == Liveness_Intervals) {
+      if (InstFakeKill *Kill = llvm::dyn_cast<InstFakeKill>(Inst)) {
+        if (!Kill->getLinked()->isDeleted()) {
+          SizeT NumSrcs = Inst->getSrcSize();
+          for (SizeT i = 0; i < NumSrcs; ++i) {
+            Variable *Var = llvm::cast<Variable>(Inst->getSrc(i));
+            InstNumberT InstNumber = Inst->getNumber();
+            Liveness->addLiveRange(Var, InstNumber, InstNumber, 1);
+          }
+        }
+      }
+    }
+  }
+  if (Mode != Liveness_Intervals)
+    return;
+
+  SizeT NumVars = Liveness->getNumVarsInNode(this);
+  SizeT NumGlobals = Liveness->getNumGlobalVars();
+  llvm::BitVector &LiveIn = Liveness->getLiveIn(this);
+  llvm::BitVector &LiveOut = Liveness->getLiveOut(this);
+  std::vector<InstNumberT> &LiveBegin = Liveness->getLiveBegin(this);
+  std::vector<InstNumberT> &LiveEnd = Liveness->getLiveEnd(this);
+  for (SizeT i = 0; i < NumVars; ++i) {
+    // Deal with the case where the variable is both live-in and
+    // live-out, but LiveEnd comes before LiveBegin.  In this case, we
+    // need to add two segments to the live range because there is a
+    // hole in the middle.  This would typically happen as a result of
+    // phi lowering in the presence of loopback edges.
+    bool IsGlobal = (i < NumGlobals);
+    if (IsGlobal && LiveIn[i] && LiveOut[i] && LiveBegin[i] > LiveEnd[i]) {
+      Variable *Var = Liveness->getVariable(i, this);
+      Liveness->addLiveRange(Var, FirstInstNum, LiveEnd[i], 1);
+      Liveness->addLiveRange(Var, LiveBegin[i], LastInstNum + 1, 1);
+      continue;
+    }
+    InstNumberT Begin = (IsGlobal && LiveIn[i]) ? FirstInstNum : LiveBegin[i];
+    InstNumberT End = (IsGlobal && LiveOut[i]) ? LastInstNum + 1 : LiveEnd[i];
+    if (Begin == Inst::NumberSentinel && End == Inst::NumberSentinel)
+      continue;
+    if (Begin <= FirstInstNum)
+      Begin = FirstInstNum;
+    if (End == Inst::NumberSentinel)
+      End = LastInstNum + 1;
+    Variable *Var = Liveness->getVariable(i, this);
+    Liveness->addLiveRange(Var, Begin, End, 1);
+  }
+}
+
 // ======================== Dump routines ======================== //
 
 void CfgNode::emit(Cfg *Func) const {
@@ -194,6 +422,7 @@
 void CfgNode::dump(Cfg *Func) const {
   Func->setCurrentNode(this);
   Ostream &Str = Func->getContext()->getStrDump();
+  Liveness *Liveness = Func->getLiveness();
   if (Func->getContext()->isVerbose(IceV_Instructions)) {
     Str << getName() << ":\n";
   }
@@ -208,6 +437,19 @@
     }
     Str << "\n";
   }
+  // Dump the live-in variables.
+  llvm::BitVector LiveIn;
+  if (Liveness)
+    LiveIn = Liveness->getLiveIn(this);
+  if (Func->getContext()->isVerbose(IceV_Liveness) && !LiveIn.empty()) {
+    Str << "    // LiveIn:";
+    for (SizeT i = 0; i < LiveIn.size(); ++i) {
+      if (LiveIn[i]) {
+        Str << " %" << Liveness->getVariable(i, this)->getName();
+      }
+    }
+    Str << "\n";
+  }
   // Dump each instruction.
   if (Func->getContext()->isVerbose(IceV_Instructions)) {
     for (PhiList::const_iterator I = Phis.begin(), E = Phis.end(); I != E;
@@ -221,6 +463,19 @@
       Inst->dumpDecorated(Func);
     }
   }
+  // Dump the live-out variables.
+  llvm::BitVector LiveOut;
+  if (Liveness)
+    LiveOut = Liveness->getLiveOut(this);
+  if (Func->getContext()->isVerbose(IceV_Liveness) && !LiveOut.empty()) {
+    Str << "    // LiveOut:";
+    for (SizeT i = 0; i < LiveOut.size(); ++i) {
+      if (LiveOut[i]) {
+        Str << " %" << Liveness->getVariable(i, this)->getName();
+      }
+    }
+    Str << "\n";
+  }
   // Dump list of successor nodes.
   if (Func->getContext()->isVerbose(IceV_Succs)) {
     Str << "    // succs = ";
diff --git a/src/IceCfgNode.h b/src/IceCfgNode.h
index 645dc57..ea50dca 100644
--- a/src/IceCfgNode.h
+++ b/src/IceCfgNode.h
@@ -45,6 +45,7 @@
   // Manage the instruction list.
   InstList &getInsts() { return Insts; }
   void appendInst(Inst *Inst);
+  void renumberInstructions();
 
   // Add a predecessor edge to the InEdges list for each of this
   // node's successors.
@@ -53,7 +54,11 @@
   void placePhiLoads();
   void placePhiStores();
   void deletePhis();
+  void doAddressOpt();
   void genCode();
+  void livenessLightweight();
+  bool liveness(Liveness *Liveness);
+  void livenessPostprocess(LivenessMode Mode, Liveness *Liveness);
   void emit(Cfg *Func) const;
   void dump(Cfg *Func) const;
 
diff --git a/src/IceDefs.h b/src/IceDefs.h
index 9870716..4c616f8 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -50,6 +50,8 @@
 class Inst;
 class InstPhi;
 class InstTarget;
+class LiveRange;
+class Liveness;
 class Operand;
 class TargetLowering;
 class Variable;
@@ -68,6 +70,23 @@
 // may be 64-bits wide) when we want to save space.
 typedef uint32_t SizeT;
 
+// InstNumberT is for holding an instruction number.  Instruction
+// numbers are used for representing Variable live ranges.
+typedef int32_t InstNumberT;
+
+enum LivenessMode {
+  // Basic version of live-range-end calculation.  Marks the last uses
+  // of variables based on dataflow analysis.  Records the set of
+  // live-in and live-out variables for each block.  Identifies and
+  // deletes dead instructions (primarily stores).
+  Liveness_Basic,
+
+  // In addition to Liveness_Basic, also calculate the complete
+  // live range for each variable in a form suitable for interference
+  // calculation and register allocation.
+  Liveness_Intervals
+};
+
 enum VerboseItem {
   IceV_None = 0,
   IceV_Instructions = 1 << 0,
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 7a21b40..d77a5dd 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -12,6 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <ctype.h> // isdigit()
+
 #include "IceDefs.h"
 #include "IceTypes.h"
 #include "IceCfg.h"
@@ -129,8 +131,14 @@
     return NewName;
   }
 
-  ItemsParsed = sscanf(Name.c_str(), "_Z%u%s", &BaseLength, NameBase);
-  if (ItemsParsed == 2 && BaseLength <= strlen(NameBase)) {
+  // Artificially limit BaseLength to 9 digits (less than 1 billion)
+  // because sscanf behavior is undefined on integer overflow.  If
+  // there are more than 9 digits (which we test by looking at the
+  // beginning of NameBase), then we consider this a failure to parse
+  // a namespace mangling, and fall back to the simple prefixing.
+  ItemsParsed = sscanf(Name.c_str(), "_Z%9u%s", &BaseLength, NameBase);
+  if (ItemsParsed == 2 && BaseLength <= strlen(NameBase) &&
+      !isdigit(NameBase[0])) {
     // Transform _Z3barxyz ==> _ZN6Prefix3barExyz
     //                           ^^^^^^^^    ^
     // (splice in "N6Prefix", and insert "E" after "3bar")
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index 3f0c97d..12ca16c 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -15,6 +15,7 @@
 #include "IceCfg.h"
 #include "IceCfgNode.h"
 #include "IceInst.h"
+#include "IceLiveness.h"
 #include "IceOperand.h"
 
 namespace Ice {
@@ -74,21 +75,140 @@
 } // end of anonymous namespace
 
 Inst::Inst(Cfg *Func, InstKind Kind, SizeT MaxSrcs, Variable *Dest)
-    : Kind(Kind), Number(Func->newInstNumber()), Deleted(false),
+    : Kind(Kind), Number(Func->newInstNumber()), Deleted(false), Dead(false),
       HasSideEffects(false), Dest(Dest), MaxSrcs(MaxSrcs), NumSrcs(0),
-      Srcs(Func->allocateArrayOf<Operand *>(MaxSrcs)) {}
+      Srcs(Func->allocateArrayOf<Operand *>(MaxSrcs)), LiveRangesEnded(0) {}
+
+// Assign the instruction a new number.
+void Inst::renumber(Cfg *Func) {
+  Number = isDeleted() ? NumberDeleted : Func->newInstNumber();
+}
+
+// Delete the instruction if its tentative Dead flag is still set
+// after liveness analysis.
+void Inst::deleteIfDead() {
+  if (Dead)
+    setDeleted();
+}
+
+// If Src is a Variable, it returns true if this instruction ends
+// Src's live range.  Otherwise, returns false.
+bool Inst::isLastUse(const Operand *TestSrc) const {
+  if (LiveRangesEnded == 0)
+    return false; // early-exit optimization
+  if (const Variable *TestVar = llvm::dyn_cast<const Variable>(TestSrc)) {
+    LREndedBits Mask = LiveRangesEnded;
+    for (SizeT I = 0; I < getSrcSize(); ++I) {
+      Operand *Src = getSrc(I);
+      SizeT NumVars = Src->getNumVars();
+      for (SizeT J = 0; J < NumVars; ++J) {
+        const Variable *Var = Src->getVar(J);
+        if (Var == TestVar) {
+          // We've found where the variable is used in the instruction.
+          return Mask & 1;
+        }
+        Mask >>= 1;
+        if (Mask == 0)
+          return false; // another early-exit optimization
+      }
+    }
+  }
+  return false;
+}
 
 void Inst::updateVars(CfgNode *Node) {
   if (Dest)
     Dest->setDefinition(this, Node);
 
+  for (SizeT I = 0; I < getSrcSize(); ++I) {
+    Operand *Src = getSrc(I);
+    SizeT NumVars = Src->getNumVars();
+    for (SizeT J = 0; J < NumVars; ++J) {
+      Variable *Var = Src->getVar(J);
+      Var->setUse(this, Node);
+    }
+  }
+}
+
+void Inst::livenessLightweight(llvm::BitVector &Live) {
+  assert(!isDeleted());
+  if (llvm::isa<InstFakeKill>(this))
+    return;
+  resetLastUses();
   SizeT VarIndex = 0;
   for (SizeT I = 0; I < getSrcSize(); ++I) {
     Operand *Src = getSrc(I);
     SizeT NumVars = Src->getNumVars();
     for (SizeT J = 0; J < NumVars; ++J, ++VarIndex) {
-      Variable *Var = Src->getVar(J);
-      Var->setUse(this, Node);
+      const Variable *Var = Src->getVar(J);
+      if (Var->isMultiblockLife())
+        continue;
+      SizeT Index = Var->getIndex();
+      if (Live[Index])
+        continue;
+      Live[Index] = true;
+      setLastUse(VarIndex);
+    }
+  }
+}
+
+void Inst::liveness(InstNumberT InstNumber, llvm::BitVector &Live,
+                    Liveness *Liveness, const CfgNode *Node) {
+  assert(!isDeleted());
+  if (llvm::isa<InstFakeKill>(this))
+    return;
+
+  std::vector<InstNumberT> &LiveBegin = Liveness->getLiveBegin(Node);
+  std::vector<InstNumberT> &LiveEnd = Liveness->getLiveEnd(Node);
+  Dead = false;
+  if (Dest) {
+    SizeT VarNum = Liveness->getLiveIndex(Dest);
+    if (Live[VarNum]) {
+      Live[VarNum] = false;
+      LiveBegin[VarNum] = InstNumber;
+    } else {
+      if (!hasSideEffects())
+        Dead = true;
+    }
+  }
+  if (Dead)
+    return;
+  // Phi arguments only get added to Live in the predecessor node, but
+  // we still need to update LiveRangesEnded.
+  bool IsPhi = llvm::isa<InstPhi>(this);
+  resetLastUses();
+  SizeT VarIndex = 0;
+  for (SizeT I = 0; I < getSrcSize(); ++I) {
+    Operand *Src = getSrc(I);
+    SizeT NumVars = Src->getNumVars();
+    for (SizeT J = 0; J < NumVars; ++J, ++VarIndex) {
+      const Variable *Var = Src->getVar(J);
+      SizeT VarNum = Liveness->getLiveIndex(Var);
+      if (!Live[VarNum]) {
+        setLastUse(VarIndex);
+        if (!IsPhi) {
+          Live[VarNum] = true;
+          // For a variable in SSA form, its live range can end at
+          // most once in a basic block.  However, after lowering to
+          // two-address instructions, we end up with sequences like
+          // "t=b;t+=c;a=t" where t's live range begins and ends
+          // twice.  ICE only allows a variable to have a single
+          // liveness interval in a basic block (except for blocks
+          // where a variable is live-in and live-out but there is a
+          // gap in the middle, and except for the special
+          // InstFakeKill instruction that can appear multiple
+          // times in the same block).  Therefore, this lowered
+          // sequence needs to represent a single conservative live
+          // range for t.  Since the instructions are being traversed
+          // backwards, we make sure LiveEnd is only set once by
+          // setting it only when LiveEnd[VarNum]==0 (sentinel value).
+          // Note that it's OK to set LiveBegin multiple times because
+          // of the backwards traversal.
+          if (LiveEnd[VarNum] == 0) {
+            LiveEnd[VarNum] = InstNumber;
+          }
+        }
+      }
     }
   }
 }
@@ -192,6 +312,28 @@
   return NULL;
 }
 
+// Updates liveness for a particular operand based on the given
+// predecessor edge.  Doesn't mark the operand as live if the Phi
+// instruction is dead or deleted.
+void InstPhi::livenessPhiOperand(llvm::BitVector &Live, CfgNode *Target,
+                                 Liveness *Liveness) {
+  if (isDeleted() || Dead)
+    return;
+  for (SizeT I = 0; I < getSrcSize(); ++I) {
+    if (Labels[I] == Target) {
+      if (Variable *Var = llvm::dyn_cast<Variable>(getSrc(I))) {
+        SizeT SrcIndex = Liveness->getLiveIndex(Var);
+        if (!Live[SrcIndex]) {
+          setLastUse(I);
+          Live[SrcIndex] = true;
+        }
+      }
+      return;
+    }
+  }
+  llvm_unreachable("Phi operand not found for specified target node");
+}
+
 // Change "a=phi(...)" to "a_phi=phi(...)" and return a new
 // instruction "a=a_phi".
 Inst *InstPhi::lower(Cfg *Func, CfgNode *Node) {
@@ -294,8 +436,8 @@
     return;
   if (Func->getContext()->isVerbose(IceV_InstNumbers)) {
     char buf[30];
-    int32_t Number = getNumber();
-    if (Number < 0)
+    InstNumberT Number = getNumber();
+    if (Number == NumberDeleted)
       snprintf(buf, llvm::array_lengthof(buf), "[XXX]");
     else
       snprintf(buf, llvm::array_lengthof(buf), "[%3d]", Number);
@@ -305,6 +447,7 @@
   if (isDeleted())
     Str << "  //";
   dump(Func);
+  dumpExtras(Func);
   Str << "\n";
 }
 
@@ -319,6 +462,32 @@
   dumpSources(Func);
 }
 
+void Inst::dumpExtras(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  bool First = true;
+  // Print "LIVEEND={a,b,c}" for all source operands whose live ranges
+  // are known to end at this instruction.
+  if (Func->getContext()->isVerbose(IceV_Liveness)) {
+    for (SizeT I = 0; I < getSrcSize(); ++I) {
+      Operand *Src = getSrc(I);
+      SizeT NumVars = Src->getNumVars();
+      for (SizeT J = 0; J < NumVars; ++J) {
+        const Variable *Var = Src->getVar(J);
+        if (isLastUse(Var)) {
+          if (First)
+            Str << " // LIVEEND={";
+          else
+            Str << ",";
+          Var->dump(Func);
+          First = false;
+        }
+      }
+    }
+    if (!First)
+      Str << "}";
+  }
+}
+
 void Inst::dumpSources(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   for (SizeT I = 0; I < getSrcSize(); ++I) {
@@ -553,4 +722,6 @@
   Inst::dump(Func);
 }
 
+void InstTarget::dumpExtras(const Cfg *Func) const { Inst::dumpExtras(Func); }
+
 } // end of namespace Ice
diff --git a/src/IceInst.h b/src/IceInst.h
index fd0eeea..a465eda 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -56,10 +56,14 @@
   };
   InstKind getKind() const { return Kind; }
 
-  int32_t getNumber() const { return Number; }
+  InstNumberT getNumber() const { return Number; }
+  void renumber(Cfg *Func);
+  static const InstNumberT NumberDeleted = -1;
+  static const InstNumberT NumberSentinel = 0;
 
   bool isDeleted() const { return Deleted; }
   void setDeleted() { Deleted = true; }
+  void deleteIfDead();
 
   bool hasSideEffects() const { return HasSideEffects; }
 
@@ -71,6 +75,8 @@
     return Srcs[I];
   }
 
+  bool isLastUse(const Operand *Src) const;
+
   // Returns a list of out-edges corresponding to a terminator
   // instruction, which is the last instruction of the block.
   virtual NodeList getTerminatorEdges() const {
@@ -88,8 +94,12 @@
   // basic blocks, i.e. used in a different block from their definition.
   void updateVars(CfgNode *Node);
 
+  void livenessLightweight(llvm::BitVector &Live);
+  void liveness(InstNumberT InstNumber, llvm::BitVector &Live,
+                Liveness *Liveness, const CfgNode *Node);
   virtual void emit(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
+  virtual void dumpExtras(const Cfg *Func) const;
   void dumpDecorated(const Cfg *Func) const;
   void emitSources(const Cfg *Func) const;
   void dumpSources(const Cfg *Func) const;
@@ -105,15 +115,22 @@
     assert(NumSrcs < MaxSrcs);
     Srcs[NumSrcs++] = Src;
   }
+  void setLastUse(SizeT VarIndex) {
+    if (VarIndex < CHAR_BIT * sizeof(LiveRangesEnded))
+      LiveRangesEnded |= (((LREndedBits)1u) << VarIndex);
+  }
+  void resetLastUses() { LiveRangesEnded = 0; }
   // The destroy() method lets the instruction cleanly release any
   // memory that was allocated via the Cfg's allocator.
   virtual void destroy(Cfg *Func) { Func->deallocateArrayOf<Operand *>(Srcs); }
 
   const InstKind Kind;
   // Number is the instruction number for describing live ranges.
-  int32_t Number;
+  InstNumberT Number;
   // Deleted means irrevocably deleted.
   bool Deleted;
+  // Dead means pending deletion after liveness analysis converges.
+  bool Dead;
   // HasSideEffects means the instruction is something like a function
   // call or a volatile load that can't be removed even if its Dest
   // variable is not live.
@@ -124,6 +141,18 @@
   SizeT NumSrcs;
   Operand **Srcs;
 
+  // LiveRangesEnded marks which Variables' live ranges end in this
+  // instruction.  An instruction can have an arbitrary number of
+  // source operands (e.g. a call instruction), and each source
+  // operand can contain 0 or 1 Variable (and target-specific operands
+  // could contain more than 1 Variable).  All the variables in an
+  // instruction are conceptually flattened and each variable is
+  // mapped to one bit position of the LiveRangesEnded bit vector.
+  // Only the first CHAR_BIT * sizeof(LREndedBits) variables are
+  // tracked this way.
+  typedef uint32_t LREndedBits; // only first 32 src operands tracked, sorry
+  LREndedBits LiveRangesEnded;
+
 private:
   Inst(const Inst &) LLVM_DELETED_FUNCTION;
   Inst &operator=(const Inst &) LLVM_DELETED_FUNCTION;
@@ -393,6 +422,8 @@
   }
   void addArgument(Operand *Source, CfgNode *Label);
   Operand *getOperandForTarget(CfgNode *Target) const;
+  void livenessPhiOperand(llvm::BitVector &Live, CfgNode *Target,
+                          Liveness *Liveness);
   Inst *lower(Cfg *Func, CfgNode *Node);
   virtual void dump(const Cfg *Func) const;
   static bool classof(const Inst *Inst) { return Inst->getKind() == Phi; }
@@ -626,6 +657,7 @@
 public:
   virtual void emit(const Cfg *Func) const = 0;
   virtual void dump(const Cfg *Func) const;
+  virtual void dumpExtras(const Cfg *Func) const;
   static bool classof(const Inst *Inst) { return Inst->getKind() >= Target; }
 
 protected:
diff --git a/src/IceLiveness.cpp b/src/IceLiveness.cpp
new file mode 100644
index 0000000..47ef358
--- /dev/null
+++ b/src/IceLiveness.cpp
@@ -0,0 +1,116 @@
+//===- subzero/src/IceLiveness.cpp - Liveness analysis implementation -----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides some of the support for the Liveness class.  In
+// particular, it handles the sparsity representation of the mapping
+// between Variables and CfgNodes.  The idea is that since most
+// variables are used only within a single basic block, we can
+// partition the variables into "local" and "global" sets.  Instead of
+// sizing and indexing vectors according to Variable::Number, we
+// create a mapping such that global variables are mapped to low
+// indexes that are common across nodes, and local variables are
+// mapped to a higher index space that is shared across nodes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IceDefs.h"
+#include "IceCfg.h"
+#include "IceCfgNode.h"
+#include "IceInst.h"
+#include "IceLiveness.h"
+#include "IceOperand.h"
+
+namespace Ice {
+
+void Liveness::init() {
+  // Initialize most of the container sizes.
+  SizeT NumVars = Func->getVariables().size();
+  SizeT NumNodes = Func->getNumNodes();
+  Nodes.resize(NumNodes);
+  VarToLiveMap.resize(NumVars);
+  if (Mode == Liveness_Intervals)
+    LiveRanges.resize(NumVars);
+
+  // Count the number of globals, and the number of locals for each
+  // block.
+  for (SizeT i = 0; i < NumVars; ++i) {
+    Variable *Var = Func->getVariables()[i];
+    if (Var->isMultiblockLife()) {
+      ++NumGlobals;
+    } else {
+      SizeT Index = Var->getLocalUseNode()->getIndex();
+      ++Nodes[Index].NumLocals;
+    }
+  }
+
+  // Resize each LivenessNode::LiveToVarMap, and the global
+  // LiveToVarMap.  Reset the counts to 0.
+  for (SizeT i = 0; i < NumNodes; ++i) {
+    Nodes[i].LiveToVarMap.assign(Nodes[i].NumLocals, NULL);
+    Nodes[i].NumLocals = 0;
+  }
+  LiveToVarMap.assign(NumGlobals, NULL);
+
+  // Sort each variable into the appropriate LiveToVarMap.  Also set
+  // VarToLiveMap.
+  SizeT TmpNumGlobals = 0;
+  for (SizeT i = 0; i < NumVars; ++i) {
+    Variable *Var = Func->getVariables()[i];
+    SizeT VarIndex = Var->getIndex();
+    SizeT LiveIndex;
+    if (Var->isMultiblockLife()) {
+      LiveIndex = TmpNumGlobals++;
+      LiveToVarMap[LiveIndex] = Var;
+    } else {
+      SizeT NodeIndex = Var->getLocalUseNode()->getIndex();
+      LiveIndex = Nodes[NodeIndex].NumLocals++;
+      Nodes[NodeIndex].LiveToVarMap[LiveIndex] = Var;
+      LiveIndex += NumGlobals;
+    }
+    VarToLiveMap[VarIndex] = LiveIndex;
+  }
+  assert(NumGlobals == TmpNumGlobals);
+
+  // Process each node.
+  const NodeList &LNodes = Func->getNodes();
+  SizeT NumLNodes = LNodes.size();
+  for (SizeT i = 0; i < NumLNodes; ++i) {
+    LivenessNode &Node = Nodes[LNodes[i]->getIndex()];
+    // NumLocals, LiveToVarMap already initialized
+    Node.LiveIn.resize(NumGlobals);
+    Node.LiveOut.resize(NumGlobals);
+    // LiveBegin and LiveEnd are reinitialized before each pass over
+    // the block.
+  }
+}
+
+Variable *Liveness::getVariable(SizeT LiveIndex, const CfgNode *Node) const {
+  if (LiveIndex < NumGlobals)
+    return LiveToVarMap[LiveIndex];
+  SizeT NodeIndex = Node->getIndex();
+  return Nodes[NodeIndex].LiveToVarMap[LiveIndex - NumGlobals];
+}
+
+SizeT Liveness::getLiveIndex(const Variable *Var) const {
+  return VarToLiveMap[Var->getIndex()];
+}
+
+void Liveness::addLiveRange(Variable *Var, InstNumberT Start, InstNumberT End,
+                            uint32_t WeightDelta) {
+  LiveRange &LiveRange = LiveRanges[Var->getIndex()];
+  assert(WeightDelta != RegWeight::Inf);
+  LiveRange.addSegment(Start, End);
+  LiveRange.addWeight(WeightDelta);
+}
+
+LiveRange &Liveness::getLiveRange(Variable *Var) {
+  return LiveRanges[Var->getIndex()];
+}
+
+} // end of namespace Ice
diff --git a/src/IceLiveness.h b/src/IceLiveness.h
new file mode 100644
index 0000000..e58f426
--- /dev/null
+++ b/src/IceLiveness.h
@@ -0,0 +1,100 @@
+//===- subzero/src/IceLiveness.h - Liveness analysis ------------*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Liveness and LivenessNode classes,
+// which are used for liveness analysis.  The node-specific
+// information tracked for each Variable includes whether it is
+// live on entry, whether it is live on exit, the instruction number
+// that starts its live range, and the instruction number that ends
+// its live range.  At the Cfg level, the actual live intervals are
+// recorded.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICELIVENESS_H
+#define SUBZERO_SRC_ICELIVENESS_H
+
+#include "IceDefs.h"
+#include "IceTypes.h"
+
+namespace Ice {
+
+class LivenessNode {
+public:
+  LivenessNode() : NumLocals(0) {}
+  // NumLocals is the number of Variables local to this block.
+  SizeT NumLocals;
+  // LiveToVarMap maps a liveness bitvector index to a Variable.  This
+  // is generally just for printing/dumping.  The index should be less
+  // than NumLocals + Liveness::NumGlobals.
+  std::vector<Variable *> LiveToVarMap;
+  // LiveIn and LiveOut track the in- and out-liveness of the global
+  // variables.  The size of each vector is
+  // LivenessNode::NumGlobals.
+  llvm::BitVector LiveIn, LiveOut;
+  // LiveBegin and LiveEnd track the instruction numbers of the start
+  // and end of each variable's live range within this block.  The
+  // size of each vector is NumLocals + Liveness::NumGlobals.
+  std::vector<InstNumberT> LiveBegin, LiveEnd;
+
+private:
+  // TODO: Disable these constructors when Liveness::Nodes is no
+  // longer an STL container.
+  // LivenessNode(const LivenessNode &) LLVM_DELETED_FUNCTION;
+  // LivenessNode &operator=(const LivenessNode &) LLVM_DELETED_FUNCTION;
+};
+
+class Liveness {
+public:
+  Liveness(Cfg *Func, LivenessMode Mode)
+      : Func(Func), Mode(Mode), NumGlobals(0) {}
+  void init();
+  Variable *getVariable(SizeT LiveIndex, const CfgNode *Node) const;
+  SizeT getLiveIndex(const Variable *Var) const;
+  SizeT getNumGlobalVars() const { return NumGlobals; }
+  SizeT getNumVarsInNode(const CfgNode *Node) const {
+    return NumGlobals + Nodes[Node->getIndex()].NumLocals;
+  }
+  llvm::BitVector &getLiveIn(const CfgNode *Node) {
+    return Nodes[Node->getIndex()].LiveIn;
+  }
+  llvm::BitVector &getLiveOut(const CfgNode *Node) {
+    return Nodes[Node->getIndex()].LiveOut;
+  }
+  std::vector<InstNumberT> &getLiveBegin(const CfgNode *Node) {
+    return Nodes[Node->getIndex()].LiveBegin;
+  }
+  std::vector<InstNumberT> &getLiveEnd(const CfgNode *Node) {
+    return Nodes[Node->getIndex()].LiveEnd;
+  }
+  LiveRange &getLiveRange(Variable *Var);
+  void addLiveRange(Variable *Var, InstNumberT Start, InstNumberT End,
+                    uint32_t WeightDelta);
+
+private:
+  Cfg *Func;
+  LivenessMode Mode;
+  SizeT NumGlobals;
+  // Size of Nodes is Cfg::Nodes.size().
+  std::vector<LivenessNode> Nodes;
+  // VarToLiveMap maps a Variable's Variable::Number to its live index
+  // within its basic block.
+  std::vector<SizeT> VarToLiveMap;
+  // LiveToVarMap is analogous to LivenessNode::LiveToVarMap, but for
+  // non-local variables.
+  std::vector<Variable *> LiveToVarMap;
+  // LiveRanges maps a Variable::Number to its live range.
+  std::vector<LiveRange> LiveRanges;
+  Liveness(const Liveness &) LLVM_DELETED_FUNCTION;
+  Liveness &operator=(const Liveness &) LLVM_DELETED_FUNCTION;
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICELIVENESS_H
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index 56520d3..02e85c7 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -7,9 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the Operand class and its
-// target-independent subclasses, primarily for the methods of the
-// Variable class.
+// This file implements the Operand class and its target-independent
+// subclasses, primarily for the methods of the Variable class.
 //
 //===----------------------------------------------------------------------===//
 
@@ -36,6 +35,109 @@
   return !(B < A) && !(A < B);
 }
 
+void LiveRange::addSegment(InstNumberT Start, InstNumberT End) {
+#ifdef USE_SET
+  RangeElementType Element(Start, End);
+  RangeType::iterator Next = Range.lower_bound(Element);
+  assert(Next == Range.upper_bound(Element)); // Element not already present
+
+  // Beginning of code that merges contiguous segments.  TODO: change
+  // "if(true)" to "if(false)" to see if this extra optimization code
+  // gives any performance gain, or is just destabilizing.
+  if (true) {
+    RangeType::iterator FirstDelete = Next;
+    RangeType::iterator Prev = Next;
+    bool hasPrev = (Next != Range.begin());
+    bool hasNext = (Next != Range.end());
+    if (hasPrev)
+      --Prev;
+    // See if Element and Next should be joined.
+    if (hasNext && End == Next->first) {
+      Element.second = Next->second;
+      ++Next;
+    }
+    // See if Prev and Element should be joined.
+    if (hasPrev && Prev->second == Start) {
+      Element.first = Prev->first;
+      FirstDelete = Prev;
+    }
+    Range.erase(FirstDelete, Next);
+  }
+  // End of code that merges contiguous segments.
+
+  Range.insert(Next, Element);
+#else
+  if (Range.empty()) {
+    Range.push_back(RangeElementType(Start, End));
+    return;
+  }
+  // Special case for faking in-arg liveness.
+  if (End < Range.front().first) {
+    assert(Start < 0);
+    Range.push_front(RangeElementType(Start, End));
+    return;
+  }
+  InstNumberT CurrentEnd = Range.back().second;
+  assert(Start >= CurrentEnd);
+  // Check for merge opportunity.
+  if (Start == CurrentEnd) {
+    Range.back().second = End;
+    return;
+  }
+  Range.push_back(RangeElementType(Start, End));
+#endif
+}
+
+// Returns true if this live range ends before Other's live range
+// starts.  This means that the highest instruction number in this
+// live range is less than or equal to the lowest instruction number
+// of the Other live range.
+bool LiveRange::endsBefore(const LiveRange &Other) const {
+  // Neither range should be empty, but let's be graceful.
+  if (Range.empty() || Other.Range.empty())
+    return true;
+  InstNumberT MyEnd = (*Range.rbegin()).second;
+  InstNumberT OtherStart = (*Other.Range.begin()).first;
+  return MyEnd <= OtherStart;
+}
+
+// Returns true if there is any overlap between the two live ranges.
+bool LiveRange::overlaps(const LiveRange &Other) const {
+  // Do a two-finger walk through the two sorted lists of segments.
+  RangeType::const_iterator I1 = Range.begin(), I2 = Other.Range.begin();
+  RangeType::const_iterator E1 = Range.end(), E2 = Other.Range.end();
+  while (I1 != E1 && I2 != E2) {
+    if (I1->second <= I2->first) {
+      ++I1;
+      continue;
+    }
+    if (I2->second <= I1->first) {
+      ++I2;
+      continue;
+    }
+    return true;
+  }
+  return false;
+}
+
+bool LiveRange::overlaps(InstNumberT OtherBegin) const {
+  LiveRange Temp;
+  Temp.addSegment(OtherBegin, OtherBegin + 1);
+  return overlaps(Temp);
+}
+
+// Returns true if the live range contains the given instruction
+// number.  This is only used for validating the live range
+// calculation.
+bool LiveRange::containsValue(InstNumberT Value) const {
+  for (RangeType::const_iterator I = Range.begin(), E = Range.end(); I != E;
+       ++I) {
+    if (I->first <= Value && Value <= I->second)
+      return true;
+  }
+  return false;
+}
+
 void Variable::setUse(const Inst *Inst, const CfgNode *Node) {
   if (DefNode == NULL)
     return;
@@ -115,12 +217,12 @@
   }
 }
 
-void ConstantRelocatable::emit(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrEmit();
+void ConstantRelocatable::emit(GlobalContext *Ctx) const {
+  Ostream &Str = Ctx->getStrEmit();
   if (SuppressMangling)
     Str << Name;
   else
-    Str << Func->getContext()->mangleName(Name);
+    Str << Ctx->mangleName(Name);
   if (Offset) {
     if (Offset > 0)
       Str << "+";
@@ -128,13 +230,28 @@
   }
 }
 
-void ConstantRelocatable::dump(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrDump();
+void ConstantRelocatable::dump(GlobalContext *Ctx) const {
+  Ostream &Str = Ctx->getStrDump();
   Str << "@" << Name;
   if (Offset)
     Str << "+" << Offset;
 }
 
+void LiveRange::dump(Ostream &Str) const {
+  Str << "(weight=" << Weight << ") ";
+  for (RangeType::const_iterator I = Range.begin(), E = Range.end(); I != E;
+       ++I) {
+    if (I != Range.begin())
+      Str << ", ";
+    Str << "[" << (*I).first << ":" << (*I).second << ")";
+  }
+}
+
+Ostream &operator<<(Ostream &Str, const LiveRange &L) {
+  L.dump(Str);
+  return Str;
+}
+
 Ostream &operator<<(Ostream &Str, const RegWeight &W) {
   if (W.getWeight() == RegWeight::Inf)
     Str << "Inf";
diff --git a/src/IceOperand.h b/src/IceOperand.h
index ef4413f..54ac48b 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -81,8 +81,10 @@
 class Constant : public Operand {
 public:
   uint32_t getPoolEntryID() const { return PoolEntryID; }
-  virtual void emit(const Cfg *Func) const = 0;
-  virtual void dump(const Cfg *Func) const = 0;
+  virtual void emit(const Cfg *Func) const { emit(Func->getContext()); }
+  virtual void dump(const Cfg *Func) const { dump(Func->getContext()); }
+  virtual void emit(GlobalContext *Ctx) const = 0;
+  virtual void dump(GlobalContext *Ctx) const = 0;
 
   static bool classof(const Operand *Operand) {
     OperandKind Kind = Operand->getKind();
@@ -116,12 +118,14 @@
         ConstantPrimitive(Ty, Value, PoolEntryID);
   }
   T getValue() const { return Value; }
-  virtual void emit(const Cfg *Func) const {
-    Ostream &Str = Func->getContext()->getStrEmit();
+  using Constant::emit;
+  virtual void emit(GlobalContext *Ctx) const {
+    Ostream &Str = Ctx->getStrEmit();
     Str << getValue();
   }
-  virtual void dump(const Cfg *Func) const {
-    Ostream &Str = Func->getContext()->getStrDump();
+  using Constant::dump;
+  virtual void dump(GlobalContext *Ctx) const {
+    Ostream &Str = Ctx->getStrDump();
     Str << getValue();
   }
 
@@ -178,8 +182,10 @@
   IceString getName() const { return Name; }
   void setSuppressMangling(bool Value) { SuppressMangling = Value; }
   bool getSuppressMangling() const { return SuppressMangling; }
-  virtual void emit(const Cfg *Func) const;
-  virtual void dump(const Cfg *Func) const;
+  using Constant::emit;
+  using Constant::dump;
+  virtual void emit(GlobalContext *Ctx) const;
+  virtual void dump(GlobalContext *Ctx) const;
 
   static bool classof(const Operand *Operand) {
     OperandKind Kind = Operand->getKind();
@@ -228,6 +234,55 @@
 bool operator<=(const RegWeight &A, const RegWeight &B);
 bool operator==(const RegWeight &A, const RegWeight &B);
 
+// LiveRange is a set of instruction number intervals representing
+// a variable's live range.  Generally there is one interval per basic
+// block where the variable is live, but adjacent intervals get
+// coalesced into a single interval.  LiveRange also includes a
+// weight, in case e.g. we want a live range to have higher weight
+// inside a loop.
+class LiveRange {
+public:
+  LiveRange() : Weight(0) {}
+
+  void reset() {
+    Range.clear();
+    Weight.setWeight(0);
+  }
+  void addSegment(InstNumberT Start, InstNumberT End);
+
+  bool endsBefore(const LiveRange &Other) const;
+  bool overlaps(const LiveRange &Other) const;
+  bool overlaps(InstNumberT OtherBegin) const;
+  bool containsValue(InstNumberT Value) const;
+  bool isEmpty() const { return Range.empty(); }
+  InstNumberT getStart() const {
+    return Range.empty() ? -1 : Range.begin()->first;
+  }
+
+  RegWeight getWeight() const { return Weight; }
+  void setWeight(const RegWeight &NewWeight) { Weight = NewWeight; }
+  void addWeight(uint32_t Delta) { Weight.addWeight(Delta); }
+  void dump(Ostream &Str) const;
+
+  // Defining USE_SET uses std::set to hold the segments instead of
+  // std::list.  Using std::list will be slightly faster, but is more
+  // restrictive because new segments cannot be added in the middle.
+
+  //#define USE_SET
+
+private:
+  typedef std::pair<InstNumberT, InstNumberT> RangeElementType;
+#ifdef USE_SET
+  typedef std::set<RangeElementType> RangeType;
+#else
+  typedef std::list<RangeElementType> RangeType;
+#endif
+  RangeType Range;
+  RegWeight Weight;
+};
+
+Ostream &operator<<(Ostream &Str, const LiveRange &L);
+
 // Variable represents an operand that is register-allocated or
 // stack-allocated.  If it is register-allocated, it will ultimately
 // have a non-negative RegNum field.
@@ -263,6 +318,9 @@
     assert(!hasReg() || RegNum == NewRegNum);
     RegNum = NewRegNum;
   }
+  bool hasRegTmp() const { return getRegNumTmp() != NoRegister; }
+  int32_t getRegNumTmp() const { return RegNumTmp; }
+  void setRegNumTmp(int32_t NewRegNum) { RegNumTmp = NewRegNum; }
 
   RegWeight getWeight() const { return Weight; }
   void setWeight(uint32_t NewWeight) { Weight = NewWeight; }
@@ -275,6 +333,19 @@
     AllowRegisterOverlap = Overlap;
   }
 
+  const LiveRange &getLiveRange() const { return Live; }
+  void setLiveRange(const LiveRange &Range) { Live = Range; }
+  void resetLiveRange() { Live.reset(); }
+  void addLiveRange(InstNumberT Start, InstNumberT End, uint32_t WeightDelta) {
+    assert(WeightDelta != RegWeight::Inf);
+    Live.addSegment(Start, End);
+    if (Weight.isInf())
+      Live.setWeight(RegWeight::Inf);
+    else
+      Live.addWeight(WeightDelta * Weight.getWeight());
+  }
+  void setLiveRangeInfiniteWeight() { Live.setWeight(RegWeight::Inf); }
+
   Variable *getLo() const { return LoVar; }
   Variable *getHi() const { return HiVar; }
   void setLoHi(Variable *Lo, Variable *Hi) {
@@ -304,8 +375,8 @@
   Variable(Type Ty, const CfgNode *Node, SizeT Index, const IceString &Name)
       : Operand(kVariable, Ty), Number(Index), Name(Name), DefInst(NULL),
         DefNode(Node), IsArgument(false), StackOffset(0), RegNum(NoRegister),
-        Weight(1), RegisterPreference(NULL), AllowRegisterOverlap(false),
-        LoVar(NULL), HiVar(NULL) {
+        RegNumTmp(NoRegister), Weight(1), RegisterPreference(NULL),
+        AllowRegisterOverlap(false), LoVar(NULL), HiVar(NULL) {
     Vars = VarsReal;
     Vars[0] = this;
     NumVars = 1;
@@ -334,6 +405,8 @@
   // RegNum is the allocated register, or NoRegister if it isn't
   // register-allocated.
   int32_t RegNum;
+  // RegNumTmp is the tentative assignment during register allocation.
+  int32_t RegNumTmp;
   RegWeight Weight; // Register allocation priority
   // RegisterPreference says that if possible, the register allocator
   // should prefer the register that was assigned to this linked
@@ -345,6 +418,7 @@
   // RegisterPreference and "share" a register even if the two live
   // ranges overlap.
   bool AllowRegisterOverlap;
+  LiveRange Live;
   // LoVar and HiVar are needed for lowering from 64 to 32 bits.  When
   // lowering from I64 to I32 on a 32-bit architecture, we split the
   // variable into two machine-size pieces.  LoVar is the low-order
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
new file mode 100644
index 0000000..f610d96
--- /dev/null
+++ b/src/IceRegAlloc.cpp
@@ -0,0 +1,461 @@
+//===- subzero/src/IceRegAlloc.cpp - Linear-scan implementation -----------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LinearScan class, which performs the
+// linear-scan register allocation after liveness analysis has been
+// performed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IceCfg.h"
+#include "IceInst.h"
+#include "IceOperand.h"
+#include "IceRegAlloc.h"
+#include "IceTargetLowering.h"
+
+namespace Ice {
+
+// Implements the linear-scan algorithm.  Based on "Linear Scan
+// Register Allocation in the Context of SSA Form and Register
+// Constraints" by Hanspeter Mössenböck and Michael Pfeiffer,
+// ftp://ftp.ssw.uni-linz.ac.at/pub/Papers/Moe02.PDF .  This
+// implementation is modified to take affinity into account and allow
+// two interfering variables to share the same register in certain
+// cases.
+//
+// Requires running Cfg::liveness(Liveness_RangesFull) in
+// preparation.  Results are assigned to Variable::RegNum for each
+// Variable.
+void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull) {
+  assert(RegMaskFull.any()); // Sanity check
+  Unhandled.clear();
+  Handled.clear();
+  Inactive.clear();
+  Active.clear();
+  Ostream &Str = Func->getContext()->getStrDump();
+  Func->setCurrentNode(NULL);
+
+  // Gather the live ranges of all variables and add them to the
+  // Unhandled set.  TODO: Unhandled is a set<> which is based on a
+  // balanced binary tree, so inserting live ranges for N variables is
+  // O(N log N) complexity.  N may be proportional to the number of
+  // instructions, thanks to temporary generation during lowering.  As
+  // a result, it may be useful to design a better data structure for
+  // storing Func->getVariables().
+  const VarList &Vars = Func->getVariables();
+  for (VarList::const_iterator I = Vars.begin(), E = Vars.end(); I != E; ++I) {
+    Variable *Var = *I;
+    // Explicitly don't consider zero-weight variables, which are
+    // meant to be spill slots.
+    if (Var->getWeight() == RegWeight::Zero)
+      continue;
+    // Don't bother if the variable has a null live range, which means
+    // it was never referenced.
+    if (Var->getLiveRange().isEmpty())
+      continue;
+    Unhandled.insert(LiveRangeWrapper(Var));
+    if (Var->hasReg()) {
+      Var->setRegNumTmp(Var->getRegNum());
+      Var->setLiveRangeInfiniteWeight();
+    }
+  }
+
+  // RegUses[I] is the number of live ranges (variables) that register
+  // I is currently assigned to.  It can be greater than 1 as a result
+  // of Variable::AllowRegisterOverlap.
+  std::vector<int> RegUses(RegMaskFull.size());
+  // Unhandled is already set to all ranges in increasing order of
+  // start points.
+  assert(Active.empty());
+  assert(Inactive.empty());
+  assert(Handled.empty());
+  UnorderedRanges::iterator Next;
+
+  while (!Unhandled.empty()) {
+    LiveRangeWrapper Cur = *Unhandled.begin();
+    Unhandled.erase(Unhandled.begin());
+    if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+      Str << "\nConsidering  ";
+      Cur.dump(Func);
+      Str << "\n";
+    }
+    const llvm::SmallBitVector RegMask =
+        RegMaskFull &
+        Func->getTarget()->getRegisterSetForType(Cur.Var->getType());
+
+    // Check for precolored ranges.  If Cur is precolored, it
+    // definitely gets that register.  Previously processed live
+    // ranges would have avoided that register due to it being
+    // precolored.  Future processed live ranges won't evict that
+    // register because the live range has infinite weight.
+    if (Cur.Var->hasReg()) {
+      int32_t RegNum = Cur.Var->getRegNum();
+      // RegNumTmp should have already been set above.
+      assert(Cur.Var->getRegNumTmp() == RegNum);
+      if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+        Str << "Precoloring  ";
+        Cur.dump(Func);
+        Str << "\n";
+      }
+      Active.push_back(Cur);
+      assert(RegUses[RegNum] >= 0);
+      ++RegUses[RegNum];
+      continue;
+    }
+
+    // Check for active ranges that have expired or become inactive.
+    for (UnorderedRanges::iterator I = Active.begin(), E = Active.end(); I != E;
+         I = Next) {
+      Next = I;
+      ++Next;
+      LiveRangeWrapper Item = *I;
+      bool Moved = false;
+      if (Item.endsBefore(Cur)) {
+        // Move Item from Active to Handled list.
+        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+          Str << "Expiring     ";
+          Item.dump(Func);
+          Str << "\n";
+        }
+        Active.erase(I);
+        Handled.push_back(Item);
+        Moved = true;
+      } else if (!Item.overlapsStart(Cur)) {
+        // Move Item from Active to Inactive list.
+        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+          Str << "Inactivating ";
+          Item.dump(Func);
+          Str << "\n";
+        }
+        Active.erase(I);
+        Inactive.push_back(Item);
+        Moved = true;
+      }
+      if (Moved) {
+        // Decrement Item from RegUses[].
+        assert(Item.Var->hasRegTmp());
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        --RegUses[RegNum];
+        assert(RegUses[RegNum] >= 0);
+      }
+    }
+
+    // Check for inactive ranges that have expired or reactivated.
+    for (UnorderedRanges::iterator I = Inactive.begin(), E = Inactive.end();
+         I != E; I = Next) {
+      Next = I;
+      ++Next;
+      LiveRangeWrapper Item = *I;
+      if (Item.endsBefore(Cur)) {
+        // Move Item from Inactive to Handled list.
+        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+          Str << "Expiring     ";
+          Item.dump(Func);
+          Str << "\n";
+        }
+        Inactive.erase(I);
+        Handled.push_back(Item);
+      } else if (Item.overlapsStart(Cur)) {
+        // Move Item from Inactive to Active list.
+        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+          Str << "Reactivating ";
+          Item.dump(Func);
+          Str << "\n";
+        }
+        Inactive.erase(I);
+        Active.push_back(Item);
+        // Increment Item in RegUses[].
+        assert(Item.Var->hasRegTmp());
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        assert(RegUses[RegNum] >= 0);
+        ++RegUses[RegNum];
+      }
+    }
+
+    // Calculate available registers into Free[].
+    llvm::SmallBitVector Free = RegMask;
+    for (SizeT i = 0; i < RegMask.size(); ++i) {
+      if (RegUses[i] > 0)
+        Free[i] = false;
+    }
+
+    // Remove registers from the Free[] list where an Inactive range
+    // overlaps with the current range.
+    for (UnorderedRanges::const_iterator I = Inactive.begin(),
+                                         E = Inactive.end();
+         I != E; ++I) {
+      LiveRangeWrapper Item = *I;
+      if (Item.overlaps(Cur)) {
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        // Don't assert(Free[RegNum]) because in theory (though
+        // probably never in practice) there could be two inactive
+        // variables that were allowed marked with
+        // AllowRegisterOverlap.
+        Free[RegNum] = false;
+      }
+    }
+
+    // Remove registers from the Free[] list where an Unhandled range
+    // overlaps with the current range and is precolored.
+    // Cur.endsBefore(*I) is an early exit check that turns a
+    // guaranteed O(N^2) algorithm into expected linear complexity.
+    for (OrderedRanges::const_iterator I = Unhandled.begin(),
+                                       E = Unhandled.end();
+         I != E && !Cur.endsBefore(*I); ++I) {
+      LiveRangeWrapper Item = *I;
+      if (Item.Var->hasReg() && Item.overlaps(Cur)) {
+        Free[Item.Var->getRegNum()] = false; // Note: getRegNum not getRegNumTmp
+      }
+    }
+
+    // Print info about physical register availability.
+    if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+      for (SizeT i = 0; i < RegMask.size(); ++i) {
+        if (RegMask[i]) {
+          Str << Func->getTarget()->getRegName(i, IceType_i32)
+              << "(U=" << RegUses[i] << ",F=" << Free[i] << ") ";
+        }
+      }
+      Str << "\n";
+    }
+
+    Variable *Prefer = Cur.Var->getPreferredRegister();
+    int32_t PreferReg = Prefer && Prefer->hasRegTmp() ? Prefer->getRegNumTmp()
+                                                      : Variable::NoRegister;
+    if (PreferReg != Variable::NoRegister &&
+        (Cur.Var->getRegisterOverlap() || Free[PreferReg])) {
+      // First choice: a preferred register that is either free or is
+      // allowed to overlap with its linked variable.
+      Cur.Var->setRegNumTmp(PreferReg);
+      if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+        Str << "Preferring   ";
+        Cur.dump(Func);
+        Str << "\n";
+      }
+      assert(RegUses[PreferReg] >= 0);
+      ++RegUses[PreferReg];
+      Active.push_back(Cur);
+    } else if (Free.any()) {
+      // Second choice: any free register.  TODO: After explicit
+      // affinity is considered, is there a strategy better than just
+      // picking the lowest-numbered available register?
+      int32_t RegNum = Free.find_first();
+      Cur.Var->setRegNumTmp(RegNum);
+      if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+        Str << "Allocating   ";
+        Cur.dump(Func);
+        Str << "\n";
+      }
+      assert(RegUses[RegNum] >= 0);
+      ++RegUses[RegNum];
+      Active.push_back(Cur);
+    } else {
+      // Fallback: there are no free registers, so we look for the
+      // lowest-weight register and see if Cur has higher weight.
+      std::vector<RegWeight> Weights(RegMask.size());
+      // Check Active ranges.
+      for (UnorderedRanges::const_iterator I = Active.begin(), E = Active.end();
+           I != E; ++I) {
+        LiveRangeWrapper Item = *I;
+        assert(Item.overlaps(Cur));
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        assert(Item.Var->hasRegTmp());
+        Weights[RegNum].addWeight(Item.range().getWeight());
+      }
+      // Same as above, but check Inactive ranges instead of Active.
+      for (UnorderedRanges::const_iterator I = Inactive.begin(),
+                                           E = Inactive.end();
+           I != E; ++I) {
+        LiveRangeWrapper Item = *I;
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        assert(Item.Var->hasRegTmp());
+        if (Item.overlaps(Cur))
+          Weights[RegNum].addWeight(Item.range().getWeight());
+      }
+      // Check Unhandled ranges that overlap Cur and are precolored.
+      // Cur.endsBefore(*I) is an early exit check that turns a
+      // guaranteed O(N^2) algorithm into expected linear complexity.
+      for (OrderedRanges::const_iterator I = Unhandled.begin(),
+                                         E = Unhandled.end();
+           I != E && !Cur.endsBefore(*I); ++I) {
+        LiveRangeWrapper Item = *I;
+        int32_t RegNum = Item.Var->getRegNumTmp();
+        if (RegNum < 0)
+          continue;
+        if (Item.overlaps(Cur))
+          Weights[RegNum].setWeight(RegWeight::Inf);
+      }
+
+      // All the weights are now calculated.  Find the register with
+      // smallest weight.
+      int32_t MinWeightIndex = RegMask.find_first();
+      // MinWeightIndex must be valid because of the initial
+      // RegMask.any() test.
+      assert(MinWeightIndex >= 0);
+      for (SizeT i = MinWeightIndex + 1; i < Weights.size(); ++i) {
+        if (RegMask[i] && Weights[i] < Weights[MinWeightIndex])
+          MinWeightIndex = i;
+      }
+
+      if (Cur.range().getWeight() <= Weights[MinWeightIndex]) {
+        // Cur doesn't have priority over any other live ranges, so
+        // don't allocate any register to it, and move it to the
+        // Handled state.
+        Handled.push_back(Cur);
+        if (Cur.range().getWeight().isInf()) {
+          Func->setError("Unable to find a physical register for an "
+                         "infinite-weight live range");
+        }
+      } else {
+        // Evict all live ranges in Active that register number
+        // MinWeightIndex is assigned to.
+        for (UnorderedRanges::iterator I = Active.begin(), E = Active.end();
+             I != E; I = Next) {
+          Next = I;
+          ++Next;
+          LiveRangeWrapper Item = *I;
+          if (Item.Var->getRegNumTmp() == MinWeightIndex) {
+            if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+              Str << "Evicting     ";
+              Item.dump(Func);
+              Str << "\n";
+            }
+            --RegUses[MinWeightIndex];
+            assert(RegUses[MinWeightIndex] >= 0);
+            Item.Var->setRegNumTmp(Variable::NoRegister);
+            Active.erase(I);
+            Handled.push_back(Item);
+          }
+        }
+        // Do the same for Inactive.
+        for (UnorderedRanges::iterator I = Inactive.begin(), E = Inactive.end();
+             I != E; I = Next) {
+          Next = I;
+          ++Next;
+          LiveRangeWrapper Item = *I;
+          if (Item.Var->getRegNumTmp() == MinWeightIndex) {
+            if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+              Str << "Evicting     ";
+              Item.dump(Func);
+              Str << "\n";
+            }
+            Item.Var->setRegNumTmp(Variable::NoRegister);
+            Inactive.erase(I);
+            Handled.push_back(Item);
+          }
+        }
+        // Assign the register to Cur.
+        Cur.Var->setRegNumTmp(MinWeightIndex);
+        assert(RegUses[MinWeightIndex] >= 0);
+        ++RegUses[MinWeightIndex];
+        Active.push_back(Cur);
+        if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+          Str << "Allocating   ";
+          Cur.dump(Func);
+          Str << "\n";
+        }
+      }
+    }
+    dump(Func);
+  }
+  // Move anything Active or Inactive to Handled for easier handling.
+  for (UnorderedRanges::iterator I = Active.begin(), E = Active.end(); I != E;
+       I = Next) {
+    Next = I;
+    ++Next;
+    Handled.push_back(*I);
+    Active.erase(I);
+  }
+  for (UnorderedRanges::iterator I = Inactive.begin(), E = Inactive.end();
+       I != E; I = Next) {
+    Next = I;
+    ++Next;
+    Handled.push_back(*I);
+    Inactive.erase(I);
+  }
+  dump(Func);
+
+  // Finish up by assigning RegNumTmp->RegNum for each Variable.
+  for (UnorderedRanges::const_iterator I = Handled.begin(), E = Handled.end();
+       I != E; ++I) {
+    LiveRangeWrapper Item = *I;
+    int32_t RegNum = Item.Var->getRegNumTmp();
+    if (Func->getContext()->isVerbose(IceV_LinearScan)) {
+      if (!Item.Var->hasRegTmp()) {
+        Str << "Not assigning ";
+        Item.Var->dump(Func);
+        Str << "\n";
+      } else {
+        Str << (RegNum == Item.Var->getRegNum() ? "Reassigning " : "Assigning ")
+            << Func->getTarget()->getRegName(RegNum, IceType_i32) << "(r"
+            << RegNum << ") to ";
+        Item.Var->dump(Func);
+        Str << "\n";
+      }
+    }
+    Item.Var->setRegNum(Item.Var->getRegNumTmp());
+  }
+
+  // TODO: Consider running register allocation one more time, with
+  // infinite registers, for two reasons.  First, evicted live ranges
+  // get a second chance for a register.  Second, it allows coalescing
+  // of stack slots.  If there is no time budget for the second
+  // register allocation run, each unallocated variable just gets its
+  // own slot.
+  //
+  // Another idea for coalescing stack slots is to initialize the
+  // Unhandled list with just the unallocated variables, saving time
+  // but not offering second-chance opportunities.
+}
+
+// ======================== Dump routines ======================== //
+
+void LiveRangeWrapper::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  const static size_t BufLen = 30;
+  char buf[BufLen];
+  snprintf(buf, BufLen, "%2d", Var->getRegNumTmp());
+  Str << "R=" << buf << "  V=";
+  Var->dump(Func);
+  Str << "  Range=" << range();
+}
+
+void LinearScan::dump(Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (!Func->getContext()->isVerbose(IceV_LinearScan))
+    return;
+  Func->setCurrentNode(NULL);
+  Str << "**** Current regalloc state:\n";
+  Str << "++++++ Handled:\n";
+  for (UnorderedRanges::const_iterator I = Handled.begin(), E = Handled.end();
+       I != E; ++I) {
+    I->dump(Func);
+    Str << "\n";
+  }
+  Str << "++++++ Unhandled:\n";
+  for (OrderedRanges::const_iterator I = Unhandled.begin(), E = Unhandled.end();
+       I != E; ++I) {
+    I->dump(Func);
+    Str << "\n";
+  }
+  Str << "++++++ Active:\n";
+  for (UnorderedRanges::const_iterator I = Active.begin(), E = Active.end();
+       I != E; ++I) {
+    I->dump(Func);
+    Str << "\n";
+  }
+  Str << "++++++ Inactive:\n";
+  for (UnorderedRanges::const_iterator I = Inactive.begin(), E = Inactive.end();
+       I != E; ++I) {
+    I->dump(Func);
+    Str << "\n";
+  }
+}
+
+} // end of namespace Ice
diff --git a/src/IceRegAlloc.h b/src/IceRegAlloc.h
new file mode 100644
index 0000000..edcf266
--- /dev/null
+++ b/src/IceRegAlloc.h
@@ -0,0 +1,81 @@
+//===- subzero/src/IceRegAlloc.h - Linear-scan reg. allocation --*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the data structures used during linear-scan
+// register allocation.  This includes LiveRangeWrapper which
+// encapsulates a variable and its live range, and LinearScan which
+// holds the various work queues for the linear-scan algorithm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICEREGALLOC_H
+#define SUBZERO_SRC_ICEREGALLOC_H
+
+#include "IceDefs.h"
+#include "IceTypes.h"
+
+namespace Ice {
+
+// Currently this just wraps a Variable pointer, so in principle we
+// could use containers of Variable* instead of LiveRangeWrapper.  But
+// in the future, we may want to do more complex things such as live
+// range splitting, and keeping a wrapper should make that simpler.
+class LiveRangeWrapper {
+public:
+  LiveRangeWrapper(Variable *Var) : Var(Var) {}
+  const LiveRange &range() const { return Var->getLiveRange(); }
+  bool endsBefore(const LiveRangeWrapper &Other) const {
+    return range().endsBefore(Other.range());
+  }
+  bool overlaps(const LiveRangeWrapper &Other) const {
+    return range().overlaps(Other.range());
+  }
+  bool overlapsStart(const LiveRangeWrapper &Other) const {
+    return range().overlaps(Other.range().getStart());
+  }
+  Variable *const Var;
+  void dump(const Cfg *Func) const;
+
+private:
+  // LiveRangeWrapper(const LiveRangeWrapper &) LLVM_DELETED_FUNCTION;
+  LiveRangeWrapper &operator=(const LiveRangeWrapper &) LLVM_DELETED_FUNCTION;
+};
+
+class LinearScan {
+public:
+  LinearScan(Cfg *Func) : Func(Func) {}
+  void scan(const llvm::SmallBitVector &RegMask);
+  void dump(Cfg *Func) const;
+
+private:
+  Cfg *const Func;
+  // RangeCompare is the comparator for sorting an LiveRangeWrapper
+  // by starting point in a std::set<>.  Ties are broken by variable
+  // number so that sorting is stable.
+  struct RangeCompare {
+    bool operator()(const LiveRangeWrapper &L,
+                    const LiveRangeWrapper &R) const {
+      InstNumberT Lstart = L.Var->getLiveRange().getStart();
+      InstNumberT Rstart = R.Var->getLiveRange().getStart();
+      if (Lstart == Rstart)
+        return L.Var->getIndex() < R.Var->getIndex();
+      return Lstart < Rstart;
+    }
+  };
+  typedef std::set<LiveRangeWrapper, RangeCompare> OrderedRanges;
+  typedef std::list<LiveRangeWrapper> UnorderedRanges;
+  OrderedRanges Unhandled;
+  UnorderedRanges Active, Inactive, Handled;
+  LinearScan(const LinearScan &) LLVM_DELETED_FUNCTION;
+  LinearScan &operator=(const LinearScan &) LLVM_DELETED_FUNCTION;
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICEREGALLOC_H
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 0d07475..72a3e8c 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -18,6 +18,7 @@
 #include "IceCfg.h" // setError()
 #include "IceCfgNode.h"
 #include "IceOperand.h"
+#include "IceRegAlloc.h"
 #include "IceTargetLowering.h"
 #include "IceTargetLoweringX8632.h"
 
@@ -66,6 +67,15 @@
   return NULL;
 }
 
+void TargetLowering::doAddressOpt() {
+  if (llvm::isa<InstLoad>(*Context.getCur()))
+    doAddressOptLoad();
+  else if (llvm::isa<InstStore>(*Context.getCur()))
+    doAddressOptStore();
+  Context.advanceCur();
+  Context.advanceNext();
+}
+
 // Lowers a single instruction according to the information in
 // Context, by checking the Context.Cur instruction kind and calling
 // the appropriate lowering method.  The lowering method should insert
@@ -144,4 +154,21 @@
   Context.advanceNext();
 }
 
+// Drives register allocation, allowing all physical registers (except
+// perhaps for the frame pointer) to be allocated.  This set of
+// registers could potentially be parameterized if we want to restrict
+// registers e.g. for performance testing.
+void TargetLowering::regAlloc() {
+  LinearScan LinearScan(Func);
+  RegSetMask RegInclude = RegSet_None;
+  RegSetMask RegExclude = RegSet_None;
+  RegInclude |= RegSet_CallerSave;
+  RegInclude |= RegSet_CalleeSave;
+  RegExclude |= RegSet_StackPointer;
+  if (hasFramePointer())
+    RegExclude |= RegSet_FramePointer;
+  llvm::SmallBitVector RegMask = getRegisterSet(RegInclude, RegExclude);
+  LinearScan.scan(RegMask);
+}
+
 } // end of namespace Ice
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 92a36af..7f798a8 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -109,6 +109,8 @@
     Func->setError("Target doesn't specify O2 lowering steps.");
   }
 
+  // Tries to do address mode optimization on a single instruction.
+  void doAddressOpt();
   // Lowers a single instruction.
   void lower();
 
@@ -173,6 +175,8 @@
   virtual void lowerSwitch(const InstSwitch *Inst) = 0;
   virtual void lowerUnreachable(const InstUnreachable *Inst) = 0;
 
+  virtual void doAddressOptLoad() {}
+  virtual void doAddressOptStore() {}
   // This gives the target an opportunity to post-process the lowered
   // expansion before returning.  The primary intention is to do some
   // Register Manager activity as necessary, specifically to eagerly
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 4b8bf27..81973d0 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -210,9 +210,10 @@
   TypeToRegisterSet[IceType_f64] = FloatRegisters;
 }
 
-void TargetX8632::translateOm1() {
+void TargetX8632::translateO2() {
   GlobalContext *Context = Func->getContext();
-  Ostream &Str = Context->getStrDump();
+
+  // Lower Phi instructions.
   Timer T_placePhiLoads;
   Func->placePhiLoads();
   if (Func->hasError())
@@ -228,30 +229,108 @@
   if (Func->hasError())
     return;
   T_deletePhis.printElapsedUs(Context, "deletePhis()");
-  if (Context->isVerbose()) {
-    Str << "================ After Phi lowering ================\n";
-    Func->dump();
-  }
+  Func->dump("After Phi lowering");
+
+  // Address mode optimization.
+  Timer T_doAddressOpt;
+  Func->doAddressOpt();
+  T_doAddressOpt.printElapsedUs(Context, "doAddressOpt()");
+
+  // Target lowering.  This requires liveness analysis for some parts
+  // of the lowering decisions, such as compare/branch fusing.  If
+  // non-lightweight liveness analysis is used, the instructions need
+  // to be renumbered first.  TODO: This renumbering should only be
+  // necessary if we're actually calculating live intervals, which we
+  // only do for register allocation.
+  Timer T_renumber1;
+  Func->renumberInstructions();
+  if (Func->hasError())
+    return;
+  T_renumber1.printElapsedUs(Context, "renumberInstructions()");
+  // TODO: It should be sufficient to use the fastest liveness
+  // calculation, i.e. livenessLightweight().  However, for some
+  // reason that slows down the rest of the translation.  Investigate.
+  Timer T_liveness1;
+  Func->liveness(Liveness_Basic);
+  if (Func->hasError())
+    return;
+  T_liveness1.printElapsedUs(Context, "liveness()");
+  Func->dump("After x86 address mode opt");
+  Timer T_genCode;
+  Func->genCode();
+  if (Func->hasError())
+    return;
+  T_genCode.printElapsedUs(Context, "genCode()");
+
+  // Register allocation.  This requires instruction renumbering and
+  // full liveness analysis.
+  Timer T_renumber2;
+  Func->renumberInstructions();
+  if (Func->hasError())
+    return;
+  T_renumber2.printElapsedUs(Context, "renumberInstructions()");
+  Timer T_liveness2;
+  Func->liveness(Liveness_Intervals);
+  if (Func->hasError())
+    return;
+  T_liveness2.printElapsedUs(Context, "liveness()");
+  // Validate the live range computations.  Do it outside the timing
+  // code.  TODO: Put this under a flag.
+  bool ValidLiveness = Func->validateLiveness();
+  assert(ValidLiveness);
+  (void)ValidLiveness; // used only in assert()
+  ComputedLiveRanges = true;
+  // The post-codegen dump is done here, after liveness analysis and
+  // associated cleanup, to make the dump cleaner and more useful.
+  Func->dump("After initial x8632 codegen");
+  Timer T_regAlloc;
+  regAlloc();
+  if (Func->hasError())
+    return;
+  T_regAlloc.printElapsedUs(Context, "regAlloc()");
+  Func->dump("After linear scan regalloc");
+
+  // Stack frame mapping.
+  Timer T_genFrame;
+  Func->genFrame();
+  if (Func->hasError())
+    return;
+  T_genFrame.printElapsedUs(Context, "genFrame()");
+  Func->dump("After stack frame mapping");
+}
+
+void TargetX8632::translateOm1() {
+  GlobalContext *Context = Func->getContext();
+  Timer T_placePhiLoads;
+  Func->placePhiLoads();
+  if (Func->hasError())
+    return;
+  T_placePhiLoads.printElapsedUs(Context, "placePhiLoads()");
+  Timer T_placePhiStores;
+  Func->placePhiStores();
+  if (Func->hasError())
+    return;
+  T_placePhiStores.printElapsedUs(Context, "placePhiStores()");
+  Timer T_deletePhis;
+  Func->deletePhis();
+  if (Func->hasError())
+    return;
+  T_deletePhis.printElapsedUs(Context, "deletePhis()");
+  Func->dump("After Phi lowering");
 
   Timer T_genCode;
   Func->genCode();
   if (Func->hasError())
     return;
   T_genCode.printElapsedUs(Context, "genCode()");
-  if (Context->isVerbose()) {
-    Str << "================ After initial x8632 codegen ================\n";
-    Func->dump();
-  }
+  Func->dump("After initial x8632 codegen");
 
   Timer T_genFrame;
   Func->genFrame();
   if (Func->hasError())
     return;
   T_genFrame.printElapsedUs(Context, "genFrame()");
-  if (Context->isVerbose()) {
-    Str << "================ After stack frame mapping ================\n";
-    Func->dump();
-  }
+  Func->dump("After stack frame mapping");
 }
 
 IceString TargetX8632::RegNames[] = {
@@ -327,8 +406,8 @@
 // calls itself recursively on the components, taking care to handle
 // Lo first because of the little-endian architecture.
 void TargetX8632::setArgOffsetAndCopy(Variable *Arg, Variable *FramePtr,
-                                      int32_t BasicFrameOffset,
-                                      int32_t &InArgsSizeBytes) {
+                                      size_t BasicFrameOffset,
+                                      size_t &InArgsSizeBytes) {
   Variable *Lo = Arg->getLo();
   Variable *Hi = Arg->getHi();
   Type Ty = Arg->getType();
@@ -359,9 +438,9 @@
   // block 1 and C is local to block 2, then C may share a slot with A
   // or B.
   const bool SimpleCoalescing = true;
-  int32_t InArgsSizeBytes = 0;
-  int32_t RetIpSizeBytes = 4;
-  int32_t PreservedRegsSizeBytes = 0;
+  size_t InArgsSizeBytes = 0;
+  size_t RetIpSizeBytes = 4;
+  size_t PreservedRegsSizeBytes = 0;
   LocalsSizeBytes = 0;
   Context.init(Node);
   Context.setInsertPoint(Context.getCur());
@@ -380,8 +459,8 @@
   llvm::SmallBitVector CalleeSaves =
       getRegisterSet(RegSet_CalleeSave, RegSet_None);
 
-  int32_t GlobalsSize = 0;
-  std::vector<int> LocalsSize(Func->getNumNodes());
+  size_t GlobalsSize = 0;
+  std::vector<size_t> LocalsSize(Func->getNumNodes());
 
   // Prepass.  Compute RegsUsed, PreservedRegsSizeBytes, and
   // LocalsSizeBytes.
@@ -398,6 +477,9 @@
     // An argument passed on the stack already has a stack slot.
     if (Var->getIsArg())
       continue;
+    // An unreferenced variable doesn't need a stack slot.
+    if (ComputedLiveRanges && Var->getLiveRange().isEmpty())
+      continue;
     // A spill slot linked to a variable with a stack slot should reuse
     // that stack slot.
     if (Var->getWeight() == RegWeight::Zero && Var->getRegisterOverlap()) {
@@ -406,7 +488,7 @@
           continue;
       }
     }
-    int32_t Increment = typeWidthInBytesOnStack(Var->getType());
+    size_t Increment = typeWidthInBytesOnStack(Var->getType());
     if (SimpleCoalescing) {
       if (Var->isMultiblockLife()) {
         GlobalsSize += Increment;
@@ -461,7 +543,7 @@
   // and if they have no home register, home space will need to be
   // allocated on the stack to copy into.
   Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
-  int32_t BasicFrameOffset = PreservedRegsSizeBytes + RetIpSizeBytes;
+  size_t BasicFrameOffset = PreservedRegsSizeBytes + RetIpSizeBytes;
   if (!IsEbpBasedFrame)
     BasicFrameOffset += LocalsSizeBytes;
   for (SizeT i = 0; i < Args.size(); ++i) {
@@ -470,10 +552,10 @@
   }
 
   // Fill in stack offsets for locals.
-  int32_t TotalGlobalsSize = GlobalsSize;
+  size_t TotalGlobalsSize = GlobalsSize;
   GlobalsSize = 0;
   LocalsSize.assign(LocalsSize.size(), 0);
-  int32_t NextStackOffset = 0;
+  size_t NextStackOffset = 0;
   for (VarList::const_iterator I = Variables.begin(), E = Variables.end();
        I != E; ++I) {
     Variable *Var = *I;
@@ -483,6 +565,8 @@
     }
     if (Var->getIsArg())
       continue;
+    if (ComputedLiveRanges && Var->getLiveRange().isEmpty())
+      continue;
     if (Var->getWeight() == RegWeight::Zero && Var->getRegisterOverlap()) {
       if (Variable *Linked = Var->getPreferredRegister()) {
         if (!Linked->hasReg()) {
@@ -493,7 +577,7 @@
         }
       }
     }
-    int32_t Increment = typeWidthInBytesOnStack(Var->getType());
+    size_t Increment = typeWidthInBytesOnStack(Var->getType());
     if (SimpleCoalescing) {
       if (Var->isMultiblockLife()) {
         GlobalsSize += Increment;
@@ -1601,6 +1685,37 @@
   Operand *Src1 = legalize(Inst->getSrc(1));
   Variable *Dest = Inst->getDest();
 
+  // If Src1 is an immediate, or known to be a physical register, we can
+  // allow Src0 to be a memory operand.  Otherwise, Src0 must be copied into
+  // a physical register.  (Actually, either Src0 or Src1 can be chosen for
+  // the physical register, but unfortunately we have to commit to one or
+  // the other before register allocation.)
+  bool IsSrc1ImmOrReg = false;
+  if (llvm::isa<Constant>(Src1)) {
+    IsSrc1ImmOrReg = true;
+  } else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
+    if (Var->hasReg())
+      IsSrc1ImmOrReg = true;
+  }
+
+  // Try to fuse a compare immediately followed by a conditional branch.  This
+  // is possible when the compare dest and the branch source operands are the
+  // same, and are their only uses.  TODO: implement this optimization for i64.
+  if (InstBr *NextBr = llvm::dyn_cast_or_null<InstBr>(Context.getNextInst())) {
+    if (Src0->getType() != IceType_i64 && !NextBr->isUnconditional() &&
+        Dest == NextBr->getSrc(0) && NextBr->isLastUse(Dest)) {
+      Operand *Src0New =
+          legalize(Src0, IsSrc1ImmOrReg ? Legal_All : Legal_Reg, true);
+      _cmp(Src0New, Src1);
+      _br(getIcmp32Mapping(Inst->getCondition()), NextBr->getTargetTrue(),
+          NextBr->getTargetFalse());
+      // Skip over the following branch instruction.
+      NextBr->setDeleted();
+      Context.advanceNext();
+      return;
+    }
+  }
+
   // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
   Constant *Zero = Ctx->getConstantInt(IceType_i32, 0);
   Constant *One = Ctx->getConstantInt(IceType_i32, 1);
@@ -1637,19 +1752,6 @@
     return;
   }
 
-  // If Src1 is an immediate, or known to be a physical register, we can
-  // allow Src0 to be a memory operand.  Otherwise, Src0 must be copied into
-  // a physical register.  (Actually, either Src0 or Src1 can be chosen for
-  // the physical register, but unfortunately we have to commit to one or
-  // the other before register allocation.)
-  bool IsSrc1ImmOrReg = false;
-  if (llvm::isa<Constant>(Src1)) {
-    IsSrc1ImmOrReg = true;
-  } else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
-    if (Var->hasReg())
-      IsSrc1ImmOrReg = true;
-  }
-
   // cmp b, c
   Operand *Src0New =
       legalize(Src0, IsSrc1ImmOrReg ? Legal_All : Legal_Reg, true);
@@ -1662,6 +1764,135 @@
   Context.insert(Label);
 }
 
+namespace {
+
+bool isAdd(const Inst *Inst) {
+  if (const InstArithmetic *Arith =
+          llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {
+    return (Arith->getOp() == InstArithmetic::Add);
+  }
+  return false;
+}
+
+void computeAddressOpt(Variable *&Base, Variable *&Index, int32_t &Shift,
+                       int32_t &Offset) {
+  (void)Offset; // TODO: pattern-match for non-zero offsets.
+  if (Base == NULL)
+    return;
+  // If the Base has more than one use or is live across multiple
+  // blocks, then don't go further.  Alternatively (?), never consider
+  // a transformation that would change a variable that is currently
+  // *not* live across basic block boundaries into one that *is*.
+  if (Base->isMultiblockLife() /* || Base->getUseCount() > 1*/)
+    return;
+
+  while (true) {
+    // Base is Base=Var ==>
+    //   set Base=Var
+    const Inst *BaseInst = Base->getDefinition();
+    Operand *BaseOperand0 = BaseInst ? BaseInst->getSrc(0) : NULL;
+    Variable *BaseVariable0 = llvm::dyn_cast_or_null<Variable>(BaseOperand0);
+    // TODO: Helper function for all instances of assignment
+    // transitivity.
+    if (BaseInst && llvm::isa<InstAssign>(BaseInst) && BaseVariable0 &&
+        // TODO: ensure BaseVariable0 stays single-BB
+        true) {
+      Base = BaseVariable0;
+      continue;
+    }
+
+    // Index is Index=Var ==>
+    //   set Index=Var
+
+    // Index==NULL && Base is Base=Var1+Var2 ==>
+    //   set Base=Var1, Index=Var2, Shift=0
+    Operand *BaseOperand1 =
+        BaseInst && BaseInst->getSrcSize() >= 2 ? BaseInst->getSrc(1) : NULL;
+    Variable *BaseVariable1 = llvm::dyn_cast_or_null<Variable>(BaseOperand1);
+    if (Index == NULL && isAdd(BaseInst) && BaseVariable0 && BaseVariable1 &&
+        // TODO: ensure BaseVariable0 and BaseVariable1 stay single-BB
+        true) {
+      Base = BaseVariable0;
+      Index = BaseVariable1;
+      Shift = 0; // should already have been 0
+      continue;
+    }
+
+    // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
+    //   Index=Var, Shift+=log2(Const)
+    const Inst *IndexInst = Index ? Index->getDefinition() : NULL;
+    if (const InstArithmetic *ArithInst =
+            llvm::dyn_cast_or_null<InstArithmetic>(IndexInst)) {
+      Operand *IndexOperand0 = ArithInst->getSrc(0);
+      Variable *IndexVariable0 = llvm::dyn_cast<Variable>(IndexOperand0);
+      Operand *IndexOperand1 = ArithInst->getSrc(1);
+      ConstantInteger *IndexConstant1 =
+          llvm::dyn_cast<ConstantInteger>(IndexOperand1);
+      if (ArithInst->getOp() == InstArithmetic::Mul && IndexVariable0 &&
+          IndexOperand1->getType() == IceType_i32 && IndexConstant1) {
+        uint64_t Mult = IndexConstant1->getValue();
+        uint32_t LogMult;
+        switch (Mult) {
+        case 1:
+          LogMult = 0;
+          break;
+        case 2:
+          LogMult = 1;
+          break;
+        case 4:
+          LogMult = 2;
+          break;
+        case 8:
+          LogMult = 3;
+          break;
+        default:
+          LogMult = 4;
+          break;
+        }
+        if (Shift + LogMult <= 3) {
+          Index = IndexVariable0;
+          Shift += LogMult;
+          continue;
+        }
+      }
+    }
+
+    // Index is Index=Var<<Const && Const+Shift<=3 ==>
+    //   Index=Var, Shift+=Const
+
+    // Index is Index=Const*Var && log2(Const)+Shift<=3 ==>
+    //   Index=Var, Shift+=log2(Const)
+
+    // Index && Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
+    //   swap(Index,Base)
+    // Similar for Base=Const*Var and Base=Var<<Const
+
+    // Base is Base=Var+Const ==>
+    //   set Base=Var, Offset+=Const
+
+    // Base is Base=Const+Var ==>
+    //   set Base=Var, Offset+=Const
+
+    // Base is Base=Var-Const ==>
+    //   set Base=Var, Offset-=Const
+
+    // Index is Index=Var+Const ==>
+    //   set Index=Var, Offset+=(Const<<Shift)
+
+    // Index is Index=Const+Var ==>
+    //   set Index=Var, Offset+=(Const<<Shift)
+
+    // Index is Index=Var-Const ==>
+    //   set Index=Var, Offset-=(Const<<Shift)
+
+    // TODO: consider overflow issues with respect to Offset.
+    // TODO: handle symbolic constants.
+    break;
+  }
+}
+
+} // anonymous namespace
+
 void TargetX8632::lowerLoad(const InstLoad *Inst) {
   // A Load instruction can be treated the same as an Assign
   // instruction, after the source operand is transformed into an
@@ -1679,10 +1910,64 @@
     Src0 = OperandX8632Mem::create(Func, Ty, Base, Offset);
   }
 
+  // Fuse this load with a subsequent Arithmetic instruction in the
+  // following situations:
+  //   a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b
+  //   a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true
+  //
+  // TODO: Clean up and test thoroughly.
+  //
+  // TODO: Why limit to Arithmetic instructions?  This could probably be
+  // applied to most any instruction type.  Look at all source operands
+  // in the following instruction, and if there is one instance of the
+  // load instruction's dest variable, and that instruction ends that
+  // variable's live range, then make the substitution.  Deal with
+  // commutativity optimization in the arithmetic instruction lowering.
+  InstArithmetic *NewArith = NULL;
+  if (InstArithmetic *Arith =
+          llvm::dyn_cast_or_null<InstArithmetic>(Context.getNextInst())) {
+    Variable *DestLoad = Inst->getDest();
+    Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
+    Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
+    if (Src1Arith == DestLoad && Arith->isLastUse(Src1Arith) &&
+        DestLoad != Src0Arith) {
+      NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(),
+                                        Arith->getSrc(0), Src0);
+    } else if (Src0Arith == DestLoad && Arith->isCommutative() &&
+               Arith->isLastUse(Src0Arith) && DestLoad != Src1Arith) {
+      NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(),
+                                        Arith->getSrc(1), Src0);
+    }
+    if (NewArith) {
+      Arith->setDeleted();
+      Context.advanceNext();
+      lowerArithmetic(NewArith);
+      return;
+    }
+  }
+
   InstAssign *Assign = InstAssign::create(Func, Inst->getDest(), Src0);
   lowerAssign(Assign);
 }
 
+void TargetX8632::doAddressOptLoad() {
+  Inst *Inst = *Context.getCur();
+  Variable *Dest = Inst->getDest();
+  Operand *Addr = Inst->getSrc(0);
+  Variable *Index = NULL;
+  int32_t Shift = 0;
+  int32_t Offset = 0; // TODO: make Constant
+  Variable *Base = llvm::dyn_cast<Variable>(Addr);
+  computeAddressOpt(Base, Index, Shift, Offset);
+  if (Base && Addr != Base) {
+    Constant *OffsetOp = Ctx->getConstantInt(IceType_i32, Offset);
+    Addr = OperandX8632Mem::create(Func, Dest->getType(), Base, OffsetOp, Index,
+                                   Shift);
+    Inst->setDeleted();
+    Context.insert(InstLoad::create(Func, Dest, Addr));
+  }
+}
+
 void TargetX8632::lowerPhi(const InstPhi * /*Inst*/) {
   Func->setError("Phi found in regular instruction list");
 }
@@ -1781,6 +2066,24 @@
   }
 }
 
+void TargetX8632::doAddressOptStore() {
+  InstStore *Inst = llvm::cast<InstStore>(*Context.getCur());
+  Operand *Data = Inst->getData();
+  Operand *Addr = Inst->getAddr();
+  Variable *Index = NULL;
+  int32_t Shift = 0;
+  int32_t Offset = 0; // TODO: make Constant
+  Variable *Base = llvm::dyn_cast<Variable>(Addr);
+  computeAddressOpt(Base, Index, Shift, Offset);
+  if (Base && Addr != Base) {
+    Constant *OffsetOp = Ctx->getConstantInt(IceType_i32, Offset);
+    Addr = OperandX8632Mem::create(Func, Data->getType(), Base, OffsetOp, Index,
+                                   Shift);
+    Inst->setDeleted();
+    Context.insert(InstStore::create(Func, Data, Addr));
+  }
+}
+
 void TargetX8632::lowerSwitch(const InstSwitch *Inst) {
   // This implements the most naive possible lowering.
   // cmp a,val[0]; jeq label[0]; cmp a,val[1]; jeq label[1]; ... jmp default
@@ -1904,11 +2207,10 @@
       continue;
     if (llvm::isa<InstFakeKill>(Inst))
       continue;
-    SizeT VarIndex = 0;
     for (SizeT SrcNum = 0; SrcNum < Inst->getSrcSize(); ++SrcNum) {
       Operand *Src = Inst->getSrc(SrcNum);
       SizeT NumVars = Src->getNumVars();
-      for (SizeT J = 0; J < NumVars; ++J, ++VarIndex) {
+      for (SizeT J = 0; J < NumVars; ++J) {
         const Variable *Var = Src->getVar(J);
         if (!Var->hasReg())
           continue;
@@ -1923,11 +2225,10 @@
     const Inst *Inst = *I;
     if (Inst->isDeleted())
       continue;
-    SizeT VarIndex = 0;
     for (SizeT SrcNum = 0; SrcNum < Inst->getSrcSize(); ++SrcNum) {
       Operand *Src = Inst->getSrc(SrcNum);
       SizeT NumVars = Src->getNumVars();
-      for (SizeT J = 0; J < NumVars; ++J, ++VarIndex) {
+      for (SizeT J = 0; J < NumVars; ++J) {
         Variable *Var = Src->getVar(J);
         if (Var->hasReg())
           continue;
@@ -1952,15 +2253,15 @@
   }
 }
 
-template <> void ConstantFloat::emit(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrEmit();
+template <> void ConstantFloat::emit(GlobalContext *Ctx) const {
+  Ostream &Str = Ctx->getStrEmit();
   // It would be better to prefix with ".L$" instead of "L$", but
   // llvm-mc doesn't parse "dword ptr [.L$foo]".
   Str << "dword ptr [L$" << IceType_f32 << "$" << getPoolEntryID() << "]";
 }
 
-template <> void ConstantDouble::emit(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrEmit();
+template <> void ConstantDouble::emit(GlobalContext *Ctx) const {
+  Ostream &Str = Ctx->getStrEmit();
   Str << "qword ptr [L$" << IceType_f64 << "$" << getPoolEntryID() << "]";
 }
 
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index e5f8be2..3ca9ca3 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -27,6 +27,7 @@
   static TargetX8632 *create(Cfg *Func) { return new TargetX8632(Func); }
 
   virtual void translateOm1();
+  virtual void translateO2();
 
   virtual Variable *getPhysicalRegister(SizeT RegNum);
   virtual IceString getRegName(SizeT RegNum, Type Ty) const;
@@ -56,7 +57,7 @@
   // latter could be done by directly writing to the stack).
   void split64(Variable *Var);
   void setArgOffsetAndCopy(Variable *Arg, Variable *FramePtr,
-                           int32_t BasicFrameOffset, int32_t &InArgsSizeBytes);
+                           size_t BasicFrameOffset, size_t &InArgsSizeBytes);
   Operand *loOperand(Operand *Operand);
   Operand *hiOperand(Operand *Operand);
 
@@ -89,6 +90,8 @@
   virtual void lowerStore(const InstStore *Inst);
   virtual void lowerSwitch(const InstSwitch *Inst);
   virtual void lowerUnreachable(const InstUnreachable *Inst);
+  virtual void doAddressOptLoad();
+  virtual void doAddressOptStore();
 
   // Operand legalization helpers.  To deal with address mode
   // constraints, the helpers will create a new Operand and emit
@@ -248,8 +251,8 @@
   }
 
   bool IsEbpBasedFrame;
-  int32_t FrameSizeLocals;
-  int32_t LocalsSizeBytes;
+  size_t FrameSizeLocals;
+  size_t LocalsSizeBytes;
   llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
   llvm::SmallBitVector ScratchRegs;
   llvm::SmallBitVector RegsUsed;
@@ -265,8 +268,8 @@
   template <typename T> void emitConstantPool() const;
 };
 
-template <> void ConstantFloat::emit(const Cfg *Func) const;
-template <> void ConstantDouble::emit(const Cfg *Func) const;
+template <> void ConstantFloat::emit(GlobalContext *Ctx) const;
+template <> void ConstantDouble::emit(GlobalContext *Ctx) const;
 
 } // end of namespace Ice
 
diff --git a/src/llvm2ice.cpp b/src/llvm2ice.cpp
index e29637d..374d753 100644
--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -100,6 +100,29 @@
     return Func;
   }
 
+  // convertConstant() does not use Func or require it to be a valid
+  // Ice::Cfg pointer.  As such, it's suitable for e.g. constructing
+  // global initializers.
+  Ice::Constant *convertConstant(const Constant *Const) {
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(Const)) {
+      return Ctx->getConstantSym(convertType(GV->getType()), 0, GV->getName());
+    } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(Const)) {
+      return Ctx->getConstantInt(convertIntegerType(CI->getType()),
+                                 CI->getZExtValue());
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Const)) {
+      Ice::Type Type = convertType(CFP->getType());
+      if (Type == Ice::IceType_f32)
+        return Ctx->getConstantFloat(CFP->getValueAPF().convertToFloat());
+      else if (Type == Ice::IceType_f64)
+        return Ctx->getConstantDouble(CFP->getValueAPF().convertToDouble());
+      llvm_unreachable("Unexpected floating point type");
+      return NULL;
+    } else {
+      llvm_unreachable("Unhandled constant type");
+      return NULL;
+    }
+  }
+
 private:
   // LLVM values (instructions, etc.) are mapped directly to ICE variables.
   // mapValueToIceVar has a version that forces an ICE type on the variable,
@@ -180,24 +203,7 @@
 
   Ice::Operand *convertValue(const Value *Op) {
     if (const Constant *Const = dyn_cast<Constant>(Op)) {
-      if (const GlobalValue *GV = dyn_cast<GlobalValue>(Const)) {
-        return Ctx->getConstantSym(convertType(GV->getType()), 0,
-                                   GV->getName());
-      } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(Const)) {
-        return Ctx->getConstantInt(convertIntegerType(CI->getType()),
-                                   CI->getZExtValue());
-      } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Const)) {
-        Ice::Type Type = convertType(CFP->getType());
-        if (Type == Ice::IceType_f32)
-          return Ctx->getConstantFloat(CFP->getValueAPF().convertToFloat());
-        else if (Type == Ice::IceType_f64)
-          return Ctx->getConstantDouble(CFP->getValueAPF().convertToDouble());
-        llvm_unreachable("Unexpected floating point type");
-        return NULL;
-      } else {
-        llvm_unreachable("Unhandled constant type");
-        return NULL;
-      }
+      return convertConstant(Const);
     } else {
       return mapValueToIceVar(Op);
     }